From ce07c9889719060ae13673cf82424704ef7b3de1 Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Wed, 5 Feb 2020 12:41:08 +0100
Subject: [PATCH] Not dropping "duplicates" in join_all_counts.

This ignored the index, and index should be unique if the joining is
made on a careful selection of biotypes.
---
 Degradome-seq/Degradome-seq.snakefile | 2 +-
 PRO-seq/PRO-seq.snakefile             | 2 +-
 RNA_Seq_Cecere/RNA-seq.snakefile      | 2 +-
 Ribo-seq/Ribo-seq.snakefile           | 7 +++++--
 small_RNA-seq/small_RNA-seq.snakefile | 2 +-
 5 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/Degradome-seq/Degradome-seq.snakefile b/Degradome-seq/Degradome-seq.snakefile
index 1628527..ccd8c38 100644
--- a/Degradome-seq/Degradome-seq.snakefile
+++ b/Degradome-seq/Degradome-seq.snakefile
@@ -362,7 +362,7 @@ rule join_all_counts:
             counts_dir,
             f"all_on_{genome}", "alltypes_{orientation}_counts.txt"),
     run:
-        counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates()
+        counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables))
         assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table."
         counts_data.index.names = ["gene"]
         counts_data.to_csv(output.counts_table, sep="\t")
diff --git a/PRO-seq/PRO-seq.snakefile b/PRO-seq/PRO-seq.snakefile
index 7e1882b..f7dca28 100644
--- a/PRO-seq/PRO-seq.snakefile
+++ b/PRO-seq/PRO-seq.snakefile
@@ -932,7 +932,7 @@ rule join_all_counts:
     wildcard_constraints:
         biotype = "|".join(JOINED_BIOTYPES)
     run:
-        counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates()
+        counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables))
         assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table."
         counts_data.index.names = ["gene"]
         counts_data.to_csv(output.counts_table, sep="\t")
diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile
index 348b940..fd2c89c 100644
--- a/RNA_Seq_Cecere/RNA-seq.snakefile
+++ b/RNA_Seq_Cecere/RNA-seq.snakefile
@@ -1539,7 +1539,7 @@ rule join_all_counts:
     wildcard_constraints:
         biotype = "|".join(JOINED_BIOTYPES)
     run:
-        counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates()
+        counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables))
         assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table."
         counts_data.index.names = ["gene"]
         counts_data.to_csv(output.counts_table, sep="\t")
diff --git a/Ribo-seq/Ribo-seq.snakefile b/Ribo-seq/Ribo-seq.snakefile
index b33d2eb..fe5a3e0 100644
--- a/Ribo-seq/Ribo-seq.snakefile
+++ b/Ribo-seq/Ribo-seq.snakefile
@@ -674,7 +674,7 @@ rule trim:
         nb_trimmed =  OPJ(data_dir, "trimmed", "{lib}_{rep}_nb_trimmed.txt"),
     #threads: 2
     message:
-        "Trimming adaptor from raw data, deduplicating reads, removing random 5' {trim5}-mers and 3' {trim3}-mers for {wildcards.lib}_{wildcards.rep}."
+        "Trimming adaptor from raw data, removing random 5' {trim5}-mers and 3' {trim3}-mers for {wildcards.lib}_{wildcards.rep}."
     benchmark:
         OPJ(log_dir, "trim", "{lib}_{rep}_benchmark.txt")
     log:
@@ -1253,7 +1253,10 @@ rule join_all_counts:
         counts_table = OPJ(counts_dir,
         "all_{read_type}_on_%s" % genome, "alltypes_{orientation}_counts.txt"),
     run:
-        counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates()
+        # !! drop_duplicates() is wrong: doesn't consider the index (at least in pandas < 1.0.0),
+        # so it eliminates rows having the same data pattern. For instance [0, 0, ...] (wich eliminates a lot of entries)
+        # counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates()
+        counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables))
         assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table."
         counts_data.index.names = ["gene"]
         counts_data.to_csv(output.counts_table, sep="\t")
diff --git a/small_RNA-seq/small_RNA-seq.snakefile b/small_RNA-seq/small_RNA-seq.snakefile
index bd71de7..d2a5ee4 100644
--- a/small_RNA-seq/small_RNA-seq.snakefile
+++ b/small_RNA-seq/small_RNA-seq.snakefile
@@ -1279,7 +1279,7 @@ rule join_all_feature_counts:
         ## debug:
         print("Input:", ", ".join(input))
         ##
-        counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates()
+        counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables))
         assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table."
         counts_data.index.names = ["gene"]
         counts_data.to_csv(output.counts_table, sep="\t")
-- 
GitLab