From 3c815d521058f4d80388f8dad56384323525caec Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Wed, 6 Dec 2017 11:22:25 +0100
Subject: [PATCH] Started removing duplicates when joining counts.

I should check to be sure that this is the correct thing to do. The
other possibility would be to sum using agg:
https://stackoverflow.com/a/35403903/1878788
---
 PRO-seq/PRO-seq.snakefile             | 3 ++-
 RNA_Seq_Cecere/RNA-seq.snakefile      | 5 +++--
 small_RNA-seq/small_RNA-seq.snakefile | 1 +
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/PRO-seq/PRO-seq.snakefile b/PRO-seq/PRO-seq.snakefile
index db635ae..36260c1 100644
--- a/PRO-seq/PRO-seq.snakefile
+++ b/PRO-seq/PRO-seq.snakefile
@@ -992,7 +992,8 @@ rule join_all_counts:
     output:
         counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"),
     run:
-        counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables))
+        counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates()
+        assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table."
         counts_data.index.names = ["gene"]
         counts_data.to_csv(output.counts_table, sep="\t")
 
diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile
index c986050..fd29436 100644
--- a/RNA_Seq_Cecere/RNA-seq.snakefile
+++ b/RNA_Seq_Cecere/RNA-seq.snakefile
@@ -258,7 +258,7 @@ rule sam2indexedbam:
     resources:
         io=45
     threads:
-        4
+        8
     wrapper:
         "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/sam2indexedbam"
 
@@ -712,7 +712,8 @@ rule join_all_counts:
     output:
         counts_table = OPJ(mapping_dir, aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"),
     run:
-        counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables))
+        counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates()
+        assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table."
         counts_data.index.names = ["gene"]
         counts_data.to_csv(output.counts_table, sep="\t")
 
diff --git a/small_RNA-seq/small_RNA-seq.snakefile b/small_RNA-seq/small_RNA-seq.snakefile
index 72b4555..0f48274 100644
--- a/small_RNA-seq/small_RNA-seq.snakefile
+++ b/small_RNA-seq/small_RNA-seq.snakefile
@@ -1236,6 +1236,7 @@ rule gather_small_RNA_counts:
         counts_data.to_csv(output.counts_table, sep="\t")
 
 
+# TODO: drop duplicates or sum counts when duplicate row indices?
 rule join_si_counts:
     """concat SI_TYPES (prot_si, te_si, pseu_si, satel_si and simrep_si) into si"""
     input:
-- 
GitLab