From 3c815d521058f4d80388f8dad56384323525caec Mon Sep 17 00:00:00 2001 From: Blaise Li <blaise.li__git@nsup.org> Date: Wed, 6 Dec 2017 11:22:25 +0100 Subject: [PATCH] Started removing duplicates when joining counts. I should check to be sure that this is the correct thing to do. The other possibility would be to sum using agg: https://stackoverflow.com/a/35403903/1878788 --- PRO-seq/PRO-seq.snakefile | 3 ++- RNA_Seq_Cecere/RNA-seq.snakefile | 5 +++-- small_RNA-seq/small_RNA-seq.snakefile | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/PRO-seq/PRO-seq.snakefile b/PRO-seq/PRO-seq.snakefile index db635ae..36260c1 100644 --- a/PRO-seq/PRO-seq.snakefile +++ b/PRO-seq/PRO-seq.snakefile @@ -992,7 +992,8 @@ rule join_all_counts: output: counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"), run: - counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)) + counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() + assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table." counts_data.index.names = ["gene"] counts_data.to_csv(output.counts_table, sep="\t") diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile index c986050..fd29436 100644 --- a/RNA_Seq_Cecere/RNA-seq.snakefile +++ b/RNA_Seq_Cecere/RNA-seq.snakefile @@ -258,7 +258,7 @@ rule sam2indexedbam: resources: io=45 threads: - 4 + 8 wrapper: "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/sam2indexedbam" @@ -712,7 +712,8 @@ rule join_all_counts: output: counts_table = OPJ(mapping_dir, aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"), run: - counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)) + counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() + assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table." counts_data.index.names = ["gene"] counts_data.to_csv(output.counts_table, sep="\t") diff --git a/small_RNA-seq/small_RNA-seq.snakefile b/small_RNA-seq/small_RNA-seq.snakefile index 72b4555..0f48274 100644 --- a/small_RNA-seq/small_RNA-seq.snakefile +++ b/small_RNA-seq/small_RNA-seq.snakefile @@ -1236,6 +1236,7 @@ rule gather_small_RNA_counts: counts_data.to_csv(output.counts_table, sep="\t") +# TODO: drop duplicates or sum counts when duplicate row indices? rule join_si_counts: """concat SI_TYPES (prot_si, te_si, pseu_si, satel_si and simrep_si) into si""" input: -- GitLab