diff --git a/PRO-seq/PRO-seq.snakefile b/PRO-seq/PRO-seq.snakefile index db635ae506e2752128ed17d5b8bb246aa368b253..36260c1d9f2dbc38014d3ba1aa8e44d78b20f13f 100644 --- a/PRO-seq/PRO-seq.snakefile +++ b/PRO-seq/PRO-seq.snakefile @@ -992,7 +992,8 @@ rule join_all_counts: output: counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"), run: - counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)) + counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() + assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table." counts_data.index.names = ["gene"] counts_data.to_csv(output.counts_table, sep="\t") diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile index c986050975d2aa11e20ca79ca383259c570a26ee..fd29436770cbc4238b4c6491e3539b08a7d2ba56 100644 --- a/RNA_Seq_Cecere/RNA-seq.snakefile +++ b/RNA_Seq_Cecere/RNA-seq.snakefile @@ -258,7 +258,7 @@ rule sam2indexedbam: resources: io=45 threads: - 4 + 8 wrapper: "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/sam2indexedbam" @@ -712,7 +712,8 @@ rule join_all_counts: output: counts_table = OPJ(mapping_dir, aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"), run: - counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)) + counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() + assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table." counts_data.index.names = ["gene"] counts_data.to_csv(output.counts_table, sep="\t") diff --git a/small_RNA-seq/small_RNA-seq.snakefile b/small_RNA-seq/small_RNA-seq.snakefile index 72b4555759e27ab2b609ccbfa79f0f85164f7dd3..0f48274898f8f0b296855848cd77e5be86105bb6 100644 --- a/small_RNA-seq/small_RNA-seq.snakefile +++ b/small_RNA-seq/small_RNA-seq.snakefile @@ -1236,6 +1236,7 @@ rule gather_small_RNA_counts: counts_data.to_csv(output.counts_table, sep="\t") +# TODO: drop duplicates or sum counts when duplicate row indices? rule join_si_counts: """concat SI_TYPES (prot_si, te_si, pseu_si, satel_si and simrep_si) into si""" input: