From ce07c9889719060ae13673cf82424704ef7b3de1 Mon Sep 17 00:00:00 2001 From: Blaise Li <blaise.li__git@nsup.org> Date: Wed, 5 Feb 2020 12:41:08 +0100 Subject: [PATCH] Not dropping "duplicates" in join_all_counts. This ignored the index, and index should be unique if the joining is made on a careful selection of biotypes. --- Degradome-seq/Degradome-seq.snakefile | 2 +- PRO-seq/PRO-seq.snakefile | 2 +- RNA_Seq_Cecere/RNA-seq.snakefile | 2 +- Ribo-seq/Ribo-seq.snakefile | 7 +++++-- small_RNA-seq/small_RNA-seq.snakefile | 2 +- 5 files changed, 9 insertions(+), 6 deletions(-) diff --git a/Degradome-seq/Degradome-seq.snakefile b/Degradome-seq/Degradome-seq.snakefile index 1628527..ccd8c38 100644 --- a/Degradome-seq/Degradome-seq.snakefile +++ b/Degradome-seq/Degradome-seq.snakefile @@ -362,7 +362,7 @@ rule join_all_counts: counts_dir, f"all_on_{genome}", "alltypes_{orientation}_counts.txt"), run: - counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() + counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)) assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table." counts_data.index.names = ["gene"] counts_data.to_csv(output.counts_table, sep="\t") diff --git a/PRO-seq/PRO-seq.snakefile b/PRO-seq/PRO-seq.snakefile index 7e1882b..f7dca28 100644 --- a/PRO-seq/PRO-seq.snakefile +++ b/PRO-seq/PRO-seq.snakefile @@ -932,7 +932,7 @@ rule join_all_counts: wildcard_constraints: biotype = "|".join(JOINED_BIOTYPES) run: - counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() + counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)) assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table." counts_data.index.names = ["gene"] counts_data.to_csv(output.counts_table, sep="\t") diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile index 348b940..fd2c89c 100644 --- a/RNA_Seq_Cecere/RNA-seq.snakefile +++ b/RNA_Seq_Cecere/RNA-seq.snakefile @@ -1539,7 +1539,7 @@ rule join_all_counts: wildcard_constraints: biotype = "|".join(JOINED_BIOTYPES) run: - counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() + counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)) assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table." counts_data.index.names = ["gene"] counts_data.to_csv(output.counts_table, sep="\t") diff --git a/Ribo-seq/Ribo-seq.snakefile b/Ribo-seq/Ribo-seq.snakefile index b33d2eb..fe5a3e0 100644 --- a/Ribo-seq/Ribo-seq.snakefile +++ b/Ribo-seq/Ribo-seq.snakefile @@ -674,7 +674,7 @@ rule trim: nb_trimmed = OPJ(data_dir, "trimmed", "{lib}_{rep}_nb_trimmed.txt"), #threads: 2 message: - "Trimming adaptor from raw data, deduplicating reads, removing random 5' {trim5}-mers and 3' {trim3}-mers for {wildcards.lib}_{wildcards.rep}." + "Trimming adaptor from raw data, removing random 5' {trim5}-mers and 3' {trim3}-mers for {wildcards.lib}_{wildcards.rep}." benchmark: OPJ(log_dir, "trim", "{lib}_{rep}_benchmark.txt") log: @@ -1253,7 +1253,10 @@ rule join_all_counts: counts_table = OPJ(counts_dir, "all_{read_type}_on_%s" % genome, "alltypes_{orientation}_counts.txt"), run: - counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() + # !! drop_duplicates() is wrong: doesn't consider the index (at least in pandas < 1.0.0), + # so it eliminates rows having the same data pattern. For instance [0, 0, ...] (wich eliminates a lot of entries) + # counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() + counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)) assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table." counts_data.index.names = ["gene"] counts_data.to_csv(output.counts_table, sep="\t") diff --git a/small_RNA-seq/small_RNA-seq.snakefile b/small_RNA-seq/small_RNA-seq.snakefile index bd71de7..d2a5ee4 100644 --- a/small_RNA-seq/small_RNA-seq.snakefile +++ b/small_RNA-seq/small_RNA-seq.snakefile @@ -1279,7 +1279,7 @@ rule join_all_feature_counts: ## debug: print("Input:", ", ".join(input)) ## - counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() + counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)) assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table." counts_data.index.names = ["gene"] counts_data.to_csv(output.counts_table, sep="\t") -- GitLab