diff --git a/Degradome-seq/Degradome-seq.snakefile b/Degradome-seq/Degradome-seq.snakefile index 16285272ec21e65bd02ce81e9b1cb35e9a58bf22..ccd8c3859f4c6a8edf021dafce767d05516bdd73 100644 --- a/Degradome-seq/Degradome-seq.snakefile +++ b/Degradome-seq/Degradome-seq.snakefile @@ -362,7 +362,7 @@ rule join_all_counts: counts_dir, f"all_on_{genome}", "alltypes_{orientation}_counts.txt"), run: - counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() + counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)) assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table." counts_data.index.names = ["gene"] counts_data.to_csv(output.counts_table, sep="\t") diff --git a/PRO-seq/PRO-seq.snakefile b/PRO-seq/PRO-seq.snakefile index 7e1882b08903bab45e586ffcf94ef46ced85047d..f7dca2829dfd2babbb7475af445116aa92d96cf1 100644 --- a/PRO-seq/PRO-seq.snakefile +++ b/PRO-seq/PRO-seq.snakefile @@ -932,7 +932,7 @@ rule join_all_counts: wildcard_constraints: biotype = "|".join(JOINED_BIOTYPES) run: - counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() + counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)) assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table." counts_data.index.names = ["gene"] counts_data.to_csv(output.counts_table, sep="\t") diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile index 348b940ffaa270ea6d0cf08ba4c87ce7eabd0474..fd2c89c7d62c87500b7ac6c0d9a11cf3f09e8eac 100644 --- a/RNA_Seq_Cecere/RNA-seq.snakefile +++ b/RNA_Seq_Cecere/RNA-seq.snakefile @@ -1539,7 +1539,7 @@ rule join_all_counts: wildcard_constraints: biotype = "|".join(JOINED_BIOTYPES) run: - counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() + counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)) assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table." counts_data.index.names = ["gene"] counts_data.to_csv(output.counts_table, sep="\t") diff --git a/Ribo-seq/Ribo-seq.snakefile b/Ribo-seq/Ribo-seq.snakefile index b33d2ebbe0c352bc01703ecc090849fb25da0cdb..fe5a3e094bea4c7905e3f1a4b085bce07d94ba39 100644 --- a/Ribo-seq/Ribo-seq.snakefile +++ b/Ribo-seq/Ribo-seq.snakefile @@ -674,7 +674,7 @@ rule trim: nb_trimmed = OPJ(data_dir, "trimmed", "{lib}_{rep}_nb_trimmed.txt"), #threads: 2 message: - "Trimming adaptor from raw data, deduplicating reads, removing random 5' {trim5}-mers and 3' {trim3}-mers for {wildcards.lib}_{wildcards.rep}." + "Trimming adaptor from raw data, removing random 5' {trim5}-mers and 3' {trim3}-mers for {wildcards.lib}_{wildcards.rep}." benchmark: OPJ(log_dir, "trim", "{lib}_{rep}_benchmark.txt") log: @@ -1253,7 +1253,10 @@ rule join_all_counts: counts_table = OPJ(counts_dir, "all_{read_type}_on_%s" % genome, "alltypes_{orientation}_counts.txt"), run: - counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() + # !! drop_duplicates() is wrong: doesn't consider the index (at least in pandas < 1.0.0), + # so it eliminates rows having the same data pattern. For instance [0, 0, ...] (wich eliminates a lot of entries) + # counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() + counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)) assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table." counts_data.index.names = ["gene"] counts_data.to_csv(output.counts_table, sep="\t") diff --git a/small_RNA-seq/small_RNA-seq.snakefile b/small_RNA-seq/small_RNA-seq.snakefile index bd71de78518db09e5aea7b390b0c0f7d4881f30c..d2a5ee467beee94c0d71db1fd4d556b5e35ebd8a 100644 --- a/small_RNA-seq/small_RNA-seq.snakefile +++ b/small_RNA-seq/small_RNA-seq.snakefile @@ -1279,7 +1279,7 @@ rule join_all_feature_counts: ## debug: print("Input:", ", ".join(input)) ## - counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() + counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)) assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table." counts_data.index.names = ["gene"] counts_data.to_csv(output.counts_table, sep="\t")