Commit ce07c988 authored by Blaise Li's avatar Blaise Li
Browse files

Not dropping "duplicates" in join_all_counts.

This ignored the index, and index should be unique if the joining is
made on a careful selection of biotypes.
parent 795c27e3
......@@ -362,7 +362,7 @@ rule join_all_counts:
counts_dir,
f"all_on_{genome}", "alltypes_{orientation}_counts.txt"),
run:
counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates()
counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables))
assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table."
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
......
......@@ -932,7 +932,7 @@ rule join_all_counts:
wildcard_constraints:
biotype = "|".join(JOINED_BIOTYPES)
run:
counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates()
counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables))
assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table."
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
......
......@@ -1539,7 +1539,7 @@ rule join_all_counts:
wildcard_constraints:
biotype = "|".join(JOINED_BIOTYPES)
run:
counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates()
counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables))
assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table."
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
......
......@@ -674,7 +674,7 @@ rule trim:
nb_trimmed = OPJ(data_dir, "trimmed", "{lib}_{rep}_nb_trimmed.txt"),
#threads: 2
message:
"Trimming adaptor from raw data, deduplicating reads, removing random 5' {trim5}-mers and 3' {trim3}-mers for {wildcards.lib}_{wildcards.rep}."
"Trimming adaptor from raw data, removing random 5' {trim5}-mers and 3' {trim3}-mers for {wildcards.lib}_{wildcards.rep}."
benchmark:
OPJ(log_dir, "trim", "{lib}_{rep}_benchmark.txt")
log:
......@@ -1253,7 +1253,10 @@ rule join_all_counts:
counts_table = OPJ(counts_dir,
"all_{read_type}_on_%s" % genome, "alltypes_{orientation}_counts.txt"),
run:
counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates()
# !! drop_duplicates() is wrong: doesn't consider the index (at least in pandas < 1.0.0),
# so it eliminates rows having the same data pattern. For instance [0, 0, ...] (wich eliminates a lot of entries)
# counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates()
counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables))
assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table."
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
......
......@@ -1279,7 +1279,7 @@ rule join_all_feature_counts:
## debug:
print("Input:", ", ".join(input))
##
counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates()
counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables))
assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table."
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment