Commit bd7d9415 authored by Blaise Li's avatar Blaise Li
Browse files

Fixing all_si_22G crashes.

parent 313d0ce7
......@@ -1705,57 +1705,66 @@ rule gather_small_RNA_counts:
counts_table = OPJ(
annot_counts_dir,
"all_{mapped_type}_on_%s" % genome, "{small_type}_counts.txt"),
log:
log = OPJ(log_dir, "gather_small_RNA_counts", "{mapped_type}_{small_type}.log"),
run:
# Gathering the counts data
############################
counts_files = (OPJ(
annot_counts_dir,
f"{cond_name}_{wildcards.mapped_type}_on_{genome}",
f"{wildcards.small_type}_counts.txt") for cond_name in COND_NAMES)
if wildcards.small_type == "pi":
drop = ["piRNA"]
elif wildcards.small_type == "mi":
drop = ["miRNA"]
elif wildcards.small_type in {
*{f"prot_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"prot_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["protein_coding_CDS", "protein_coding_UTR", "protein_coding_exon_junction", "protein_coding_pure_intron"]
elif wildcards.small_type in {
*{f"te_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"te_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["DNA_transposons_rmsk", "RNA_transposons_rmsk"]
elif wildcards.small_type in {
*{f"pseu_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"pseu_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["pseudogene"]
elif wildcards.small_type in {
*{f"satel_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"satel_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["satellites_rmsk"]
elif wildcards.small_type in {
*{f"simrep_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"simrep_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["simple_repeats_rmsk"]
elif wildcards.small_type in {"all_si", "all_siu"}:
drop = ["all_siRNA"]
else:
raise NotImplementedError("Unknown read type: %s" % wildcards.small_type)
counts_data = pd.concat(
(pd.read_table(
counts_file,
index_col=0,
na_filter=False).drop(drop, errors="ignore") for counts_file in counts_files),
axis=1).fillna(0).astype(int)
counts_data.columns = COND_NAMES
# Simple_repeat|Simple_repeat|(TTTTTTG)n:1
# Simple_repeat|Simple_repeat|(TTTTTTG)n:2
# Simple_repeat|Simple_repeat|(TTTTTTG)n:3
# Simple_repeat|Simple_repeat|(TTTTTTG)n:4
# -> Simple_repeat|Simple_repeat|(TTTTTTG)n
if wildcards.small_type in RMSK_SISIU_TYPES:
counts_data = sum_by_family(counts_data)
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
with open(log.log, "w") as logfile:
# Gathering the counts data
############################
counts_files = (OPJ(
annot_counts_dir,
f"{cond_name}_{wildcards.mapped_type}_on_{genome}",
f"{wildcards.small_type}_counts.txt") for cond_name in COND_NAMES)
if wildcards.small_type == "pi":
drop = ["piRNA"]
elif wildcards.small_type == "mi":
drop = ["miRNA"]
elif wildcards.small_type in {
*{f"prot_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"prot_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["protein_coding_CDS", "protein_coding_UTR", "protein_coding_exon_junction", "protein_coding_pure_intron"]
elif wildcards.small_type in {
*{f"te_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"te_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["DNA_transposons_rmsk", "RNA_transposons_rmsk"]
elif wildcards.small_type in {
*{f"pseu_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"pseu_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["pseudogene"]
elif wildcards.small_type in {
*{f"satel_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"satel_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["satellites_rmsk"]
elif wildcards.small_type in {
*{f"simrep_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"simrep_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["simple_repeats_rmsk"]
elif wildcards.small_type in {
*{f"all_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"all_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = [f"all_si_{suffix}RNA" for suffix in SI_SUFFIXES]
elif wildcards.small_type in {"all_si", "all_siu"}:
drop = ["all_siRNA"]
else:
# Unknown read type: all_si_22G
raise NotImplementedError("Unknown read type: %s" % wildcards.small_type)
logfile.write(f"Will drop {drop} for {wildcards.small_type}.\n")
counts_data = pd.concat(
(pd.read_table(
counts_file,
index_col=0,
na_filter=False).drop(drop, errors="ignore") for counts_file in counts_files),
axis=1).fillna(0).astype(int)
counts_data.columns = COND_NAMES
# Simple_repeat|Simple_repeat|(TTTTTTG)n:1
# Simple_repeat|Simple_repeat|(TTTTTTG)n:2
# Simple_repeat|Simple_repeat|(TTTTTTG)n:3
# Simple_repeat|Simple_repeat|(TTTTTTG)n:4
# -> Simple_repeat|Simple_repeat|(TTTTTTG)n
if wildcards.small_type in RMSK_SISIU_TYPES:
counts_data = sum_by_family(counts_data)
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
def counts_to_join(wildcards):
......@@ -1960,11 +1969,15 @@ rule associate_small_type:
tags_table.to_csv(output.tags_table, sep="\t")
def add_tags_column(data, tags_table, tag_name):
# TODO: check what this does with index column name (can it loose "gene"?)
def add_tags_column(data, tags_table, tag_name, logfile=None):
"""Adds a column *tag_name* to *data* based on the DataFrame *tag_table*
associating tags to row names."""
# Why "inner" ?
df = pd.concat((data, pd.read_table(tags_table, index_col=0)), join="inner", axis=1)
if logfile is not None:
logfile.write(f"Index in data is: {data.index.name}\n")
logfile.write(f"Index in df is: {df.index.name}\n")
df.columns = (*data.columns, tag_name)
return df
......@@ -2180,6 +2193,9 @@ rule compute_RPM_folds:
input.counts_table,
usecols=["gene", *column_list],
index_col="gene")
assert counts_data.index.name == "gene", f"Wrong index: {counts_data.index.name}"
# logfile.write(f"Columns in counts_data are: {counts_data.columns}")
# logfile.write(f"Index in counts_data is: {counts_data.index.name}")
logfile.write(f"Found {len(counts_data.index)} genes\n")
#logfile.write(f"Reading exon lengths from {exon_lengths_file}\n")
#exon_lengths = pd.read_table(
......@@ -2196,6 +2212,9 @@ rule compute_RPM_folds:
logfile.write(str(norm))
logfile.write("Computing counts per million non-structural mappers\n")
RPM = 1000000 * counts_data / norm
# logfile.write(f"Columns in RPM are: {RPM.columns}\n")
# logfile.write(f"Index in RPM is: {RPM.index.name}\n")
assert RPM.index.name == "gene", f"Wrong index: {RPM.index.name}"
logfile.write("Computing RPM log2 fold changes\n")
# Maybe not a very sensible way to compute the mean folds: it pairs replicates, but the pairing is arbitrary
# TODO: Should we compute the fold of the means instead?
......@@ -2212,9 +2231,28 @@ rule compute_RPM_folds:
lfc = lfc.assign(cosmid=lfc.apply(column_converter(load(dict_file)), axis=1))
with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1))
logfile.write("Adding small read type info\n")
# logfile.write(f"Columns in lfc are: {lfc.columns}\n")
# logfile.write(f"Index in lfc is: {lfc.index.name}\n")
assert lfc.index.name == "gene", f"Wrong index: {lfc.index.name}"
logfile.write(f"Adding small read type info from {input.tags_table}\n")
#pd.concat((counts_data.loc[common], RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
tags_table = add_tags_column(lfc, input.tags_table, "small_type", logfile)
logfile.write(f"Columns in tags_table are: {tags_table.columns}\n")
logfile.write(f"Index in tags_table is: {tags_table.index.name}\n")
lfc_idx = lfc.index
RPM_idx = RPM.index
tags_table_idx = tags_table.index
lfc_xor_RPM = lfc_idx.symmetric_difference(RPM_idx)
lfc_xor_tags_table = lfc_idx.symmetric_difference(tags_table_idx)
RPM_xor_tags_table = RPM_idx.symmetric_difference(tags_table_idx)
logfile.write(f"Index difference:\nlfc_xor_RPM: {lfc_xor_RPM}\nlfc_xor_tags_table: {lfc_xor_tags_table}\nRPM_xor_tags_table: {RPM_xor_tags_table}\n")
# with_tags = pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1)
with_tags = pd.concat((RPM, tags_table), axis=1)
logfile.write(f"Columns in with_tags are: {with_tags.columns}\n")
logfile.write(f"Index in with_tags is: {with_tags.index.name}\n")
# pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
logfile.write(f"Then writing to {output.fold_results}\n")
with_tags.to_csv(output.fold_results, sep="\t")
rule gather_RPM_folds:
......@@ -2233,12 +2271,17 @@ rule gather_RPM_folds:
benchmark:
OPJ(log_dir, "gather_RPM_folds", "{small_type}_benchmark.txt"),
run:
pd.DataFrame({contrast : pd.read_table(
OPJ(mapping_dir, f"RPM_folds_{size_selected}",
f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt"),
index_col=["gene", "cosmid", "name", "small_type"],
usecols=["gene", "cosmid", "name", "small_type", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv(
output.all_folds, sep="\t")
with open(log.log, "w") as logfile:
actual_input = [
OPJ(mapping_dir, f"RPM_folds_{size_selected}",
f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt") for contrast in IP_CONTRASTS]
logfile.write(f"Gathering RPM folds from:\n{actual_input}\nShould be from:\n{input.fold_results}\n")
pd.DataFrame({contrast : pd.read_table(
OPJ(mapping_dir, f"RPM_folds_{size_selected}",
f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt"),
index_col=["gene", "cosmid", "name", "small_type"],
usecols=["gene", "cosmid", "name", "small_type", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv(
output.all_folds, sep="\t")
# TODO: update consumers with mapping_type
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment