Skip to content
Snippets Groups Projects
Commit bd7d9415 authored by Blaise Li's avatar Blaise Li
Browse files

Fixing all_si_22G crashes.

parent 313d0ce7
No related branches found
No related tags found
No related merge requests found
...@@ -1705,57 +1705,66 @@ rule gather_small_RNA_counts: ...@@ -1705,57 +1705,66 @@ rule gather_small_RNA_counts:
counts_table = OPJ( counts_table = OPJ(
annot_counts_dir, annot_counts_dir,
"all_{mapped_type}_on_%s" % genome, "{small_type}_counts.txt"), "all_{mapped_type}_on_%s" % genome, "{small_type}_counts.txt"),
log:
log = OPJ(log_dir, "gather_small_RNA_counts", "{mapped_type}_{small_type}.log"),
run: run:
# Gathering the counts data with open(log.log, "w") as logfile:
############################ # Gathering the counts data
counts_files = (OPJ( ############################
annot_counts_dir, counts_files = (OPJ(
f"{cond_name}_{wildcards.mapped_type}_on_{genome}", annot_counts_dir,
f"{wildcards.small_type}_counts.txt") for cond_name in COND_NAMES) f"{cond_name}_{wildcards.mapped_type}_on_{genome}",
if wildcards.small_type == "pi": f"{wildcards.small_type}_counts.txt") for cond_name in COND_NAMES)
drop = ["piRNA"] if wildcards.small_type == "pi":
elif wildcards.small_type == "mi": drop = ["piRNA"]
drop = ["miRNA"] elif wildcards.small_type == "mi":
elif wildcards.small_type in { drop = ["miRNA"]
*{f"prot_si_{suffix}" for suffix in SI_SUFFIXES}, elif wildcards.small_type in {
*{f"prot_siu_{suffix}" for suffix in SI_SUFFIXES}}: *{f"prot_si_{suffix}" for suffix in SI_SUFFIXES},
drop = ["protein_coding_CDS", "protein_coding_UTR", "protein_coding_exon_junction", "protein_coding_pure_intron"] *{f"prot_siu_{suffix}" for suffix in SI_SUFFIXES}}:
elif wildcards.small_type in { drop = ["protein_coding_CDS", "protein_coding_UTR", "protein_coding_exon_junction", "protein_coding_pure_intron"]
*{f"te_si_{suffix}" for suffix in SI_SUFFIXES}, elif wildcards.small_type in {
*{f"te_siu_{suffix}" for suffix in SI_SUFFIXES}}: *{f"te_si_{suffix}" for suffix in SI_SUFFIXES},
drop = ["DNA_transposons_rmsk", "RNA_transposons_rmsk"] *{f"te_siu_{suffix}" for suffix in SI_SUFFIXES}}:
elif wildcards.small_type in { drop = ["DNA_transposons_rmsk", "RNA_transposons_rmsk"]
*{f"pseu_si_{suffix}" for suffix in SI_SUFFIXES}, elif wildcards.small_type in {
*{f"pseu_siu_{suffix}" for suffix in SI_SUFFIXES}}: *{f"pseu_si_{suffix}" for suffix in SI_SUFFIXES},
drop = ["pseudogene"] *{f"pseu_siu_{suffix}" for suffix in SI_SUFFIXES}}:
elif wildcards.small_type in { drop = ["pseudogene"]
*{f"satel_si_{suffix}" for suffix in SI_SUFFIXES}, elif wildcards.small_type in {
*{f"satel_siu_{suffix}" for suffix in SI_SUFFIXES}}: *{f"satel_si_{suffix}" for suffix in SI_SUFFIXES},
drop = ["satellites_rmsk"] *{f"satel_siu_{suffix}" for suffix in SI_SUFFIXES}}:
elif wildcards.small_type in { drop = ["satellites_rmsk"]
*{f"simrep_si_{suffix}" for suffix in SI_SUFFIXES}, elif wildcards.small_type in {
*{f"simrep_siu_{suffix}" for suffix in SI_SUFFIXES}}: *{f"simrep_si_{suffix}" for suffix in SI_SUFFIXES},
drop = ["simple_repeats_rmsk"] *{f"simrep_siu_{suffix}" for suffix in SI_SUFFIXES}}:
elif wildcards.small_type in {"all_si", "all_siu"}: drop = ["simple_repeats_rmsk"]
drop = ["all_siRNA"] elif wildcards.small_type in {
else: *{f"all_si_{suffix}" for suffix in SI_SUFFIXES},
raise NotImplementedError("Unknown read type: %s" % wildcards.small_type) *{f"all_siu_{suffix}" for suffix in SI_SUFFIXES}}:
counts_data = pd.concat( drop = [f"all_si_{suffix}RNA" for suffix in SI_SUFFIXES]
(pd.read_table( elif wildcards.small_type in {"all_si", "all_siu"}:
counts_file, drop = ["all_siRNA"]
index_col=0, else:
na_filter=False).drop(drop, errors="ignore") for counts_file in counts_files), # Unknown read type: all_si_22G
axis=1).fillna(0).astype(int) raise NotImplementedError("Unknown read type: %s" % wildcards.small_type)
counts_data.columns = COND_NAMES logfile.write(f"Will drop {drop} for {wildcards.small_type}.\n")
# Simple_repeat|Simple_repeat|(TTTTTTG)n:1 counts_data = pd.concat(
# Simple_repeat|Simple_repeat|(TTTTTTG)n:2 (pd.read_table(
# Simple_repeat|Simple_repeat|(TTTTTTG)n:3 counts_file,
# Simple_repeat|Simple_repeat|(TTTTTTG)n:4 index_col=0,
# -> Simple_repeat|Simple_repeat|(TTTTTTG)n na_filter=False).drop(drop, errors="ignore") for counts_file in counts_files),
if wildcards.small_type in RMSK_SISIU_TYPES: axis=1).fillna(0).astype(int)
counts_data = sum_by_family(counts_data) counts_data.columns = COND_NAMES
counts_data.index.names = ["gene"] # Simple_repeat|Simple_repeat|(TTTTTTG)n:1
counts_data.to_csv(output.counts_table, sep="\t") # Simple_repeat|Simple_repeat|(TTTTTTG)n:2
# Simple_repeat|Simple_repeat|(TTTTTTG)n:3
# Simple_repeat|Simple_repeat|(TTTTTTG)n:4
# -> Simple_repeat|Simple_repeat|(TTTTTTG)n
if wildcards.small_type in RMSK_SISIU_TYPES:
counts_data = sum_by_family(counts_data)
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
def counts_to_join(wildcards): def counts_to_join(wildcards):
...@@ -1960,11 +1969,15 @@ rule associate_small_type: ...@@ -1960,11 +1969,15 @@ rule associate_small_type:
tags_table.to_csv(output.tags_table, sep="\t") tags_table.to_csv(output.tags_table, sep="\t")
def add_tags_column(data, tags_table, tag_name): # TODO: check what this does with index column name (can it loose "gene"?)
def add_tags_column(data, tags_table, tag_name, logfile=None):
"""Adds a column *tag_name* to *data* based on the DataFrame *tag_table* """Adds a column *tag_name* to *data* based on the DataFrame *tag_table*
associating tags to row names.""" associating tags to row names."""
# Why "inner" ? # Why "inner" ?
df = pd.concat((data, pd.read_table(tags_table, index_col=0)), join="inner", axis=1) df = pd.concat((data, pd.read_table(tags_table, index_col=0)), join="inner", axis=1)
if logfile is not None:
logfile.write(f"Index in data is: {data.index.name}\n")
logfile.write(f"Index in df is: {df.index.name}\n")
df.columns = (*data.columns, tag_name) df.columns = (*data.columns, tag_name)
return df return df
...@@ -2180,6 +2193,9 @@ rule compute_RPM_folds: ...@@ -2180,6 +2193,9 @@ rule compute_RPM_folds:
input.counts_table, input.counts_table,
usecols=["gene", *column_list], usecols=["gene", *column_list],
index_col="gene") index_col="gene")
assert counts_data.index.name == "gene", f"Wrong index: {counts_data.index.name}"
# logfile.write(f"Columns in counts_data are: {counts_data.columns}")
# logfile.write(f"Index in counts_data is: {counts_data.index.name}")
logfile.write(f"Found {len(counts_data.index)} genes\n") logfile.write(f"Found {len(counts_data.index)} genes\n")
#logfile.write(f"Reading exon lengths from {exon_lengths_file}\n") #logfile.write(f"Reading exon lengths from {exon_lengths_file}\n")
#exon_lengths = pd.read_table( #exon_lengths = pd.read_table(
...@@ -2196,6 +2212,9 @@ rule compute_RPM_folds: ...@@ -2196,6 +2212,9 @@ rule compute_RPM_folds:
logfile.write(str(norm)) logfile.write(str(norm))
logfile.write("Computing counts per million non-structural mappers\n") logfile.write("Computing counts per million non-structural mappers\n")
RPM = 1000000 * counts_data / norm RPM = 1000000 * counts_data / norm
# logfile.write(f"Columns in RPM are: {RPM.columns}\n")
# logfile.write(f"Index in RPM is: {RPM.index.name}\n")
assert RPM.index.name == "gene", f"Wrong index: {RPM.index.name}"
logfile.write("Computing RPM log2 fold changes\n") logfile.write("Computing RPM log2 fold changes\n")
# Maybe not a very sensible way to compute the mean folds: it pairs replicates, but the pairing is arbitrary # Maybe not a very sensible way to compute the mean folds: it pairs replicates, but the pairing is arbitrary
# TODO: Should we compute the fold of the means instead? # TODO: Should we compute the fold of the means instead?
...@@ -2212,9 +2231,28 @@ rule compute_RPM_folds: ...@@ -2212,9 +2231,28 @@ rule compute_RPM_folds:
lfc = lfc.assign(cosmid=lfc.apply(column_converter(load(dict_file)), axis=1)) lfc = lfc.assign(cosmid=lfc.apply(column_converter(load(dict_file)), axis=1))
with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1)) lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1))
logfile.write("Adding small read type info\n") # logfile.write(f"Columns in lfc are: {lfc.columns}\n")
# logfile.write(f"Index in lfc is: {lfc.index.name}\n")
assert lfc.index.name == "gene", f"Wrong index: {lfc.index.name}"
logfile.write(f"Adding small read type info from {input.tags_table}\n")
#pd.concat((counts_data.loc[common], RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t") #pd.concat((counts_data.loc[common], RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t") tags_table = add_tags_column(lfc, input.tags_table, "small_type", logfile)
logfile.write(f"Columns in tags_table are: {tags_table.columns}\n")
logfile.write(f"Index in tags_table is: {tags_table.index.name}\n")
lfc_idx = lfc.index
RPM_idx = RPM.index
tags_table_idx = tags_table.index
lfc_xor_RPM = lfc_idx.symmetric_difference(RPM_idx)
lfc_xor_tags_table = lfc_idx.symmetric_difference(tags_table_idx)
RPM_xor_tags_table = RPM_idx.symmetric_difference(tags_table_idx)
logfile.write(f"Index difference:\nlfc_xor_RPM: {lfc_xor_RPM}\nlfc_xor_tags_table: {lfc_xor_tags_table}\nRPM_xor_tags_table: {RPM_xor_tags_table}\n")
# with_tags = pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1)
with_tags = pd.concat((RPM, tags_table), axis=1)
logfile.write(f"Columns in with_tags are: {with_tags.columns}\n")
logfile.write(f"Index in with_tags is: {with_tags.index.name}\n")
# pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
logfile.write(f"Then writing to {output.fold_results}\n")
with_tags.to_csv(output.fold_results, sep="\t")
rule gather_RPM_folds: rule gather_RPM_folds:
...@@ -2233,12 +2271,17 @@ rule gather_RPM_folds: ...@@ -2233,12 +2271,17 @@ rule gather_RPM_folds:
benchmark: benchmark:
OPJ(log_dir, "gather_RPM_folds", "{small_type}_benchmark.txt"), OPJ(log_dir, "gather_RPM_folds", "{small_type}_benchmark.txt"),
run: run:
pd.DataFrame({contrast : pd.read_table( with open(log.log, "w") as logfile:
OPJ(mapping_dir, f"RPM_folds_{size_selected}", actual_input = [
f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt"), OPJ(mapping_dir, f"RPM_folds_{size_selected}",
index_col=["gene", "cosmid", "name", "small_type"], f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt") for contrast in IP_CONTRASTS]
usecols=["gene", "cosmid", "name", "small_type", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv( logfile.write(f"Gathering RPM folds from:\n{actual_input}\nShould be from:\n{input.fold_results}\n")
output.all_folds, sep="\t") pd.DataFrame({contrast : pd.read_table(
OPJ(mapping_dir, f"RPM_folds_{size_selected}",
f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt"),
index_col=["gene", "cosmid", "name", "small_type"],
usecols=["gene", "cosmid", "name", "small_type", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv(
output.all_folds, sep="\t")
# TODO: update consumers with mapping_type # TODO: update consumers with mapping_type
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment