Commit 14b5f981 authored by Blaise Li's avatar Blaise Li
Browse files

Avoid crashing on empty DESeq2 results.

parent 800dd65b
...@@ -1751,87 +1751,90 @@ rule differential_expression: ...@@ -1751,87 +1751,90 @@ rule differential_expression:
counts_and_res = OPJ( counts_and_res = OPJ(
aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}",
"{contrast}", "{orientation}_{biotype}", "{contrast}_counts_and_res.txt"), "{contrast}", "{orientation}_{biotype}", "{contrast}_counts_and_res.txt"),
log:
warnings = OPJ(log_dir, "differential_expression", "{counter}_{contrast}_{orientation}_{biotype}_{mapped_type}.warnings"),
threads: 4 # to limit memory usage, actually threads: 4 # to limit memory usage, actually
run: run:
counts_data = pd.read_table(input.counts_table, index_col="gene") with warn_context(log.warnings) as warn:
summaries = pd.read_table(input.summary_table, index_col=0) counts_data = pd.read_table(input.counts_table, index_col="gene")
# Running DESeq2 summaries = pd.read_table(input.summary_table, index_col=0)
################# # Running DESeq2
formula = Formula("~ lib") #################
(cond, ref) = CONTRAST2PAIR[wildcards.contrast] formula = Formula("~ lib")
if not any(counts_data[f"{ref}_{rep}"].any() for rep in REPS): (cond, ref) = CONTRAST2PAIR[wildcards.contrast]
warnings.warn( if not any(counts_data[f"{ref}_{rep}"].any() for rep in REPS):
"Reference data is all zero.\nSkipping %s_%s_%s" % ( warn(
wildcards.contrast, wildcards.orientation, wildcards.biotype)) "Reference data is all zero.\nSkipping %s_%s_%s" % (
for outfile in output: wildcards.contrast, wildcards.orientation, wildcards.biotype))
shell(f"echo 'NA' > {outfile}")
else:
contrast = StrVector(["lib", cond, ref])
try:
res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast)
#except RRuntimeError as e:
except RuntimeError as e:
warnings.warn(
"Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % (
str(e), wildcards.contrast, wildcards.orientation, wildcards.biotype))
for outfile in output: for outfile in output:
shell(f"echo 'NA' > {outfile}") shell(f"echo 'NA' > {outfile}")
else: else:
# Determining fold-change category contrast = StrVector(["lib", cond, ref])
################################### try:
set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange") res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast)
#counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type") #except RRuntimeError as e:
res = res.assign(status=res.apply(set_de_status, axis=1)) except RuntimeError as e:
# Converting gene IDs warn(
###################### "Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % (
with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: str(e), wildcards.contrast, wildcards.orientation, wildcards.biotype))
res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1)) for outfile in output:
with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: shell(f"echo 'NA' > {outfile}")
res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) else:
# Just to see if column_converter works also with named column, and not just index: # Determining fold-change category
# with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file: ###################################
# res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1)) set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange")
########################################## #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
# res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",") res = res.assign(status=res.apply(set_de_status, axis=1))
res.to_csv(output.deseq_results, sep="\t", na_rep="NA") # Converting gene IDs
# Joining counts and DESeq2 results in a same table and determining up- or down- regulation status ######################
counts_and_res = counts_data with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
for normalizer in SIZE_FACTORS: res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1))
if normalizer == "median_ratio_to_pseudo_ref": with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
## add pseudo-count to compute the geometric mean, then remove it # Just to see if column_converter works also with named column, and not just index:
#pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1 # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file:
#def median_ratio_to_pseudo_ref(col): # res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1))
# return (col / pseudo_ref).median() ##########################################
#size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0) # res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",")
size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data) res.to_csv(output.deseq_results, sep="\t", na_rep="NA")
else: # Joining counts and DESeq2 results in a same table and determining up- or down- regulation status
#raise NotImplementedError(f"{normalizer} normalization not implemented") counts_and_res = counts_data
size_factors = summaries.loc[normalizer] for normalizer in SIZE_FACTORS:
by_norm = counts_data / size_factors if normalizer == "median_ratio_to_pseudo_ref":
by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer)) ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but
counts_and_res = pd.concat((counts_and_res, by_norm), axis=1) ## add pseudo-count to compute the geometric mean, then remove it
#counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type") #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1
counts_and_res = pd.concat((counts_and_res, res), axis=1) #def median_ratio_to_pseudo_ref(col):
counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA") # return (col / pseudo_ref).median()
# Saving lists of genes gaining or loosing siRNAs #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0)
# diff_expressed = res.query("padj < 0.05") size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
# up_genes = list(diff_expressed.query("log2FoldChange >= 1").index) else:
# down_genes = list(diff_expressed.query("log2FoldChange <= -1").index) #raise NotImplementedError(f"{normalizer} normalization not implemented")
up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index) size_factors = summaries.loc[normalizer]
down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index) by_norm = counts_data / size_factors
#up_genes = list(counts_and_res.query("status == 'up'").index) by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer))
#down_genes = list(counts_and_res.query("status == 'down'").index) counts_and_res = pd.concat((counts_and_res, by_norm), axis=1)
with open(output.up_genes, "w") as up_file: #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
if up_genes: counts_and_res = pd.concat((counts_and_res, res), axis=1)
up_file.write("%s\n" % "\n".join(up_genes)) counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA")
else: # Saving lists of genes gaining or loosing siRNAs
up_file.truncate(0) # diff_expressed = res.query("padj < 0.05")
with open(output.down_genes, "w") as down_file: # up_genes = list(diff_expressed.query("log2FoldChange >= 1").index)
if down_genes: # down_genes = list(diff_expressed.query("log2FoldChange <= -1").index)
down_file.write("%s\n" % "\n".join(down_genes)) up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index)
else: down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index)
down_file.truncate(0) #up_genes = list(counts_and_res.query("status == 'up'").index)
#down_genes = list(counts_and_res.query("status == 'down'").index)
with open(output.up_genes, "w") as up_file:
if up_genes:
up_file.write("%s\n" % "\n".join(up_genes))
else:
up_file.truncate(0)
with open(output.down_genes, "w") as down_file:
if down_genes:
down_file.write("%s\n" % "\n".join(down_genes))
else:
down_file.truncate(0)
rule make_lfc_distrib_plot: rule make_lfc_distrib_plot:
...@@ -1910,6 +1913,7 @@ def source_fold_data(wildcards): ...@@ -1910,6 +1913,7 @@ def source_fold_data(wildcards):
OPJ(aligner, f"mapped_{genome}", OPJ(aligner, f"mapped_{genome}",
"{{counter}}", "deseq2_{{mapped_type}}", "{contrast}", "{{counter}}", "deseq2_{{mapped_type}}", "{contrast}",
"{{orientation}}_{{biotype}}", "{contrast}_counts_and_res.txt"), "{{orientation}}_{{biotype}}", "{contrast}_counts_and_res.txt"),
# OPJ("hisat2/mapped_on_C_elegans/feature_count/deseq2_on_C_elegans/{contrast}/rev_DNA_transposons_rmsk_families/{contrast}_counts_and_res.txt"),
contrast=contrasts_dict[wildcards.contrast_type])] contrast=contrasts_dict[wildcards.contrast_type])]
else: else:
return rules.differential_expression.output.counts_and_res return rules.differential_expression.output.counts_and_res
...@@ -1941,7 +1945,10 @@ rule make_contrast_lfc_boxplots: ...@@ -1941,7 +1945,10 @@ rule make_contrast_lfc_boxplots:
boxplots = OPJ(aligner, f"mapped_{genome}", "{counter}", "fold_boxplots_{mapped_type}", boxplots = OPJ(aligner, f"mapped_{genome}", "{counter}", "fold_boxplots_{mapped_type}",
"{contrast_type}_{orientation}_{biotype}_{fold_type}_{id_list}_boxplots.pdf"), "{contrast_type}_{orientation}_{biotype}_{fold_type}_{id_list}_boxplots.pdf"),
log: log:
warnings = OPJ(log_dir, "make_contrast_lfc_boxplots", aligner, f"mapped_{genome}", "{counter}", "fold_boxplots_{mapped_type}", "{contrast_type}_{orientation}_{biotype}_{fold_type}_{id_list}.warnings"), warnings = OPJ(
log_dir, "make_contrast_lfc_boxplots", aligner,
f"mapped_{genome}", "{counter}", "fold_boxplots_{mapped_type}",
"{contrast_type}_{orientation}_{biotype}_{fold_type}_{id_list}.warnings"),
params: params:
id_lists = set_id_lists, id_lists = set_id_lists,
run: run:
...@@ -1952,7 +1959,32 @@ rule make_contrast_lfc_boxplots: ...@@ -1952,7 +1959,32 @@ rule make_contrast_lfc_boxplots:
# print("Reading fold changes from:", *input.data, sep="\n") # print("Reading fold changes from:", *input.data, sep="\n")
lfcs_dict = {} lfcs_dict = {}
for (contrast, filename) in zip(contrasts_dict[wildcards.contrast_type], input.data): for (contrast, filename) in zip(contrasts_dict[wildcards.contrast_type], input.data):
lfc_data = pd.read_table(filename, index_col="gene") # Input files may come from a failed DESeq2 analysis
if test_na_file(filename):
warn(
"No %s results for %s_%s_%s_%s. Making dummy output." % (
wildcards.fold_type,
wildcards.mapped_type,
contrast,
wildcards.orientation,
wildcards.biotype))
# Should we make an empty lfc_data DataFrame instead?
# Make the file empty
open(output.boxplots, "w").close()
break
try:
lfc_data = pd.read_table(filename, index_col="gene")
except TypeError as err:
# if str(err) == "unsupported operand type(s) for -: 'str' and 'int'":
# warn(str(err))
# warn("Generating empty file.\n")
# # Make the file empty
# open(output.boxplots, "w").close()
# break
warn(
"Unexpected TypeError:\n{str(err)}\n"
"This occurred while parsing {filename}.")
raise
#print(type(lfc_data)) #print(type(lfc_data))
for (list_name, id_list) in params.id_lists.items(): for (list_name, id_list) in params.id_lists.items():
try: try:
...@@ -1968,31 +2000,33 @@ rule make_contrast_lfc_boxplots: ...@@ -1968,31 +2000,33 @@ rule make_contrast_lfc_boxplots:
#print("column has type:", type(selected_data)) #print("column has type:", type(selected_data))
#print(selected_data) #print(selected_data)
lfcs_dict[f"{contrast}_{list_name}"] = selected_data lfcs_dict[f"{contrast}_{list_name}"] = selected_data
lfcs = pd.DataFrame(lfcs_dict) else:
# lfcs = pd.DataFrame( # No break encountered in the loop
# {f"{contrast}_{list_name}" : pd.read_table(filename, index_col="gene").loc[ lfcs = pd.DataFrame(lfcs_dict)
# set(id_list)][wildcard.fold_type] for ( # lfcs = pd.DataFrame(
# contrast, filename) in zip(contrasts_dict[wildcards.contrast_type], input.data) for ( # {f"{contrast}_{list_name}" : pd.read_table(filename, index_col="gene").loc[
# list_name, id_list) in params.id_lists.items()}) # set(id_list)][wildcard.fold_type] for (
title = f"{wildcards.orientation} {wildcards.biotype} {wildcards.mapped_type} {wildcards.fold_type} folds for {wildcards.contrast_type} contrasts" # contrast, filename) in zip(contrasts_dict[wildcards.contrast_type], input.data) for (
try: # list_name, id_list) in params.id_lists.items()})
save_plot( title = f"{wildcards.orientation} {wildcards.biotype} {wildcards.mapped_type} {wildcards.fold_type} folds for {wildcards.contrast_type} contrasts"
output.boxplots, try:
plot_boxplots, save_plot(
lfcs, output.boxplots,
wildcards.fold_type, plot_boxplots,
title=title) lfcs,
except TypeError as err: wildcards.fold_type,
if str(err) == "Empty 'DataFrame': no numeric data to plot": title=title)
warn("\n".join([ except TypeError as err:
"Got TypeError:", if str(err) == "Empty 'DataFrame': no numeric data to plot":
f"{str(err)}", warn("\n".join([
f"No data to plot for {title}\n"])) "Got TypeError:",
warn("Generating empty file.\n") f"{str(err)}",
# Make the file empty f"No data to plot for {title}\n"]))
open(output.boxplots, "w").close() warn("Generating empty file.\n")
else: # Make the file empty
raise open(output.boxplots, "w").close()
else:
raise
#ax = res[res["status"] == "NS"].plot.scatter(x="baseMean", y="lfcMLE", s=2, logx=True, c="grey", label="NS") #ax = res[res["status"] == "NS"].plot.scatter(x="baseMean", y="lfcMLE", s=2, logx=True, c="grey", label="NS")
#cutoffs = [-c for c in reversed(lfc_cutoffs)] + [0] + lfc_cutoffs #cutoffs = [-c for c in reversed(lfc_cutoffs)] + [0] + lfc_cutoffs
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment