diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile index 2cdada560f90ec7110fcf1c80fd8039bea1832dc..894c4144cfbb75f8d3c5b8d9b12e6ccf1c79088b 100644 --- a/RNA_Seq_Cecere/RNA-seq.snakefile +++ b/RNA_Seq_Cecere/RNA-seq.snakefile @@ -1751,87 +1751,90 @@ rule differential_expression: counts_and_res = OPJ( aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}", "{contrast}_counts_and_res.txt"), + log: + warnings = OPJ(log_dir, "differential_expression", "{counter}_{contrast}_{orientation}_{biotype}_{mapped_type}.warnings"), threads: 4 # to limit memory usage, actually run: - counts_data = pd.read_table(input.counts_table, index_col="gene") - summaries = pd.read_table(input.summary_table, index_col=0) - # Running DESeq2 - ################# - formula = Formula("~ lib") - (cond, ref) = CONTRAST2PAIR[wildcards.contrast] - if not any(counts_data[f"{ref}_{rep}"].any() for rep in REPS): - warnings.warn( - "Reference data is all zero.\nSkipping %s_%s_%s" % ( - wildcards.contrast, wildcards.orientation, wildcards.biotype)) - for outfile in output: - shell(f"echo 'NA' > {outfile}") - else: - contrast = StrVector(["lib", cond, ref]) - try: - res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast) - #except RRuntimeError as e: - except RuntimeError as e: - warnings.warn( - "Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % ( - str(e), wildcards.contrast, wildcards.orientation, wildcards.biotype)) + with warn_context(log.warnings) as warn: + counts_data = pd.read_table(input.counts_table, index_col="gene") + summaries = pd.read_table(input.summary_table, index_col=0) + # Running DESeq2 + ################# + formula = Formula("~ lib") + (cond, ref) = CONTRAST2PAIR[wildcards.contrast] + if not any(counts_data[f"{ref}_{rep}"].any() for rep in REPS): + warn( + "Reference data is all zero.\nSkipping %s_%s_%s" % ( + wildcards.contrast, wildcards.orientation, wildcards.biotype)) for outfile in output: shell(f"echo 'NA' > {outfile}") else: - # Determining fold-change category - ################################### - set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange") - #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type") - res = res.assign(status=res.apply(set_de_status, axis=1)) - # Converting gene IDs - ###################### - with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: - res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) - # Just to see if column_converter works also with named column, and not just index: - # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file: - # res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1)) - ########################################## - # res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",") - res.to_csv(output.deseq_results, sep="\t", na_rep="NA") - # Joining counts and DESeq2 results in a same table and determining up- or down- regulation status - counts_and_res = counts_data - for normalizer in SIZE_FACTORS: - if normalizer == "median_ratio_to_pseudo_ref": - ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but - ## add pseudo-count to compute the geometric mean, then remove it - #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1 - #def median_ratio_to_pseudo_ref(col): - # return (col / pseudo_ref).median() - #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0) - size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data) - else: - #raise NotImplementedError(f"{normalizer} normalization not implemented") - size_factors = summaries.loc[normalizer] - by_norm = counts_data / size_factors - by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer)) - counts_and_res = pd.concat((counts_and_res, by_norm), axis=1) - #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type") - counts_and_res = pd.concat((counts_and_res, res), axis=1) - counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA") - # Saving lists of genes gaining or loosing siRNAs - # diff_expressed = res.query("padj < 0.05") - # up_genes = list(diff_expressed.query("log2FoldChange >= 1").index) - # down_genes = list(diff_expressed.query("log2FoldChange <= -1").index) - up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index) - down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index) - #up_genes = list(counts_and_res.query("status == 'up'").index) - #down_genes = list(counts_and_res.query("status == 'down'").index) - with open(output.up_genes, "w") as up_file: - if up_genes: - up_file.write("%s\n" % "\n".join(up_genes)) - else: - up_file.truncate(0) - with open(output.down_genes, "w") as down_file: - if down_genes: - down_file.write("%s\n" % "\n".join(down_genes)) - else: - down_file.truncate(0) + contrast = StrVector(["lib", cond, ref]) + try: + res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast) + #except RRuntimeError as e: + except RuntimeError as e: + warn( + "Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % ( + str(e), wildcards.contrast, wildcards.orientation, wildcards.biotype)) + for outfile in output: + shell(f"echo 'NA' > {outfile}") + else: + # Determining fold-change category + ################################### + set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange") + #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type") + res = res.assign(status=res.apply(set_de_status, axis=1)) + # Converting gene IDs + ###################### + with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: + res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1)) + with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) + # Just to see if column_converter works also with named column, and not just index: + # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file: + # res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1)) + ########################################## + # res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",") + res.to_csv(output.deseq_results, sep="\t", na_rep="NA") + # Joining counts and DESeq2 results in a same table and determining up- or down- regulation status + counts_and_res = counts_data + for normalizer in SIZE_FACTORS: + if normalizer == "median_ratio_to_pseudo_ref": + ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but + ## add pseudo-count to compute the geometric mean, then remove it + #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1 + #def median_ratio_to_pseudo_ref(col): + # return (col / pseudo_ref).median() + #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0) + size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data) + else: + #raise NotImplementedError(f"{normalizer} normalization not implemented") + size_factors = summaries.loc[normalizer] + by_norm = counts_data / size_factors + by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer)) + counts_and_res = pd.concat((counts_and_res, by_norm), axis=1) + #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type") + counts_and_res = pd.concat((counts_and_res, res), axis=1) + counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA") + # Saving lists of genes gaining or loosing siRNAs + # diff_expressed = res.query("padj < 0.05") + # up_genes = list(diff_expressed.query("log2FoldChange >= 1").index) + # down_genes = list(diff_expressed.query("log2FoldChange <= -1").index) + up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index) + down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index) + #up_genes = list(counts_and_res.query("status == 'up'").index) + #down_genes = list(counts_and_res.query("status == 'down'").index) + with open(output.up_genes, "w") as up_file: + if up_genes: + up_file.write("%s\n" % "\n".join(up_genes)) + else: + up_file.truncate(0) + with open(output.down_genes, "w") as down_file: + if down_genes: + down_file.write("%s\n" % "\n".join(down_genes)) + else: + down_file.truncate(0) rule make_lfc_distrib_plot: @@ -1910,6 +1913,7 @@ def source_fold_data(wildcards): OPJ(aligner, f"mapped_{genome}", "{{counter}}", "deseq2_{{mapped_type}}", "{contrast}", "{{orientation}}_{{biotype}}", "{contrast}_counts_and_res.txt"), + # OPJ("hisat2/mapped_on_C_elegans/feature_count/deseq2_on_C_elegans/{contrast}/rev_DNA_transposons_rmsk_families/{contrast}_counts_and_res.txt"), contrast=contrasts_dict[wildcards.contrast_type])] else: return rules.differential_expression.output.counts_and_res @@ -1941,7 +1945,10 @@ rule make_contrast_lfc_boxplots: boxplots = OPJ(aligner, f"mapped_{genome}", "{counter}", "fold_boxplots_{mapped_type}", "{contrast_type}_{orientation}_{biotype}_{fold_type}_{id_list}_boxplots.pdf"), log: - warnings = OPJ(log_dir, "make_contrast_lfc_boxplots", aligner, f"mapped_{genome}", "{counter}", "fold_boxplots_{mapped_type}", "{contrast_type}_{orientation}_{biotype}_{fold_type}_{id_list}.warnings"), + warnings = OPJ( + log_dir, "make_contrast_lfc_boxplots", aligner, + f"mapped_{genome}", "{counter}", "fold_boxplots_{mapped_type}", + "{contrast_type}_{orientation}_{biotype}_{fold_type}_{id_list}.warnings"), params: id_lists = set_id_lists, run: @@ -1952,7 +1959,32 @@ rule make_contrast_lfc_boxplots: # print("Reading fold changes from:", *input.data, sep="\n") lfcs_dict = {} for (contrast, filename) in zip(contrasts_dict[wildcards.contrast_type], input.data): - lfc_data = pd.read_table(filename, index_col="gene") + # Input files may come from a failed DESeq2 analysis + if test_na_file(filename): + warn( + "No %s results for %s_%s_%s_%s. Making dummy output." % ( + wildcards.fold_type, + wildcards.mapped_type, + contrast, + wildcards.orientation, + wildcards.biotype)) + # Should we make an empty lfc_data DataFrame instead? + # Make the file empty + open(output.boxplots, "w").close() + break + try: + lfc_data = pd.read_table(filename, index_col="gene") + except TypeError as err: + # if str(err) == "unsupported operand type(s) for -: 'str' and 'int'": + # warn(str(err)) + # warn("Generating empty file.\n") + # # Make the file empty + # open(output.boxplots, "w").close() + # break + warn( + "Unexpected TypeError:\n{str(err)}\n" + "This occurred while parsing {filename}.") + raise #print(type(lfc_data)) for (list_name, id_list) in params.id_lists.items(): try: @@ -1968,31 +2000,33 @@ rule make_contrast_lfc_boxplots: #print("column has type:", type(selected_data)) #print(selected_data) lfcs_dict[f"{contrast}_{list_name}"] = selected_data - lfcs = pd.DataFrame(lfcs_dict) - # lfcs = pd.DataFrame( - # {f"{contrast}_{list_name}" : pd.read_table(filename, index_col="gene").loc[ - # set(id_list)][wildcard.fold_type] for ( - # contrast, filename) in zip(contrasts_dict[wildcards.contrast_type], input.data) for ( - # list_name, id_list) in params.id_lists.items()}) - title = f"{wildcards.orientation} {wildcards.biotype} {wildcards.mapped_type} {wildcards.fold_type} folds for {wildcards.contrast_type} contrasts" - try: - save_plot( - output.boxplots, - plot_boxplots, - lfcs, - wildcards.fold_type, - title=title) - except TypeError as err: - if str(err) == "Empty 'DataFrame': no numeric data to plot": - warn("\n".join([ - "Got TypeError:", - f"{str(err)}", - f"No data to plot for {title}\n"])) - warn("Generating empty file.\n") - # Make the file empty - open(output.boxplots, "w").close() - else: - raise + else: + # No break encountered in the loop + lfcs = pd.DataFrame(lfcs_dict) + # lfcs = pd.DataFrame( + # {f"{contrast}_{list_name}" : pd.read_table(filename, index_col="gene").loc[ + # set(id_list)][wildcard.fold_type] for ( + # contrast, filename) in zip(contrasts_dict[wildcards.contrast_type], input.data) for ( + # list_name, id_list) in params.id_lists.items()}) + title = f"{wildcards.orientation} {wildcards.biotype} {wildcards.mapped_type} {wildcards.fold_type} folds for {wildcards.contrast_type} contrasts" + try: + save_plot( + output.boxplots, + plot_boxplots, + lfcs, + wildcards.fold_type, + title=title) + except TypeError as err: + if str(err) == "Empty 'DataFrame': no numeric data to plot": + warn("\n".join([ + "Got TypeError:", + f"{str(err)}", + f"No data to plot for {title}\n"])) + warn("Generating empty file.\n") + # Make the file empty + open(output.boxplots, "w").close() + else: + raise #ax = res[res["status"] == "NS"].plot.scatter(x="baseMean", y="lfcMLE", s=2, logx=True, c="grey", label="NS") #cutoffs = [-c for c in reversed(lfc_cutoffs)] + [0] + lfc_cutoffs