diff --git a/small_RNA-seq/small_RNA-seq.snakefile b/small_RNA-seq/small_RNA-seq.snakefile index b45f3bbf1aecc87cdf844fa5e244cbdc4142f51d..b4afa0f60c6661de966535cf840fd0e3b9499ef5 100644 --- a/small_RNA-seq/small_RNA-seq.snakefile +++ b/small_RNA-seq/small_RNA-seq.snakefile @@ -610,9 +610,20 @@ de_fold_heatmaps = [] ip_fold_boxplots = [] # Temporary, until used for boxplots: remapped_folds = [] +remapped_fold_boxplots = [] if contrasts_dict["ip"]: + remapped_fold_boxplots = expand( + OPJ(output_dir, "figures", "{contrast}", "by_biotypes", + "{contrast}_{read_type}_on_%s_{biotypes}_{orientation}_transcript_{fold_type}_{gene_list}_boxplots.{fig_format}" % genome), + contrast=IP_CONTRASTS, + read_type=READ_TYPES_FOR_COUNTING, + # TODO: Read this from config file + biotypes=["protein_coding_5UTR_and_protein_coding_CDS_and_protein_coding_3UTR"], + orientation=ORIENTATIONS, + fold_type=["mean_log2_RPM_fold"], + gene_list=BOXPLOT_GENE_LISTS, fig_format=FIG_FORMATS) remapped_folds = expand( - OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", "{counted_type}_mean_log2_RPM_fold.txt"), + OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", "remapped", "{counted_type}_mean_log2_RPM_fold.txt"), counted_type=REMAPPED_COUNTED), # snakemake -n OK # expand( @@ -732,6 +743,7 @@ rule all: exploratory_graphs, remapped_folds, fold_boxplots, + remapped_fold_boxplots, #expand(OPJ(output_dir, "figures", "{small_type}_unit_clustermap.{fig_format}"), small_type=SMALL_TYPES, fig_format=FIG_FORMATS), #expand(OPJ( # feature_counts_dir, @@ -1834,7 +1846,7 @@ rule compute_remapped_RPM_folds: output: fold_results = OPJ( mapping_dir, f"RPM_folds_{size_selected}", - "{contrast}", "{contrast}_{read_type}_on_%s_{biotype}_{orientation}_{feature_type}_RPM_folds.txt" % genome), + "{contrast}", "remapped", "{contrast}_{read_type}_on_%s_{biotype}_{orientation}_{feature_type}_RPM_folds.txt" % genome), log: log = OPJ(log_dir, "compute_RPM_folds", "{contrast}_{read_type}_on_%s_{biotype}_{orientation}_transcript.log" % genome), run: @@ -1886,10 +1898,10 @@ rule gather_remapped_RPM_folds: input: fold_results = expand(OPJ( mapping_dir, f"RPM_folds_{size_selected}", - "{contrast}", "{contrast}_{{read_type}}_on_%s_{{biotype}}_{{orientation}}_transcript_RPM_folds.txt" % genome), contrast=IP_CONTRASTS), + "{contrast}", "remapped", "{contrast}_{{read_type}}_on_%s_{{biotype}}_{{orientation}}_transcript_RPM_folds.txt" % genome), contrast=IP_CONTRASTS), output: all_folds = OPJ( - mapping_dir, f"RPM_folds_{size_selected}", "all", "{read_type}_on_%s_{biotype}_{orientation}_transcript_mean_log2_RPM_fold.txt" % genome), + mapping_dir, f"RPM_folds_{size_selected}", "all", "remapped", "{read_type}_on_%s_{biotype}_{orientation}_transcript_mean_log2_RPM_fold.txt" % genome), log: log = OPJ(log_dir, "gather_RPM_folds", "{read_type}_on_%s_{biotype}_{orientation}_transcript.log" % genome), run: @@ -1897,7 +1909,7 @@ rule gather_remapped_RPM_folds: # OPJ(mapping_dir, f"RPM_folds_{size_selected}", # f"{contrast}", f"{contrast}_{wildcards.counted_type}_RPM_folds.txt"), OPJ(mapping_dir, f"RPM_folds_{size_selected}", - f"{contrast}", f"{contrast}_{wildcards.read_type}_on_{genome}_{wildcards.biotype}_{wildcards.orientation}_transcript_RPM_folds.txt"), + f"{contrast}", "remapped", f"{contrast}_{wildcards.read_type}_on_{genome}_{wildcards.biotype}_{wildcards.orientation}_transcript_RPM_folds.txt"), index_col=["gene", "cosmid", "name"], usecols=["gene", "cosmid", "name", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv( output.all_folds, sep="\t") @@ -1905,7 +1917,9 @@ rule gather_remapped_RPM_folds: # TODO add remapped_RPM_folds def source_gathered_folds(wildcards): - if wildcards.fold_type == "mean_log2_RPM_fold": + if hasattr(wildcards, "counted_type"): + return rules.gather_remapped_RPM_folds.output.all_folds + elif wildcards.fold_type == "mean_log2_RPM_fold": return rules.gather_RPM_folds.output.all_folds else: return rules.gather_DE_folds.output.all_folds @@ -3340,6 +3354,7 @@ def source_fold_data(wildcards): return rules.small_RNA_differential_expression.output.counts_and_res elif fold_type == "mean_log2_RPM_fold": if hasattr(wildcards, "contrast_type"): + # We want all contrasts of this type. #https://stackoverflow.com/a/26791923/1878788 #print("{0.small_type}".format(wildcards)) return [filename.format(wildcards) for filename in expand( @@ -3347,6 +3362,13 @@ def source_fold_data(wildcards): "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{{0.small_type}}_RPM_folds.txt"), contrast=contrasts_dict[wildcards.contrast_type])] + elif hasattr(wildcards, "biotypes"): + return [filename.format(wildcards) for filename in expand( + OPJ(mapping_dir, + "RPM_folds_%s" % size_selected, + "{{0.contrast}}", "remapped", + "{{0.contrast}}_{{0.read_type}}_on_%s_{biotype}_{{0.orientation}}_transcript_RPM_folds.txt" % genome), + biotype=wildcards.biotypes.split("_and_"))] else: return rules.compute_RPM_folds.output.fold_results else: @@ -3382,6 +3404,7 @@ rule make_gene_list_lfc_boxplots: rule make_contrast_lfc_boxplots: + """For a given gene list, make one box for each contrast.""" input: data = source_fold_data, output: @@ -3406,6 +3429,31 @@ rule make_contrast_lfc_boxplots: title=f"{wildcards.small_type} folds for {wildcards.contrast_type} contrasts") +rule make_biotype_lfc_boxplots: + """For a given gene list, make one box for each contrast.""" + input: + data = source_fold_data, + output: + boxplots = OPJ(output_dir, "figures", "{contrast}", "by_biotypes", + "{contrast}_{read_type}_on_%s_{biotypes}_{orientation}_transcript_{fold_type}_{gene_list}_boxplots.{fig_format}" % genome) + params: + id_lists = set_id_lists, + #shell: + # """touch {output.boxplots}""" + run: + lfcs = pd.DataFrame( + {f"{biotype}_{list_name}" : pd.read_table(filename, index_col="gene").loc[ + set(id_list)]["mean_log2_RPM_fold"] for ( + biotype, filename) in zip(wildcards.biotypes.split("_and_"), input.data) for ( + list_name, id_list) in params.id_lists.items()}) + save_plot( + output.boxplots, + plot_boxplots, + lfcs, + wildcards.fold_type, + title=f"{wildcards.read_type} folds for {wildcards.biotypes} contrasts") + + ########################## # Read sequence analyses # ##########################