diff --git a/small_RNA-seq/small_RNA-seq.snakefile b/small_RNA-seq/small_RNA-seq.snakefile index 8d65830c1b571c87ef0f09e881d2d2f095a2a2f3..1be3459772c0f7185ddce39605a35140f4878f1d 100644 --- a/small_RNA-seq/small_RNA-seq.snakefile +++ b/small_RNA-seq/small_RNA-seq.snakefile @@ -517,6 +517,10 @@ TESTED_SIZE_FACTORS = ["mapped", "non_structural", "all_sisiuRNA", "piRNA", "miR DE_SIZE_FACTORS = ["non_structural", "median_ratio_to_pseudo_ref"] SIZE_FACTORS = ["non_structural", "piRNA"] #NORMALIZER = "median_ratio_to_pseudo_ref" +RPM_NORMS = config.get("rpm_norms", SIZE_FACTORS) +RPM_FOLD_TYPES = [ + f"mean_log2_RPM_by_{norm}_fold" + for norm in RPM_NORMS] # For metagene analyses #META_MARGIN = 300 @@ -646,7 +650,7 @@ wildcard_constraints: norm="|".join(TESTED_SIZE_FACTORS), lib_group="|".join(ALL_LIB_GROUPS), group_type="|".join(GROUP_TYPES), - fold_type="|".join(["mean_log2_RPM_fold", "log2FoldChange", "lfcMLE"]), + fold_type="|".join(["log2FoldChange", "lfcMLE", *RPM_FOLD_TYPES]), feature_type="|".join(["transcript", "exon"]) #ruleorder: map_on_genome > sam2indexedbam > compute_coverage > remap_on_genome > resam2indexedbam > recompute_coverage @@ -771,11 +775,14 @@ if contrasts_dict["ip"]: # TODO: Read this from config file biotypes=["_and_".join(BOXPLOT_BIOTYPES)], orientation=ORIENTATIONS, - fold_type=["mean_log2_RPM_fold"], + fold_type=RPM_FOLD_TYPES, gene_list=BOXPLOT_GENE_LISTS) + #remapped_folds = expand( + # OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", "remapped", "{counted_type}_mean_log2_RPM_fold.txt"), + # counted_type=REMAPPED_COUNTED) remapped_folds = expand( - OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", "remapped", "{counted_type}_mean_log2_RPM_fold.txt"), - counted_type=REMAPPED_COUNTED) + OPJ(mapping_dir, "RPM_by_{norm}_folds_%s" % size_selected, "all", "remapped", "{counted_type}_mean_log2_RPM_by_{norm}_fold.txt"), + counted_type=REMAPPED_COUNTED, norm=RPM_NORMS) # Should be generated as a dependency of the "all" contrast: ## TODO: check that this is generated # remapped_folds += expand( @@ -799,11 +806,11 @@ if contrasts_dict["ip"]: # contrast=IP_CONTRASTS, small_type=READ_TYPES_FOR_COUNTING, mapping_type=[f"on_{genome}"], biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS), ip_fold_heatmaps = expand( OPJ("figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.pdf"), - small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"]) + small_type=IP_TYPES, fold_type=RPM_FOLD_TYPES) ip_fold_boxplots = expand( OPJ("figures", "all_{contrast_type}", "{contrast_type}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf"), - contrast_type=["ip"], small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"], + contrast_type=["ip"], small_type=IP_TYPES, fold_type=RPM_FOLD_TYPES, gene_list=BOXPLOT_GENE_LISTS) # Subdivided by biotype (to enable distinction between 5'UTR, CDS and 3'UTR) ## TODO: activate this? @@ -848,20 +855,23 @@ if DE_CONTRASTS: de_fold_boxplots = expand( OPJ("figures", "{contrast}", "{contrast}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf"), - contrast=DE_CONTRASTS, small_type=DE_TYPES, fold_type=["log2FoldChange", "mean_log2_RPM_fold"], + contrast=DE_CONTRASTS, small_type=DE_TYPES, fold_type=["log2FoldChange", *RPM_FOLD_TYPES], gene_list=["all_gene_lists"]) else: de_fold_boxplots = [] if IP_CONTRASTS: - all_contrasts_folds = [ - OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", f"pimi{SI_MIN}G_mean_log2_RPM_fold.txt"), - OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", f"pimi{SI_MIN}GtRF_mean_log2_RPM_fold.txt"), - # To have RPM (folds) for transgenes (which are not in prot_si category) - OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", f"all_si_{SI_MIN}G_mean_log2_RPM_fold.txt")] + # 08/06/2023: expand by norm in both dir and file name + all_contrasts_folds = [] + for norm in RPM_NORMS: + all_contrasts_folds.extend([ + OPJ(mapping_dir, f"RPM_by_{norm}_folds_{size_selected}", "all", f"pimi{SI_MIN}G_mean_log2_RPM_by_{norm}_fold.txt"), + OPJ(mapping_dir, f"RPM_by_{norm}_folds_{size_selected}", "all", f"pimi{SI_MIN}GtRF_mean_log2_RPM_by_{norm}_fold.txt"), + # To have RPM (folds) for transgenes (which are not in prot_si category) + OPJ(mapping_dir, f"RPM_by_{norm}_folds_{size_selected}", "all", f"all_si_{SI_MIN}G_mean_log2_RPM_by_{norm}_fold.txt")]) ip_fold_boxplots_by_contrast = expand( OPJ("figures", "{contrast}", "{contrast}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf"), - contrast=IP_CONTRASTS, small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"], + contrast=IP_CONTRASTS, small_type=IP_TYPES, fold_type=RPM_FOLD_TYPES, gene_list=["all_gene_lists"]) else: all_contrasts_folds = [] @@ -2299,6 +2309,34 @@ rule gather_read_counts_summaries: # repeat_bed_2_lengths(input.simrep_bed))).to_csv(output.exon_lengths, sep="\t") +if False: + rule compute_RPM: + input: + counts_table = source_small_RNA_counts, + summary_table = rules.gather_read_counts_summaries.output.summary_table, + tags_table = rules.associate_small_type.output.tags_table, + output: + RPM_table = OPJ( + annot_counts_dir, + "all_{mapped_type}_on_%s" % genome, "{small_type}_RPM.txt"), + log: + log = OPJ(log_dir, "compute_RPM_{mapped_type}", "{small_type}.log"), + benchmark: + OPJ(log_dir, "compute_RPM_{mapped_type}", "{small_type}_benchmark.txt"), + run: + with open(log.log, "w") as logfile: + logfile.write(f"Reading column counts from {input.counts_table}\n") + counts_data = pd.read_table( + input.counts_table, + index_col="gene") + logfile.write(f"Reading number of non-structural mappers from {input.summary_table}\n") + norm = pd.read_table(input.summary_table, index_col=0).loc["non_structural"] + logfile.write(str(norm)) + logfile.write("Computing counts per million non-structural mappers\n") + RPM = 1000000 * counts_data / norm + add_tags_column(RPM, input.tags_table, "small_type").to_csv(output.RPM_table, sep="\t") + + rule compute_RPM: input: counts_table = source_small_RNA_counts, @@ -2307,25 +2345,110 @@ rule compute_RPM: output: RPM_table = OPJ( annot_counts_dir, - "all_{mapped_type}_on_%s" % genome, "{small_type}_RPM.txt"), + "all_{mapped_type}_on_%s" % genome, "{small_type}_RPM_by_{norm}.txt"), log: - log = OPJ(log_dir, "compute_RPM_{mapped_type}", "{small_type}.log"), + log = OPJ(log_dir, "compute_RPM_by_{norm}_{mapped_type}", "{small_type}.log"), benchmark: - OPJ(log_dir, "compute_RPM_{mapped_type}", "{small_type}_benchmark.txt"), + OPJ(log_dir, "compute_RPM_by_{norm}_{mapped_type}", "{small_type}_benchmark.txt"), run: with open(log.log, "w") as logfile: logfile.write(f"Reading column counts from {input.counts_table}\n") counts_data = pd.read_table( input.counts_table, index_col="gene") - logfile.write(f"Reading number of non-structural mappers from {input.summary_table}\n") - norm = pd.read_table(input.summary_table, index_col=0).loc["non_structural"] + logfile.write(f"Reading number of {wildcards.norm} from {input.summary_table}\n") + norm = pd.read_table(input.summary_table, index_col=0).loc[wildcards.norm] logfile.write(str(norm)) - logfile.write("Computing counts per million non-structural mappers\n") + logfile.write(f"Computing counts per million {wildcards.norm}\n") RPM = 1000000 * counts_data / norm add_tags_column(RPM, input.tags_table, "small_type").to_csv(output.RPM_table, sep="\t") +if False: + rule compute_RPM_folds: + input: + counts_table = source_small_RNA_counts, + #exon_lengths = rules.compute_genes_exon_lengths.output.exon_lengths, + summary_table = rules.gather_read_counts_summaries.output.summary_table, + tags_table = rules.associate_small_type.output.tags_table, + output: + fold_results = OPJ( + mapping_dir, f"RPM_folds_{size_selected}", + "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), + log: + log = OPJ(log_dir, "compute_RPM_folds", "{contrast}_{small_type}.log"), + benchmark: + OPJ(log_dir, "compute_RPM_folds", "{contrast}_{small_type}_benchmark.txt"), + run: + with open(log.log, "w") as logfile: + (cond, ref) = CONTRAST2PAIR[wildcards.contrast] + logfile.write(f"cond: {cond}, ref: {ref}\n") + column_list = expand("{lib}_{rep}", lib=[ref, cond], rep=REPS) + logfile.write(f"Reading columns {column_list} from {input.counts_table}\n") + counts_data = pd.read_table( + input.counts_table, + usecols=["gene", *column_list], + index_col="gene") + assert counts_data.index.name == "gene", f"Wrong index: {counts_data.index.name}" + # logfile.write(f"Columns in counts_data are: {counts_data.columns}") + # logfile.write(f"Index in counts_data is: {counts_data.index.name}") + logfile.write(f"Found {len(counts_data.index)} genes\n") + #logfile.write(f"Reading exon lengths from {exon_lengths_file}\n") + #exon_lengths = pd.read_table( + # input.exon_lengths_file, + # index_col="gene") + #logfile.write(f"Found {len(exon_lengths.index)} genes\n") + #logfile.write("Determining set of common genes\n") + #common = counts_data.index.intersection(exon_lengths.index) + #logfile.write(f"Found {len(common)} common genes\n") + #logfile.write("Computing counts per kilobase\n") + #RPK = 1000 * counts_data.loc[common].div(exon_lengths.loc[common]["union_exon_len"], axis="index") + logfile.write(f"Reading number of non-structural mappers from {input.summary_table}\n") + norm = pd.read_table(input.summary_table, index_col=0).loc["non_structural"][column_list] + logfile.write(str(norm)) + logfile.write("Computing counts per million non-structural mappers\n") + RPM = 1000000 * counts_data / norm + # logfile.write(f"Columns in RPM are: {RPM.columns}\n") + # logfile.write(f"Index in RPM is: {RPM.index.name}\n") + assert RPM.index.name == "gene", f"Wrong index: {RPM.index.name}" + logfile.write("Computing RPM log2 fold changes\n") + # Maybe not a very sensible way to compute the mean folds: it pairs replicates, but the pairing is arbitrary + # TODO: Should we compute the fold of the means instead? + # -> No: the pairing is not necessarily arbitrary. + # The condition pairs may come from the same biological line, for instance. + # Add 0.25 as RPM "pseudo-count" (edgeR does this ? https://www.biostars.org/p/102568/#102604) + lfc = np.log2(pd.DataFrame( + {f"log2({cond}_{rep}/{ref}_{rep})" : (RPM[f"{cond}_{rep}"] + 0.25) / (RPM[f"{ref}_{rep}"] + 0.25) for rep in REPS})) + # Compute the mean + lfc = lfc.assign(mean_log2_RPM_fold=lfc.mean(axis=1)) + # Converting gene IDs + ###################### + with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: + lfc = lfc.assign(cosmid=lfc.apply(column_converter(load(dict_file)), axis=1)) + #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + # lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1)) + lfc = lfc.assign(name=lfc.apply(column_converter(wormid2name), axis=1)) + # logfile.write(f"Columns in lfc are: {lfc.columns}\n") + # logfile.write(f"Index in lfc is: {lfc.index.name}\n") + assert lfc.index.name == "gene", f"Wrong index: {lfc.index.name}" + logfile.write(f"Adding small read type info from {input.tags_table}\n") + #pd.concat((counts_data.loc[common], RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t") + lfc_with_tags = add_tags_column(lfc, input.tags_table, "small_type", logfile) + logfile.write(f"Columns in lfc_with_tags are: {lfc_with_tags.columns}\n") + logfile.write(f"Index in lfc_with_tags is: {lfc_with_tags.index.name}\n") + lfc_idx = lfc.index + RPM_idx = RPM.index + lfc_with_tags_idx = lfc_with_tags.index + lfc_xor_RPM = lfc_idx.symmetric_difference(RPM_idx) + lfc_xor_lfc_with_tags = lfc_idx.symmetric_difference(lfc_with_tags_idx) + RPM_xor_lfc_with_tags = RPM_idx.symmetric_difference(lfc_with_tags_idx) + logfile.write(f"Index difference:\nlfc_xor_RPM: {lfc_xor_RPM}\nlfc_xor_lfc_with_tags: {lfc_xor_lfc_with_tags}\nRPM_xor_lfc_with_tags: {RPM_xor_lfc_with_tags}\n") + with_tags = pd.concat((RPM, lfc_with_tags), axis=1) + logfile.write(f"Columns in with_tags are: {with_tags.columns}\n") + logfile.write(f"Index in with_tags is: {with_tags.index.name}\n") + logfile.write(f"Then writing to {output.fold_results}\n") + with_tags.to_csv(output.fold_results, sep="\t") + rule compute_RPM_folds: input: counts_table = source_small_RNA_counts, @@ -2334,12 +2457,12 @@ rule compute_RPM_folds: tags_table = rules.associate_small_type.output.tags_table, output: fold_results = OPJ( - mapping_dir, f"RPM_folds_{size_selected}", - "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), + mapping_dir, "RPM_by_{norm}_folds_%s" % size_selected, + "{contrast}", "{contrast}_{small_type}_RPM_by_{norm}_folds.txt"), log: - log = OPJ(log_dir, "compute_RPM_folds", "{contrast}_{small_type}.log"), + log = OPJ(log_dir, "compute_RPM_by_{norm}_folds", "{contrast}_{small_type}.log"), benchmark: - OPJ(log_dir, "compute_RPM_folds", "{contrast}_{small_type}_benchmark.txt"), + OPJ(log_dir, "compute_RPM_by_{norm}_folds", "{contrast}_{small_type}_benchmark.txt"), run: with open(log.log, "w") as logfile: (cond, ref) = CONTRAST2PAIR[wildcards.contrast] @@ -2364,10 +2487,10 @@ rule compute_RPM_folds: #logfile.write(f"Found {len(common)} common genes\n") #logfile.write("Computing counts per kilobase\n") #RPK = 1000 * counts_data.loc[common].div(exon_lengths.loc[common]["union_exon_len"], axis="index") - logfile.write(f"Reading number of non-structural mappers from {input.summary_table}\n") - norm = pd.read_table(input.summary_table, index_col=0).loc["non_structural"][column_list] + logfile.write(f"Reading number of {wildcards.norm} from {input.summary_table}\n") + norm = pd.read_table(input.summary_table, index_col=0).loc[wildcards.norm][column_list] logfile.write(str(norm)) - logfile.write("Computing counts per million non-structural mappers\n") + logfile.write(f"Computing counts per million {wildcards.norm}\n") RPM = 1000000 * counts_data / norm # logfile.write(f"Columns in RPM are: {RPM.columns}\n") # logfile.write(f"Index in RPM is: {RPM.index.name}\n") @@ -2411,31 +2534,61 @@ rule compute_RPM_folds: with_tags.to_csv(output.fold_results, sep="\t") +if False: + rule gather_RPM_folds: + """Gathers RPM folds across contrasts.""" + input: + fold_results = expand(OPJ( + mapping_dir, f"RPM_folds_{size_selected}", + "{contrast}", "{contrast}_{{small_type}}_RPM_folds.txt"), contrast=IP_CONTRASTS), + output: + all_folds = OPJ( + mapping_dir, f"RPM_folds_{size_selected}", "all", "{small_type}_mean_log2_RPM_fold.txt"), + # wildcard_constraints: + # small_type="si|siu|sisiu|all_si|all_siu|all_sisiu|%s"% "|".join(SMALL_TYPES + JOINED_SMALL_TYPES), + log: + log = OPJ(log_dir, "gather_RPM_folds", "{small_type}.log"), + benchmark: + OPJ(log_dir, "gather_RPM_folds", "{small_type}_benchmark.txt"), + run: + with open(log.log, "w") as logfile: + logfile.write(f"Debug: input\n{input}\n") + actual_input = [ + OPJ(mapping_dir, f"RPM_folds_{size_selected}", + f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt") for contrast in IP_CONTRASTS] + logfile.write(f"Gathering RPM folds from:\n{actual_input}\nShould be from:\n{input.fold_results}\n") + pd.DataFrame({contrast : pd.read_table( + OPJ(mapping_dir, f"RPM_folds_{size_selected}", + f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt"), + index_col=["gene", "cosmid", "name", "small_type"], + usecols=["gene", "cosmid", "name", "small_type", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv( + output.all_folds, sep="\t") + rule gather_RPM_folds: """Gathers RPM folds across contrasts.""" input: fold_results = expand(OPJ( - mapping_dir, f"RPM_folds_{size_selected}", - "{contrast}", "{contrast}_{{small_type}}_RPM_folds.txt"), contrast=IP_CONTRASTS), + mapping_dir, "RPM_by_{{norm}}_folds_%s" % size_selected, + "{contrast}", "{contrast}_{{small_type}}_RPM_by_{{norm}}_folds.txt"), contrast=IP_CONTRASTS), output: all_folds = OPJ( - mapping_dir, f"RPM_folds_{size_selected}", "all", "{small_type}_mean_log2_RPM_fold.txt"), + mapping_dir, "RPM_by_{norm}_folds_%s" % size_selected, "all", "{small_type}_mean_log2_RPM_by_{norm}_fold.txt"), # wildcard_constraints: # small_type="si|siu|sisiu|all_si|all_siu|all_sisiu|%s"% "|".join(SMALL_TYPES + JOINED_SMALL_TYPES), log: - log = OPJ(log_dir, "gather_RPM_folds", "{small_type}.log"), + log = OPJ(log_dir, "gather_RPM_by_{norm}_folds", "{small_type}.log"), benchmark: - OPJ(log_dir, "gather_RPM_folds", "{small_type}_benchmark.txt"), + OPJ(log_dir, "gather_RPM_by_{norm}_folds", "{small_type}_benchmark.txt"), run: with open(log.log, "w") as logfile: logfile.write(f"Debug: input\n{input}\n") actual_input = [ - OPJ(mapping_dir, f"RPM_folds_{size_selected}", - f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt") for contrast in IP_CONTRASTS] + OPJ(mapping_dir, f"RPM_by_{wildcards.norm}_folds_{size_selected}", + f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_by_{wildcards.norm}_folds.txt") for contrast in IP_CONTRASTS] logfile.write(f"Gathering RPM folds from:\n{actual_input}\nShould be from:\n{input.fold_results}\n") pd.DataFrame({contrast : pd.read_table( - OPJ(mapping_dir, f"RPM_folds_{size_selected}", - f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt"), + OPJ(mapping_dir, f"RPM_by_{wildcards.norm}_folds_{size_selected}", + f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_by_{wildcards.norm}_folds.txt"), index_col=["gene", "cosmid", "name", "small_type"], usecols=["gene", "cosmid", "name", "small_type", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv( output.all_folds, sep="\t") @@ -2445,6 +2598,67 @@ rule gather_RPM_folds: # TODO: replace counted_type with all subwildcards. Why does it work with cond_name? # Wildcards in input files cannot be determined from output files: # 'read_type' +if False: + rule compute_remapped_RPM_folds: + input: + counts_table = source_small_RNA_counts, + # bowtie2/mapped_C_elegans_klp7co/feature_count/all_18-26_and_nomap_siRNA_on_C_elegans_klp7co_genes_fwd_exon_counts.txt + #counts_table = rules.gather_feature_counts.output.counts_table, + #exon_lengths = rules.compute_genes_exon_lengths.output.exon_lengths, + summary_table = rules.gather_read_counts_summaries.output.summary_table, + #tags_table = rules.associate_small_type.output.tags_table, + output: + fold_results = OPJ( + mapping_dir, f"RPM_folds_{size_selected}", + "{contrast}", "remapped", "{contrast}_{read_type}_{mapping_type}_{biotype}_{orientation}_{feature_type}_RPM_folds.txt"), + log: + log = OPJ(log_dir, "compute_RPM_folds", "{contrast}_{read_type}_{mapping_type}_{biotype}_{orientation}_{feature_type}.log"), + benchmark: + OPJ(log_dir, "compute_RPM_folds", "{contrast}_{read_type}_{mapping_type}_{biotype}_{orientation}_{feature_type}_benchmark.txt"), + run: + with open(log.log, "w") as logfile: + (cond, ref) = CONTRAST2PAIR[wildcards.contrast] + logfile.write(f"cond: {cond}, ref: {ref}\n") + column_list = expand("{lib}_{rep}", lib=[ref, cond], rep=REPS) + logfile.write(f"Reading columns {column_list} from {input.counts_table}\n") + counts_data = pd.read_table( + input.counts_table, + usecols=["gene", *column_list], + index_col="gene") + logfile.write(f"Found {len(counts_data.index)} genes\n") + #logfile.write(f"Reading exon lengths from {exon_lengths_file}\n") + #exon_lengths = pd.read_table( + # input.exon_lengths_file, + # index_col="gene") + #logfile.write(f"Found {len(exon_lengths.index)} genes\n") + #logfile.write("Determining set of common genes\n") + #common = counts_data.index.intersection(exon_lengths.index) + #logfile.write(f"Found {len(common)} common genes\n") + #logfile.write("Computing counts per kilobase\n") + #RPK = 1000 * counts_data.loc[common].div(exon_lengths.loc[common]["union_exon_len"], axis="index") + logfile.write(f"Reading number of non-structural mappers from {input.summary_table}\n") + norm = pd.read_table(input.summary_table, index_col=0).loc["non_structural"][column_list] + logfile.write(str(norm)) + logfile.write("Computing counts per million non-structural mappers\n") + RPM = 1000000 * counts_data / norm + logfile.write("Computing RPM log2 fold changes\n") + # Add 0.25 as RPM "pseudo-count" (edgeR does this ? https://www.biostars.org/p/102568/#102604) + lfc = np.log2(pd.DataFrame( + {f"log2({cond}_{rep}/{ref}_{rep})" : (RPM[f"{cond}_{rep}"] + 0.25) / (RPM[f"{ref}_{rep}"] + 0.25) for rep in REPS})) + # Compute the mean + lfc = lfc.assign(mean_log2_RPM_fold=lfc.mean(axis=1)) + # Converting gene IDs + ###################### + with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: + lfc = lfc.assign(cosmid=lfc.apply(column_converter(load(dict_file)), axis=1)) + #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + # lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1)) + lfc = lfc.assign(name=lfc.apply(column_converter(wormid2name), axis=1)) + #logfile.write("Adding small read type info\n") + ##pd.concat((counts_data.loc[common], RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t") + #pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t") + pd.concat((RPM, lfc), axis=1).to_csv(output.fold_results, sep="\t") + rule compute_remapped_RPM_folds: input: counts_table = source_small_RNA_counts, @@ -2455,12 +2669,12 @@ rule compute_remapped_RPM_folds: #tags_table = rules.associate_small_type.output.tags_table, output: fold_results = OPJ( - mapping_dir, f"RPM_folds_{size_selected}", - "{contrast}", "remapped", "{contrast}_{read_type}_{mapping_type}_{biotype}_{orientation}_{feature_type}_RPM_folds.txt"), + mapping_dir, "RPM_by_{norm}_folds_%s" % size_selected, + "{contrast}", "remapped", "{contrast}_{read_type}_{mapping_type}_{biotype}_{orientation}_{feature_type}_RPM_by_{norm}_folds.txt"), log: - log = OPJ(log_dir, "compute_RPM_folds", "{contrast}_{read_type}_{mapping_type}_{biotype}_{orientation}_{feature_type}.log"), + log = OPJ(log_dir, "compute_RPM_by_{norm}_folds", "{contrast}_{read_type}_{mapping_type}_{biotype}_{orientation}_{feature_type}.log"), benchmark: - OPJ(log_dir, "compute_RPM_folds", "{contrast}_{read_type}_{mapping_type}_{biotype}_{orientation}_{feature_type}_benchmark.txt"), + OPJ(log_dir, "compute_RPM_by_{norm}_folds", "{contrast}_{read_type}_{mapping_type}_{biotype}_{orientation}_{feature_type}_benchmark.txt"), run: with open(log.log, "w") as logfile: (cond, ref) = CONTRAST2PAIR[wildcards.contrast] @@ -2482,10 +2696,10 @@ rule compute_remapped_RPM_folds: #logfile.write(f"Found {len(common)} common genes\n") #logfile.write("Computing counts per kilobase\n") #RPK = 1000 * counts_data.loc[common].div(exon_lengths.loc[common]["union_exon_len"], axis="index") - logfile.write(f"Reading number of non-structural mappers from {input.summary_table}\n") - norm = pd.read_table(input.summary_table, index_col=0).loc["non_structural"][column_list] + logfile.write(f"Reading number of {wildcards.norm} from {input.summary_table}\n") + norm = pd.read_table(input.summary_table, index_col=0).loc[wildcards.norm][column_list] logfile.write(str(norm)) - logfile.write("Computing counts per million non-structural mappers\n") + logfile.write(f"Computing counts per million {wildcards.norm}\n") RPM = 1000000 * counts_data / norm logfile.write("Computing RPM log2 fold changes\n") # Add 0.25 as RPM "pseudo-count" (edgeR does this ? https://www.biostars.org/p/102568/#102604) @@ -2506,25 +2720,49 @@ rule compute_remapped_RPM_folds: pd.concat((RPM, lfc), axis=1).to_csv(output.fold_results, sep="\t") +if False: + rule gather_remapped_RPM_folds: + """Gathers RPM folds across contrasts.""" + input: + fold_results = expand(OPJ( + mapping_dir, f"RPM_folds_{size_selected}", + "{contrast}", "remapped", "{contrast}_{{read_type}}_{{mapping_type}}_{{biotype}}_{{orientation}}_{{feature_type}}_RPM_folds.txt"), contrast=IP_CONTRASTS), + output: + all_folds = OPJ( + mapping_dir, f"RPM_folds_{size_selected}", "all", "remapped", "{read_type}_{mapping_type}_{biotype}_{orientation}_{feature_type}_mean_log2_RPM_fold.txt"), + log: + log = OPJ(log_dir, "gather_RPM_folds", "{read_type}_{mapping_type}_{biotype}_{orientation}_{feature_type}.log"), + benchmark: + OPJ(log_dir, "gather_RPM_folds", "{read_type}_{mapping_type}_{biotype}_{orientation}_{feature_type}_benchmark.txt"), + run: + pd.DataFrame({contrast : pd.read_table( + # OPJ(mapping_dir, f"RPM_folds_{size_selected}", + # f"{contrast}", f"{contrast}_{wildcards.counted_type}_RPM_folds.txt"), + OPJ(mapping_dir, f"RPM_folds_{size_selected}", + f"{contrast}", "remapped", f"{contrast}_{wildcards.read_type}_{wildcards.mapping_type}_{wildcards.biotype}_{wildcards.orientation}_{wildcards.feature_type}_RPM_folds.txt"), + index_col=["gene", "cosmid", "name"], + usecols=["gene", "cosmid", "name", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv( + output.all_folds, sep="\t") + rule gather_remapped_RPM_folds: """Gathers RPM folds across contrasts.""" input: fold_results = expand(OPJ( - mapping_dir, f"RPM_folds_{size_selected}", - "{contrast}", "remapped", "{contrast}_{{read_type}}_{{mapping_type}}_{{biotype}}_{{orientation}}_{{feature_type}}_RPM_folds.txt"), contrast=IP_CONTRASTS), + mapping_dir, "RPM_by_{{norm}}_folds_%s" % size_selected, + "{contrast}", "remapped", "{contrast}_{{read_type}}_{{mapping_type}}_{{biotype}}_{{orientation}}_{{feature_type}}_RPM_by_{{norm}}_folds.txt"), contrast=IP_CONTRASTS), output: all_folds = OPJ( - mapping_dir, f"RPM_folds_{size_selected}", "all", "remapped", "{read_type}_{mapping_type}_{biotype}_{orientation}_{feature_type}_mean_log2_RPM_fold.txt"), + mapping_dir, "RPM_by_{norm}_folds_%s" % size_selected, "all", "remapped", "{read_type}_{mapping_type}_{biotype}_{orientation}_{feature_type}_mean_log2_RPM_by_{norm}_fold.txt"), log: - log = OPJ(log_dir, "gather_RPM_folds", "{read_type}_{mapping_type}_{biotype}_{orientation}_{feature_type}.log"), + log = OPJ(log_dir, "gather_RPM_by_{norm}_folds", "{read_type}_{mapping_type}_{biotype}_{orientation}_{feature_type}.log"), benchmark: - OPJ(log_dir, "gather_RPM_folds", "{read_type}_{mapping_type}_{biotype}_{orientation}_{feature_type}_benchmark.txt"), + OPJ(log_dir, "gather_RPM_by_{norm}_folds", "{read_type}_{mapping_type}_{biotype}_{orientation}_{feature_type}_benchmark.txt"), run: pd.DataFrame({contrast : pd.read_table( # OPJ(mapping_dir, f"RPM_folds_{size_selected}", # f"{contrast}", f"{contrast}_{wildcards.counted_type}_RPM_folds.txt"), - OPJ(mapping_dir, f"RPM_folds_{size_selected}", - f"{contrast}", "remapped", f"{contrast}_{wildcards.read_type}_{wildcards.mapping_type}_{wildcards.biotype}_{wildcards.orientation}_{wildcards.feature_type}_RPM_folds.txt"), + OPJ(mapping_dir, f"RPM_by_{wildcards.norm}_folds_{size_selected}", + f"{contrast}", "remapped", f"{contrast}_{wildcards.read_type}_{wildcards.mapping_type}_{wildcards.biotype}_{wildcards.orientation}_{wildcards.feature_type}_RPM_by_{wildcards.norm}_folds.txt"), index_col=["gene", "cosmid", "name"], usecols=["gene", "cosmid", "name", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv( output.all_folds, sep="\t") @@ -2536,7 +2774,8 @@ rule gather_remapped_RPM_folds: def source_gathered_folds(wildcards): if hasattr(wildcards, "counted_type"): return rules.gather_remapped_RPM_folds.output.all_folds - elif wildcards.fold_type == "mean_log2_RPM_fold": + elif wildcards.fold_type in RPM_FOLD_TYPES: + #elif wildcards.fold_type == "mean_log2_RPM_fold": return rules.gather_RPM_folds.output.all_folds else: return rules.gather_DE_folds.output.all_folds @@ -4024,6 +4263,40 @@ all_id_lists = dict(zip( filter(None.__ne__, map(get_id_list, config["gene_lists"])))) +#@wc_applied +#def source_fold_data(wildcards): +# fold_type = wildcards.fold_type +# if fold_type in {"log2FoldChange", "lfcMLE"}: +# if hasattr(wildcards, "contrast_type"): +# return expand( +# OPJ(mapping_dir, +# "deseq2_%s" % size_selected, "{contrast}", +# "{contrast}_{{small_type}}_counts_and_res.txt"), +# contrast=contrasts_dict[wildcards.contrast_type]) +# else: +# return rules.small_RNA_differential_expression.output.counts_and_res +# # possibly set wildcard constraints +# elif fold_type == "mean_log2_RPM_fold": +# if hasattr(wildcards, "contrast_type"): +# # We want all contrasts of this type. +# #https://stackoverflow.com/a/26791923/1878788 +# #print("{0.small_type}".format(wildcards)) +# return [filename.format(wildcards) for filename in expand( +# OPJ(mapping_dir, +# "RPM_folds_%s" % size_selected, "{contrast}", +# "{contrast}_{{0.small_type}}_RPM_folds.txt"), +# contrast=contrasts_dict[wildcards.contrast_type])] +# elif hasattr(wildcards, "biotypes"): +# return [filename.format(wildcards) for filename in expand( +# OPJ(mapping_dir, +# "RPM_folds_%s" % size_selected, +# "{{0.contrast}}", "remapped", +# "{{0.contrast}}_{{0.read_type}}_{{0.mapping_type}}_{biotype}_{{0.orientation}}_transcript_RPM_folds.txt"), +# biotype=wildcards.biotypes.split("_and_"))] +# else: +# return rules.compute_RPM_folds.output.fold_results +# else: +# raise NotImplementedError("Unknown fold type: %s" % fold_type) @wc_applied def source_fold_data(wildcards): fold_type = wildcards.fold_type @@ -4036,22 +4309,24 @@ def source_fold_data(wildcards): contrast=contrasts_dict[wildcards.contrast_type]) else: return rules.small_RNA_differential_expression.output.counts_and_res - elif fold_type == "mean_log2_RPM_fold": + # 08/06/2023: Take into account new "_by_{norm}" file name element in RPM files + # possibly set wildcard constraints + elif fold_type in RPM_FOLD_TYPES: if hasattr(wildcards, "contrast_type"): # We want all contrasts of this type. #https://stackoverflow.com/a/26791923/1878788 #print("{0.small_type}".format(wildcards)) return [filename.format(wildcards) for filename in expand( OPJ(mapping_dir, - "RPM_folds_%s" % size_selected, "{contrast}", - "{contrast}_{{0.small_type}}_RPM_folds.txt"), + "RPM_by_{{0.norm}}_folds_%s" % size_selected, "{contrast}", + "{contrast}_{{0.small_type}}_RPM_by_{{0.norm}}_folds.txt"), contrast=contrasts_dict[wildcards.contrast_type])] elif hasattr(wildcards, "biotypes"): return [filename.format(wildcards) for filename in expand( OPJ(mapping_dir, - "RPM_folds_%s" % size_selected, + "RPM_by_{{0.norm}}_folds_%s" % size_selected, "{{0.contrast}}", "remapped", - "{{0.contrast}}_{{0.read_type}}_{{0.mapping_type}}_{biotype}_{{0.orientation}}_transcript_RPM_folds.txt"), + "{{0.contrast}}_{{0.read_type}}_{{0.mapping_type}}_{biotype}_{{0.orientation}}_transcript_RPM_by_{{0.norm}}_folds.txt"), biotype=wildcards.biotypes.split("_and_"))] else: return rules.compute_RPM_folds.output.fold_results