diff --git a/PRO-seq/PRO-seq.snakefile b/PRO-seq/PRO-seq.snakefile index 4d8b565a27cd3f381a4ea90cbb9cb4f1d698741e..116aa1b5a597517fbe13227a7c7c947089d86bef 100644 --- a/PRO-seq/PRO-seq.snakefile +++ b/PRO-seq/PRO-seq.snakefile @@ -61,7 +61,7 @@ import husl from libdeseq import do_deseq2 from libhts import median_ratio_to_pseudo_ref_size_factors, status_setter, plot_lfc_distribution, plot_MA -from libworkflows import ensure_relative, cleanup_and_backup, texscape +from libworkflows import wc_applied, ensure_relative, cleanup_and_backup, texscape from libworkflows import get_chrom_sizes, column_converter from libworkflows import strip_split, file_len, last_lines, save_plot, test_na_file from libworkflows import make_id_list_getter, filter_combinator, SHELL_FUNCTIONS, warn_context @@ -79,9 +79,6 @@ ORIENTATIONS = ["fwd", "rev", "all"] COMPL = {"A" : "T", "C" : "G", "G" : "C", "T" : "A", "N" : "N"} -#FORMAT = "pdf" -FIG_FORMATS = ["pdf"] - LFC_RANGE = { "protein_coding" : (-10, 10), "DNA_transposons_rmsk" : (-10, 10), @@ -90,10 +87,12 @@ LFC_RANGE = { "simple_repeats_rmsk" : (-10, 10), "DNA_transposons_rmsk_families" : (-10, 10), "RNA_transposons_rmsk_families" : (-10, 10), + "satellites_rmsk_families" : (-10, 10), "simple_repeats_rmsk_families" : (-10, 10), "pseudogene" : (-10, 10), + "all_rmsk" : (-10, 10), + "all_rmsk_families" : (-10, 10), "alltypes" : (-10, 10)} -DE_BIOTYPES = list(LFC_RANGE.keys()) # Cutoffs in log fold change LFC_CUTOFFS = [0.5, 1, 2] UP_STATUSES = [f"up{cutoff}" for cutoff in LFC_CUTOFFS] @@ -138,10 +137,32 @@ COND_COLUMNS = pd.DataFrame(CONDITIONS).assign( cond_name=pd.Series(COND_NAMES).values).set_index("cond_name") COUNT_BIOTYPES = config["count_biotypes"] -ANNOT_BIOTYPES = config["annot_biotypes"] -#METAGENE_BIOTYPES=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"] -#METAGENE_BIOTYPES=["protein_coding"] -METAGENE_BIOTYPES=["protein_coding", "protein_coding_5UTR", "protein_coding_CDS", "protein_coding_3UTR"] + +RMSK_BIOTYPES = [ + "DNA_transposons_rmsk", + "RNA_transposons_rmsk", + "satellites_rmsk", + "simple_repeats_rmsk"] +RMSK_FAMILIES_BIOTYPES = [ + "DNA_transposons_rmsk_families", + "RNA_transposons_rmsk_families", + "satellites_rmsk_families", + "simple_repeats_rmsk_families"] + +BIOTYPES_TO_JOIN = { + "all_rmsk": [biotype for biotype in COUNT_BIOTYPES if biotype in RMSK_BIOTYPES], + "all_rmsk_families": [biotype for biotype in COUNT_BIOTYPES if biotype in RMSK_FAMILIES_BIOTYPES], + # We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}" + "alltypes": [biotype for biotype in COUNT_BIOTYPES if not biotype.startswith("protein_coding_")]} +JOINED_BIOTYPES = list(BIOTYPES_TO_JOIN.keys()) +DE_BIOTYPES = [biotype for biotype in LFC_RANGE.keys() if biotype in COUNT_BIOTYPES + JOINED_BIOTYPES] + + +#ANNOT_BIOTYPES = config["annot_biotypes"] +#METAGENE_BIOTYPES = ["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"] +#METAGENE_BIOTYPES = ["protein_coding"] +#METAGENE_BIOTYPES = ["protein_coding", "protein_coding_5UTR", "protein_coding_CDS", "protein_coding_3UTR"] +METAGENE_BIOTYPES = [biotype for biotype in ["protein_coding", "protein_coding_5UTR", "protein_coding_CDS", "protein_coding_3UTR"] if biotype in COUNT_BIOTYPES] ID_LISTS = [ "lfc_statuses", "germline_specific", @@ -259,7 +280,7 @@ wildcard_constraints: lib="|".join(LIBS), rep="\d+", orientation="|".join(ORIENTATIONS), - biotype="|".join(COUNT_BIOTYPES + ANNOT_BIOTYPES + ["alltypes"]) + biotype="|".join(COUNT_BIOTYPES + JOINED_BIOTYPES) # Define functions to be used in shell portions @@ -286,9 +307,9 @@ rule all: orientation=ORIENTATIONS, biotype=DE_BIOTYPES), expand(OPJ( output_dir, "{trimmer}", "figures", aligner, "{counter}", - "{biotype}_{orientation}_PCA.{fig_format}"), + "{biotype}_{orientation}_PCA.pdf"), trimmer=TRIMMERS, counter=COUNTERS, biotype=COUNT_BIOTYPES, - orientation=ORIENTATIONS, fig_format=FIG_FORMATS), + orientation=ORIENTATIONS), #expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"), trimmer=TRIMMERS, counter=COUNTERS, orientation=["all"]), #expand(expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "{lib}_{rep}_on_C_elegans", "{{biotype}}_{{orientation}}_counts.txt"), filtered_product, lib=LIBS, rep=REPS), trimmer=TRIMMERS, counter=COUNTERS, biotype=COUNT_BIOTYPES, orientation=["all"]), expand(expand(OPJ( @@ -299,13 +320,13 @@ rule all: output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_by_{{norm_type}}_{{orientation}}.bw"), filtered_product, lib=LIBS, rep=REPS), trimmer=TRIMMERS, norm_type=NORM_TYPES, orientation=["all"]), - #expand(OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.{fig_format}" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, orientation=["all"], biotype=["protein_coding"], fig_format=FIG_FORMATS), - #expand(OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.{fig_format}" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, orientation=["all"], biotype=METAGENE_BIOTYPES, fig_format=FIG_FORMATS), + #expand(OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, orientation=["all"], biotype=["protein_coding"]), + #expand(OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, orientation=["all"], biotype=METAGENE_BIOTYPES), expand(OPJ( output_dir, "{trimmer}", "figures", aligner, "{lib}_by_{norm_type}_mean", - "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.{fig_format}" % (MIN_DIST, META_MIN_LEN)), + "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, norm_type=NORM_TYPES, orientation=["all"], - biotype=METAGENE_BIOTYPES, fig_format=FIG_FORMATS), + biotype=METAGENE_BIOTYPES), include: ensure_relative(irules["link_raw_data"], workflow.basedir) @@ -628,8 +649,8 @@ rule feature_count_reads: message: "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} with featureCounts." log: - log = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{lib}_{rep}.log"), - err = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{lib}_{rep}.err") + log = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{orientation}_{biotype}", "{lib}_{rep}.log"), + err = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{orientation}_{biotype}", "{lib}_{rep}.err") shell: """ converter="/pasteur/entites/Mhe/Genomes/C_elegans/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes_id2name.pickle" @@ -686,12 +707,12 @@ rule do_PCA: output: #OPJ(output_dir, aligner, "mapped_C_elegans", "htseq_count", "summaries", "{biotype}_{orientation}_PCA.pdf"), #OPJ(output_dir, aligner, "mapped_C_elegans", "htseq_count", "summaries", "{biotype}_{orientation}_PCA.png"), - OPJ(output_dir, "{trimmer}", "figures", aligner, "{counter}", "{biotype}_{orientation}_PCA.{fig_format}"), + OPJ(output_dir, "{trimmer}", "figures", aligner, "{counter}", "{biotype}_{orientation}_PCA.pdf"), message: "Summarizing counts for {wildcards.biotype}_{wildcards.orientation}" threads: 12 # trying to avoid TimeoutError and "LOCKERROR: matplotlib is trying to acquire the lock [...]" log: - OPJ(log_dir, "{trimmer}", "do_PCA_{biotype}_{orientation}.log") + OPJ(log_dir, "{trimmer}", "{counter}", "do_PCA_{biotype}_{orientation}.log") run: if wildcards.counter == "htseq_count": counts_parser = parse_htseq_counts @@ -841,10 +862,55 @@ rule gather_counts: counts_data.to_csv(output.counts_table, sep="\t") +@wc_applied +def source_counts_to_join(wildcards): + """ + Determines which elementary biotype counts files should be joined to make the desired "joined" biotype. + """ + return expand( + OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", + "{{counter}}", "all_on_C_elegans", + "{biotype}_{{orientation}}_counts.txt"), + biotype=BIOTYPES_TO_JOIN[wildcards.biotype]) + + +rule join_all_counts: + """concat counts for all biotypes into all""" + input: + counts_tables = source_counts_to_join, + #counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=[bty for bty in COUNT_BIOTYPES if bty[-9:] != "_families"]), + # counts_tables = expand( + # OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", + # "{{counter}}", "all_on_C_elegans", + # "{biotype}_{{orientation}}_counts.txt"), + # # We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}" + # biotype=[b for b in COUNT_BIOTYPES if not b.startswith("protein_coding_")]), + output: + counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "{biotype}_{orientation}_counts.txt"), + wildcard_constraints: + biotype = "|".join(JOINED_BIOTYPES) + run: + counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() + assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table." + counts_data.index.names = ["gene"] + counts_data.to_csv(output.counts_table, sep="\t") + + +@wc_applied +def source_counts(wildcards): + """Determines from which rule the gathered small counts should be sourced.""" + if wildcards.biotype in JOINED_BIOTYPES: + return rules.join_all_counts.output.counts_table + else: + # "Directly" from the counts gathered across libraries + return rules.gather_counts.output.counts_table + + rule compute_RPK: """For a given biotype, compute the corresponding RPK value (reads per kilobase).""" input: - counts_data = rules.gather_counts.output.counts_table, + counts_data = source_counts, + #counts_data = rules.gather_counts.output.counts_table, #counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", # "all_on_C_elegans", "{biotype}_{orientation}_counts.txt"), output: @@ -852,12 +918,14 @@ rule compute_RPK: "all_on_C_elegans", "{biotype}_{orientation}_RPK.txt"), params: feature_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"), - run: - counts_data = pd.read_table(input.counts_data, index_col="gene") - feature_lengths = pd.read_table(params.feature_lengths_file, index_col="gene") - common = counts_data.index.intersection(feature_lengths.index) - rpk = 1000 * counts_data.loc[common].div(feature_lengths.loc[common]["union_exon_len"], axis="index") - rpk.to_csv(output.rpk_file, sep="\t") + # run: + # counts_data = pd.read_table(input.counts_data, index_col="gene") + # feature_lengths = pd.read_table(params.feature_lengths_file, index_col="gene") + # common = counts_data.index.intersection(feature_lengths.index) + # rpk = 1000 * counts_data.loc[common].div(feature_lengths.loc[common]["union_exon_len"], axis="index") + # rpk.to_csv(output.rpk_file, sep="\t") + wrapper: + "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/compute_RPK" rule compute_sum_million_RPK: @@ -882,17 +950,21 @@ rule compute_TPM: "all_on_C_elegans", "{biotype}_{orientation}_TPM.txt"), # The sum must be done over all counted features wildcard_constraints: - biotype = "alltypes" - run: - rpk = pd.read_table(input.rpk_file, index_col="gene") - tpm = 1000000 * rpk / rpk.sum() - tpm.to_csv(output.tpm_file, sep="\t") + biotype = "|".join(["alltypes"]) + # run: + # rpk = pd.read_table(input.rpk_file, index_col="gene") + # tpm = 1000000 * rpk / rpk.sum() + # tpm.to_csv(output.tpm_file, sep="\t") + wrapper: + "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/compute_TPM" +@wc_applied def source_quantif(wildcards): """Determines from which rule the gathered counts should be sourced.""" if wildcards.quantif_type == "counts": - return rules.gather_counts.output.counts_table + return source_counts(wildcards) + #return rules.gather_counts.output.counts_table elif wildcards.quantif_type == "RPK": return rules.compute_RPK.output.rpk_file elif wildcards.quantif_type == "TPM": @@ -903,7 +975,8 @@ def source_quantif(wildcards): rule compute_median_ratio_to_pseudo_ref_size_factors: input: - counts_table = rules.gather_counts.output.counts_table, + counts_table = source_counts, + #counts_table = rules.gather_counts.output.counts_table, output: median_ratios_file = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "{biotype}_{orientation}_median_ratios_to_pseudo_ref.txt"), run: @@ -1004,10 +1077,10 @@ rule make_bigwig: orient_filter = bamcoverage_filter, threads: 12 # to limit memory usage, actually benchmark: - OPJ(log_dir, "{trimmer}", "make_bigwig", "{lib}_{rep}_{orientation}_benchmark.txt") + OPJ(log_dir, "{trimmer}", "make_bigwig", "{lib}_{rep}_by_{norm_type}_{orientation}_benchmark.txt") log: - log = OPJ(log_dir, "{trimmer}", "make_bigwig", "{lib}_{rep}_{orientation}.log"), - err = OPJ(log_dir, "{trimmer}", "make_bigwig", "{lib}_{rep}_{orientation}.err"), + log = OPJ(log_dir, "{trimmer}", "make_bigwig", "{lib}_{rep}_by_{norm_type}_{orientation}.log"), + err = OPJ(log_dir, "{trimmer}", "make_bigwig", "{lib}_{rep}_by_{norm_type}_{orientation}.err"), run: scale = 1 / float(pd.read_table( input.median_ratios_file, index_col=0, header=None).loc[ @@ -1050,7 +1123,7 @@ rule merge_bigwig_reps: output: bw = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_mean_on_C_elegans_by_{norm_type}_{orientation}.bw"), log: - warnings = OPJ(log_dir, "merge_bigwig_reps", "{lib}_mean_on_C_elegans_by_{norm_type}_{orientation}.warnings"), + warnings = OPJ(log_dir, "{trimmer}", "merge_bigwig_reps", "{lib}_mean_on_C_elegans_by_{norm_type}_{orientation}.warnings"), threads: 2 # to limit memory usage, actually run: with warn_context(log.warnings) as warn: @@ -1089,34 +1162,6 @@ rule merge_bigwig_reps: bw.close() -rule join_all_counts: - """concat counts for all biotypes into all""" - input: - #counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=[bty for bty in COUNT_BIOTYPES if bty[-9:] != "_families"]), - counts_tables = expand( - OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", - "{{counter}}", "all_on_C_elegans", - "{biotype}_{{orientation}}_counts.txt"), - # We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}" - biotype=[b for b in COUNT_BIOTYPES if not b.startswith("protein_coding_")]), - output: - counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"), - run: - counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() - assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table." - counts_data.index.names = ["gene"] - counts_data.to_csv(output.counts_table, sep="\t") - - -def source_counts(wildcards): - """Determines from which rule the gathered small counts should be sourced.""" - if wildcards.biotype == "alltypes": - return rules.join_all_counts.output.counts_table - else: - # "Directly" from the counts gathered across libraries - return rules.gather_counts.output.counts_table - - from rpy2.robjects import Formula, StrVector from rpy2.rinterface import RRuntimeError rule differential_expression: @@ -1272,22 +1317,22 @@ rule make_MA_plot: # Metagene plots # ################## -rule gather_annotations: - input: - expand(OPJ(annot_dir, "{biotype}.gtf"), biotype=ANNOT_BIOTYPES), - output: - merged_gtf = OPJ(local_annot_dir, "all_annotations.gtf"), - merged_gtf_gz = OPJ(local_annot_dir, "all_annotations.gtf.gz"), - index = OPJ(local_annot_dir, "all_annotations.gtf.gz.tbi"), - #merged_bed = OPJ(local_annot_dir, "all_annotations.bed"), - message: - "Gathering annotations for {}".format(", ".join(ANNOT_BIOTYPES)) - shell: - """ - sort -k1,1 -k4,4n -m {input} | tee {output.merged_gtf} | bgzip > {output.merged_gtf_gz} - tabix -p gff {output.merged_gtf_gz} - #ensembl_gtf2bed.py {output.merged_gtf} > {output.merged_bed} - """ +# rule gather_annotations: +# input: +# expand(OPJ(annot_dir, "{biotype}.gtf"), biotype=ANNOT_BIOTYPES), +# output: +# merged_gtf = OPJ(local_annot_dir, "all_annotations.gtf"), +# merged_gtf_gz = OPJ(local_annot_dir, "all_annotations.gtf.gz"), +# index = OPJ(local_annot_dir, "all_annotations.gtf.gz.tbi"), +# #merged_bed = OPJ(local_annot_dir, "all_annotations.bed"), +# message: +# "Gathering annotations for {}".format(", ".join(ANNOT_BIOTYPES)) +# shell: +# """ +# sort -k1,1 -k4,4n -m {input} | tee {output.merged_gtf} | bgzip > {output.merged_gtf_gz} +# tabix -p gff {output.merged_gtf_gz} +# #ensembl_gtf2bed.py {output.merged_gtf} > {output.merged_bed} +# """ # For metagene analyses: #- Extract transcripts @@ -1606,7 +1651,7 @@ rule plot_meta_profile_mean: bigwig = rules.merge_bigwig_reps.output.bw, bed = rules.select_genes_for_meta_profile.output.out_bed, output: - OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_by_{norm_type}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.{fig_format}" % (MIN_DIST, META_MIN_LEN)), + OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_by_{norm_type}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), params: meta_params = meta_params, # before = META_MARGIN, @@ -1615,8 +1660,8 @@ rule plot_meta_profile_mean: # unscaled_5 = UNSCALED_INSIDE, # unscaled_3 = UNSCALED_INSIDE, log: - plot_TSS_log = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.log" % (MIN_DIST, META_MIN_LEN)), - plot_TSS_err = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.err" % (MIN_DIST, META_MIN_LEN)), + plot_TSS_log = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "by_{norm_type}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.log" % (MIN_DIST, META_MIN_LEN)), + plot_TSS_err = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "by_{norm_type}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.err" % (MIN_DIST, META_MIN_LEN)), threads: 12 # to limit memory usage, actually run: if file_len(input.bed): diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile index 4a2913342677a9a2810e1bc7e65f3f7e87013529..4830d61bd69a1cc81fdc1db93aaaa4c1b1ffd4b4 100644 --- a/RNA_Seq_Cecere/RNA-seq.snakefile +++ b/RNA_Seq_Cecere/RNA-seq.snakefile @@ -98,7 +98,7 @@ from libhts import ( plot_norm_correlations, plot_counts_distribution, plot_boxplots) -from libworkflows import ensure_relative, cleanup_and_backup +from libworkflows import wc_applied, ensure_relative, cleanup_and_backup from libworkflows import get_chrom_sizes, column_converter, make_id_list_getter from libworkflows import strip_split, save_plot, test_na_file, SHELL_FUNCTIONS, warn_context from libworkflows import feature_orientation2stranded @@ -122,8 +122,11 @@ LFC_RANGE = { "simple_repeats_rmsk" : (-10, 10), "DNA_transposons_rmsk_families" : (-10, 10), "RNA_transposons_rmsk_families" : (-10, 10), + "satellites_rmsk_families" : (-10, 10), "simple_repeats_rmsk_families" : (-10, 10), "pseudogene" : (-10, 10), + "all_rmsk" : (-10, 10), + "all_rmsk_families" : (-10, 10), "alltypes" : (-10, 10)} # Cutoffs in log fold change LFC_CUTOFFS = [0.5, 1, 2] @@ -197,7 +200,25 @@ COND_NAMES = ["_".join(( BIOTYPES = config["biotypes"] -DE_BIOTYPES = [biotype for biotype in LFC_RANGE.keys() if biotype in BIOTYPES + ["alltypes"]] + +RMSK_BIOTYPES = [ + "DNA_transposons_rmsk", + "RNA_transposons_rmsk", + "satellites_rmsk", + "simple_repeats_rmsk"] +RMSK_FAMILIES_BIOTYPES = [ + "DNA_transposons_rmsk_families", + "RNA_transposons_rmsk_families", + "satellites_rmsk_families", + "simple_repeats_rmsk_families"] + +BIOTYPES_TO_JOIN = { + "all_rmsk": [biotype for biotype in BIOTYPES if biotype in RMSK_BIOTYPES], + "all_rmsk_families": [biotype for biotype in BIOTYPES if biotype in RMSK_FAMILIES_BIOTYPES], + # We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}" + "alltypes": [biotype for biotype in BIOTYPES if not biotype.startswith("protein_coding_")]} +JOINED_BIOTYPES = list(BIOTYPES_TO_JOIN.keys()) +DE_BIOTYPES = [biotype for biotype in LFC_RANGE.keys() if biotype in BIOTYPES + JOINED_BIOTYPES] if "spike_ins" in BIOTYPES: PAIR_SCATTER_BIOTYPES = ["spike_ins"] else: @@ -251,7 +272,7 @@ minU, maxU = 1, 5 wildcard_constraints: rep="\d+", orientation="|".join(ORIENTATIONS), - biotype="|".join(BIOTYPES + ["alltypes"]), + biotype="|".join(BIOTYPES + JOINED_BIOTYPES), fold_type="|".join(["mean_log2_RPM_fold", "log2FoldChange", "lfcMLE"]), #ruleorder: join_all_counts > gather_counts @@ -293,12 +314,12 @@ counts_files = [ OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "{biotype}_{orientation}_RPK.txt"), counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"], - biotype=BIOTYPES + ["alltypes"], orientation=ORIENTATIONS), + biotype=BIOTYPES + JOINED_BIOTYPES, orientation=ORIENTATIONS), expand( OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "{biotype}_{orientation}_counts.txt"), counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"], - biotype=BIOTYPES, orientation=ORIENTATIONS), + biotype=BIOTYPES + JOINED_BIOTYPES, orientation=ORIENTATIONS), # expand( # OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", # "summaries", f"{{lib}}_{{rep}}_on_{genome}_{{orientation}}_counts.txt"), @@ -1094,7 +1115,7 @@ rule compute_TPM: "all_{mapped_type}", "{biotype}_{orientation}_TPM.txt"), # The sum must be done over all counted features wildcard_constraints: - biotype = "alltypes" + biotype = "|".join(["alltypes"]) # run: # rpk = pd.read_table(input.rpk_file, index_col="gene") # tpm = 1000000 * rpk / rpk.sum() @@ -1465,12 +1486,24 @@ rule merge_bigwig_reps: bw.close() +@wc_applied +def source_counts_to_join(wildcards): + """ + Determines which elementary biotype counts files should be joined to make the desired "joined" biotype. + """ + return expand( + OPJ(output_dir, aligner, f"mapped_{genome}", "{{counter}}", "all_{{mapped_type}}", "{biotype}_{{orientation}}_counts.txt"), + biotype=BIOTYPES_TO_JOIN[wildcards.biotype]) + + rule join_all_counts: - """concat counts for all biotypes into all""" + """Concat counts for all biotypes into all""" input: - counts_tables = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{{counter}}", "all_{{mapped_type}}", "{biotype}_{{orientation}}_counts.txt"), biotype=BIOTYPES), + counts_tables = source_counts_to_join, output: - counts_table = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "alltypes_{orientation}_counts.txt"), + counts_table = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "{biotype}_{orientation}_counts.txt"), + wildcard_constraints: + biotype = "|".join(JOINED_BIOTYPES) run: counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table." @@ -1478,12 +1511,13 @@ rule join_all_counts: counts_data.to_csv(output.counts_table, sep="\t") +@wc_applied def source_counts(wildcards): """Determines from which rule the gathered counts should be sourced.""" - if wildcards.biotype == "alltypes": + if wildcards.biotype in JOINED_BIOTYPES: ## debug - #return rules.join_all_counts.output.counts_table - return OPJ(output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}", f"all_{wildcards.mapped_type}", f"alltypes_{wildcards.orientation}_counts.txt") + return rules.join_all_counts.output.counts_table + #return OPJ(output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}", f"all_{wildcards.mapped_type}", f"alltypes_{wildcards.orientation}_counts.txt") ## else: # "Directly" from the counts gathered across libraries @@ -1497,10 +1531,10 @@ def source_counts(wildcards): # output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}", # f"all_{wildcards.mapped_type}", f"{wildcards.biotype}_{wildcards.orientation}_counts.txt")) # sys.exit(1) - #return rules.gather_counts.output.counts_table - return OPJ( - output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}", - f"all_{wildcards.mapped_type}", f"{wildcards.biotype}_{wildcards.orientation}_counts.txt") + return rules.gather_counts.output.counts_table + #return OPJ( + # output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}", + # f"all_{wildcards.mapped_type}", f"{wildcards.biotype}_{wildcards.orientation}_counts.txt") ## diff --git a/small_RNA-seq/small_RNA-seq.snakefile b/small_RNA-seq/small_RNA-seq.snakefile index f721dd3408b7cca4091b6d9d788508a6779040eb..fda01ed063c03a4bf39ae1539055decf622a25ee 100644 --- a/small_RNA-seq/small_RNA-seq.snakefile +++ b/small_RNA-seq/small_RNA-seq.snakefile @@ -28,6 +28,10 @@ major, minor = sys.version_info[:2] if major < 3 or (major == 3 and minor < 6): sys.exit("Need at least python 3.6\n") +# TODO: Try to find what the proportion of small reads map at unique position within repetitive elements. +# What is the distribution of unique reads among repetitive elements of a given family: is it evenly spread, or biased? +# Is it different if we use all small RNAs instead of just unique ones? +# For bowtie2, allow MAPQ >= 23 to get "not too ambiguously mapped reads" # TODO: implement RPM folds from feature_counts results (and also for Ribo-seq pipeline) @@ -92,7 +96,7 @@ from subprocess import Popen, PIPE, CalledProcessError # Useful for functional style from itertools import chain, combinations, product, repeat, starmap -from functools import reduce +from functools import partial, reduce from operator import or_ as union from cytoolz import concat, merge_with, take_nth, valmap @@ -156,10 +160,11 @@ from libsmallrna import PI_MIN, PI_MAX, SI_MIN, SI_MAX # Do this outside the workflow #from libhts import gtf_2_genes_exon_lengths, repeat_bed_2_lengths from libdeseq import do_deseq2 +from libhts import aligner2min_mapq from libhts import status_setter, make_empty_bigwig from libhts import median_ratio_to_pseudo_ref_size_factors, size_factor_correlations from libhts import plot_paired_scatters, plot_norm_correlations, plot_counts_distribution, plot_boxplots, plot_histo -from libworkflows import texscape, ensure_relative, cleanup_and_backup +from libworkflows import texscape, wc_applied, ensure_relative, cleanup_and_backup from libworkflows import get_chrom_sizes, column_converter, make_id_list_getter from libworkflows import read_int_from_file, strip_split, file_len, last_lines, save_plot, SHELL_FUNCTIONS from libworkflows import filter_combinator, read_feature_counts, sum_feature_counts, sum_htseq_counts, warn_context @@ -297,7 +302,28 @@ READ_TYPES_FOR_MAPPING = [ # Types of annotation features, as defined in the "gene_biotype" # GTF attribute sections of the annotation files. COUNT_BIOTYPES = config["count_biotypes"] +BOXPLOT_BIOTYPES = config.get("boxplot_biotypes", []) ANNOT_BIOTYPES = config["annot_biotypes"] + +RMSK_BIOTYPES = [ + "DNA_transposons_rmsk", + "RNA_transposons_rmsk", + "satellites_rmsk", + "simple_repeats_rmsk"] +RMSK_FAMILIES_BIOTYPES = [ + "DNA_transposons_rmsk_families", + "RNA_transposons_rmsk_families", + "satellites_rmsk_families", + "simple_repeats_rmsk_families"] + +BIOTYPES_TO_JOIN = { + "all_rmsk": [biotype for biotype in COUNT_BIOTYPES + BOXPLOT_BIOTYPES if biotype in RMSK_BIOTYPES], + "all_rmsk_families": [biotype for biotype in COUNT_BIOTYPES + BOXPLOT_BIOTYPES if biotype in RMSK_FAMILIES_BIOTYPES], + # We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}" + "alltypes": [biotype for biotype in COUNT_BIOTYPES + BOXPLOT_BIOTYPES if not biotype.startswith("protein_coding_")]} +# Filter out those with empty values +JOINED_BIOTYPES = [joined_biotype for (joined_biotype, biotypes) in BIOTYPES_TO_JOIN.items() if biotypes] + # cf Gu et al. (2009), supplement: # ----- # To compare 22G-RNAs derived from a gene, transposon, or pseudogene between two @@ -356,14 +382,16 @@ mapping_dir = OPJ(output_dir, aligner, f"mapped_{genome}") reads_dir = OPJ(mapping_dir, "reads") annot_counts_dir = OPJ(mapping_dir, "annotation") feature_counts_dir = OPJ(mapping_dir, "feature_count") -READ_TYPES_FOR_COUNTING = ["all_sisiuRNA", "sisiuRNA"] + list(map(lambda x: "%s%s" % (x, "RNA"), ["prot_sisiu"])) -# Actually, we don't need to enforce this: snakemake will map whatever read type is needed for counting -assert set(READ_TYPES_FOR_COUNTING) < set(READ_TYPES_FOR_MAPPING), "Cannot count reads that were not mapped." +READ_TYPES_FOR_COUNTING = ["all_sisiuRNA", "sisiuRNA", f"{size_selected}_and_nomap_siRNA", "prot_siRNA_and_prot_siuRNA"] + list(map(lambda x: "%s%s" % (x, "RNA"), ["prot_sisiu"])) # Reads remapped and counted using featureCounts REMAPPED_COUNTED = [ - f"{small_type}_on_{genome}_{biotype}_{orientation}_transcript" for ( - small_type, biotype, orientation) in product( - READ_TYPES_FOR_COUNTING, COUNT_BIOTYPES, ORIENTATIONS)] + f"{small_type}_{mapped_type}_{biotype}_{orientation}_transcript" for ( + small_type, mapped_type, biotype, orientation) in product( + READ_TYPES_FOR_COUNTING, + [f"on_{genome}", f"unique_on_{genome}"], + #[f"on_{genome}"], + COUNT_BIOTYPES + JOINED_BIOTYPES, + ORIENTATIONS)] # Used to skip some genotype x treatment x replicate number combinations # when some of them were not sequenced forbidden = {frozenset(wc_comb.items()) for wc_comb in config["missing"]} @@ -463,6 +491,20 @@ class Scale(matplotlib.patheffects.RendererBase): renderer.draw_path(gc, tpath, affine, rgbFace) +def print_wc(wildcards): + print(wildcards) + return [] + + +def check_wc(wildcards): + #if hasattr(wildcards, "read_type"): + # if wildcards.read_type.endswith("_on_C"): + # print(wildcards) + if any(attr.endswith("_on_C") for attr in vars(wildcards).values() if type(attr) == "str"): + print(wildcards) + return [] + + filtered_product = filter_combinator(product, forbidden) # Limit risks of ambiguity by imposing replicates to be numbers @@ -474,11 +516,12 @@ wildcard_constraints: min_dist="\d+", min_len="\d+", #max_len="\d+", - biotype="|".join(set(COUNT_BIOTYPES + ANNOT_BIOTYPES)), + biotype="|".join(set(COUNT_BIOTYPES + ANNOT_BIOTYPES + JOINED_BIOTYPES)), id_list="|".join(GENE_LISTS), type_set="|".join(["all", "protein_coding", "protein_coding_TE"]), + mapped_type="|".join([f"on_{genome}", f"unique_on_{genome}"]), small_type="[spm]i|pisimi|siu|prot_si|te_si|pseu_si|satel_si|simrep_si|prot_siu|te_siu|pseu_siu|satel_siu|simrep_siu|sisiu|all_si|all_siu|all_sisiu", - # read_type = "|".join(REMAPPED_COUNTED), + read_type = "|".join(REMAPPED_COUNTED + READ_TYPES_FOR_MAPPING + READ_TYPES_FOR_COUNTING + ["trimmed", "nomap"]), # read_type="|".join([ # "raw", "trimmed", "deduped", f"{size_selected}", # "nomap", "nomap_siRNA", "mapped", @@ -607,29 +650,32 @@ ip_fold_boxplots = [] remapped_folds = [] remapped_fold_boxplots = [] if contrasts_dict["ip"]: - remapped_fold_boxplots = expand( - OPJ(output_dir, "figures", "{contrast}", "by_biotypes", - "{contrast}_{read_type}_on_%s_{biotypes}_{orientation}_transcript_{fold_type}_{gene_list}_boxplots.pdf" % genome), - contrast=IP_CONTRASTS, - read_type=READ_TYPES_FOR_COUNTING, - # TODO: Read this from config file - biotypes=["protein_coding_5UTR_and_protein_coding_CDS_and_protein_coding_3UTR"], - orientation=ORIENTATIONS, - fold_type=["mean_log2_RPM_fold"], - gene_list=BOXPLOT_GENE_LISTS) + if BOXPLOT_BIOTYPES: + remapped_fold_boxplots = expand( + OPJ(output_dir, "figures", "{contrast}", "by_biotypes", + "{contrast}_{read_type}_{mapped_type}_{biotypes}_{orientation}_transcript_{fold_type}_{gene_list}_boxplots.pdf"), + contrast=IP_CONTRASTS, + read_type=READ_TYPES_FOR_COUNTING, + mapped_type=[f"on_{genome}", f"unique_on_{genome}"], + #mapped_type=[f"on_{genome}"], + # TODO: Read this from config file + biotypes=["_and_".join(BOXPLOT_BIOTYPES)], + orientation=ORIENTATIONS, + fold_type=["mean_log2_RPM_fold"], + gene_list=BOXPLOT_GENE_LISTS) remapped_folds = expand( OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", "remapped", "{counted_type}_mean_log2_RPM_fold.txt"), counted_type=REMAPPED_COUNTED), # snakemake -n OK # expand( # OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", - # "{read_type}_on_%s_{biotype}_{orientation}_transcript_mean_log2_RPM_fold.txt" % genome), - # read_type=READ_TYPES_FOR_COUNTING, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS), + # "{read_type}_{mapped_type}_{biotype}_{orientation}_transcript_mean_log2_RPM_fold.txt"), + # read_type=READ_TYPES_FOR_COUNTING, mapped_type=[f"on_{genome}"], biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS), # snakemake -n OK # expand( # OPJ(mapping_dir, f"RPM_folds_{size_selected}", "{contrast}", - # "{contrast}_{small_type}_on_%s_{biotype}_{orientation}_transcript_RPM_folds.txt" % genome), - # contrast=IP_CONTRASTS, small_type=READ_TYPES_FOR_COUNTING, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS), + # "{contrast}_{small_type}_{mapped_type}_{biotype}_{orientation}_transcript_RPM_folds.txt"), + # contrast=IP_CONTRASTS, small_type=READ_TYPES_FOR_COUNTING, mapped_type=[f"on_{genome}"], biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS), ip_fold_heatmaps = expand( OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.pdf"), small_type=["pisimi", "prot_si"], fold_type=["mean_log2_RPM_fold"]) @@ -733,7 +779,8 @@ rule all: read_graphs, #expand(OPJ(feature_counts_dir, "summaries", "{lib}_{rep}_nb_non_structural.txt"), filtered_product, lib=LIBS, rep=REPS), OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", "pisimi_mean_log2_RPM_fold.txt"), - expand(OPJ(mapping_dir, f"deseq2_{size_selected}", "all", "pisimi_{fold_type}.txt"), fold_type=["log2FoldChange"]), + # Not looking ad deseq2 results any more + #expand(OPJ(mapping_dir, f"deseq2_{size_selected}", "all", "pisimi_{fold_type}.txt"), fold_type=["log2FoldChange"]), #expand(OPJ(mapping_dir, "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), contrast=IP_CONTRASTS, small_type=DE_TYPES), exploratory_graphs, remapped_folds, @@ -742,11 +789,11 @@ rule all: #expand(OPJ(output_dir, "figures", "{small_type}_unit_clustermap.pdf"), small_type=SMALL_TYPES), #expand(OPJ( # feature_counts_dir, - # "all_{read_type}_on_%s_{biotype}_{orientation}_transcript_counts.txt" % genome), read_type=READ_TYPES_FOR_COUNTING, biotype=ANNOT_BIOTYPES, orientation=ORIENTATIONS), + # "all_{read_type}_{mapped_type}_{biotype}_{orientation}_transcript_counts.txt"), read_type=READ_TYPES_FOR_COUNTING, mapped_type=[f"on_{genome}"], biotype=ANNOT_BIOTYPES, orientation=ORIENTATIONS), # expand(OPJ( # feature_counts_dir, - # "all_{small_type}_on_%s_{biotype}_{orientation}_transcript_counts.txt" % genome), - # small_type=READ_TYPES_FOR_MAPPING, biotype=set(COUNT_BIOTYPES + ANNOT_BIOTYPES), orientation=ORIENTATIONS), + # "all_{small_type}_{mapped_type}_{biotype}_{orientation}_transcript_counts.txt"), + # small_type=READ_TYPES_FOR_MAPPING, mapped_type=[f"on_{genome}"], biotype=set(COUNT_BIOTYPES + ANNOT_BIOTYPES), orientation=ORIENTATIONS), expand( OPJ(annot_counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"), small_type=["mi", "prot_si", "te_si", "pseu_si", "satel_si", "simrep_si", "prot_siu", "te_siu", "pseu_siu", "pisimi"]), @@ -826,6 +873,134 @@ rule select_size_range: """ +# if int(snakemake.__version__.split(".")[0]) == 5: +# def source_fastq(wildcards): +# """Determine the fastq file corresponding to a given read type.""" +# read_type = wildcards.read_type +# if read_type == "raw": +# return OPJ(data_dir, "{wildcards.lib}_{wildcards.rep}.fastq.gz") +# # return rules.link_raw_data.output +# elif read_type == "trimmed": +# return rules.trim_and_dedup.output.trimmed +# elif read_type == size_selected: +# return rules.select_size_range.output.selected +# elif read_type == "nomap": +# return rules.map_on_genome.output.nomap_fastq +# elif read_type == "nomap_siRNA": +# return rules.extract_nomap_siRNAs.output.nomap_si +# elif read_type == "prot_siRNA": +# return rules.small_RNA_seq_annotate.output.prot_si +# elif read_type == "te_siRNA": +# return rules.small_RNA_seq_annotate.output.te_si +# elif read_type == "pseu_siRNA": +# return rules.small_RNA_seq_annotate.output.pseu_si +# elif read_type == "satel_siRNA": +# return rules.small_RNA_seq_annotate.output.satel_si +# elif read_type == "simrep_siRNA": +# return rules.small_RNA_seq_annotate.output.simrep_si +# elif read_type == "siRNA": +# return rules.join_si_reads.output.si +# elif read_type == "piRNA": +# return rules.small_RNA_seq_annotate.output.pi +# elif read_type == "miRNA": +# return rules.small_RNA_seq_annotate.output.mi +# elif read_type == "all_siRNA": +# return rules.small_RNA_seq_annotate.output.all_si +# elif read_type == "prot_siuRNA": +# return rules.small_RNA_seq_annotate.output.prot_siu +# elif read_type == "te_siuRNA": +# return rules.small_RNA_seq_annotate.output.te_siu +# elif read_type == "pseu_siuRNA": +# return rules.small_RNA_seq_annotate.output.pseu_siu +# elif read_type == "satel_siuRNA": +# return rules.small_RNA_seq_annotate.output.satel_siu +# elif read_type == "simrep_siuRNA": +# return rules.small_RNA_seq_annotate.output.simrep_siu +# elif read_type == "all_siuRNA": +# return rules.small_RNA_seq_annotate.output.all_siu +# elif read_type == "siuRNA": +# return rules.join_si_reads.output.siu +# elif read_type == "sisiuRNA": +# return rules.join_si_reads.output.sisiu +# elif read_type == "prot_sisiuRNA": +# return rules.join_si_reads.output.prot_sisiu +# elif read_type == "te_sisiuRNA": +# return rules.join_si_reads.output.te_sisiu +# elif read_type == "pseu_sisiuRNA": +# return rules.join_si_reads.output.pseu_sisiu +# elif read_type == "satel_sisiuRNA": +# return rules.join_si_reads.output.satel_sisiu +# elif read_type == "simrep_sisiuRNA": +# return rules.join_si_reads.output.simrep_sisiu +# elif read_type == "all_sisiuRNA": +# return rules.join_si_reads.output.all_sisiu +# else: +# raise NotImplementedError("Unknown read type: %s" % read_type) +# else: +# def source_fastq(wildcards): +# """Determine the fastq file corresponding to a given read type.""" +# read_type = wildcards.read_type +# if read_type == "raw": +# return rules.link_raw_data.output +# elif read_type == "trimmed": +# return rules.trim_and_dedup.output.trimmed +# elif read_type == size_selected: +# return rules.select_size_range.output.selected +# elif read_type == "nomap": +# return rules.map_on_genome.output.nomap_fastq +# elif read_type == "nomap_siRNA": +# return rules.extract_nomap_siRNAs.output.nomap_si +# elif read_type == "prot_siRNA": +# return rules.small_RNA_seq_annotate.output.prot_si +# elif read_type == "te_siRNA": +# return rules.small_RNA_seq_annotate.output.te_si +# elif read_type == "pseu_siRNA": +# return rules.small_RNA_seq_annotate.output.pseu_si +# elif read_type == "satel_siRNA": +# return rules.small_RNA_seq_annotate.output.satel_si +# elif read_type == "simrep_siRNA": +# return rules.small_RNA_seq_annotate.output.simrep_si +# elif read_type == "siRNA": +# return rules.join_si_reads.output.si +# elif read_type == "piRNA": +# return rules.small_RNA_seq_annotate.output.pi +# elif read_type == "miRNA": +# return rules.small_RNA_seq_annotate.output.mi +# elif read_type == "all_siRNA": +# return rules.small_RNA_seq_annotate.output.all_si +# elif read_type == "prot_siuRNA": +# return rules.small_RNA_seq_annotate.output.prot_siu +# elif read_type == "te_siuRNA": +# return rules.small_RNA_seq_annotate.output.te_siu +# elif read_type == "pseu_siuRNA": +# return rules.small_RNA_seq_annotate.output.pseu_siu +# elif read_type == "satel_siuRNA": +# return rules.small_RNA_seq_annotate.output.satel_siu +# elif read_type == "simrep_siuRNA": +# return rules.small_RNA_seq_annotate.output.simrep_siu +# elif read_type == "all_siuRNA": +# return rules.small_RNA_seq_annotate.output.all_siu +# elif read_type == "siuRNA": +# return rules.join_si_reads.output.siu +# elif read_type == "sisiuRNA": +# return rules.join_si_reads.output.sisiu +# elif read_type == "prot_sisiuRNA": +# return rules.join_si_reads.output.prot_sisiu +# elif read_type == "te_sisiuRNA": +# return rules.join_si_reads.output.te_sisiu +# elif read_type == "pseu_sisiuRNA": +# return rules.join_si_reads.output.pseu_sisiu +# elif read_type == "satel_sisiuRNA": +# return rules.join_si_reads.output.satel_sisiu +# elif read_type == "simrep_sisiuRNA": +# return rules.join_si_reads.output.simrep_sisiu +# elif read_type == "all_sisiuRNA": +# return rules.join_si_reads.output.all_sisiu +# else: +# raise NotImplementedError("Unknown read type: %s" % read_type) + + +@wc_applied def source_fastq(wildcards): """Determine the fastq file corresponding to a given read type.""" read_type = wildcards.read_type @@ -889,6 +1064,8 @@ def source_fastq(wildcards): raise NotImplementedError("Unknown read type: %s" % read_type) + + rule map_on_genome: input: #rules.select_size_range.output.selected, @@ -940,6 +1117,7 @@ rule remap_on_genome: """Remap small reads after first categorization following initial mapping.""" input: #rules.select_size_range.output.selected, + #fastq = wc_applied(source_fastq), fastq = source_fastq, output: sam = temp(OPJ(mapping_dir, "{lib}_{rep}", "{read_type}_on_%s.sam" % genome)), @@ -954,7 +1132,7 @@ rule remap_on_genome: # "prot_siuRNA", "te_siuRNA", "pseu_siuRNA", "satel_siuRNA", "simrep_siuRNA", # "prot_sisiuRNA", "te_sisiuRNA", "pseu_sisiuRNA", "satel_sisiuRNA", "simrep_sisiuRNA", # "all_siRNA", "all_siuRNA", "all_sisiuRNA"]) - read_type = "|".join(set(READ_TYPES_FOR_MAPPING) - {size_selected}) + read_type = "|".join(set(READ_TYPES_FOR_MAPPING + READ_TYPES_FOR_COUNTING) - {size_selected}) params: aligner = aligner, index = genome_db, @@ -987,6 +1165,7 @@ def source_sam(wildcards): rule sam2indexedbam: input: + #check_wc, sam = source_sam, output: sorted_bam = OPJ(mapping_dir, "{lib}_{rep}", "{read_type}_on_%s_sorted.bam" % genome), @@ -1045,21 +1224,22 @@ rule feature_count_reads: output: counts = OPJ( feature_counts_dir, - "{lib}_{rep}_{read_type}_on_%s_{biotype}_{orientation}_{feature_type}_counts.txt" % genome), + "{lib}_{rep}_{read_type}_{mapped_type}_{biotype}_{orientation}_{feature_type}_counts.txt"), params: + min_mapq = partial(aligner2min_mapq, aligner), stranded = feature_orientation2stranded, annot = lambda wildcards: OPJ(annot_dir, f"{wildcards.biotype}.gtf") message: - "Counting {wildcards.orientation} {wildcards.biotype} {wildcards.feature_type} reads for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type} with featureCounts." + "Counting {wildcards.orientation} {wildcards.biotype} {wildcards.feature_type} reads for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}_{wildcards.mapped_type} with featureCounts." benchmark: - OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_type}_{biotype}_{orientation}_{feature_type}_benchmark.txt"), + OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_type}_{mapped_type}_{biotype}_{orientation}_{feature_type}_benchmark.txt"), log: - log = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_type}_{biotype}_{orientation}_{feature_type}.log"), - err = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_type}_{biotype}_{orientation}_{feature_type}.err"), + log = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_type}_{mapped_type}_{biotype}_{orientation}_{feature_type}.log"), + err = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_type}_{mapped_type}_{biotype}_{orientation}_{feature_type}.err"), shell: """ - tmpdir=$(TMPDIR=/var/tmp mktemp --tmpdir -d "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.read_type}_{wildcards.biotype}_{wildcards.orientation}_{wildcards.feature_type}.XXXXXXXXXX") - cmd="featureCounts -a {params.annot} -o {output.counts} -t {wildcards.feature_type} -g "gene_id" -O -s {params.stranded} --fracOverlap 1 --tmpDir ${{tmpdir}} {input}" + tmpdir=$(TMPDIR=/var/tmp mktemp --tmpdir -d "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.read_type}_{wildcards.mapped_type}_{wildcards.biotype}_{wildcards.orientation}_{wildcards.feature_type}.XXXXXXXXXX") + cmd="featureCounts {params.min_mapq} -a {params.annot} -o {output.counts} -t {wildcards.feature_type} -g "gene_id" -O -s {params.stranded} --fracOverlap 1 --tmpDir ${{tmpdir}} {input}" featureCounts -v 2> {log.log} echo ${{cmd}} 1>> {log.log} eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed" @@ -1072,11 +1252,11 @@ rule summarize_feature_counts: input: expand( OPJ(feature_counts_dir, - "{{lib}}_{{rep}}_{{read_type}}_on_%s_{biotype}_{{orientation}}_{{feature_type}}_counts.txt" % genome), + "{{lib}}_{{rep}}_{{read_type}}_{{mapped_type}}_{biotype}_{{orientation}}_{{feature_type}}_counts.txt"), biotype=COUNT_BIOTYPES), output: summary = OPJ(mapping_dir, "feature_count", "summaries", - "{lib}_{rep}_{read_type}_on_%s_{orientation}_{feature_type}_counts.txt" % genome) + "{lib}_{rep}_{read_type}_{mapped_type}_{orientation}_{feature_type}_counts.txt") run: with open(output.summary, "w") as summary_file: header = "\t".join(COUNT_BIOTYPES) @@ -1085,31 +1265,40 @@ rule summarize_feature_counts: summary_file.write(f"%s_%s_%s\t%s\n" % (wildcards.lib, wildcards.rep, wildcards.orientation, sums)) +# TODO: update consumers with mapped_type rule gather_feature_counts: """For a given biotype and read type, gather counts from all libraries in one table.""" input: counts_tables = expand( OPJ(feature_counts_dir, - "{cond_name}_{{read_type}}_on_%s_{{biotype}}_{{orientation}}_{{feature_type}}_counts.txt" % genome), + "{cond_name}_{{read_type}}_{{mapped_type}}_{{biotype}}_{{orientation}}_{{feature_type}}_counts.txt"), cond_name=COND_NAMES), output: counts_table = OPJ( feature_counts_dir, - "all_{read_type}_on_%s_{biotype}_{orientation}_{feature_type}_counts.txt" % genome), + "all_{read_type}_{mapped_type}_{biotype}_{orientation}_{feature_type}_counts.txt"), wildcard_constraints: - biotype = "|".join(COUNT_BIOTYPES) + biotype = "|".join(COUNT_BIOTYPES + BOXPLOT_BIOTYPES) run: # Gathering the counts data ############################ counts_files = (OPJ( feature_counts_dir, - f"{cond_name}_{wildcards.read_type}_on_{genome}_{wildcards.biotype}_{wildcards.orientation}_{wildcards.feature_type}_counts.txt") for cond_name in COND_NAMES) + f"{cond_name}_{wildcards.read_type}_{wildcards.mapped_type}_{wildcards.biotype}_{wildcards.orientation}_{wildcards.feature_type}_counts.txt") for cond_name in COND_NAMES) counts_data = pd.concat( (read_feature_counts( counts_file, - nb_bams=len(wildcards.read_type.split("_and_"))) for counts_file in counts_files), + nb_bams=len(wildcards.read_type.split("_and_"))).sum(axis=1) for counts_file in counts_files), axis=1).fillna(0).astype(int) - counts_data.columns = COND_NAMES + ## TODO + ## debug + try: + counts_data.columns = COND_NAMES + except ValueError as e: + print(counts_data.columns) + print(COND_NAMES) + raise + ## # Simple_repeat|Simple_repeat|(TTTTTTG)n:1 # Simple_repeat|Simple_repeat|(TTTTTTG)n:2 # Simple_repeat|Simple_repeat|(TTTTTTG)n:3 @@ -1123,6 +1312,43 @@ rule gather_feature_counts: counts_data.to_csv(output.counts_table, sep="\t") +@wc_applied +def source_feature_counts_to_join(wildcards): + """ + Determines which elementary biotype counts files should be joined to make the desired "joined" biotype. + """ + ## debug + #print("dest biotype:", wildcards.biotype) + #print("source biotypes:", ",".join(BIOTYPES_TO_JOIN[wildcards.biotype])) + ## + #if BIOTYPES_TO_JOIN[wildcards.biotype]: + return expand( + OPJ(feature_counts_dir, + "all_{{read_type}}_{{mapped_type}}_{biotype}_{{orientation}}_{{feature_type}}_counts.txt"), + biotype=BIOTYPES_TO_JOIN[wildcards.biotype]) + #return [] + + +rule join_all_feature_counts: + """Concat counts for all biotypes into all""" + input: + counts_tables = source_feature_counts_to_join, + output: + counts_table = OPJ( + feature_counts_dir, + "all_{read_type}_{mapped_type}_{biotype}_{orientation}_{feature_type}_counts.txt"), + wildcard_constraints: + biotype = "|".join(JOINED_BIOTYPES) + run: + ## debug: + print("Input:", ", ".join(input)) + ## + counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates() + assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table." + counts_data.index.names = ["gene"] + counts_data.to_csv(output.counts_table, sep="\t") + + rule gather_annotations: input: expand(OPJ(annot_dir, "{biotype}.gtf"), biotype=ANNOT_BIOTYPES), @@ -1503,11 +1729,15 @@ rule join_pisimi_counts: counts_data.to_csv(output.counts_table, sep="\t") +@wc_applied def source_small_RNA_counts(wildcards): """Determines from which rule the gathered small counts should be sourced.""" - # TODO: add {small_type}_on_{genome}_{biotype}_{orientation}_{feature_type} category to use the featurecounts results + # TODO: add {small_type}_{mapped_type}_{biotype}_{orientation}_{feature_type} category to use the featurecounts results if hasattr(wildcards, "biotype"): - return rules.gather_feature_counts.output.counts_table + if wildcards.biotype in JOINED_BIOTYPES: + return rules.join_all_feature_counts.output.counts_table + else: + return rules.gather_feature_counts.output.counts_table #return OPJ( # feature_counts_dir, # f"all_{wildcards.remapped_counted}_{wildcards.feature_type}_counts.txt") @@ -1829,21 +2059,23 @@ rule gather_RPM_folds: output.all_folds, sep="\t") +# TODO: update consumers with mapped_type # TODO: replace counted_type with all subwildcards. Why does it work with cond_name? # Wildcards in input files cannot be determined from output files: # 'read_type' rule compute_remapped_RPM_folds: input: - counts_table = rules.gather_feature_counts.output.counts_table, + counts_table = source_small_RNA_counts, + #counts_table = rules.gather_feature_counts.output.counts_table, #exon_lengths = rules.compute_genes_exon_lengths.output.exon_lengths, summary_table = rules.gather_read_counts_summaries.output.summary_table, #tags_table = rules.associate_small_type.output.tags_table, output: fold_results = OPJ( mapping_dir, f"RPM_folds_{size_selected}", - "{contrast}", "remapped", "{contrast}_{read_type}_on_%s_{biotype}_{orientation}_{feature_type}_RPM_folds.txt" % genome), + "{contrast}", "remapped", "{contrast}_{read_type}_{mapped_type}_{biotype}_{orientation}_{feature_type}_RPM_folds.txt"), log: - log = OPJ(log_dir, "compute_RPM_folds", "{contrast}_{read_type}_on_%s_{biotype}_{orientation}_{feature_type}.log" % genome), + log = OPJ(log_dir, "compute_RPM_folds", "{contrast}_{read_type}_{mapped_type}_{biotype}_{orientation}_{feature_type}.log"), run: with open(log.log, "w") as logfile: (cond, ref) = CONTRAST2PAIR[wildcards.contrast] @@ -1893,24 +2125,26 @@ rule gather_remapped_RPM_folds: input: fold_results = expand(OPJ( mapping_dir, f"RPM_folds_{size_selected}", - "{contrast}", "remapped", "{contrast}_{{read_type}}_on_%s_{{biotype}}_{{orientation}}_transcript_RPM_folds.txt" % genome), contrast=IP_CONTRASTS), + "{contrast}", "remapped", "{contrast}_{{read_type}}_{{mapped_type}}_{{biotype}}_{{orientation}}_transcript_RPM_folds.txt"), contrast=IP_CONTRASTS), output: all_folds = OPJ( - mapping_dir, f"RPM_folds_{size_selected}", "all", "remapped", "{read_type}_on_%s_{biotype}_{orientation}_transcript_mean_log2_RPM_fold.txt" % genome), + mapping_dir, f"RPM_folds_{size_selected}", "all", "remapped", "{read_type}_{mapped_type}_{biotype}_{orientation}_transcript_mean_log2_RPM_fold.txt"), log: - log = OPJ(log_dir, "gather_RPM_folds", "{read_type}_on_%s_{biotype}_{orientation}_transcript.log" % genome), + log = OPJ(log_dir, "gather_RPM_folds", "{read_type}_{mapped_type}_{biotype}_{orientation}_transcript.log"), run: pd.DataFrame({contrast : pd.read_table( # OPJ(mapping_dir, f"RPM_folds_{size_selected}", # f"{contrast}", f"{contrast}_{wildcards.counted_type}_RPM_folds.txt"), OPJ(mapping_dir, f"RPM_folds_{size_selected}", - f"{contrast}", "remapped", f"{contrast}_{wildcards.read_type}_on_{genome}_{wildcards.biotype}_{wildcards.orientation}_transcript_RPM_folds.txt"), + f"{contrast}", "remapped", f"{contrast}_{wildcards.read_type}_{wildcards.mapped_type}_{wildcards.biotype}_{wildcards.orientation}_transcript_RPM_folds.txt"), index_col=["gene", "cosmid", "name"], usecols=["gene", "cosmid", "name", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv( output.all_folds, sep="\t") +# TODO update consumers of remapped_RPM_folds # TODO add remapped_RPM_folds +@wc_applied def source_gathered_folds(wildcards): if hasattr(wildcards, "counted_type"): return rules.gather_remapped_RPM_folds.output.all_folds @@ -2108,7 +2342,7 @@ rule test_size_factor: try: xlabel = "log10(normalized counts)" save_plot(pp, plot_counts_distribution, data, xlabel, - format=wildcards.fig_format, + format="pdf", title=params.counts_distrib_plot_title.format(normalizer)) except TypeError as e: if str(e) == "Empty 'DataFrame': no numeric data to plot": @@ -2236,6 +2470,7 @@ your deepTools settings and check your input files. raise +@wc_applied def source_norm_file(wildcards): if wildcards.norm == "median_ratio_to_pseudo_ref": return OPJ(mapping_dir, "annotation", @@ -3307,13 +3542,13 @@ rule plot_scatterplots: # raise NotImplementedError("Unknown contrast: %s" % wildcards.contrast) save_plot(output.pairplots, plot_paired_scatters, counts_and_res, columns=column_list, hue="status", - format=wildcards.fig_format, + format="pdf", title="Sample comparison (normalized %sRNA counts) for %s\n(size factor: %s)" % (wildcards.small_type, wildcards.contrast, wildcards.norm)) #pp = PdfPages(output.pairplots) #for (condition, column_list) in column_lists.items(): # save_plot(pp, plot_paired_scatters, # counts_and_res, column_list, "status", - # format=wildcards.fig_format, + # format="pdf", # title="Sample comparison (normalized %sRNA counts) for %s\n(size factor: %s)" % (wildcards.small_type, condition, NORMALIZER)) #pp.close() # save_plot(output.pairplot, sns.pairplot, counts_and_res, @@ -3336,6 +3571,7 @@ all_id_lists = dict(zip( filter(None.__ne__, map(get_id_list, config["gene_lists"])))) +@wc_applied def source_fold_data(wildcards): fold_type = wildcards.fold_type if fold_type in {"log2FoldChange", "lfcMLE"}: @@ -3362,7 +3598,7 @@ def source_fold_data(wildcards): OPJ(mapping_dir, "RPM_folds_%s" % size_selected, "{{0.contrast}}", "remapped", - "{{0.contrast}}_{{0.read_type}}_on_%s_{biotype}_{{0.orientation}}_transcript_RPM_folds.txt" % genome), + "{{0.contrast}}_{{0.read_type}}_{{0.mapped_type}}_{biotype}_{{0.orientation}}_transcript_RPM_folds.txt"), biotype=wildcards.biotypes.split("_and_"))] else: return rules.compute_RPM_folds.output.fold_results @@ -3427,15 +3663,17 @@ rule make_contrast_lfc_boxplots: rule make_biotype_lfc_boxplots: """For a given gene list, make one box for each contrast.""" input: + #print_wc, data = source_fold_data, output: boxplots = OPJ(output_dir, "figures", "{contrast}", "by_biotypes", - "{contrast}_{read_type}_on_%s_{biotypes}_{orientation}_transcript_{fold_type}_{gene_list}_boxplots.pdf" % genome) + "{contrast}_{read_type}_{mapped_type}_{biotypes}_{orientation}_transcript_{fold_type}_{gene_list}_boxplots.pdf") params: id_lists = set_id_lists, #shell: # """touch {output.boxplots}""" run: + # TODO: test lfcs.shape before trying to plot lfcs = pd.DataFrame( {f"{biotype}_{list_name}" : pd.read_table(filename, index_col="gene").loc[ set(id_list)]["mean_log2_RPM_fold"] for ( @@ -3446,7 +3684,7 @@ rule make_biotype_lfc_boxplots: plot_boxplots, lfcs, wildcards.fold_type, - title=f"{wildcards.read_type} folds for {wildcards.biotypes} contrasts") + title=f"{wildcards.read_type}_{wildcards.mapped_type} folds for {wildcards.biotypes} contrasts") ########################## @@ -3500,6 +3738,7 @@ def get_type_max_len(wildcards): rule compute_base_composition_along: input: reads = source_fastq, + #reads = wc_applied(source_fastq), # rules.select_size_range.output.size_selected output: composition = OPJ(data_dir, "read_stats", "{lib}_{rep}_{read_type}_base_composition_from_{position}.txt"),