diff --git a/PRO-seq/PRO-seq.snakefile b/PRO-seq/PRO-seq.snakefile index f3e80d0c2ff0a7939e421493c461bbe67458f966..cfbbfd16e2a8e6f2151d07bf0a25665bd8b229ed 100644 --- a/PRO-seq/PRO-seq.snakefile +++ b/PRO-seq/PRO-seq.snakefile @@ -59,11 +59,11 @@ import matplotlib.pyplot as plt import seaborn as sns import husl -from libdeseq import do_deseq2, +from libdeseq import do_deseq2 from libhts import median_ratio_to_pseudo_ref_size_factors, status_setter, plot_lfc_distribution, plot_MA -from libworkflows import ensure_relative, cleanup_and_backup +from libworkflows import ensure_relative, cleanup_and_backup, texscape from libworkflows import get_chrom_sizes, column_converter -from libworkflows import strip_split, last_lines, save_plot, test_na_file +from libworkflows import strip_split, file_len, last_lines, save_plot, test_na_file from libworkflows import make_id_list_getter, filter_combinator, SHELL_FUNCTIONS, warn_context from libworkflows import feature_orientation2stranded from libworkflows import read_htseq_counts, sum_htseq_counts @@ -140,8 +140,15 @@ COND_COLUMNS = pd.DataFrame(CONDITIONS).assign( COUNT_BIOTYPES = config["count_biotypes"] ANNOT_BIOTYPES = config["annot_biotypes"] #METAGENE_BIOTYPES=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"] -METAGENE_BIOTYPES=["protein_coding"] -ID_LISTS = ["lfc_statuses", "germline_specific", "histone", "spermatogenic_Ortiz_2014", "oogenic_Ortiz_2014"] +#METAGENE_BIOTYPES=["protein_coding"] +METAGENE_BIOTYPES=["protein_coding", "protein_coding_5UTR", "protein_coding_CDS", "protein_coding_3UTR"] +ID_LISTS = [ + "lfc_statuses", + "germline_specific", + "histone", + "spermatogenic_Ortiz_2014", "oogenic_Ortiz_2014", + "piRNA_dependent_prot_si_down4_WR_RT_top200", "piRNA_dependent_prot_si_down4", + "csr1_prot_si_supertargets_common"] annot_dir = config["annot_dir"] gene_lists_dir = "/pasteur/entites/Mhe/bli/Gene_lists" avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) @@ -336,7 +343,7 @@ rule trim_and_dedup: err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"), run: shell_commands = """ -THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {trimmer} {input} \\ +THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {wildcards.trimmer} {input} \\ {params.adapter} {params.trim5} {params.trim3} \\ {output.adapt} {output.noadapt} {log.trim} \\ {output.nb_raw} {output.nb_adapt} {output.nb_adapt_deduped} \\ @@ -345,6 +352,7 @@ THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {trimmer} {input} \\ shell(shell_commands) +# TODO: Do not deduplicate, or at least do not use the noadapt_deduped: The 3' UMI is not present. rule map_on_genome: input: fastq = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{type}_deduped.fastq.gz"), @@ -465,6 +473,7 @@ rule check_last_base: print(length, nb_reads_this_length, *[str(counter[letter] / nb_reads_this_length) for letter in "ACGTN"], sep="\t", file=output_file) +# TODO: use Python to make the plot rule plot_last_base: input: OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_last_bases.txt") @@ -629,6 +638,7 @@ rule do_PCA: OPJ(output_dir, "{trimmer}", "figures", aligner, "{counter}", "{biotype}_{orientation}_PCA.{fig_format}"), message: "Summarizing counts for {wildcards.biotype}_{wildcards.orientation}" + threads: 12 # trying to avoid TimeoutError and "LOCKERROR: matplotlib is trying to acquire the lock [...]" log: OPJ(log_dir, "{trimmer}", "do_PCA_{biotype}_{orientation}.log") run: @@ -1032,7 +1042,12 @@ rule join_all_counts: """concat counts for all biotypes into all""" input: #counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=[bty for bty in COUNT_BIOTYPES if bty[-9:] != "_families"]), - counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=COUNT_BIOTYPES), + counts_tables = expand( + OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", + "{{counter}}", "all_on_C_elegans", + "{biotype}_{{orientation}}_counts.txt"), + # We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}" + biotype=[b for b in COUNT_BIOTYPES if not b.startswith("protein_coding_")]), output: counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"), run: @@ -1196,8 +1211,11 @@ rule make_MA_plot: group2colour=group2colour, lfc_range = params.lfc_range, fold_type = "log2FoldChange", - title="MA-plot for %s, %s_%s" % ( - wildcards.contrast, wildcards.orientation, wildcards.biotype)) + title=texscape( + "MA-plot for %s, %s_%s" % ( + wildcards.contrast, + wildcards.orientation, + wildcards.biotype))) ################## @@ -1522,6 +1540,13 @@ def meta_params(wildcards): "-m %d" % META_SCALE, "--unscaled3prime %d" % UNSCALED_INSIDE, "-a %d" % META_MARGIN]) + if biotype.startswith("protein_coding_"): + return " ".join([ + "-b %d" % 0, + "--unscaled5prime %d" % UNSCALED_INSIDE, + "-m %d" % META_SCALE, + "--unscaled3prime %d" % UNSCALED_INSIDE, + "-a %d" % 0]) else: raise NotImplementedError("Metagene analyses for %s not implemented." % biotype) @@ -1543,25 +1568,27 @@ rule plot_meta_profile_mean: plot_TSS_log = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.log" % (MIN_DIST, META_MIN_LEN)), plot_TSS_err = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.err" % (MIN_DIST, META_MIN_LEN)), threads: 12 # to limit memory usage, actually - shell: - """ - tmpdir=$(mktemp -dt "plot_meta_profile_{wildcards.trimmer}_{wildcards.lib}_{wildcards.orientation}_{wildcards.biotype}_%d.XXXXXXXXXX") - computeMatrix scale-regions -S {input.bigwig} \\ - -R {input.bed} \\ - {params.meta_params} \\ - -p 1 \\ - -out ${{tmpdir}}/meta_profile.gz \\ - --skipZeros \\ - 1> {log.plot_TSS_log} \\ - 2> {log.plot_TSS_err} \\ - || error_exit "computeMatrix failed" - plotProfile -m ${{tmpdir}}/meta_profile.gz -out {output} \\ - 1>> {log.plot_TSS_log} \\ - 2>> {log.plot_TSS_err} \\ - || error_exit "plotProfile failed" - rm -rf ${{tmpdir}} - """ % META_MIN_LEN - + run: + if file_len(input.bed): + shell("""tmpdir=$(mktemp -dt "plot_meta_profile_{wildcards.trimmer}_{wildcards.lib}_{wildcards.orientation}_{wildcards.biotype}_%d.XXXXXXXXXX") +computeMatrix scale-regions -S {input.bigwig} \\ + -R {input.bed} \\ + {params.meta_params} \\ + -p 1 \\ + -out ${{tmpdir}}/meta_profile.gz \\ + --skipZeros \\ + 1> {log.plot_TSS_log} \\ + 2> {log.plot_TSS_err} \\ + || error_exit "computeMatrix failed" +plotProfile -m ${{tmpdir}}/meta_profile.gz -out {output} \\ + 1>> {log.plot_TSS_log} \\ + 2>> {log.plot_TSS_err} \\ + || error_exit "plotProfile failed" +rm -rf ${{tmpdir}} +""" % META_MIN_LEN) + else: + warnings.warn("No regions selected in {input.bed}. Generating empty figure.\n") + shell("""> {output}""") onsuccess: print("PRO-seq analysis finished.")