Skip to content
Snippets Groups Projects
Commit cff1d2d4 authored by Blaise Li's avatar Blaise Li
Browse files

Adding more biotypes, fixing various bug.

parent 01da51a2
No related branches found
No related tags found
No related merge requests found
...@@ -59,11 +59,11 @@ import matplotlib.pyplot as plt ...@@ -59,11 +59,11 @@ import matplotlib.pyplot as plt
import seaborn as sns import seaborn as sns
import husl import husl
from libdeseq import do_deseq2, from libdeseq import do_deseq2
from libhts import median_ratio_to_pseudo_ref_size_factors, status_setter, plot_lfc_distribution, plot_MA from libhts import median_ratio_to_pseudo_ref_size_factors, status_setter, plot_lfc_distribution, plot_MA
from libworkflows import ensure_relative, cleanup_and_backup from libworkflows import ensure_relative, cleanup_and_backup, texscape
from libworkflows import get_chrom_sizes, column_converter from libworkflows import get_chrom_sizes, column_converter
from libworkflows import strip_split, last_lines, save_plot, test_na_file from libworkflows import strip_split, file_len, last_lines, save_plot, test_na_file
from libworkflows import make_id_list_getter, filter_combinator, SHELL_FUNCTIONS, warn_context from libworkflows import make_id_list_getter, filter_combinator, SHELL_FUNCTIONS, warn_context
from libworkflows import feature_orientation2stranded from libworkflows import feature_orientation2stranded
from libworkflows import read_htseq_counts, sum_htseq_counts from libworkflows import read_htseq_counts, sum_htseq_counts
...@@ -140,8 +140,15 @@ COND_COLUMNS = pd.DataFrame(CONDITIONS).assign( ...@@ -140,8 +140,15 @@ COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
COUNT_BIOTYPES = config["count_biotypes"] COUNT_BIOTYPES = config["count_biotypes"]
ANNOT_BIOTYPES = config["annot_biotypes"] ANNOT_BIOTYPES = config["annot_biotypes"]
#METAGENE_BIOTYPES=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"] #METAGENE_BIOTYPES=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"]
METAGENE_BIOTYPES=["protein_coding"] #METAGENE_BIOTYPES=["protein_coding"]
ID_LISTS = ["lfc_statuses", "germline_specific", "histone", "spermatogenic_Ortiz_2014", "oogenic_Ortiz_2014"] METAGENE_BIOTYPES=["protein_coding", "protein_coding_5UTR", "protein_coding_CDS", "protein_coding_3UTR"]
ID_LISTS = [
"lfc_statuses",
"germline_specific",
"histone",
"spermatogenic_Ortiz_2014", "oogenic_Ortiz_2014",
"piRNA_dependent_prot_si_down4_WR_RT_top200", "piRNA_dependent_prot_si_down4",
"csr1_prot_si_supertargets_common"]
annot_dir = config["annot_dir"] annot_dir = config["annot_dir"]
gene_lists_dir = "/pasteur/entites/Mhe/bli/Gene_lists" gene_lists_dir = "/pasteur/entites/Mhe/bli/Gene_lists"
avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
...@@ -336,7 +343,7 @@ rule trim_and_dedup: ...@@ -336,7 +343,7 @@ rule trim_and_dedup:
err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"), err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"),
run: run:
shell_commands = """ shell_commands = """
THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {trimmer} {input} \\ THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {wildcards.trimmer} {input} \\
{params.adapter} {params.trim5} {params.trim3} \\ {params.adapter} {params.trim5} {params.trim3} \\
{output.adapt} {output.noadapt} {log.trim} \\ {output.adapt} {output.noadapt} {log.trim} \\
{output.nb_raw} {output.nb_adapt} {output.nb_adapt_deduped} \\ {output.nb_raw} {output.nb_adapt} {output.nb_adapt_deduped} \\
...@@ -345,6 +352,7 @@ THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {trimmer} {input} \\ ...@@ -345,6 +352,7 @@ THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {trimmer} {input} \\
shell(shell_commands) shell(shell_commands)
# TODO: Do not deduplicate, or at least do not use the noadapt_deduped: The 3' UMI is not present.
rule map_on_genome: rule map_on_genome:
input: input:
fastq = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{type}_deduped.fastq.gz"), fastq = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{type}_deduped.fastq.gz"),
...@@ -465,6 +473,7 @@ rule check_last_base: ...@@ -465,6 +473,7 @@ rule check_last_base:
print(length, nb_reads_this_length, *[str(counter[letter] / nb_reads_this_length) for letter in "ACGTN"], sep="\t", file=output_file) print(length, nb_reads_this_length, *[str(counter[letter] / nb_reads_this_length) for letter in "ACGTN"], sep="\t", file=output_file)
# TODO: use Python to make the plot
rule plot_last_base: rule plot_last_base:
input: input:
OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_last_bases.txt") OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_last_bases.txt")
...@@ -629,6 +638,7 @@ rule do_PCA: ...@@ -629,6 +638,7 @@ rule do_PCA:
OPJ(output_dir, "{trimmer}", "figures", aligner, "{counter}", "{biotype}_{orientation}_PCA.{fig_format}"), OPJ(output_dir, "{trimmer}", "figures", aligner, "{counter}", "{biotype}_{orientation}_PCA.{fig_format}"),
message: message:
"Summarizing counts for {wildcards.biotype}_{wildcards.orientation}" "Summarizing counts for {wildcards.biotype}_{wildcards.orientation}"
threads: 12 # trying to avoid TimeoutError and "LOCKERROR: matplotlib is trying to acquire the lock [...]"
log: log:
OPJ(log_dir, "{trimmer}", "do_PCA_{biotype}_{orientation}.log") OPJ(log_dir, "{trimmer}", "do_PCA_{biotype}_{orientation}.log")
run: run:
...@@ -1032,7 +1042,12 @@ rule join_all_counts: ...@@ -1032,7 +1042,12 @@ rule join_all_counts:
"""concat counts for all biotypes into all""" """concat counts for all biotypes into all"""
input: input:
#counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=[bty for bty in COUNT_BIOTYPES if bty[-9:] != "_families"]), #counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=[bty for bty in COUNT_BIOTYPES if bty[-9:] != "_families"]),
counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=COUNT_BIOTYPES), counts_tables = expand(
OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans",
"{{counter}}", "all_on_C_elegans",
"{biotype}_{{orientation}}_counts.txt"),
# We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}"
biotype=[b for b in COUNT_BIOTYPES if not b.startswith("protein_coding_")]),
output: output:
counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"), counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"),
run: run:
...@@ -1196,8 +1211,11 @@ rule make_MA_plot: ...@@ -1196,8 +1211,11 @@ rule make_MA_plot:
group2colour=group2colour, group2colour=group2colour,
lfc_range = params.lfc_range, lfc_range = params.lfc_range,
fold_type = "log2FoldChange", fold_type = "log2FoldChange",
title="MA-plot for %s, %s_%s" % ( title=texscape(
wildcards.contrast, wildcards.orientation, wildcards.biotype)) "MA-plot for %s, %s_%s" % (
wildcards.contrast,
wildcards.orientation,
wildcards.biotype)))
################## ##################
...@@ -1522,6 +1540,13 @@ def meta_params(wildcards): ...@@ -1522,6 +1540,13 @@ def meta_params(wildcards):
"-m %d" % META_SCALE, "-m %d" % META_SCALE,
"--unscaled3prime %d" % UNSCALED_INSIDE, "--unscaled3prime %d" % UNSCALED_INSIDE,
"-a %d" % META_MARGIN]) "-a %d" % META_MARGIN])
if biotype.startswith("protein_coding_"):
return " ".join([
"-b %d" % 0,
"--unscaled5prime %d" % UNSCALED_INSIDE,
"-m %d" % META_SCALE,
"--unscaled3prime %d" % UNSCALED_INSIDE,
"-a %d" % 0])
else: else:
raise NotImplementedError("Metagene analyses for %s not implemented." % biotype) raise NotImplementedError("Metagene analyses for %s not implemented." % biotype)
...@@ -1543,25 +1568,27 @@ rule plot_meta_profile_mean: ...@@ -1543,25 +1568,27 @@ rule plot_meta_profile_mean:
plot_TSS_log = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.log" % (MIN_DIST, META_MIN_LEN)), plot_TSS_log = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.log" % (MIN_DIST, META_MIN_LEN)),
plot_TSS_err = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.err" % (MIN_DIST, META_MIN_LEN)), plot_TSS_err = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.err" % (MIN_DIST, META_MIN_LEN)),
threads: 12 # to limit memory usage, actually threads: 12 # to limit memory usage, actually
shell: run:
""" if file_len(input.bed):
tmpdir=$(mktemp -dt "plot_meta_profile_{wildcards.trimmer}_{wildcards.lib}_{wildcards.orientation}_{wildcards.biotype}_%d.XXXXXXXXXX") shell("""tmpdir=$(mktemp -dt "plot_meta_profile_{wildcards.trimmer}_{wildcards.lib}_{wildcards.orientation}_{wildcards.biotype}_%d.XXXXXXXXXX")
computeMatrix scale-regions -S {input.bigwig} \\ computeMatrix scale-regions -S {input.bigwig} \\
-R {input.bed} \\ -R {input.bed} \\
{params.meta_params} \\ {params.meta_params} \\
-p 1 \\ -p 1 \\
-out ${{tmpdir}}/meta_profile.gz \\ -out ${{tmpdir}}/meta_profile.gz \\
--skipZeros \\ --skipZeros \\
1> {log.plot_TSS_log} \\ 1> {log.plot_TSS_log} \\
2> {log.plot_TSS_err} \\ 2> {log.plot_TSS_err} \\
|| error_exit "computeMatrix failed" || error_exit "computeMatrix failed"
plotProfile -m ${{tmpdir}}/meta_profile.gz -out {output} \\ plotProfile -m ${{tmpdir}}/meta_profile.gz -out {output} \\
1>> {log.plot_TSS_log} \\ 1>> {log.plot_TSS_log} \\
2>> {log.plot_TSS_err} \\ 2>> {log.plot_TSS_err} \\
|| error_exit "plotProfile failed" || error_exit "plotProfile failed"
rm -rf ${{tmpdir}} rm -rf ${{tmpdir}}
""" % META_MIN_LEN """ % META_MIN_LEN)
else:
warnings.warn("No regions selected in {input.bed}. Generating empty figure.\n")
shell("""> {output}""")
onsuccess: onsuccess:
print("PRO-seq analysis finished.") print("PRO-seq analysis finished.")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment