Commit cff1d2d4 authored by Blaise Li's avatar Blaise Li
Browse files

Adding more biotypes, fixing various bug.

parent 01da51a2
......@@ -59,11 +59,11 @@ import matplotlib.pyplot as plt
import seaborn as sns
import husl
from libdeseq import do_deseq2,
from libdeseq import do_deseq2
from libhts import median_ratio_to_pseudo_ref_size_factors, status_setter, plot_lfc_distribution, plot_MA
from libworkflows import ensure_relative, cleanup_and_backup
from libworkflows import ensure_relative, cleanup_and_backup, texscape
from libworkflows import get_chrom_sizes, column_converter
from libworkflows import strip_split, last_lines, save_plot, test_na_file
from libworkflows import strip_split, file_len, last_lines, save_plot, test_na_file
from libworkflows import make_id_list_getter, filter_combinator, SHELL_FUNCTIONS, warn_context
from libworkflows import feature_orientation2stranded
from libworkflows import read_htseq_counts, sum_htseq_counts
......@@ -140,8 +140,15 @@ COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
COUNT_BIOTYPES = config["count_biotypes"]
ANNOT_BIOTYPES = config["annot_biotypes"]
#METAGENE_BIOTYPES=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"]
METAGENE_BIOTYPES=["protein_coding"]
ID_LISTS = ["lfc_statuses", "germline_specific", "histone", "spermatogenic_Ortiz_2014", "oogenic_Ortiz_2014"]
#METAGENE_BIOTYPES=["protein_coding"]
METAGENE_BIOTYPES=["protein_coding", "protein_coding_5UTR", "protein_coding_CDS", "protein_coding_3UTR"]
ID_LISTS = [
"lfc_statuses",
"germline_specific",
"histone",
"spermatogenic_Ortiz_2014", "oogenic_Ortiz_2014",
"piRNA_dependent_prot_si_down4_WR_RT_top200", "piRNA_dependent_prot_si_down4",
"csr1_prot_si_supertargets_common"]
annot_dir = config["annot_dir"]
gene_lists_dir = "/pasteur/entites/Mhe/bli/Gene_lists"
avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
......@@ -336,7 +343,7 @@ rule trim_and_dedup:
err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"),
run:
shell_commands = """
THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {trimmer} {input} \\
THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {wildcards.trimmer} {input} \\
{params.adapter} {params.trim5} {params.trim3} \\
{output.adapt} {output.noadapt} {log.trim} \\
{output.nb_raw} {output.nb_adapt} {output.nb_adapt_deduped} \\
......@@ -345,6 +352,7 @@ THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {trimmer} {input} \\
shell(shell_commands)
# TODO: Do not deduplicate, or at least do not use the noadapt_deduped: The 3' UMI is not present.
rule map_on_genome:
input:
fastq = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{type}_deduped.fastq.gz"),
......@@ -465,6 +473,7 @@ rule check_last_base:
print(length, nb_reads_this_length, *[str(counter[letter] / nb_reads_this_length) for letter in "ACGTN"], sep="\t", file=output_file)
# TODO: use Python to make the plot
rule plot_last_base:
input:
OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_last_bases.txt")
......@@ -629,6 +638,7 @@ rule do_PCA:
OPJ(output_dir, "{trimmer}", "figures", aligner, "{counter}", "{biotype}_{orientation}_PCA.{fig_format}"),
message:
"Summarizing counts for {wildcards.biotype}_{wildcards.orientation}"
threads: 12 # trying to avoid TimeoutError and "LOCKERROR: matplotlib is trying to acquire the lock [...]"
log:
OPJ(log_dir, "{trimmer}", "do_PCA_{biotype}_{orientation}.log")
run:
......@@ -1032,7 +1042,12 @@ rule join_all_counts:
"""concat counts for all biotypes into all"""
input:
#counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=[bty for bty in COUNT_BIOTYPES if bty[-9:] != "_families"]),
counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=COUNT_BIOTYPES),
counts_tables = expand(
OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans",
"{{counter}}", "all_on_C_elegans",
"{biotype}_{{orientation}}_counts.txt"),
# We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}"
biotype=[b for b in COUNT_BIOTYPES if not b.startswith("protein_coding_")]),
output:
counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"),
run:
......@@ -1196,8 +1211,11 @@ rule make_MA_plot:
group2colour=group2colour,
lfc_range = params.lfc_range,
fold_type = "log2FoldChange",
title="MA-plot for %s, %s_%s" % (
wildcards.contrast, wildcards.orientation, wildcards.biotype))
title=texscape(
"MA-plot for %s, %s_%s" % (
wildcards.contrast,
wildcards.orientation,
wildcards.biotype)))
##################
......@@ -1522,6 +1540,13 @@ def meta_params(wildcards):
"-m %d" % META_SCALE,
"--unscaled3prime %d" % UNSCALED_INSIDE,
"-a %d" % META_MARGIN])
if biotype.startswith("protein_coding_"):
return " ".join([
"-b %d" % 0,
"--unscaled5prime %d" % UNSCALED_INSIDE,
"-m %d" % META_SCALE,
"--unscaled3prime %d" % UNSCALED_INSIDE,
"-a %d" % 0])
else:
raise NotImplementedError("Metagene analyses for %s not implemented." % biotype)
......@@ -1543,25 +1568,27 @@ rule plot_meta_profile_mean:
plot_TSS_log = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.log" % (MIN_DIST, META_MIN_LEN)),
plot_TSS_err = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.err" % (MIN_DIST, META_MIN_LEN)),
threads: 12 # to limit memory usage, actually
shell:
"""
tmpdir=$(mktemp -dt "plot_meta_profile_{wildcards.trimmer}_{wildcards.lib}_{wildcards.orientation}_{wildcards.biotype}_%d.XXXXXXXXXX")
computeMatrix scale-regions -S {input.bigwig} \\
-R {input.bed} \\
{params.meta_params} \\
-p 1 \\
-out ${{tmpdir}}/meta_profile.gz \\
--skipZeros \\
1> {log.plot_TSS_log} \\
2> {log.plot_TSS_err} \\
|| error_exit "computeMatrix failed"
plotProfile -m ${{tmpdir}}/meta_profile.gz -out {output} \\
1>> {log.plot_TSS_log} \\
2>> {log.plot_TSS_err} \\
|| error_exit "plotProfile failed"
rm -rf ${{tmpdir}}
""" % META_MIN_LEN
run:
if file_len(input.bed):
shell("""tmpdir=$(mktemp -dt "plot_meta_profile_{wildcards.trimmer}_{wildcards.lib}_{wildcards.orientation}_{wildcards.biotype}_%d.XXXXXXXXXX")
computeMatrix scale-regions -S {input.bigwig} \\
-R {input.bed} \\
{params.meta_params} \\
-p 1 \\
-out ${{tmpdir}}/meta_profile.gz \\
--skipZeros \\
1> {log.plot_TSS_log} \\
2> {log.plot_TSS_err} \\
|| error_exit "computeMatrix failed"
plotProfile -m ${{tmpdir}}/meta_profile.gz -out {output} \\
1>> {log.plot_TSS_log} \\
2>> {log.plot_TSS_err} \\
|| error_exit "plotProfile failed"
rm -rf ${{tmpdir}}
""" % META_MIN_LEN)
else:
warnings.warn("No regions selected in {input.bed}. Generating empty figure.\n")
shell("""> {output}""")
onsuccess:
print("PRO-seq analysis finished.")
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment