Commit 48ff58c3 authored by Blaise Li's avatar Blaise Li
Browse files

Fixing normalization.

The desired normalizer for bigwig files was not actually used.
parent a24e5022
......@@ -18,6 +18,17 @@ from glob import glob
from pickle import load
from fileinput import input as finput
import warnings
def formatwarning(message, category, filename, lineno, line):
"""Used to format warning messages."""
return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)
warnings.formatwarning = formatwarning
# Useful for functional style
from itertools import product, starmap
from operator import eq
......@@ -250,17 +261,24 @@ shell.prefix(SHELL_FUNCTIONS)
rule all:
"""This top rule is used to drive the whole workflow by taking as input its final products."""
input:
expand(OPJ(output_dir, "{trimmer}", "figures", aligner, "{counter}", "{contrast}", "{orientation}_{biotype}", "MA_with_{id_list}.pdf"), trimmer=TRIMMERS, counter=COUNTERS, contrast=CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES, id_list=ID_LISTS),
expand(OPJ(output_dir, "{trimmer}", "figures", aligner, "{counter}", "{contrast}", "{orientation}_{biotype}", "{fold_type}_distribution.pdf"), trimmer=TRIMMERS, counter=COUNTERS, contrast=CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES, fold_type=["log2FoldChange"]),
expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "deseq2", "{contrast}", "{orientation}_{biotype}", "counts_and_res.txt"), trimmer=TRIMMERS, counter=COUNTERS, contrast=CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES),
expand(OPJ(output_dir, "{trimmer}", "figures", aligner, "{counter}", "{biotype}_{orientation}_PCA.{fig_format}"), trimmer=TRIMMERS, counter=COUNTERS, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS, fig_format=FIG_FORMATS),
expand(OPJ(
output_dir, "{trimmer}", "figures", aligner, "{counter}", "{contrast}", "{orientation}_{biotype}", "MA_with_{id_list}.pdf"), trimmer=TRIMMERS, counter=COUNTERS, contrast=CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES, id_list=ID_LISTS),
expand(OPJ(
output_dir, "{trimmer}", "figures", aligner, "{counter}", "{contrast}", "{orientation}_{biotype}", "{fold_type}_distribution.pdf"), trimmer=TRIMMERS, counter=COUNTERS, contrast=CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES, fold_type=["log2FoldChange"]),
expand(OPJ(
output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "deseq2", "{contrast}", "{orientation}_{biotype}", "counts_and_res.txt"), trimmer=TRIMMERS, counter=COUNTERS, contrast=CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES),
expand(OPJ(
output_dir, "{trimmer}", "figures", aligner, "{counter}", "{biotype}_{orientation}_PCA.{fig_format}"), trimmer=TRIMMERS, counter=COUNTERS, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS, fig_format=FIG_FORMATS),
#expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"), trimmer=TRIMMERS, counter=COUNTERS, orientation=["all"]),
#expand(expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "{lib}_{rep}_on_C_elegans", "{{biotype}}_{{orientation}}_counts.txt"), filtered_product, lib=LIBS, rep=REPS), trimmer=TRIMMERS, counter=COUNTERS, biotype=COUNT_BIOTYPES, orientation=["all"]),
expand(expand(OPJ(output_dir, "{{trimmer}}", "figures", aligner, "{lib}_{rep}", "adapt_on_C_elegans_last_bases.pdf"), filtered_product, lib=LIBS, rep=REPS), trimmer=TRIMMERS),
expand(expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_by_{{norm_type}}_{{orientation}}.bw"), filtered_product, lib=LIBS, rep=REPS), trimmer=TRIMMERS, norm_type=NORM_TYPES, orientation=["all"]),
expand(expand(OPJ(
output_dir, "{{trimmer}}", "figures", aligner, "{lib}_{rep}", "adapt_on_C_elegans_last_bases.pdf"), filtered_product, lib=LIBS, rep=REPS), trimmer=TRIMMERS),
expand(expand(OPJ(
output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_by_{{norm_type}}_{{orientation}}.bw"), filtered_product, lib=LIBS, rep=REPS), trimmer=TRIMMERS, norm_type=NORM_TYPES, orientation=["all"]),
#expand(OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.{fig_format}" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, orientation=["all"], biotype=["protein_coding"], fig_format=FIG_FORMATS),
#expand(OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.{fig_format}" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, orientation=["all"], biotype=METAGENE_BIOTYPES, fig_format=FIG_FORMATS),
expand(OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_by_{norm_type}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.{fig_format}" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, norm_type=NORM_TYPES, orientation=["all"], biotype=METAGENE_BIOTYPES, fig_format=FIG_FORMATS),
expand(OPJ(
output_dir, "{trimmer}", "figures", aligner, "{lib}_by_{norm_type}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.{fig_format}" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, norm_type=NORM_TYPES, orientation=["all"], biotype=METAGENE_BIOTYPES, fig_format=FIG_FORMATS),
include: ensure_relative(irules["link_raw_data"], workflow.basedir)
......@@ -298,7 +316,7 @@ rule trim_and_dedup:
err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"),
run:
shell_commands = """
THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {wildcards.trimmer} {input} \\
THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {trimmer} {input} \\
{params.adapter} {params.trim5} {params.trim3} \\
{output.adapt} {output.noadapt} {log.trim} \\
{output.nb_raw} {output.nb_adapt} {output.nb_adapt_deduped} \\
......@@ -738,6 +756,66 @@ rule gather_counts:
counts_data.to_csv(output.counts_table, sep="\t")
rule compute_RPK:
"""For a given biotype, compute the corresponding RPK value (reads per kilobase)."""
input:
counts_data = rules.gather_counts.output.counts_table,
#counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}",
# "all_on_C_elegans", "{biotype}_{orientation}_counts.txt"),
output:
rpk_file = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}",
"all_on_C_elegans", "{biotype}_{orientation}_RPK.txt"),
params:
feature_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"),
run:
counts_data = pd.read_table(input.counts_data, index_col="gene")
feature_lengths = pd.read_table(params.feature_lengths_file, index_col="gene")
common = counts_data.index.intersection(feature_lengths.index)
rpk = 1000 * counts_data.loc[common].div(feature_lengths.loc[common]["union_exon_len"], axis="index")
rpk.to_csv(output.rpk_file, sep="\t")
rule compute_sum_million_RPK:
input:
rpk_file = rules.compute_RPK.output.rpk_file,
output:
sum_rpk_file = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}",
"all_on_C_elegans", "{biotype}_{orientation}_sum_million_RPK.txt"),
run:
sum_rpk = pd.read_table(
input.rpk_file,
index_col=0).sum()
(sum_rpk / 1000000).to_csv(output.sum_rpk_file, sep="\t")
rule compute_TPM:
"""For a given biotype, compute the corresponding TPM value (reads per kilobase per million mappers)."""
input:
rpk_file = rules.compute_RPK.output.rpk_file
output:
tpm_file = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}",
"all_on_C_elegans", "{biotype}_{orientation}_TPM.txt"),
# The sum must be done over all counted features
wildcard_constraints:
biotype = "alltypes"
run:
rpk = pd.read_table(input.rpk_file, index_col="gene")
tpm = 1000000 * rpk / rpk.sum()
tpm.to_csv(output.tpm_file, sep="\t")
def source_quantif(wildcards):
"""Determines from which rule the gathered counts should be sourced."""
if wildcards.quantif_type == "counts":
return rules.gather_counts.output.counts_table
elif wildcards.quantif_type == "RPK":
return rules.compute_RPK.output.rpk_file
elif wildcards.quantif_type == "TPM":
return rules.compute_TPM.output.tpm_file
else:
raise NotImplementedError("%s is not yet handeled." % wildcards.quantif_type)
rule compute_median_ratio_to_pseudo_ref_size_factors:
input:
counts_table = rules.gather_counts.output.counts_table,
......@@ -767,12 +845,27 @@ def bamcoverage_filter(wildcards):
return ""
def source_normalizer(wildcards):
if wildcards.norm_type == "median_ratio_to_pseudo_ref":
return OPJ(
output_dir, f"{wildcards.trimmer}", aligner, "mapped_C_elegans", COUNTERS[0], "all_on_C_elegans",
"protein_coding_fwd_median_ratios_to_pseudo_ref.txt"),
elif wildcards.norm_type in COUNT_BIOTYPES:
return OPJ(
output_dir, f"{wildcards.trimmer}", aligner, "mapped_C_elegans", COUNTERS[0], "all_on_C_elegans",
f"{wildcards.norm_type}_fwd_sum_million_RPK.txt"),
else:
raise NotImplementedError(f"{wildcards.norm_type} normalization not implemented yet.")
# Warning: The normalization is done based on a particular count using the first counter
rule make_normalized_bigwig:
input:
bam = rules.fuse_bams.output.sorted_bam,
# TODO: use sourcing function based on norm_type
size_factor_file = source_normalizer,
#size_factor_file = rules.compute_coverage.output.coverage
median_ratios_file = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", COUNTERS[0], "all_on_C_elegans", "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"),
#median_ratios_file = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", COUNTERS[0], "all_on_C_elegans", "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"),
# TODO: compute this
#scale_factor_file = OPJ(output_dir, aligner, "mapped_C_elegans", "annotation", "all_%s_on_C_elegans" % size_selected, "pisimi_median_ratios_to_pseudo_ref.txt"),
output:
......@@ -780,7 +873,7 @@ rule make_normalized_bigwig:
#bigwig = OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_{orientation}.bw"),
#params:
# orient_filter = bamcoverage_filter,
#threads: 12 # to limit memory usage, actually
threads: 4 # to limit memory usage, actually
benchmark:
OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_by_{norm_type}_{orientation}_benchmark.txt")
params:
......@@ -792,7 +885,7 @@ rule make_normalized_bigwig:
"""
bam2bigwig.sh {input.bam} {params.genome_binned} \\
{wildcards.lib}_{wildcards.rep} {wildcards.orientation} %s \\
{input.median_ratios_file} {output.bigwig_norm} \\
{input.size_factor_file} {output.bigwig_norm} \\
> {log.log} 2> {log.err} \\
|| error_exit "bam2bigwig.sh failed"
""" % LIB_TYPE[-1]
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment