Skip to content
Snippets Groups Projects
Commit 28d6e34e authored by Blaise Li's avatar Blaise Li
Browse files

Adapting TPM and RPM counting from RNA-seq.

parent 38c641d1
No related branches found
No related tags found
No related merge requests found
......@@ -208,9 +208,16 @@ counting = [
## Will be pulled in as dependencies of other needed results:
# expand(OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
##
expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "summaries", "all_{read_type}_on_%s_{orientation}_counts.txt" % genome), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED + ["deduped"], orientation=ORIENTATIONS),
expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED + ["deduped"], biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED + ["deduped"], norm=NORM_TYPES, orientation=["all"]),
expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "summaries", "all_{read_type}_on_%s_{orientation}_counts.txt" % genome),
trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED + ["deduped"], orientation=ORIENTATIONS),
expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"),
trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED + ["deduped"], biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome),
trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED + ["deduped"], norm=NORM_TYPES, orientation=["all"]),
expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_RPM.txt"),
trimmer=TRIMMERS, read_type=["deduped"], biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "{lib}_mean_{read_type}_on_%s" % genome, "{lib}_mean_{biotype}_{orientation}_TPM.txt"),
trimmer=TRIMMERS, lib=LIBS, read_type=["deduped"], biotype=["protein_coding"], orientation=ORIENTATIONS),
]
#TODO:
......@@ -647,7 +654,6 @@ rule feature_count_reads:
eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed"
rm -rf ${{tmpdir}}
cat {output.counts} | wormid2name > {output.counts_converted}
# cat {output.counts} | id2name.py ${{converter}} > {output.counts_converted}
"""
......@@ -667,10 +673,13 @@ rule summarize_feature_counts:
rule gather_read_counts_summaries:
"""Gather read count summaries across libraries: lib_rep -> all."""
input:
summary_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count", "summaries", "{lib}_{rep}_{{read_type}}_on_%s_{{orientation}}_counts.txt" % genome), lib=LIBS, rep=REPS),
summary_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count", "summaries",
"{lib}_{rep}_{{read_type}}_on_%s_{{orientation}}_counts.txt" % genome), lib=LIBS, rep=REPS),
output:
summary_table = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "summaries", "all_{read_type}_on_%s_{orientation}_counts.txt" % genome),
summary_table = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "summaries",
"all_{read_type}_on_%s_{orientation}_counts.txt" % genome),
run:
summary_files = (OPJ(
wildcards.trimmer,
......@@ -689,9 +698,11 @@ rule gather_read_counts_summaries:
rule gather_counts:
"""For a given biotype, gather counts from all libraries in one table."""
input:
counts_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{{read_type}}_on_%s" % genome, "{{biotype}}_{{orientation}}_counts.txt"), lib=LIBS, rep=REPS),
counts_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count",
"{lib}_{rep}_{{read_type}}_on_%s" % genome, "{{biotype}}_{{orientation}}_counts.txt"), lib=LIBS, rep=REPS),
output:
counts_table = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"),
counts_table = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count",
"all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"),
# wildcard_constraints:
# # Avoid ambiguity with join_all_counts
# biotype = "|".join(COUNT_BIOTYPES)
......@@ -734,6 +745,100 @@ rule gather_counts:
counts_data.to_csv(output.counts_table, sep="\t")
rule compute_RPK:
"""For a given biotype, compute the corresponding RPK value (reads per kilobase)."""
input:
# TODO: Why wildcards seems to be None?
#counts_data = rules.gather_counts.output.counts_table,
counts_data = OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count",
"all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"),
output:
rpk_file = OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count",
"all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_RPK.txt"),
params:
feature_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"),
wrapper:
f"file://{wrappers_dir}/compute_RPK"
rule compute_sum_million_RPK:
input:
rpk_file = rules.compute_RPK.output.rpk_file,
output:
sum_rpk_file = OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count",
"all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_sum_million_RPK.txt"),
run:
sum_rpk = pd.read_table(
input.rpk_file,
index_col=0).sum()
(sum_rpk / 1000000).to_csv(output.sum_rpk_file, sep="\t")
# Compute TPM using total number of mappers divided by genome length
# (not sum of RPK across biotypes: some mappers may not be counted)
# No, doesn't work: mappers / genome length not really comparable
# Needs to be done on all_types
rule compute_TPM:
"""For a given biotype, compute the corresponding TPM value (reads per kilobase per million mappers)."""
input:
rpk_file = rules.compute_RPK.output.rpk_file
output:
tpm_file = OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count",
"all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_TPM.txt"),
# The sum must be done over all counted features
wildcard_constraints:
biotype = "|".join(["protein_coding"])
# run:
# rpk = pd.read_table(input.rpk_file, index_col="gene")
# tpm = 1000000 * rpk / rpk.sum()
# tpm.to_csv(output.tpm_file, sep="\t")
wrapper:
f"file://{wrappers_dir}/compute_TPM"
# Useful to compute translation efficiency in the Ribo-seq pipeline
rule compute_mean_TPM:
input:
all_tmp_file = rules.compute_TPM.output.tpm_file
output:
tpm_file = OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count",
"{lib}_mean_{read_type}_on_%s" % genome, "{lib}_mean_{biotype}_{orientation}_TPM.txt"),
wildcard_constraints:
biotype = "|".join(["protein_coding"])
run:
tpm = pd.read_table(
input.all_tmp_file, index_col="gene",
usecols=["gene", *[f"{wildcards.lib}_{rep}" for rep in REPS]])
tpm_mean = tpm.mean(axis=1)
# This is a Series
tpm_mean.name = wildcards.lib
tpm_mean.to_csv(output.tpm_file, sep="\t", header=True)
rule compute_RPM:
input:
counts_data = OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count",
"all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"),
#summary_table = rules.gather_read_counts_summaries.output.summary_table,
summary_table = OPJ(
"{trimmer}", aligner, f"mapped_{genome}", "feature_count", "summaries",
"all_{read_type}_on_%s_fwd_counts.txt" % genome),
output:
rpm_file = OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count",
"all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_RPM.txt"),
run:
# Reading column counts from {input.counts_table}
counts_data = pd.read_table(
input.counts_data,
index_col="gene")
# Reading number of protein_coding fwd mappers from {input.summary_table}
norm = pd.read_table(input.summary_table, index_col=0).loc["protein_coding"]
# Computing counts per million protein_coding fwd mappers
RPM = 1000000 * counts_data / norm
RPM.to_csv(output.rpm_file, sep="\t")
# TODO: add other steps found in RNA-seq pipeline ?
rule compute_median_ratio_to_pseudo_ref_size_factors:
input:
counts_table = rules.gather_counts.output.counts_table,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment