Commit 5946a7a6 authored by Blaise Li's avatar Blaise Li
Browse files

Added "joined biotypes" like all_rmsk_families.

parent 2ef7bf3e
......@@ -61,7 +61,7 @@ import husl
from libdeseq import do_deseq2
from libhts import median_ratio_to_pseudo_ref_size_factors, status_setter, plot_lfc_distribution, plot_MA
from libworkflows import ensure_relative, cleanup_and_backup, texscape
from libworkflows import wc_applied, ensure_relative, cleanup_and_backup, texscape
from libworkflows import get_chrom_sizes, column_converter
from libworkflows import strip_split, file_len, last_lines, save_plot, test_na_file
from libworkflows import make_id_list_getter, filter_combinator, SHELL_FUNCTIONS, warn_context
......@@ -79,9 +79,6 @@ ORIENTATIONS = ["fwd", "rev", "all"]
COMPL = {"A" : "T", "C" : "G", "G" : "C", "T" : "A", "N" : "N"}
#FORMAT = "pdf"
FIG_FORMATS = ["pdf"]
LFC_RANGE = {
"protein_coding" : (-10, 10),
"DNA_transposons_rmsk" : (-10, 10),
......@@ -90,10 +87,12 @@ LFC_RANGE = {
"simple_repeats_rmsk" : (-10, 10),
"DNA_transposons_rmsk_families" : (-10, 10),
"RNA_transposons_rmsk_families" : (-10, 10),
"satellites_rmsk_families" : (-10, 10),
"simple_repeats_rmsk_families" : (-10, 10),
"pseudogene" : (-10, 10),
"all_rmsk" : (-10, 10),
"all_rmsk_families" : (-10, 10),
"alltypes" : (-10, 10)}
DE_BIOTYPES = list(LFC_RANGE.keys())
# Cutoffs in log fold change
LFC_CUTOFFS = [0.5, 1, 2]
UP_STATUSES = [f"up{cutoff}" for cutoff in LFC_CUTOFFS]
......@@ -138,10 +137,32 @@ COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
cond_name=pd.Series(COND_NAMES).values).set_index("cond_name")
COUNT_BIOTYPES = config["count_biotypes"]
ANNOT_BIOTYPES = config["annot_biotypes"]
#METAGENE_BIOTYPES=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"]
#METAGENE_BIOTYPES=["protein_coding"]
METAGENE_BIOTYPES=["protein_coding", "protein_coding_5UTR", "protein_coding_CDS", "protein_coding_3UTR"]
RMSK_BIOTYPES = [
"DNA_transposons_rmsk",
"RNA_transposons_rmsk",
"satellites_rmsk",
"simple_repeats_rmsk"]
RMSK_FAMILIES_BIOTYPES = [
"DNA_transposons_rmsk_families",
"RNA_transposons_rmsk_families",
"satellites_rmsk_families",
"simple_repeats_rmsk_families"]
BIOTYPES_TO_JOIN = {
"all_rmsk": [biotype for biotype in COUNT_BIOTYPES if biotype in RMSK_BIOTYPES],
"all_rmsk_families": [biotype for biotype in COUNT_BIOTYPES if biotype in RMSK_FAMILIES_BIOTYPES],
# We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}"
"alltypes": [biotype for biotype in COUNT_BIOTYPES if not biotype.startswith("protein_coding_")]}
JOINED_BIOTYPES = list(BIOTYPES_TO_JOIN.keys())
DE_BIOTYPES = [biotype for biotype in LFC_RANGE.keys() if biotype in COUNT_BIOTYPES + JOINED_BIOTYPES]
#ANNOT_BIOTYPES = config["annot_biotypes"]
#METAGENE_BIOTYPES = ["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"]
#METAGENE_BIOTYPES = ["protein_coding"]
#METAGENE_BIOTYPES = ["protein_coding", "protein_coding_5UTR", "protein_coding_CDS", "protein_coding_3UTR"]
METAGENE_BIOTYPES = [biotype for biotype in ["protein_coding", "protein_coding_5UTR", "protein_coding_CDS", "protein_coding_3UTR"] if biotype in COUNT_BIOTYPES]
ID_LISTS = [
"lfc_statuses",
"germline_specific",
......@@ -259,7 +280,7 @@ wildcard_constraints:
lib="|".join(LIBS),
rep="\d+",
orientation="|".join(ORIENTATIONS),
biotype="|".join(COUNT_BIOTYPES + ANNOT_BIOTYPES + ["alltypes"])
biotype="|".join(COUNT_BIOTYPES + JOINED_BIOTYPES)
# Define functions to be used in shell portions
......@@ -286,9 +307,9 @@ rule all:
orientation=ORIENTATIONS, biotype=DE_BIOTYPES),
expand(OPJ(
output_dir, "{trimmer}", "figures", aligner, "{counter}",
"{biotype}_{orientation}_PCA.{fig_format}"),
"{biotype}_{orientation}_PCA.pdf"),
trimmer=TRIMMERS, counter=COUNTERS, biotype=COUNT_BIOTYPES,
orientation=ORIENTATIONS, fig_format=FIG_FORMATS),
orientation=ORIENTATIONS),
#expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"), trimmer=TRIMMERS, counter=COUNTERS, orientation=["all"]),
#expand(expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "{lib}_{rep}_on_C_elegans", "{{biotype}}_{{orientation}}_counts.txt"), filtered_product, lib=LIBS, rep=REPS), trimmer=TRIMMERS, counter=COUNTERS, biotype=COUNT_BIOTYPES, orientation=["all"]),
expand(expand(OPJ(
......@@ -299,13 +320,13 @@ rule all:
output_dir, "{{trimmer}}", aligner, "mapped_C_elegans",
"{lib}_{rep}_on_C_elegans_by_{{norm_type}}_{{orientation}}.bw"), filtered_product, lib=LIBS, rep=REPS),
trimmer=TRIMMERS, norm_type=NORM_TYPES, orientation=["all"]),
#expand(OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.{fig_format}" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, orientation=["all"], biotype=["protein_coding"], fig_format=FIG_FORMATS),
#expand(OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.{fig_format}" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, orientation=["all"], biotype=METAGENE_BIOTYPES, fig_format=FIG_FORMATS),
#expand(OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, orientation=["all"], biotype=["protein_coding"]),
#expand(OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, orientation=["all"], biotype=METAGENE_BIOTYPES),
expand(OPJ(
output_dir, "{trimmer}", "figures", aligner, "{lib}_by_{norm_type}_mean",
"{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.{fig_format}" % (MIN_DIST, META_MIN_LEN)),
"{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)),
trimmer=TRIMMERS, lib=LIBS, norm_type=NORM_TYPES, orientation=["all"],
biotype=METAGENE_BIOTYPES, fig_format=FIG_FORMATS),
biotype=METAGENE_BIOTYPES),
include: ensure_relative(irules["link_raw_data"], workflow.basedir)
......@@ -628,8 +649,8 @@ rule feature_count_reads:
message:
"Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} with featureCounts."
log:
log = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{lib}_{rep}.log"),
err = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{lib}_{rep}.err")
log = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{orientation}_{biotype}", "{lib}_{rep}.log"),
err = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{orientation}_{biotype}", "{lib}_{rep}.err")
shell:
"""
converter="/pasteur/entites/Mhe/Genomes/C_elegans/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes_id2name.pickle"
......@@ -686,12 +707,12 @@ rule do_PCA:
output:
#OPJ(output_dir, aligner, "mapped_C_elegans", "htseq_count", "summaries", "{biotype}_{orientation}_PCA.pdf"),
#OPJ(output_dir, aligner, "mapped_C_elegans", "htseq_count", "summaries", "{biotype}_{orientation}_PCA.png"),
OPJ(output_dir, "{trimmer}", "figures", aligner, "{counter}", "{biotype}_{orientation}_PCA.{fig_format}"),
OPJ(output_dir, "{trimmer}", "figures", aligner, "{counter}", "{biotype}_{orientation}_PCA.pdf"),
message:
"Summarizing counts for {wildcards.biotype}_{wildcards.orientation}"
threads: 12 # trying to avoid TimeoutError and "LOCKERROR: matplotlib is trying to acquire the lock [...]"
log:
OPJ(log_dir, "{trimmer}", "do_PCA_{biotype}_{orientation}.log")
OPJ(log_dir, "{trimmer}", "{counter}", "do_PCA_{biotype}_{orientation}.log")
run:
if wildcards.counter == "htseq_count":
counts_parser = parse_htseq_counts
......@@ -841,10 +862,55 @@ rule gather_counts:
counts_data.to_csv(output.counts_table, sep="\t")
@wc_applied
def source_counts_to_join(wildcards):
"""
Determines which elementary biotype counts files should be joined to make the desired "joined" biotype.
"""
return expand(
OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans",
"{{counter}}", "all_on_C_elegans",
"{biotype}_{{orientation}}_counts.txt"),
biotype=BIOTYPES_TO_JOIN[wildcards.biotype])
rule join_all_counts:
"""concat counts for all biotypes into all"""
input:
counts_tables = source_counts_to_join,
#counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=[bty for bty in COUNT_BIOTYPES if bty[-9:] != "_families"]),
# counts_tables = expand(
# OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans",
# "{{counter}}", "all_on_C_elegans",
# "{biotype}_{{orientation}}_counts.txt"),
# # We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}"
# biotype=[b for b in COUNT_BIOTYPES if not b.startswith("protein_coding_")]),
output:
counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "{biotype}_{orientation}_counts.txt"),
wildcard_constraints:
biotype = "|".join(JOINED_BIOTYPES)
run:
counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates()
assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table."
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
@wc_applied
def source_counts(wildcards):
"""Determines from which rule the gathered small counts should be sourced."""
if wildcards.biotype in JOINED_BIOTYPES:
return rules.join_all_counts.output.counts_table
else:
# "Directly" from the counts gathered across libraries
return rules.gather_counts.output.counts_table
rule compute_RPK:
"""For a given biotype, compute the corresponding RPK value (reads per kilobase)."""
input:
counts_data = rules.gather_counts.output.counts_table,
counts_data = source_counts,
#counts_data = rules.gather_counts.output.counts_table,
#counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}",
# "all_on_C_elegans", "{biotype}_{orientation}_counts.txt"),
output:
......@@ -852,12 +918,14 @@ rule compute_RPK:
"all_on_C_elegans", "{biotype}_{orientation}_RPK.txt"),
params:
feature_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"),
run:
counts_data = pd.read_table(input.counts_data, index_col="gene")
feature_lengths = pd.read_table(params.feature_lengths_file, index_col="gene")
common = counts_data.index.intersection(feature_lengths.index)
rpk = 1000 * counts_data.loc[common].div(feature_lengths.loc[common]["union_exon_len"], axis="index")
rpk.to_csv(output.rpk_file, sep="\t")
# run:
# counts_data = pd.read_table(input.counts_data, index_col="gene")
# feature_lengths = pd.read_table(params.feature_lengths_file, index_col="gene")
# common = counts_data.index.intersection(feature_lengths.index)
# rpk = 1000 * counts_data.loc[common].div(feature_lengths.loc[common]["union_exon_len"], axis="index")
# rpk.to_csv(output.rpk_file, sep="\t")
wrapper:
"file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/compute_RPK"
rule compute_sum_million_RPK:
......@@ -882,17 +950,21 @@ rule compute_TPM:
"all_on_C_elegans", "{biotype}_{orientation}_TPM.txt"),
# The sum must be done over all counted features
wildcard_constraints:
biotype = "alltypes"
run:
rpk = pd.read_table(input.rpk_file, index_col="gene")
tpm = 1000000 * rpk / rpk.sum()
tpm.to_csv(output.tpm_file, sep="\t")
biotype = "|".join(["alltypes"])
# run:
# rpk = pd.read_table(input.rpk_file, index_col="gene")
# tpm = 1000000 * rpk / rpk.sum()
# tpm.to_csv(output.tpm_file, sep="\t")
wrapper:
"file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/compute_TPM"
@wc_applied
def source_quantif(wildcards):
"""Determines from which rule the gathered counts should be sourced."""
if wildcards.quantif_type == "counts":
return rules.gather_counts.output.counts_table
return source_counts(wildcards)
#return rules.gather_counts.output.counts_table
elif wildcards.quantif_type == "RPK":
return rules.compute_RPK.output.rpk_file
elif wildcards.quantif_type == "TPM":
......@@ -903,7 +975,8 @@ def source_quantif(wildcards):
rule compute_median_ratio_to_pseudo_ref_size_factors:
input:
counts_table = rules.gather_counts.output.counts_table,
counts_table = source_counts,
#counts_table = rules.gather_counts.output.counts_table,
output:
median_ratios_file = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "{biotype}_{orientation}_median_ratios_to_pseudo_ref.txt"),
run:
......@@ -1004,10 +1077,10 @@ rule make_bigwig:
orient_filter = bamcoverage_filter,
threads: 12 # to limit memory usage, actually
benchmark:
OPJ(log_dir, "{trimmer}", "make_bigwig", "{lib}_{rep}_{orientation}_benchmark.txt")
OPJ(log_dir, "{trimmer}", "make_bigwig", "{lib}_{rep}_by_{norm_type}_{orientation}_benchmark.txt")
log:
log = OPJ(log_dir, "{trimmer}", "make_bigwig", "{lib}_{rep}_{orientation}.log"),
err = OPJ(log_dir, "{trimmer}", "make_bigwig", "{lib}_{rep}_{orientation}.err"),
log = OPJ(log_dir, "{trimmer}", "make_bigwig", "{lib}_{rep}_by_{norm_type}_{orientation}.log"),
err = OPJ(log_dir, "{trimmer}", "make_bigwig", "{lib}_{rep}_by_{norm_type}_{orientation}.err"),
run:
scale = 1 / float(pd.read_table(
input.median_ratios_file, index_col=0, header=None).loc[
......@@ -1050,7 +1123,7 @@ rule merge_bigwig_reps:
output:
bw = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_mean_on_C_elegans_by_{norm_type}_{orientation}.bw"),
log:
warnings = OPJ(log_dir, "merge_bigwig_reps", "{lib}_mean_on_C_elegans_by_{norm_type}_{orientation}.warnings"),
warnings = OPJ(log_dir, "{trimmer}", "merge_bigwig_reps", "{lib}_mean_on_C_elegans_by_{norm_type}_{orientation}.warnings"),
threads: 2 # to limit memory usage, actually
run:
with warn_context(log.warnings) as warn:
......@@ -1089,34 +1162,6 @@ rule merge_bigwig_reps:
bw.close()
rule join_all_counts:
"""concat counts for all biotypes into all"""
input:
#counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=[bty for bty in COUNT_BIOTYPES if bty[-9:] != "_families"]),
counts_tables = expand(
OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans",
"{{counter}}", "all_on_C_elegans",
"{biotype}_{{orientation}}_counts.txt"),
# We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}"
biotype=[b for b in COUNT_BIOTYPES if not b.startswith("protein_coding_")]),
output:
counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"),
run:
counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates()
assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table."
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
def source_counts(wildcards):
"""Determines from which rule the gathered small counts should be sourced."""
if wildcards.biotype == "alltypes":
return rules.join_all_counts.output.counts_table
else:
# "Directly" from the counts gathered across libraries
return rules.gather_counts.output.counts_table
from rpy2.robjects import Formula, StrVector
from rpy2.rinterface import RRuntimeError
rule differential_expression:
......@@ -1272,22 +1317,22 @@ rule make_MA_plot:
# Metagene plots #
##################
rule gather_annotations:
input:
expand(OPJ(annot_dir, "{biotype}.gtf"), biotype=ANNOT_BIOTYPES),
output:
merged_gtf = OPJ(local_annot_dir, "all_annotations.gtf"),
merged_gtf_gz = OPJ(local_annot_dir, "all_annotations.gtf.gz"),
index = OPJ(local_annot_dir, "all_annotations.gtf.gz.tbi"),
#merged_bed = OPJ(local_annot_dir, "all_annotations.bed"),
message:
"Gathering annotations for {}".format(", ".join(ANNOT_BIOTYPES))
shell:
"""
sort -k1,1 -k4,4n -m {input} | tee {output.merged_gtf} | bgzip > {output.merged_gtf_gz}
tabix -p gff {output.merged_gtf_gz}
#ensembl_gtf2bed.py {output.merged_gtf} > {output.merged_bed}
"""
# rule gather_annotations:
# input:
# expand(OPJ(annot_dir, "{biotype}.gtf"), biotype=ANNOT_BIOTYPES),
# output:
# merged_gtf = OPJ(local_annot_dir, "all_annotations.gtf"),
# merged_gtf_gz = OPJ(local_annot_dir, "all_annotations.gtf.gz"),
# index = OPJ(local_annot_dir, "all_annotations.gtf.gz.tbi"),
# #merged_bed = OPJ(local_annot_dir, "all_annotations.bed"),
# message:
# "Gathering annotations for {}".format(", ".join(ANNOT_BIOTYPES))
# shell:
# """
# sort -k1,1 -k4,4n -m {input} | tee {output.merged_gtf} | bgzip > {output.merged_gtf_gz}
# tabix -p gff {output.merged_gtf_gz}
# #ensembl_gtf2bed.py {output.merged_gtf} > {output.merged_bed}
# """
# For metagene analyses:
#- Extract transcripts
......@@ -1606,7 +1651,7 @@ rule plot_meta_profile_mean:
bigwig = rules.merge_bigwig_reps.output.bw,
bed = rules.select_genes_for_meta_profile.output.out_bed,
output:
OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_by_{norm_type}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.{fig_format}" % (MIN_DIST, META_MIN_LEN)),
OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_by_{norm_type}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)),
params:
meta_params = meta_params,
# before = META_MARGIN,
......@@ -1615,8 +1660,8 @@ rule plot_meta_profile_mean:
# unscaled_5 = UNSCALED_INSIDE,
# unscaled_3 = UNSCALED_INSIDE,
log:
plot_TSS_log = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.log" % (MIN_DIST, META_MIN_LEN)),
plot_TSS_err = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.err" % (MIN_DIST, META_MIN_LEN)),
plot_TSS_log = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "by_{norm_type}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.log" % (MIN_DIST, META_MIN_LEN)),
plot_TSS_err = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "by_{norm_type}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.err" % (MIN_DIST, META_MIN_LEN)),
threads: 12 # to limit memory usage, actually
run:
if file_len(input.bed):
......
......@@ -98,7 +98,7 @@ from libhts import (
plot_norm_correlations,
plot_counts_distribution,
plot_boxplots)
from libworkflows import ensure_relative, cleanup_and_backup
from libworkflows import wc_applied, ensure_relative, cleanup_and_backup
from libworkflows import get_chrom_sizes, column_converter, make_id_list_getter
from libworkflows import strip_split, save_plot, test_na_file, SHELL_FUNCTIONS, warn_context
from libworkflows import feature_orientation2stranded
......@@ -122,8 +122,11 @@ LFC_RANGE = {
"simple_repeats_rmsk" : (-10, 10),
"DNA_transposons_rmsk_families" : (-10, 10),
"RNA_transposons_rmsk_families" : (-10, 10),
"satellites_rmsk_families" : (-10, 10),
"simple_repeats_rmsk_families" : (-10, 10),
"pseudogene" : (-10, 10),
"all_rmsk" : (-10, 10),
"all_rmsk_families" : (-10, 10),
"alltypes" : (-10, 10)}
# Cutoffs in log fold change
LFC_CUTOFFS = [0.5, 1, 2]
......@@ -197,7 +200,25 @@ COND_NAMES = ["_".join((
BIOTYPES = config["biotypes"]
DE_BIOTYPES = [biotype for biotype in LFC_RANGE.keys() if biotype in BIOTYPES + ["alltypes"]]
RMSK_BIOTYPES = [
"DNA_transposons_rmsk",
"RNA_transposons_rmsk",
"satellites_rmsk",
"simple_repeats_rmsk"]
RMSK_FAMILIES_BIOTYPES = [
"DNA_transposons_rmsk_families",
"RNA_transposons_rmsk_families",
"satellites_rmsk_families",
"simple_repeats_rmsk_families"]
BIOTYPES_TO_JOIN = {
"all_rmsk": [biotype for biotype in BIOTYPES if biotype in RMSK_BIOTYPES],
"all_rmsk_families": [biotype for biotype in BIOTYPES if biotype in RMSK_FAMILIES_BIOTYPES],
# We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}"
"alltypes": [biotype for biotype in BIOTYPES if not biotype.startswith("protein_coding_")]}
JOINED_BIOTYPES = list(BIOTYPES_TO_JOIN.keys())
DE_BIOTYPES = [biotype for biotype in LFC_RANGE.keys() if biotype in BIOTYPES + JOINED_BIOTYPES]
if "spike_ins" in BIOTYPES:
PAIR_SCATTER_BIOTYPES = ["spike_ins"]
else:
......@@ -251,7 +272,7 @@ minU, maxU = 1, 5
wildcard_constraints:
rep="\d+",
orientation="|".join(ORIENTATIONS),
biotype="|".join(BIOTYPES + ["alltypes"]),
biotype="|".join(BIOTYPES + JOINED_BIOTYPES),
fold_type="|".join(["mean_log2_RPM_fold", "log2FoldChange", "lfcMLE"]),
#ruleorder: join_all_counts > gather_counts
......@@ -293,12 +314,12 @@ counts_files = [
OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
"all_{mapped_type}", "{biotype}_{orientation}_RPK.txt"),
counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
biotype=BIOTYPES + ["alltypes"], orientation=ORIENTATIONS),
biotype=BIOTYPES + JOINED_BIOTYPES, orientation=ORIENTATIONS),
expand(
OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
"all_{mapped_type}", "{biotype}_{orientation}_counts.txt"),
counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
biotype=BIOTYPES, orientation=ORIENTATIONS),
biotype=BIOTYPES + JOINED_BIOTYPES, orientation=ORIENTATIONS),
# expand(
# OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
# "summaries", f"{{lib}}_{{rep}}_on_{genome}_{{orientation}}_counts.txt"),
......@@ -1094,7 +1115,7 @@ rule compute_TPM:
"all_{mapped_type}", "{biotype}_{orientation}_TPM.txt"),
# The sum must be done over all counted features
wildcard_constraints:
biotype = "alltypes"
biotype = "|".join(["alltypes"])
# run:
# rpk = pd.read_table(input.rpk_file, index_col="gene")
# tpm = 1000000 * rpk / rpk.sum()
......@@ -1465,12 +1486,24 @@ rule merge_bigwig_reps:
bw.close()
@wc_applied
def source_counts_to_join(wildcards):
"""
Determines which elementary biotype counts files should be joined to make the desired "joined" biotype.
"""
return expand(
OPJ(output_dir, aligner, f"mapped_{genome}", "{{counter}}", "all_{{mapped_type}}", "{biotype}_{{orientation}}_counts.txt"),
biotype=BIOTYPES_TO_JOIN[wildcards.biotype])
rule join_all_counts:
"""concat counts for all biotypes into all"""
"""Concat counts for all biotypes into all"""
input:
counts_tables = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{{counter}}", "all_{{mapped_type}}", "{biotype}_{{orientation}}_counts.txt"), biotype=BIOTYPES),
counts_tables = source_counts_to_join,
output:
counts_table = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "alltypes_{orientation}_counts.txt"),
counts_table = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "{biotype}_{orientation}_counts.txt"),
wildcard_constraints:
biotype = "|".join(JOINED_BIOTYPES)
run:
counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates()
assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table."
......@@ -1478,12 +1511,13 @@ rule join_all_counts:
counts_data.to_csv(output.counts_table, sep="\t")
@wc_applied
def source_counts(wildcards):
"""Determines from which rule the gathered counts should be sourced."""
if wildcards.biotype == "alltypes":
if wildcards.biotype in JOINED_BIOTYPES:
## debug
#return rules.join_all_counts.output.counts_table
return OPJ(output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}", f"all_{wildcards.mapped_type}", f"alltypes_{wildcards.orientation}_counts.txt")
return rules.join_all_counts.output.counts_table
#return OPJ(output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}", f"all_{wildcards.mapped_type}", f"alltypes_{wildcards.orientation}_counts.txt")
##
else:
# "Directly" from the counts gathered across libraries
......@@ -1497,10 +1531,10 @@ def source_counts(wildcards):
# output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}",
# f"all_{wildcards.mapped_type}", f"{wildcards.biotype}_{wildcards.orientation}_counts.txt"))
# sys.exit(1)
#return rules.gather_counts.output.counts_table
return OPJ(
output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}",
f"all_{wildcards.mapped_type}", f"{wildcards.biotype}_{wildcards.orientation}_counts.txt")
return rules.gather_counts.output.counts_table
#return OPJ(
# output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}",
# f"all_{wildcards.mapped_type}", f"{wildcards.biotype}_{wildcards.orientation}_counts.txt")
##
......
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment