Skip to content
Snippets Groups Projects
Commit 485dd97f authored by Blaise Li's avatar Blaise Li
Browse files

Folds of RPM for counts of remapped reads.

parent e69418a4
Branches
Tags
No related merge requests found
...@@ -359,7 +359,7 @@ mapping_dir = OPJ(output_dir, aligner, f"mapped_{genome}") ...@@ -359,7 +359,7 @@ mapping_dir = OPJ(output_dir, aligner, f"mapped_{genome}")
reads_dir = OPJ(mapping_dir, "reads") reads_dir = OPJ(mapping_dir, "reads")
annot_counts_dir = OPJ(mapping_dir, "annotation") annot_counts_dir = OPJ(mapping_dir, "annotation")
feature_counts_dir = OPJ(mapping_dir, "feature_count") feature_counts_dir = OPJ(mapping_dir, "feature_count")
READ_TYPES_FOR_COUNTING = [size_selected, "all_sisiuRNA", "sisiuRNA"] + list(map(lambda x: "%s%s" % (x, "RNA"), ["prot_sisiu"])) READ_TYPES_FOR_COUNTING = ["all_sisiuRNA", "sisiuRNA"] + list(map(lambda x: "%s%s" % (x, "RNA"), ["prot_sisiu"]))
# Actually, we don't need to enforce this: snakemake will map whatever read type is needed for counting # Actually, we don't need to enforce this: snakemake will map whatever read type is needed for counting
assert set(READ_TYPES_FOR_COUNTING) < set(READ_TYPES_FOR_MAPPING), "Cannot count reads that were not mapped." assert set(READ_TYPES_FOR_COUNTING) < set(READ_TYPES_FOR_MAPPING), "Cannot count reads that were not mapped."
# Reads remapped and counted using featureCounts # Reads remapped and counted using featureCounts
...@@ -480,7 +480,8 @@ wildcard_constraints: ...@@ -480,7 +480,8 @@ wildcard_constraints:
biotype="|".join(set(COUNT_BIOTYPES + ANNOT_BIOTYPES)), biotype="|".join(set(COUNT_BIOTYPES + ANNOT_BIOTYPES)),
id_list="|".join(GENE_LISTS), id_list="|".join(GENE_LISTS),
type_set="|".join(["all", "protein_coding", "protein_coding_TE"]), type_set="|".join(["all", "protein_coding", "protein_coding_TE"]),
small_type="[spm]i|pisimi|siu|prot_si|te_si|pseu_si|satel_si|simrep_si|prot_siu|te_siu|pseu_siu|satel_siu|simrep_siu|sisiu|all_si|all_siu|all_sisiu|" + "|".join(REMAPPED_COUNTED), small_type="[spm]i|pisimi|siu|prot_si|te_si|pseu_si|satel_si|simrep_si|prot_siu|te_siu|pseu_siu|satel_siu|simrep_siu|sisiu|all_si|all_siu|all_sisiu",
# read_type = "|".join(REMAPPED_COUNTED),
# read_type="|".join([ # read_type="|".join([
# "raw", "trimmed", "deduped", f"{size_selected}", # "raw", "trimmed", "deduped", f"{size_selected}",
# "nomap", "nomap_siRNA", "mapped", # "nomap", "nomap_siRNA", "mapped",
...@@ -607,7 +608,22 @@ read_graphs = [ ...@@ -607,7 +608,22 @@ read_graphs = [
ip_fold_heatmaps = [] ip_fold_heatmaps = []
de_fold_heatmaps = [] de_fold_heatmaps = []
ip_fold_boxplots = [] ip_fold_boxplots = []
# Temporary, until used for boxplots:
remapped_folds = []
if contrasts_dict["ip"]: if contrasts_dict["ip"]:
remapped_folds = expand(
OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", "{counted_type}_mean_log2_RPM_fold.txt"),
counted_type=REMAPPED_COUNTED),
# snakemake -n OK
# expand(
# OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all",
# "{read_type}_on_%s_{biotype}_{orientation}_transcript_mean_log2_RPM_fold.txt" % genome),
# read_type=READ_TYPES_FOR_COUNTING, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
# snakemake -n OK
# expand(
# OPJ(mapping_dir, f"RPM_folds_{size_selected}", "{contrast}",
# "{contrast}_{small_type}_on_%s_{biotype}_{orientation}_transcript_RPM_folds.txt" % genome),
# contrast=IP_CONTRASTS, small_type=READ_TYPES_FOR_COUNTING, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
ip_fold_heatmaps = expand( ip_fold_heatmaps = expand(
OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.{fig_format}"), OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.{fig_format}"),
small_type=["pisimi", "prot_si"], fold_type=["mean_log2_RPM_fold"], fig_format=FIG_FORMATS) small_type=["pisimi", "prot_si"], fold_type=["mean_log2_RPM_fold"], fig_format=FIG_FORMATS)
...@@ -714,12 +730,9 @@ rule all: ...@@ -714,12 +730,9 @@ rule all:
expand(OPJ(mapping_dir, f"deseq2_{size_selected}", "all", "pisimi_{fold_type}.txt"), fold_type=["log2FoldChange"]), expand(OPJ(mapping_dir, f"deseq2_{size_selected}", "all", "pisimi_{fold_type}.txt"), fold_type=["log2FoldChange"]),
#expand(OPJ(mapping_dir, "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), contrast=IP_CONTRASTS, small_type=DE_TYPES), #expand(OPJ(mapping_dir, "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), contrast=IP_CONTRASTS, small_type=DE_TYPES),
exploratory_graphs, exploratory_graphs,
remapped_folds,
fold_boxplots, fold_boxplots,
#expand(OPJ(output_dir, "figures", "{small_type}_unit_clustermap.{fig_format}"), small_type=SMALL_TYPES, fig_format=FIG_FORMATS), #expand(OPJ(output_dir, "figures", "{small_type}_unit_clustermap.{fig_format}"), small_type=SMALL_TYPES, fig_format=FIG_FORMATS),
# snakemake -n OK
expand(OPJ(
feature_counts_dir,
"all_{small_type}_counts.txt"), small_type=REMAPPED_COUNTED),
#expand(OPJ( #expand(OPJ(
# feature_counts_dir, # feature_counts_dir,
# "all_{read_type}_on_%s_{biotype}_{orientation}_transcript_counts.txt" % genome), read_type=READ_TYPES_FOR_COUNTING, biotype=ANNOT_BIOTYPES, orientation=ORIENTATIONS), # "all_{read_type}_on_%s_{biotype}_{orientation}_transcript_counts.txt" % genome), read_type=READ_TYPES_FOR_COUNTING, biotype=ANNOT_BIOTYPES, orientation=ORIENTATIONS),
...@@ -727,11 +740,6 @@ rule all: ...@@ -727,11 +740,6 @@ rule all:
# feature_counts_dir, # feature_counts_dir,
# "all_{small_type}_on_%s_{biotype}_{orientation}_transcript_counts.txt" % genome), # "all_{small_type}_on_%s_{biotype}_{orientation}_transcript_counts.txt" % genome),
# small_type=READ_TYPES_FOR_MAPPING, biotype=set(COUNT_BIOTYPES + ANNOT_BIOTYPES), orientation=ORIENTATIONS), # small_type=READ_TYPES_FOR_MAPPING, biotype=set(COUNT_BIOTYPES + ANNOT_BIOTYPES), orientation=ORIENTATIONS),
# snakemake -n not OK (Missing input files for rule gather_small_RNA_counts)
#expand(
# OPJ(annot_counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"),
# small_type=REMAPPED_COUNTED),
##
expand( expand(
OPJ(annot_counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"), OPJ(annot_counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"),
small_type=["mi", "prot_si", "te_si", "pseu_si", "satel_si", "simrep_si", "prot_siu", "te_siu", "pseu_siu", "pisimi"]), small_type=["mi", "prot_si", "te_si", "pseu_si", "satel_si", "simrep_si", "prot_siu", "te_siu", "pseu_siu", "pisimi"]),
...@@ -1075,8 +1083,8 @@ rule gather_feature_counts: ...@@ -1075,8 +1083,8 @@ rule gather_feature_counts:
input: input:
counts_tables = expand( counts_tables = expand(
OPJ(feature_counts_dir, OPJ(feature_counts_dir,
"{lib}_{rep}_{{read_type}}_on_%s_{{biotype}}_{{orientation}}_{{feature_type}}_counts.txt" % genome), "{cond_name}_{{read_type}}_on_%s_{{biotype}}_{{orientation}}_{{feature_type}}_counts.txt" % genome),
filtered_product, lib=LIBS, rep=REPS), cond_name=COND_NAMES),
output: output:
counts_table = OPJ( counts_table = OPJ(
feature_counts_dir, feature_counts_dir,
...@@ -1334,8 +1342,6 @@ rule join_si_reads: ...@@ -1334,8 +1342,6 @@ rule join_si_reads:
# df = pd.pd.DataFrame(speeds).set_index(0) # df = pd.pd.DataFrame(speeds).set_index(0)
# Reminder:
# annot_counts_dir = OPJ(mapping_dir, "annotation")
rule gather_small_RNA_counts: rule gather_small_RNA_counts:
"""For a given small_type, gather counts from all libraries in one table.""" """For a given small_type, gather counts from all libraries in one table."""
input: input:
...@@ -1803,6 +1809,8 @@ rule gather_RPM_folds: ...@@ -1803,6 +1809,8 @@ rule gather_RPM_folds:
output: output:
all_folds = OPJ( all_folds = OPJ(
mapping_dir, f"RPM_folds_{size_selected}", "all", "{small_type}_mean_log2_RPM_fold.txt"), mapping_dir, f"RPM_folds_{size_selected}", "all", "{small_type}_mean_log2_RPM_fold.txt"),
wildcard_constraints:
small_type="[spm]i|pisimi|siu|prot_si|te_si|pseu_si|satel_si|simrep_si|prot_siu|te_siu|pseu_siu|satel_siu|simrep_siu|sisiu|all_si|all_siu|all_sisiu"
log: log:
log = OPJ(log_dir, "gather_RPM_folds", "{small_type}.log"), log = OPJ(log_dir, "gather_RPM_folds", "{small_type}.log"),
run: run:
...@@ -1814,6 +1822,88 @@ rule gather_RPM_folds: ...@@ -1814,6 +1822,88 @@ rule gather_RPM_folds:
output.all_folds, sep="\t") output.all_folds, sep="\t")
# TODO: replace counted_type with all subwildcards. Why does it work with cond_name?
# Wildcards in input files cannot be determined from output files:
# 'read_type'
rule compute_remapped_RPM_folds:
input:
counts_table = rules.gather_feature_counts.output.counts_table,
#exon_lengths = rules.compute_genes_exon_lengths.output.exon_lengths,
summary_table = rules.gather_read_counts_summaries.output.summary_table,
#tags_table = rules.associate_small_type.output.tags_table,
output:
fold_results = OPJ(
mapping_dir, f"RPM_folds_{size_selected}",
"{contrast}", "{contrast}_{read_type}_on_%s_{biotype}_{orientation}_{feature_type}_RPM_folds.txt" % genome),
log:
log = OPJ(log_dir, "compute_RPM_folds", "{contrast}_{read_type}_on_%s_{biotype}_{orientation}_transcript.log" % genome),
run:
with open(log.log, "w") as logfile:
(cond, ref) = CONTRAST2PAIR[wildcards.contrast]
logfile.write(f"cond: {cond}, ref: {ref}\n")
column_list = expand("{lib}_{rep}", lib=[ref, cond], rep=REPS)
logfile.write(f"Reading columns {column_list} from {input.counts_table}\n")
counts_data = pd.read_table(
input.counts_table,
usecols=["gene", *column_list],
index_col="gene")
logfile.write(f"Found {len(counts_data.index)} genes\n")
#logfile.write(f"Reading exon lengths from {exon_lengths_file}\n")
#exon_lengths = pd.read_table(
# input.exon_lengths_file,
# index_col="gene")
#logfile.write(f"Found {len(exon_lengths.index)} genes\n")
#logfile.write("Determining set of common genes\n")
#common = counts_data.index.intersection(exon_lengths.index)
#logfile.write(f"Found {len(common)} common genes\n")
#logfile.write("Computing counts per kilobase\n")
#RPK = 1000 * counts_data.loc[common].div(exon_lengths.loc[common]["union_exon_len"], axis="index")
logfile.write(f"Reading number of non-structural mappers from {input.summary_table}\n")
norm = pd.read_table(input.summary_table, index_col=0).loc["non_structural"][column_list]
logfile.write(str(norm))
logfile.write("Computing counts per million non-structural mappers\n")
RPM = 1000000 * counts_data / norm
logfile.write("Computing RPM log2 fold changes\n")
# Add 0.25 as RPM "pseudo-count" (edgeR does this ? https://www.biostars.org/p/102568/#102604)
lfc = np.log2(pd.DataFrame(
{f"log2({cond}_{rep}/{ref}_{rep})" : (RPM[f"{cond}_{rep}"] + 0.25) / (RPM[f"{ref}_{rep}"] + 0.25) for rep in REPS}))
# Compute the mean
lfc = lfc.assign(mean_log2_RPM_fold=lfc.mean(axis=1))
# Converting gene IDs
######################
with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
lfc = lfc.assign(cosmid=lfc.apply(column_converter(load(dict_file)), axis=1))
with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1))
#logfile.write("Adding small read type info\n")
##pd.concat((counts_data.loc[common], RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
#pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
pd.concat((RPM, lfc), axis=1).to_csv(output.fold_results, sep="\t")
rule gather_remapped_RPM_folds:
"""Gathers RPM folds across contrasts."""
input:
fold_results = expand(OPJ(
mapping_dir, f"RPM_folds_{size_selected}",
"{contrast}", "{contrast}_{{read_type}}_on_%s_{{biotype}}_{{orientation}}_transcript_RPM_folds.txt" % genome), contrast=IP_CONTRASTS),
output:
all_folds = OPJ(
mapping_dir, f"RPM_folds_{size_selected}", "all", "{read_type}_on_%s_{biotype}_{orientation}_transcript_mean_log2_RPM_fold.txt" % genome),
log:
log = OPJ(log_dir, "gather_RPM_folds", "{read_type}_on_%s_{biotype}_{orientation}_transcript.log" % genome),
run:
pd.DataFrame({contrast : pd.read_table(
# OPJ(mapping_dir, f"RPM_folds_{size_selected}",
# f"{contrast}", f"{contrast}_{wildcards.counted_type}_RPM_folds.txt"),
OPJ(mapping_dir, f"RPM_folds_{size_selected}",
f"{contrast}", f"{contrast}_{wildcards.read_type}_on_{genome}_{wildcards.biotype}_{wildcards.orientation}_transcript_RPM_folds.txt"),
index_col=["gene", "cosmid", "name"],
usecols=["gene", "cosmid", "name", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv(
output.all_folds, sep="\t")
# TODO add remapped_RPM_folds
def source_gathered_folds(wildcards): def source_gathered_folds(wildcards):
if wildcards.fold_type == "mean_log2_RPM_fold": if wildcards.fold_type == "mean_log2_RPM_fold":
return rules.gather_RPM_folds.output.all_folds return rules.gather_RPM_folds.output.all_folds
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment