Skip to content
Snippets Groups Projects
Commit f4ec9caf authored by Blaise Li's avatar Blaise Li
Browse files

Computation of translation efficiency difference.

parent d15736bd
No related branches found
No related tags found
No related merge requests found
...@@ -192,7 +192,7 @@ TE_LIBS = list(config["transcriptome_TPM"].keys()) ...@@ -192,7 +192,7 @@ TE_LIBS = list(config["transcriptome_TPM"].keys())
series_dict = config["series"] series_dict = config["series"]
SERIES_TYPES = list(series_dict.keys()) SERIES_TYPES = list(series_dict.keys())
genotype_series = series_dict["genotype_series"] genotype_series = series_dict["genotype_series"]
ip_series = series_dict["ip_series"] dt_series = series_dict["dt_series"]
merged_series = merge_with(concat_lists, *series_dict.values()) merged_series = merge_with(concat_lists, *series_dict.values())
ALL_SERIES = list(merged_series.keys()) ALL_SERIES = list(merged_series.keys())
#all_libs_in series = concat_lists(merge_with(concat_lists, *series_dict.values()).values()) #all_libs_in series = concat_lists(merge_with(concat_lists, *series_dict.values()).values())
...@@ -203,17 +203,17 @@ msg = "\n".join([ ...@@ -203,17 +203,17 @@ msg = "\n".join([
"Contrasts:" "Contrasts:"
", ".join([f"({cond}, {ref})" for (cond, ref) in DE_COND_PAIRS])]) ", ".join([f"({cond}, {ref})" for (cond, ref) in DE_COND_PAIRS])])
assert all([cond in LIBS and ref in LIBS for (cond, ref) in DE_COND_PAIRS]), msg assert all([cond in LIBS and ref in LIBS for (cond, ref) in DE_COND_PAIRS]), msg
IP_COND_PAIRS = config["ip_cond_pairs"] DT_COND_PAIRS = config["dt_cond_pairs"]
msg = "\n".join([ msg = "\n".join([
"Some contrats do not use known library names.", "Some contrats do not use known library names.",
"Contrasts:" "Contrasts:"
", ".join([f"({cond}, {ref})" for (cond, ref) in IP_COND_PAIRS])]) ", ".join([f"({cond}, {ref})" for (cond, ref) in DT_COND_PAIRS])])
assert all([cond in LIBS and ref in LIBS for (cond, ref) in IP_COND_PAIRS]), "" assert all([cond in LIBS and ref in LIBS for (cond, ref) in DT_COND_PAIRS]), ""
COND_PAIRS = DE_COND_PAIRS + IP_COND_PAIRS COND_PAIRS = DE_COND_PAIRS + DT_COND_PAIRS
DE_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in DE_COND_PAIRS] DE_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in DE_COND_PAIRS]
IP_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in IP_COND_PAIRS] DT_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in DT_COND_PAIRS]
contrasts_dict = {"de" : DE_CONTRASTS, "ip" : IP_CONTRASTS} contrasts_dict = {"de" : DE_CONTRASTS, "dt" : DT_CONTRASTS}
CONTRASTS = DE_CONTRASTS + IP_CONTRASTS CONTRASTS = DE_CONTRASTS + DT_CONTRASTS
CONTRAST2PAIR = dict(zip(CONTRASTS, COND_PAIRS)) CONTRAST2PAIR = dict(zip(CONTRASTS, COND_PAIRS))
MIN_LEN = config["min_len"] MIN_LEN = config["min_len"]
MAX_LEN = config["max_len"] MAX_LEN = config["max_len"]
...@@ -441,26 +441,27 @@ counts_files = [ ...@@ -441,26 +441,27 @@ counts_files = [
meta_profiles = [ meta_profiles = [
#expand(OPJ(local_annot_dir, "transcripts_{type_set}", "merged_isolated_{min_dist}.bed"), type_set=["all", "protein_coding", "protein_coding_TE"], min_dist="0 5 10 25 50 100 250 500 1000 2500 5000 10000".split()), #expand(OPJ(local_annot_dir, "transcripts_{type_set}", "merged_isolated_{min_dist}.bed"), type_set=["all", "protein_coding", "protein_coding_TE"], min_dist="0 5 10 25 50 100 250 500 1000 2500 5000 10000".split()),
#expand(OPJ(local_annot_dir, "transcripts_{type_set}", "merged_isolated_{min_dist}_{biotype}_min_{min_len}.bed"), type_set=["all", "protein_coding", "protein_coding_TE"], min_dist="0 5 10 25 50 100 250 500 1000 2500 5000 10000".split(), biotype=["protein_coding"], min_len=[str(META_MIN_LEN)]), #expand(OPJ(local_annot_dir, "transcripts_{type_set}", "merged_isolated_{min_dist}_{biotype}_min_{min_len}.bed"), type_set=["all", "protein_coding", "protein_coding_TE"], min_dist="0 5 10 25 50 100 250 500 1000 2500 5000 10000".split(), biotype=["protein_coding"], min_len=[str(META_MIN_LEN)]),
# TODO: check how to adapt to dt_series instead of ip_series
expand( expand(
OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}", OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}",
"{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{series_type}_{series}_meta_profile.{fig_format}"), "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{series_type}_{series}_meta_profile.{fig_format}"),
meta_scale= [str(META_SCALE)], read_type=[size_selected, "RPF"], meta_scale= [str(META_SCALE)], read_type=[size_selected, "RPF"],
norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding_TE"], min_dist=[str(MIN_DIST)], norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding_TE"], min_dist=[str(MIN_DIST)],
biotype=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"], min_len=[str(META_MIN_LEN)], biotype=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"], min_len=[str(META_MIN_LEN)],
series_type=["ip_series"], series=list(ip_series.keys()), fig_format=FIG_FORMATS), series_type=["dt_series"], series=list(dt_series.keys()), fig_format=FIG_FORMATS),
expand( expand(
OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}", OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}",
"{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{series_type}_{series}_meta_profile.{fig_format}"), "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{series_type}_{series}_meta_profile.{fig_format}"),
meta_scale= [str(META_SCALE)], read_type=[size_selected, "RPF"], meta_scale= [str(META_SCALE)], read_type=[size_selected, "RPF"],
norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding"], min_dist=[str(MIN_DIST)], norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding"], min_dist=[str(MIN_DIST)],
biotype=["protein_coding"], min_len=[str(META_MIN_LEN)], biotype=["protein_coding"], min_len=[str(META_MIN_LEN)],
series_type=["ip_series"], series=list(ip_series.keys()), fig_format=FIG_FORMATS), series_type=["dt_series"], series=list(dt_series.keys()), fig_format=FIG_FORMATS),
expand( expand(
OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}", OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}",
"{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{id_list}_{series_type}_{series}_meta_profile.{fig_format}"), "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{id_list}_{series_type}_{series}_meta_profile.{fig_format}"),
meta_scale=[str(META_SCALE)], read_type=[size_selected, "RPF"], meta_scale=[str(META_SCALE)], read_type=[size_selected, "RPF"],
norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding_TE"], min_dist=["0"], id_list=GENE_LISTS, norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding_TE"], min_dist=["0"], id_list=GENE_LISTS,
series_type=["ip_series"], series=list(ip_series.keys()), fig_format=FIG_FORMATS), series_type=["dt_series"], series=list(dt_series.keys()), fig_format=FIG_FORMATS),
## TODO: Resolve issue with bedtools ## TODO: Resolve issue with bedtools
# expand( # expand(
# OPJ(output_dir, "figures", "{lib}_{rep}", # OPJ(output_dir, "figures", "{lib}_{rep}",
...@@ -494,7 +495,9 @@ read_graphs = [ ...@@ -494,7 +495,9 @@ read_graphs = [
lib=LIBS, rep=REPS, fig_format=FIG_FORMATS), lib=LIBS, rep=REPS, fig_format=FIG_FORMATS),
] ]
if contrasts_dict["ip"]: # TODO: check what can be done with "dt" (differential translation, a.k.a. translation efficiency difference)
# What should be done with small_type, fold_type, etc.
if contrasts_dict["dt"]:
fold_heatmaps = expand( fold_heatmaps = expand(
OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.{fig_format}"), OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.{fig_format}"),
small_type=["pisimi", "prot_si"], fold_type=["mean_log2_RPM_fold", "log2FoldChange"], fig_format=FIG_FORMATS) small_type=["pisimi", "prot_si"], fold_type=["mean_log2_RPM_fold", "log2FoldChange"], fig_format=FIG_FORMATS)
...@@ -512,9 +515,9 @@ else: ...@@ -512,9 +515,9 @@ else:
exploratory_graphs = [ exploratory_graphs = [
fold_heatmaps, fold_heatmaps,
# Large figures, not very readable # Large figures, not very readable
expand( #expand(
OPJ(output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "{contrast}", "{contrast}_{small_type}_pairplots.{fig_format}"), # OPJ(output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "{contrast}", "{contrast}_{small_type}_pairplots.{fig_format}"),
contrast=DE_CONTRASTS, small_type=DE_TYPES, fig_format=FIG_FORMATS), # contrast=DE_CONTRASTS, small_type=DE_TYPES, fig_format=FIG_FORMATS),
## TODO: debug PCA ## TODO: debug PCA
#expand( #expand(
# OPJ(output_dir, "figures", "{small_type}_{standard}_PCA.{fig_format}"), # OPJ(output_dir, "figures", "{small_type}_{standard}_PCA.{fig_format}"),
...@@ -542,7 +545,7 @@ de_fold_boxplots = expand( ...@@ -542,7 +545,7 @@ de_fold_boxplots = expand(
ip_fold_boxplots_by_contrast = expand( ip_fold_boxplots_by_contrast = expand(
OPJ(output_dir, "figures", "{contrast}", OPJ(output_dir, "figures", "{contrast}",
"{contrast}_{small_type}_{fold_type}_{gene_list}_boxplots.{fig_format}"), "{contrast}_{small_type}_{fold_type}_{gene_list}_boxplots.{fig_format}"),
contrast=IP_CONTRASTS, small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"], contrast=DT_CONTRASTS, small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"],
gene_list=["all_gene_lists"], fig_format=FIG_FORMATS) gene_list=["all_gene_lists"], fig_format=FIG_FORMATS)
fold_boxplots = [de_fold_boxplots, ip_fold_boxplots_by_contrast, ip_fold_boxplots] fold_boxplots = [de_fold_boxplots, ip_fold_boxplots_by_contrast, ip_fold_boxplots]
...@@ -580,13 +583,19 @@ rule all: ...@@ -580,13 +583,19 @@ rule all:
"deseq2_{read_type}", "deseq2_{read_type}",
"{contrast}", "{orientation}_{biotype}", "{contrast}_counts_and_res.txt"), "{contrast}", "{orientation}_{biotype}", "{contrast}_counts_and_res.txt"),
read_type=["RPF"], contrast=DE_CONTRASTS, orientation=ORIENTATIONS, biotype=["protein_coding"]), read_type=["RPF"], contrast=DE_CONTRASTS, orientation=ORIENTATIONS, biotype=["protein_coding"]),
expand(
OPJ(
counts_dir,
"diff_TE_{read_type}",
"{contrast}", "{orientation}_{biotype}", "{contrast}_diff_TE.txt"),
read_type=["RPF"], contrast=DT_CONTRASTS, orientation=["fwd"], biotype=["alltypes"]),
# rule future_all: # rule future_all:
# meta_profiles, # meta_profiles,
# read_graphs, # read_graphs,
# OPJ(output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "all", "pisimi_mean_log2_RPM_fold.txt"), # OPJ(output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "all", "pisimi_mean_log2_RPM_fold.txt"),
# expand(OPJ(output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "all", "pisimi_{fold_type}.txt"), fold_type=["log2FoldChange"]), # expand(OPJ(output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "all", "pisimi_{fold_type}.txt"), fold_type=["log2FoldChange"]),
# #expand(OPJ(output_dir, aligner, f"mapped_{genome}", "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), contrast=IP_CONTRASTS, small_type=DE_TYPES), # #expand(OPJ(output_dir, aligner, f"mapped_{genome}", "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), contrast=DT_CONTRASTS, small_type=DE_TYPES),
# exploratory_graphs, # exploratory_graphs,
# fold_boxplots, # fold_boxplots,
# #expand(OPJ(output_dir, "figures", "{small_type}_unit_clustermap.{fig_format}"), small_type=SMALL_TYPES, fig_format=FIG_FORMATS), # #expand(OPJ(output_dir, "figures", "{small_type}_unit_clustermap.{fig_format}"), small_type=SMALL_TYPES, fig_format=FIG_FORMATS),
...@@ -1326,6 +1335,51 @@ rule compute_translation_efficiency: ...@@ -1326,6 +1335,51 @@ rule compute_translation_efficiency:
tpm_and_teff.to_csv(output.teff_file, sep="\t", na_rep="NA") tpm_and_teff.to_csv(output.teff_file, sep="\t", na_rep="NA")
rule gather_teff:
input:
teff_files = expand(OPJ(
counts_dir,
"{lib}_mean_{{read_type}}_on_%s" % genome,
"{lib}_{{biotype}}_{{orientation}}_TE.txt"), lib=TE_LIBS),
#tags_table = rules.associate_biotype.output.tags_table,
output:
teff_file = OPJ(
counts_dir,
"all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_TE.txt"),
run:
teff_data = pd.concat(
[pd.read_table(teff_file, index_col="gene", usecols=["gene", "TE"]) for teff_file in input.teff_files],
axis=1)
teff_data.columns = [f"{lib}_TE" for lib in TE_LIBS]
teff_data.index.name = "gene"
# Converting gene IDs
######################
with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
teff_data = teff_data.assign(cosmid=teff_data.apply(
column_converter(load(dict_file)), axis=1))
with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
teff_data = teff_data.assign(name=teff_data.apply(
column_converter(load(dict_file)), axis=1))
#teff_data = add_tags_column(teff_data, input.tags_table, "biotype")
teff_data.to_csv(output.teff_file, sep="\t", na_rep="NA")
rule compute_translation_efficiency_difference:
input:
teff_file = rules.gather_teff.output.teff_file,
output:
diff_teff_file = OPJ(counts_dir, "diff_TE_{read_type}",
"{contrast}", "{orientation}_{biotype}", "{contrast}_diff_TE.txt"),
run:
teff_data = pd.read_table(input.teff_file, index_col="gene")
(cond, ref) = CONTRAST2PAIR[wildcards.contrast]
# TE_mut = (ribo_tpm_mut / transcritome_tpm_mut)
# TE_WT = (ribo_tpm_WT / transcritome_tpm_WT)
# log2(TE_mut / TE_WT) = log2(TE_mut) - log2(TE_WT)
diff_teff = teff_data[f"{cond}_TE"] - teff_data[f"{ref}_TE"]
diff_teff.to_csv(output.diff_teff_file, sep="\t", na_rep="NA")
rule compute_RPM: rule compute_RPM:
input: input:
counts_table = source_counts, counts_table = source_counts,
...@@ -1424,7 +1478,7 @@ rule gather_RPM_folds: ...@@ -1424,7 +1478,7 @@ rule gather_RPM_folds:
input: input:
fold_results = expand(OPJ( fold_results = expand(OPJ(
output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}",
"{contrast}", "{contrast}_{{small_type}}_RPM_folds.txt"), contrast=IP_CONTRASTS), "{contrast}", "{contrast}_{{small_type}}_RPM_folds.txt"), contrast=DT_CONTRASTS),
output: output:
all_folds = OPJ( all_folds = OPJ(
output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "all", "{small_type}_mean_log2_RPM_fold.txt"), output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "all", "{small_type}_mean_log2_RPM_fold.txt"),
...@@ -1435,7 +1489,7 @@ rule gather_RPM_folds: ...@@ -1435,7 +1489,7 @@ rule gather_RPM_folds:
OPJ(output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", OPJ(output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}",
f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt"), f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt"),
index_col=["gene", "cosmid", "name", "small_type"], index_col=["gene", "cosmid", "name", "small_type"],
usecols=["gene", "cosmid", "name", "small_type", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv( usecols=["gene", "cosmid", "name", "small_type", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in DT_CONTRASTS}).to_csv(
output.all_folds, sep="\t") output.all_folds, sep="\t")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment