Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
B
bioinfo_utils
Manage
Activity
Members
Labels
Plan
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Blaise LI
bioinfo_utils
Commits
f4ec9caf
Commit
f4ec9caf
authored
7 years ago
by
Blaise Li
Browse files
Options
Downloads
Patches
Plain Diff
Computation of translation efficiency difference.
parent
d15736bd
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
Ribo-seq/Ribo-seq.snakefile
+73
-19
73 additions, 19 deletions
Ribo-seq/Ribo-seq.snakefile
with
73 additions
and
19 deletions
Ribo-seq/Ribo-seq.snakefile
+
73
−
19
View file @
f4ec9caf
...
@@ -192,7 +192,7 @@ TE_LIBS = list(config["transcriptome_TPM"].keys())
...
@@ -192,7 +192,7 @@ TE_LIBS = list(config["transcriptome_TPM"].keys())
series_dict = config["series"]
series_dict = config["series"]
SERIES_TYPES = list(series_dict.keys())
SERIES_TYPES = list(series_dict.keys())
genotype_series = series_dict["genotype_series"]
genotype_series = series_dict["genotype_series"]
ip
_series = series_dict["
ip
_series"]
dt
_series = series_dict["
dt
_series"]
merged_series = merge_with(concat_lists, *series_dict.values())
merged_series = merge_with(concat_lists, *series_dict.values())
ALL_SERIES = list(merged_series.keys())
ALL_SERIES = list(merged_series.keys())
#all_libs_in series = concat_lists(merge_with(concat_lists, *series_dict.values()).values())
#all_libs_in series = concat_lists(merge_with(concat_lists, *series_dict.values()).values())
...
@@ -203,17 +203,17 @@ msg = "\n".join([
...
@@ -203,17 +203,17 @@ msg = "\n".join([
"Contrasts:"
"Contrasts:"
", ".join([f"({cond}, {ref})" for (cond, ref) in DE_COND_PAIRS])])
", ".join([f"({cond}, {ref})" for (cond, ref) in DE_COND_PAIRS])])
assert all([cond in LIBS and ref in LIBS for (cond, ref) in DE_COND_PAIRS]), msg
assert all([cond in LIBS and ref in LIBS for (cond, ref) in DE_COND_PAIRS]), msg
IP
_COND_PAIRS = config["
ip
_cond_pairs"]
DT
_COND_PAIRS = config["
dt
_cond_pairs"]
msg = "\n".join([
msg = "\n".join([
"Some contrats do not use known library names.",
"Some contrats do not use known library names.",
"Contrasts:"
"Contrasts:"
", ".join([f"({cond}, {ref})" for (cond, ref) in
IP
_COND_PAIRS])])
", ".join([f"({cond}, {ref})" for (cond, ref) in
DT
_COND_PAIRS])])
assert all([cond in LIBS and ref in LIBS for (cond, ref) in
IP
_COND_PAIRS]), ""
assert all([cond in LIBS and ref in LIBS for (cond, ref) in
DT
_COND_PAIRS]), ""
COND_PAIRS = DE_COND_PAIRS +
IP
_COND_PAIRS
COND_PAIRS = DE_COND_PAIRS +
DT
_COND_PAIRS
DE_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in DE_COND_PAIRS]
DE_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in DE_COND_PAIRS]
IP
_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in
IP
_COND_PAIRS]
DT
_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in
DT
_COND_PAIRS]
contrasts_dict = {"de" : DE_CONTRASTS, "
ip
" :
IP
_CONTRASTS}
contrasts_dict = {"de" : DE_CONTRASTS, "
dt
" :
DT
_CONTRASTS}
CONTRASTS = DE_CONTRASTS +
IP
_CONTRASTS
CONTRASTS = DE_CONTRASTS +
DT
_CONTRASTS
CONTRAST2PAIR = dict(zip(CONTRASTS, COND_PAIRS))
CONTRAST2PAIR = dict(zip(CONTRASTS, COND_PAIRS))
MIN_LEN = config["min_len"]
MIN_LEN = config["min_len"]
MAX_LEN = config["max_len"]
MAX_LEN = config["max_len"]
...
@@ -441,26 +441,27 @@ counts_files = [
...
@@ -441,26 +441,27 @@ counts_files = [
meta_profiles = [
meta_profiles = [
#expand(OPJ(local_annot_dir, "transcripts_{type_set}", "merged_isolated_{min_dist}.bed"), type_set=["all", "protein_coding", "protein_coding_TE"], min_dist="0 5 10 25 50 100 250 500 1000 2500 5000 10000".split()),
#expand(OPJ(local_annot_dir, "transcripts_{type_set}", "merged_isolated_{min_dist}.bed"), type_set=["all", "protein_coding", "protein_coding_TE"], min_dist="0 5 10 25 50 100 250 500 1000 2500 5000 10000".split()),
#expand(OPJ(local_annot_dir, "transcripts_{type_set}", "merged_isolated_{min_dist}_{biotype}_min_{min_len}.bed"), type_set=["all", "protein_coding", "protein_coding_TE"], min_dist="0 5 10 25 50 100 250 500 1000 2500 5000 10000".split(), biotype=["protein_coding"], min_len=[str(META_MIN_LEN)]),
#expand(OPJ(local_annot_dir, "transcripts_{type_set}", "merged_isolated_{min_dist}_{biotype}_min_{min_len}.bed"), type_set=["all", "protein_coding", "protein_coding_TE"], min_dist="0 5 10 25 50 100 250 500 1000 2500 5000 10000".split(), biotype=["protein_coding"], min_len=[str(META_MIN_LEN)]),
# TODO: check how to adapt to dt_series instead of ip_series
expand(
expand(
OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}",
OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}",
"{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{series_type}_{series}_meta_profile.{fig_format}"),
"{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{series_type}_{series}_meta_profile.{fig_format}"),
meta_scale= [str(META_SCALE)], read_type=[size_selected, "RPF"],
meta_scale= [str(META_SCALE)], read_type=[size_selected, "RPF"],
norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding_TE"], min_dist=[str(MIN_DIST)],
norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding_TE"], min_dist=[str(MIN_DIST)],
biotype=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"], min_len=[str(META_MIN_LEN)],
biotype=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"], min_len=[str(META_MIN_LEN)],
series_type=["
ip
_series"], series=list(
ip
_series.keys()), fig_format=FIG_FORMATS),
series_type=["
dt
_series"], series=list(
dt
_series.keys()), fig_format=FIG_FORMATS),
expand(
expand(
OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}",
OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}",
"{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{series_type}_{series}_meta_profile.{fig_format}"),
"{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{series_type}_{series}_meta_profile.{fig_format}"),
meta_scale= [str(META_SCALE)], read_type=[size_selected, "RPF"],
meta_scale= [str(META_SCALE)], read_type=[size_selected, "RPF"],
norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding"], min_dist=[str(MIN_DIST)],
norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding"], min_dist=[str(MIN_DIST)],
biotype=["protein_coding"], min_len=[str(META_MIN_LEN)],
biotype=["protein_coding"], min_len=[str(META_MIN_LEN)],
series_type=["
ip
_series"], series=list(
ip
_series.keys()), fig_format=FIG_FORMATS),
series_type=["
dt
_series"], series=list(
dt
_series.keys()), fig_format=FIG_FORMATS),
expand(
expand(
OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}",
OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}",
"{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{id_list}_{series_type}_{series}_meta_profile.{fig_format}"),
"{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{id_list}_{series_type}_{series}_meta_profile.{fig_format}"),
meta_scale=[str(META_SCALE)], read_type=[size_selected, "RPF"],
meta_scale=[str(META_SCALE)], read_type=[size_selected, "RPF"],
norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding_TE"], min_dist=["0"], id_list=GENE_LISTS,
norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding_TE"], min_dist=["0"], id_list=GENE_LISTS,
series_type=["
ip
_series"], series=list(
ip
_series.keys()), fig_format=FIG_FORMATS),
series_type=["
dt
_series"], series=list(
dt
_series.keys()), fig_format=FIG_FORMATS),
## TODO: Resolve issue with bedtools
## TODO: Resolve issue with bedtools
# expand(
# expand(
# OPJ(output_dir, "figures", "{lib}_{rep}",
# OPJ(output_dir, "figures", "{lib}_{rep}",
...
@@ -494,7 +495,9 @@ read_graphs = [
...
@@ -494,7 +495,9 @@ read_graphs = [
lib=LIBS, rep=REPS, fig_format=FIG_FORMATS),
lib=LIBS, rep=REPS, fig_format=FIG_FORMATS),
]
]
if contrasts_dict["ip"]:
# TODO: check what can be done with "dt" (differential translation, a.k.a. translation efficiency difference)
# What should be done with small_type, fold_type, etc.
if contrasts_dict["dt"]:
fold_heatmaps = expand(
fold_heatmaps = expand(
OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.{fig_format}"),
OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.{fig_format}"),
small_type=["pisimi", "prot_si"], fold_type=["mean_log2_RPM_fold", "log2FoldChange"], fig_format=FIG_FORMATS)
small_type=["pisimi", "prot_si"], fold_type=["mean_log2_RPM_fold", "log2FoldChange"], fig_format=FIG_FORMATS)
...
@@ -512,9 +515,9 @@ else:
...
@@ -512,9 +515,9 @@ else:
exploratory_graphs = [
exploratory_graphs = [
fold_heatmaps,
fold_heatmaps,
# Large figures, not very readable
# Large figures, not very readable
expand(
#
expand(
OPJ(output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "{contrast}", "{contrast}_{small_type}_pairplots.{fig_format}"),
#
OPJ(output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "{contrast}", "{contrast}_{small_type}_pairplots.{fig_format}"),
contrast=DE_CONTRASTS, small_type=DE_TYPES, fig_format=FIG_FORMATS),
#
contrast=DE_CONTRASTS, small_type=DE_TYPES, fig_format=FIG_FORMATS),
## TODO: debug PCA
## TODO: debug PCA
#expand(
#expand(
# OPJ(output_dir, "figures", "{small_type}_{standard}_PCA.{fig_format}"),
# OPJ(output_dir, "figures", "{small_type}_{standard}_PCA.{fig_format}"),
...
@@ -542,7 +545,7 @@ de_fold_boxplots = expand(
...
@@ -542,7 +545,7 @@ de_fold_boxplots = expand(
ip_fold_boxplots_by_contrast = expand(
ip_fold_boxplots_by_contrast = expand(
OPJ(output_dir, "figures", "{contrast}",
OPJ(output_dir, "figures", "{contrast}",
"{contrast}_{small_type}_{fold_type}_{gene_list}_boxplots.{fig_format}"),
"{contrast}_{small_type}_{fold_type}_{gene_list}_boxplots.{fig_format}"),
contrast=
IP
_CONTRASTS, small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"],
contrast=
DT
_CONTRASTS, small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"],
gene_list=["all_gene_lists"], fig_format=FIG_FORMATS)
gene_list=["all_gene_lists"], fig_format=FIG_FORMATS)
fold_boxplots = [de_fold_boxplots, ip_fold_boxplots_by_contrast, ip_fold_boxplots]
fold_boxplots = [de_fold_boxplots, ip_fold_boxplots_by_contrast, ip_fold_boxplots]
...
@@ -580,13 +583,19 @@ rule all:
...
@@ -580,13 +583,19 @@ rule all:
"deseq2_{read_type}",
"deseq2_{read_type}",
"{contrast}", "{orientation}_{biotype}", "{contrast}_counts_and_res.txt"),
"{contrast}", "{orientation}_{biotype}", "{contrast}_counts_and_res.txt"),
read_type=["RPF"], contrast=DE_CONTRASTS, orientation=ORIENTATIONS, biotype=["protein_coding"]),
read_type=["RPF"], contrast=DE_CONTRASTS, orientation=ORIENTATIONS, biotype=["protein_coding"]),
expand(
OPJ(
counts_dir,
"diff_TE_{read_type}",
"{contrast}", "{orientation}_{biotype}", "{contrast}_diff_TE.txt"),
read_type=["RPF"], contrast=DT_CONTRASTS, orientation=["fwd"], biotype=["alltypes"]),
# rule future_all:
# rule future_all:
# meta_profiles,
# meta_profiles,
# read_graphs,
# read_graphs,
# OPJ(output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "all", "pisimi_mean_log2_RPM_fold.txt"),
# OPJ(output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "all", "pisimi_mean_log2_RPM_fold.txt"),
# expand(OPJ(output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "all", "pisimi_{fold_type}.txt"), fold_type=["log2FoldChange"]),
# expand(OPJ(output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "all", "pisimi_{fold_type}.txt"), fold_type=["log2FoldChange"]),
# #expand(OPJ(output_dir, aligner, f"mapped_{genome}", "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), contrast=
IP
_CONTRASTS, small_type=DE_TYPES),
# #expand(OPJ(output_dir, aligner, f"mapped_{genome}", "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), contrast=
DT
_CONTRASTS, small_type=DE_TYPES),
# exploratory_graphs,
# exploratory_graphs,
# fold_boxplots,
# fold_boxplots,
# #expand(OPJ(output_dir, "figures", "{small_type}_unit_clustermap.{fig_format}"), small_type=SMALL_TYPES, fig_format=FIG_FORMATS),
# #expand(OPJ(output_dir, "figures", "{small_type}_unit_clustermap.{fig_format}"), small_type=SMALL_TYPES, fig_format=FIG_FORMATS),
...
@@ -1326,6 +1335,51 @@ rule compute_translation_efficiency:
...
@@ -1326,6 +1335,51 @@ rule compute_translation_efficiency:
tpm_and_teff.to_csv(output.teff_file, sep="\t", na_rep="NA")
tpm_and_teff.to_csv(output.teff_file, sep="\t", na_rep="NA")
rule gather_teff:
input:
teff_files = expand(OPJ(
counts_dir,
"{lib}_mean_{{read_type}}_on_%s" % genome,
"{lib}_{{biotype}}_{{orientation}}_TE.txt"), lib=TE_LIBS),
#tags_table = rules.associate_biotype.output.tags_table,
output:
teff_file = OPJ(
counts_dir,
"all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_TE.txt"),
run:
teff_data = pd.concat(
[pd.read_table(teff_file, index_col="gene", usecols=["gene", "TE"]) for teff_file in input.teff_files],
axis=1)
teff_data.columns = [f"{lib}_TE" for lib in TE_LIBS]
teff_data.index.name = "gene"
# Converting gene IDs
######################
with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
teff_data = teff_data.assign(cosmid=teff_data.apply(
column_converter(load(dict_file)), axis=1))
with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
teff_data = teff_data.assign(name=teff_data.apply(
column_converter(load(dict_file)), axis=1))
#teff_data = add_tags_column(teff_data, input.tags_table, "biotype")
teff_data.to_csv(output.teff_file, sep="\t", na_rep="NA")
rule compute_translation_efficiency_difference:
input:
teff_file = rules.gather_teff.output.teff_file,
output:
diff_teff_file = OPJ(counts_dir, "diff_TE_{read_type}",
"{contrast}", "{orientation}_{biotype}", "{contrast}_diff_TE.txt"),
run:
teff_data = pd.read_table(input.teff_file, index_col="gene")
(cond, ref) = CONTRAST2PAIR[wildcards.contrast]
# TE_mut = (ribo_tpm_mut / transcritome_tpm_mut)
# TE_WT = (ribo_tpm_WT / transcritome_tpm_WT)
# log2(TE_mut / TE_WT) = log2(TE_mut) - log2(TE_WT)
diff_teff = teff_data[f"{cond}_TE"] - teff_data[f"{ref}_TE"]
diff_teff.to_csv(output.diff_teff_file, sep="\t", na_rep="NA")
rule compute_RPM:
rule compute_RPM:
input:
input:
counts_table = source_counts,
counts_table = source_counts,
...
@@ -1424,7 +1478,7 @@ rule gather_RPM_folds:
...
@@ -1424,7 +1478,7 @@ rule gather_RPM_folds:
input:
input:
fold_results = expand(OPJ(
fold_results = expand(OPJ(
output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}",
output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}",
"{contrast}", "{contrast}_{{small_type}}_RPM_folds.txt"), contrast=
IP
_CONTRASTS),
"{contrast}", "{contrast}_{{small_type}}_RPM_folds.txt"), contrast=
DT
_CONTRASTS),
output:
output:
all_folds = OPJ(
all_folds = OPJ(
output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "all", "{small_type}_mean_log2_RPM_fold.txt"),
output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "all", "{small_type}_mean_log2_RPM_fold.txt"),
...
@@ -1435,7 +1489,7 @@ rule gather_RPM_folds:
...
@@ -1435,7 +1489,7 @@ rule gather_RPM_folds:
OPJ(output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}",
OPJ(output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}",
f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt"),
f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt"),
index_col=["gene", "cosmid", "name", "small_type"],
index_col=["gene", "cosmid", "name", "small_type"],
usecols=["gene", "cosmid", "name", "small_type", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in
IP
_CONTRASTS}).to_csv(
usecols=["gene", "cosmid", "name", "small_type", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in
DT
_CONTRASTS}).to_csv(
output.all_folds, sep="\t")
output.all_folds, sep="\t")
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment