Degradome-seq.snakefile 24.06 KiB
# Copyright (C) 2020-2023 Blaise Li
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
Snakefile to analyse Degradome-seq data.
"""
#TODO: compute TPM, and compare with RNA-seq to have "degradation efficiency" (like TE in Ribo-seq pipeline)
#TODO: bigwig files
import sys
major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
sys.exit("Need at least python 3.6\n")
import os
OPJ = os.path.join
from distutils.util import strtobool
from glob import glob
from pickle import load
from sys import stderr
from yaml import safe_load as yload
import numpy as np
import pandas as pd
from itertools import chain, repeat
from functools import reduce
from operator import or_ as union
from cytoolz import valmap
from smincludes import rules as irules
from smwrappers import wrappers_dir
from idconvert import gene_ids_data_dir
from libworkflows import (
cleanup_and_backup,
column_converter,
ensure_relative,
feature_orientation2stranded,
get_chrom_sizes,
read_feature_counts,
sum_by_family,
sum_feature_counts,
wc_applied)
# http://sailfish.readthedocs.io/en/master/library_type.html
LIB_TYPE = config["lib_type"]
# key: library name
# value: dictionary
# key: replicate number
# value: path to raw data
lib2raw = config["lib2raw"]
LIBS = list(lib2raw.keys())
# Libraries for which we have matching RNA-seq data
# so that degradation efficiency can be computed
EFF_LIBS = list(config["transcriptome_TPM"].keys())
# What type of "efficency" are we computing? Degradation efficiency.
[this_TPM, ref_TPM, eff_name] = ["Degradome_TPM", "RNA_TPM", "DE"]
REPS = config["replicates"]
CONDITIONS = [{
"lib" : lib,
"rep" : rep} for rep in REPS for lib in LIBS]
# We use this for various things in order to have always the same library order:
COND_NAMES = ["_".join((
cond["lib"],
cond["rep"])) for cond in CONDITIONS]
# COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
# cond_name=pd.Series(COND_NAMES).values).set_index("cond_name")
# Differential degradation
DD_COND_PAIRS = config.get("dd_cond_pairs", [])
DD_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in DD_COND_PAIRS]
CONTRAST2PAIR = dict(zip(DD_CONTRASTS, DD_COND_PAIRS))
# TODO: have a distinct subset of biotypes for join_all_counts, that are guaranteed overlap-free.
BIOTYPES = config["biotypes"]
ORIENTATIONS = ["all", "fwd", "rev"]
aligner = config["aligner"]
alignment_settings = {
"bowtie2": "",
"hisat2": "",
"crac": "-k 20 --stranded --use-x-in-cigar"}
########################
# Genome configuration #
########################
# config["genome_dict"] can be either the path to a genome configuration file
# or a dict
if isinstance(config["genome_dict"], (str, bytes)):
print(f"loading {config['genome_dict']}", file=stderr)
with open(config["genome_dict"]) as fh:
genome_dict = yload(fh)
else:
genome_dict = config["genome_dict"]
genome = genome_dict["name"]
chrom_sizes = get_chrom_sizes(genome_dict["size"])
chrom_sizes.update(valmap(int, genome_dict.get("extra_chromosomes", {})))
genomelen = sum(chrom_sizes.values())
genome_db = genome_dict["db"][aligner]
# bed file binning the genome in 10nt bins
genome_binned = genome_dict["binned"]
annot_dir = genome_dict["annot_dir"]
# What are the difference between
# OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]?
# /!\ gene_ids_data_dir contains more conversion dicts,
# but is not influenced by genome preparation customization,
# like splitting of miRNAs into 3p and 5p.
convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir)
# For wormid2name, load in priority the one
# that might contain custom gene names, like for splitted miRNAs
with open(
genome_dict.get(
"converter",
OPJ(convert_dir, "wormid2name.pickle")),
"rb") as dict_file:
wormid2name = load(dict_file)
gene_lists_dir = genome_dict["gene_lists_dir"]
avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
upload_on_err = strtobool(str(config.get("upload_on_err", "True")))
#output_dir = config["output_dir"]
#workdir: config["output_dir"]
output_dir = os.path.abspath(".")
log_dir = OPJ(f"logs_{genome}")
data_dir = OPJ("data")
counter = "feature_count"
counts_dir = OPJ(aligner, f"mapped_{genome}", counter)
# Limit risks of ambiguity by imposing replicates to be numbers
# and restricting possible forms of some other wildcards
wildcard_constraints:
lib="|".join(LIBS),
rep="\d+",
orientation="|".join(ORIENTATIONS),
localrules: all, link_raw_data
rule all:
input:
#expand(
# OPJ(data_dir, "trimmed", "{lib}_{rep}.fastq.gz"),
# lib=LIBS, rep=REPS),
# expand(
# OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam" % genome),
# lib=LIBS, rep=REPS),
# expand(
# OPJ(counts_dir,
# f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"),
# biotype=BIOTYPES, orientation=ORIENTATIONS),
expand(
OPJ(counts_dir, "summaries",
"all_on_%s_{orientation}_counts.txt" % genome),
orientation=ORIENTATIONS),
# expand(
# OPJ(counts_dir, f"all_on_{genome}",
# "{biotype}_{orientation}_TPM.txt"),
# biotype=["alltypes"], orientation=ORIENTATIONS),
# expand(
# OPJ(counts_dir, "{lib}_mean_on_%s" % genome,
# "{lib}_mean_{biotype}_{orientation}_TPM.txt"),
# lib=LIBS, biotype=["alltypes"], orientation=ORIENTATIONS),
# degradation_efficiency
# expand(
# OPJ(counts_dir, f"all_on_{genome}", "{biotype}_{orientation}_%s.txt" % eff_name),
# lib=LIBS, biotype=["alltypes"], orientation=ORIENTATIONS),
expand(
OPJ(counts_dir, "diff_%s" % eff_name, "{contrast}", "{orientation}_{biotype}", "{contrast}_diff_%s.txt" % eff_name),
contrast=DD_CONTRASTS, orientation=["fwd"], biotype=["alltypes", "protein_coding"]),
include: ensure_relative(irules["link_raw_data"], workflow.basedir)
rule trim_reads:
input:
raw = rules.link_raw_data.output.link,
output:
trimmed = OPJ(data_dir, "trimmed", "{lib}_{rep}.fastq.gz"),
methods = OPJ(data_dir, "trimmed", "methods", "{lib}_{rep}_trim_reads_methods.txt"),
params:
trim3 = 4,
shell:
"""
seqtk trimfq -b {params.trim3} {input.raw} | gzip > {output.trimmed}
echo "# {rule}" > {output.methods}
seqtk_version=$(seqtk 2>&1 | grep "Version" | cut -d" " -f2)
echo "Raw reads were trimmed at their low quality 3' end using seqtk (version ${{seqtk_version}}) as follows: 'seqtk trimfq -b {params.trim3}'" >> {output.methods}
"""
rule map_on_genome:
input:
fastq = rules.trim_reads.output.trimmed,
methods = rules.trim_reads.output.methods,
output:
# temp because it uses a lot of space
sam = temp(OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s.sam" % genome)),
nomap_fastq = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s.fastq.gz" % genome),
methods = OPJ(aligner, f"mapped_{genome}", "methods", "{lib}_{rep}_map_on_genome_methods.txt"),
params:
aligner = aligner,
index = genome_db,
settings = alignment_settings[aligner],
# For auto-methods
read_name = "trimmed",
message:
"Mapping {wildcards.lib}_{wildcards.rep} on %s." % genome
log:
log = OPJ(log_dir, "map_on_genome_{lib}_{rep}.log"),
err = OPJ(log_dir, "map_on_genome_{lib}_{rep}.err")
resources:
io=5
threads:
8
wrapper:
f"file://{wrappers_dir}/map_on_genome"
rule sam2indexedbam:
input:
sam = rules.map_on_genome.output.sam,
output:
sorted_bam = protected(OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam" % genome)),
index = protected(OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam.bai" % genome)),
message:
"Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}."
log:
log = OPJ(log_dir, "sam2indexedbam_{lib}_{rep}.log"),
err = OPJ(log_dir, "sam2indexedbam_{lib}_{rep}.err"),
threads:
8
resources:
mem_mb=4100
wrapper:
f"file://{wrappers_dir}/sam2indexedbam"
def biotype2annot(wildcards):
if wildcards.biotype.endswith("_rmsk_families"):
biotype = wildcards.biotype[:-9]
else:
biotype = wildcards.biotype
return OPJ(annot_dir, f"{biotype}.gtf")
rule feature_count_reads:
input:
sorted_bam = rules.sam2indexedbam.output.sorted_bam,
index = rules.sam2indexedbam.output.index,
methods = rules.map_on_genome.output.methods,
output:
counts = OPJ(counts_dir, "{lib}_{rep}_on_%s" % genome, "{lib}_{rep}_{biotype}_{orientation}_counts.txt"),
counts_converted = OPJ(counts_dir, "{lib}_{rep}_on_%s" % genome, "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"),
methods = OPJ(counts_dir, "{lib}_{rep}_on_%s" % genome, "methods", "{lib}_{rep}_{biotype}_{orientation}_feature_count_reads_methods.txt"),
params:
stranded = feature_orientation2stranded(LIB_TYPE),
annot = biotype2annot,
# pickled dictionary that associates gene ids to gene names
message:
"Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} with featureCounts."
benchmark:
OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{biotype}_{orientation}_benchmark.txt")
log:
log = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{biotype}_{orientation}.log"),
err = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{biotype}_{orientation}.err"),
shell:
"""
tmpdir=$(mktemp -dt "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.biotype}_{wildcards.orientation}.XXXXXXXXXX")
cmd="featureCounts -a {params.annot} -o {output.counts} -t transcript -g "gene_id" -O -M --primary -s {params.stranded} --fracOverlap 0 --tmpDir ${{tmpdir}} {input.sorted_bam}"
featureCounts -v 2> {log.log}
echo ${{cmd}} 1>> {log.log}
eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed"
rm -rf ${{tmpdir}}
cat {output.counts} | wormid2name > {output.counts_converted}
cp {input.methods} {output.methods}
echo "# {rule}" >> {output.methods}
featurecounts_version=$(featureCounts -v 2>&1 | grep featureCounts | cut -d" " -f2)
echo "Mapping on genes were counted against transcript annotations using featureCounts (version ${{featurecounts_version}}) as follows: ${{cmd}}" >> {output.methods}
"""
rule summarize_counts:
"""For a given library, write a summary of the read counts for the various biotypes."""
input:
biotype_counts_files = expand(
OPJ(counts_dir, "{{lib}}_{{rep}}_on_%s" % genome, "{{lib}}_{{rep}}_{biotype}_{{orientation}}_counts.txt"),
biotype=BIOTYPES),
output:
summary = OPJ(counts_dir, "summaries", "{lib}_{rep}_on_%s_{orientation}_counts.txt" % genome)
run:
sum_counter = sum_feature_counts
with open(output.summary, "w") as summary_file:
header = "\t".join(BIOTYPES)
summary_file.write("%s\n" % header)
sums = "\t".join((str(sum_counter(counts_file)) for counts_file in input.biotype_counts_files))
summary_file.write("%s\n" % sums)
rule gather_read_counts_summaries:
"""Gather read count summaries across libraries: lib_rep -> all."""
input:
summary_tables = expand(OPJ(
counts_dir, "summaries",
"{name}_on_%s_{{orientation}}_counts.txt" % genome), name=COND_NAMES),
output:
summary_table = OPJ(
counts_dir, "summaries",
"all_on_%s_{orientation}_counts.txt" % genome),
run:
#summary_files = (OPJ(
summary_files = [OPJ(
counts_dir, "summaries",
f"{cond_name}_on_{genome}_{wildcards.orientation}_counts.txt") for cond_name in COND_NAMES]
try:
summaries = pd.concat((pd.read_table(summary_file).T.astype(int) for summary_file in summary_files), axis=1)
except ValueError:
warnings.warn("Some data may be missing. Using floats instead of ints.\n")
summaries = pd.concat((pd.read_table(summary_file).T.astype(float) for summary_file in summary_files), axis=1)
summaries.columns = COND_NAMES
summaries.to_csv(output.summary_table, na_rep="NA", sep="\t")
rule gather_counts:
"""For a given biotype, gather counts from all libraries in one table."""
input:
counts_tables = expand(
OPJ(counts_dir, "{lib}_{rep}_on_%s" % genome,
"{lib}_{rep}_{{biotype}}_{{orientation}}_counts.txt"),
lib=LIBS, rep=REPS),
output:
counts_table = OPJ(
counts_dir, f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"),
wildcard_constraints:
# Avoid ambiguity with join_all_counts
biotype = "|".join(BIOTYPES)
run:
# Gathering the counts data
############################
#counts_files = (OPJ(
counts_files = [OPJ(
counts_dir,
f"{cond_name}_on_{genome}",
f"{cond_name}_{wildcards.biotype}_{wildcards.orientation}_counts.txt") for cond_name in COND_NAMES]
#"{biotype}_{orientation}_counts.txt".format(biotype=wildcards.biotype, orientation=wildcards.orientation)) for cond_name in COND_NAMES)
counts_data = pd.concat(
map(read_feature_counts, counts_files),
axis=1).fillna(0).astype(int)
counts_data.columns = COND_NAMES
# Simple_repeat|Simple_repeat|(TTTTTTG)n:1
# Simple_repeat|Simple_repeat|(TTTTTTG)n:2
# Simple_repeat|Simple_repeat|(TTTTTTG)n:3
# Simple_repeat|Simple_repeat|(TTTTTTG)n:4
# -> Simple_repeat|Simple_repeat|(TTTTTTG)n
if wildcards.biotype.endswith("_rmsk_families"):
counts_data = sum_by_family(counts_data)
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, na_rep="NA", sep="\t")
def make_tag_association(dfs, tag):
"""Associates a tag "tag" to the union of the indices of dataframes *dfs*."""
idx = reduce(union, (df.index for df in dfs))
return pd.DataFrame(list(zip(idx, repeat(tag)))).set_index(0)
rule associate_biotype:
"""This rule uses the read count matrices to associate a biotype to each gene identifier."""
input:
counts_tables = expand(OPJ(
counts_dir,
"all_on_%s" % genome,
"{biotype}_all_counts.txt"), biotype=[b_name for b_name in BIOTYPES if not b_name.startswith("protein_coding_")]),
output:
tags_table = OPJ(counts_dir, "all_on_%s" % genome, "id2tags.txt"),
run:
biotype2tags_tables = {}
for biotype in BIOTYPES:
if biotype.startswith("protein_coding_"):
continue
biotype2tags_tables[biotype] = make_tag_association(
(pd.read_table(OPJ(
counts_dir,
f"all_on_{genome}",
f"{biotype}_all_counts.txt"), index_col="gene"),), biotype)
tags_table = pd.concat(chain(biotype2tags_tables.values()))
tags_table.index.names = ["gene"]
tags_table.columns = ["biotype"]
tags_table.to_csv(output.tags_table, sep="\t")
def add_tags_column(data, tags_table, tag_name):
"""Adds a column *tag_name* to *data* based on the DataFrame *tag_table*
associating tags to row names."""
# Why "inner" ?
df = pd.concat((data, pd.read_table(tags_table, index_col=0)), join="inner", axis=1)
df.columns = (*data.columns, tag_name)
return df
rule join_all_counts:
"""Concatenate counts for all biotypes into "alltypes"."""
input:
counts_tables = expand(
OPJ(counts_dir,
f"all_on_{genome}", "{biotype}_{{orientation}}_counts.txt"),
biotype=BIOTYPES),
output:
counts_table = OPJ(
counts_dir,
f"all_on_{genome}", "alltypes_{orientation}_counts.txt"),
run:
counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables))
assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table."
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
@wc_applied
def source_counts(wildcards):
"""Determines from which rule the gathered counts should be sourced."""
if wildcards.biotype == "alltypes":
return rules.join_all_counts.output.counts_table
else:
# "Directly" from the counts gathered across libraries
return rules.gather_counts.output.counts_table
rule compute_RPK:
"""For a given biotype, compute the corresponding RPK value (reads per kilobase)."""
input:
counts_data = source_counts,
#counts_data = rules.gather_counts.output.counts_table,
#counts_data = OPJ(aligner, f"mapped_{genome}", "feature_count",
# f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"),
output:
rpk_file = OPJ(counts_dir,
f"all_on_{genome}", "{biotype}_{orientation}_RPK.txt"),
params:
feature_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"),
# run:
# counts_data = pd.read_table(input.counts_data, index_col="gene")
# feature_lengths = pd.read_table(params.feature_lengths_file, index_col="gene")
# common = counts_data.index.intersection(feature_lengths.index)
# rpk = 1000 * counts_data.loc[common].div(feature_lengths.loc[common]["union_exon_len"], axis="index")
# rpk.to_csv(output.rpk_file, sep="\t")
wrapper:
f"file://{wrappers_dir}/compute_RPK"
# Compute TPM using total number of mappers divided by genome length
# (not sum of RPK across biotypes: some mappers may not be counted)
# No, doesn't work: mappers / genome length not really comparable
# Needs to be done on all_types
rule compute_TPM:
"""For a given biotype, compute the corresponding TPM value (reads per kilobase per million mappers)."""
input:
rpk_file = rules.compute_RPK.output.rpk_file
output:
tpm_file = OPJ(counts_dir,
f"all_on_{genome}", "{biotype}_{orientation}_TPM.txt"),
# The sum must be done over all counted features
wildcard_constraints:
biotype = "|".join(["alltypes", "protein_coding"]),
# run:
# rpk = pd.read_table(input.rpk_file, index_col="gene")
# tpm = 1000000 * rpk / rpk.sum()
# tpm.to_csv(output.tpm_file, sep="\t")
wrapper:
f"file://{wrappers_dir}/compute_TPM"
# TODO: Is it better to compute the mean and then the fold of the means?
# Here, the matching between Degradome-seq and RNA-seq replicates is arbitrary.
# We don't even always have the same number of replicates,
# so we work with means across replicates.
rule compute_mean_TPM:
input:
all_tmp_file = rules.compute_TPM.output.tpm_file
output:
tpm_file = OPJ(counts_dir,
"{lib}_mean_on_%s" % genome, "{lib}_mean_{biotype}_{orientation}_TPM.txt"),
run:
tpm = pd.read_table(
input.all_tmp_file, index_col="gene",
usecols=["gene", *[f"{wildcards.lib}_{rep}" for rep in REPS]])
tpm_mean = tpm.mean(axis=1)
# This is a Series
tpm_mean.name = wildcards.lib
tpm_mean.to_csv(output.tpm_file, sep="\t", header=True)
# TODO
rule compute_efficiency:
"""This is actually a TPM fold between library types."""
input:
mean_tpm = rules.compute_mean_TPM.output.tpm_file,
tags_table = rules.associate_biotype.output.tags_table,
output:
eff_file = OPJ(counts_dir,
"{lib}_mean_on_%s" % genome, "{lib}_{biotype}_{orientation}_%s.txt" % eff_name),
run:
mean_tpm = pd.read_table(input.mean_tpm, index_col="gene")
transcriptome_tpm = pd.read_table(config["transcriptome_TPM"][wildcards.lib], index_col="gene")
transcriptome_tpm.columns = mean_tpm.columns
lfc = np.log2((mean_tpm + 1) / (transcriptome_tpm + 1))
# TE_mut = (ribo_tpm_mut / transcritome_tpm_mut)
# TE_WT = (ribo_tpm_WT / transcritome_tpm_WT)
# log2(TE_mut / TE_WT) = log2(TE_mut) - log2(TE_WT)
tpm_and_eff = pd.concat([mean_tpm, transcriptome_tpm, lfc], axis=1)
tpm_and_eff.columns = [this_TPM, ref_TPM, eff_name]
tpm_and_eff.index.name = "gene"
# Converting gene IDs
######################
with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
tpm_and_eff = tpm_and_eff.assign(cosmid=tpm_and_eff.apply(
column_converter(load(dict_file)), axis=1))
#with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
# tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply(
# column_converter(load(dict_file)), axis=1))
tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply(
column_converter(wormid2name), axis=1))
tpm_and_eff = add_tags_column(tpm_and_eff, input.tags_table, "biotype")
tpm_and_eff.to_csv(output.eff_file, sep="\t", na_rep="NA")
rule gather_efficiency:
input:
eff_files = expand(OPJ(
counts_dir,
"{lib}_mean_on_%s" % genome,
"{lib}_{{biotype}}_{{orientation}}_%s.txt" % eff_name), lib=EFF_LIBS),
#tags_table = rules.associate_biotype.output.tags_table,
output:
eff_file = OPJ(
counts_dir,
"all_on_%s" % genome, "{biotype}_{orientation}_%s.txt" % eff_name),
run:
eff_data = pd.concat(
[pd.read_table(eff_file, index_col="gene", usecols=["gene", eff_name]) for eff_file in input.eff_files],
axis=1)
eff_data.columns = [f"{lib}_{eff_name}" for lib in EFF_LIBS]
eff_data.index.name = "gene"
# Converting gene IDs
######################
with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
eff_data = eff_data.assign(cosmid=eff_data.apply(
column_converter(load(dict_file)), axis=1))
#with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
# eff_data = eff_data.assign(name=eff_data.apply(
# column_converter(load(dict_file)), axis=1))
eff_data = eff_data.assign(name=eff_data.apply(
column_converter(wormid2name), axis=1))
#eff_data = add_tags_column(eff_data, input.tags_table, "biotype")
eff_data.to_csv(output.eff_file, sep="\t", na_rep="NA")
rule compute_efficiency_difference:
input:
eff_file = rules.gather_efficiency.output.eff_file,
output:
diff_eff_file = OPJ(counts_dir, "diff_%s" % eff_name,
"{contrast}", "{orientation}_{biotype}", "{contrast}_diff_%s.txt" % eff_name),
run:
eff_data = pd.read_table(input.eff_file, index_col="gene")
(cond, ref) = CONTRAST2PAIR[wildcards.contrast]
# TE_mut = (ribo_tpm_mut / transcritome_tpm_mut)
# TE_WT = (ribo_tpm_WT / transcritome_tpm_WT)
# log2(TE_mut / TE_WT) = log2(TE_mut) - log2(TE_WT)
diff_eff = eff_data[f"{cond}_{eff_name}"] - eff_data[f"{ref}_{eff_name}"]
diff_eff.to_csv(output.diff_eff_file, sep="\t", na_rep="NA")
onsuccess:
print("Degradome-seq analysis finished.")
cleanup_and_backup(output_dir, config, delete=True)
onerror:
shell(f"rm -rf {output_dir}_err")
shell(f"cp -rp {output_dir} {output_dir}_err")
if upload_on_err:
cleanup_and_backup(output_dir + "_err", config)
print("Degradome-seq analysis failed.")