Degradome-seq.snakefile

# Copyright (C) 2020-2023 Blaise Li
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
Snakefile to analyse Degradome-seq data.
"""
#TODO: compute TPM, and compare with RNA-seq to have "degradation efficiency" (like TE in Ribo-seq pipeline)
#TODO: bigwig files
import sys
major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
    sys.exit("Need at least python 3.6\n")

import os
OPJ = os.path.join
from distutils.util import strtobool
from glob import glob
from pickle import load
from sys import stderr
from yaml import safe_load as yload
import numpy as np
import pandas as pd

from itertools import chain, repeat
from functools import reduce
from operator import or_ as union
from cytoolz import valmap

from smincludes import rules as irules
from smwrappers import wrappers_dir
from idconvert import gene_ids_data_dir
from libworkflows import (
    cleanup_and_backup,
    column_converter,
    ensure_relative,
    feature_orientation2stranded,
    get_chrom_sizes,
    read_feature_counts,
    sum_by_family,
    sum_feature_counts,
    wc_applied)
# http://sailfish.readthedocs.io/en/master/library_type.html
LIB_TYPE = config["lib_type"]
# key: library name
# value: dictionary
#   key: replicate number
#   value: path to raw data
lib2raw = config["lib2raw"]
LIBS = list(lib2raw.keys())
# Libraries for which we have matching RNA-seq data
# so that degradation efficiency can be computed
EFF_LIBS = list(config["transcriptome_TPM"].keys())
# What type of "efficency" are we computing? Degradation efficiency.
[this_TPM, ref_TPM, eff_name] = ["Degradome_TPM", "RNA_TPM", "DE"]
REPS = config["replicates"]
CONDITIONS = [{
    "lib" : lib,
    "rep" : rep} for rep in REPS for lib in LIBS]
# We use this for various things in order to have always the same library order:
COND_NAMES = ["_".join((
    cond["lib"],
    cond["rep"])) for cond in CONDITIONS]
# COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
#     cond_name=pd.Series(COND_NAMES).values).set_index("cond_name")
# Differential degradation
DD_COND_PAIRS = config.get("dd_cond_pairs", [])
DD_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in DD_COND_PAIRS]
CONTRAST2PAIR = dict(zip(DD_CONTRASTS, DD_COND_PAIRS))

# TODO: have a distinct subset of biotypes for join_all_counts, that are guaranteed overlap-free.
BIOTYPES = config["biotypes"]
ORIENTATIONS = ["all", "fwd", "rev"]

aligner = config["aligner"]
alignment_settings = {
    "bowtie2": "",
    "hisat2": "",
    "crac": "-k 20 --stranded --use-x-in-cigar"}

########################
# Genome configuration #
########################
# config["genome_dict"] can be either the path to a genome configuration file
# or a dict
if isinstance(config["genome_dict"], (str, bytes)):
    print(f"loading {config['genome_dict']}", file=stderr)
    with open(config["genome_dict"]) as fh:
        genome_dict = yload(fh)
else:
    genome_dict = config["genome_dict"]
genome = genome_dict["name"]
chrom_sizes = get_chrom_sizes(genome_dict["size"])
chrom_sizes.update(valmap(int, genome_dict.get("extra_chromosomes", {})))
genomelen = sum(chrom_sizes.values())
genome_db = genome_dict["db"][aligner]
# bed file binning the genome in 10nt bins
genome_binned = genome_dict["binned"]
annot_dir = genome_dict["annot_dir"]
# What are the difference between
# OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]?
# /!\ gene_ids_data_dir contains more conversion dicts,
# but is not influenced by genome preparation customization,
# like splitting of miRNAs into 3p and 5p.
convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir)
# For wormid2name, load in priority the one
# that might contain custom gene names, like for splitted miRNAs
with open(
        genome_dict.get(
            "converter",
            OPJ(convert_dir, "wormid2name.pickle")),
        "rb") as dict_file:
    wormid2name = load(dict_file)
gene_lists_dir = genome_dict["gene_lists_dir"]
avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))

upload_on_err = strtobool(str(config.get("upload_on_err", "True")))
#output_dir = config["output_dir"]
#workdir: config["output_dir"]
output_dir = os.path.abspath(".")
log_dir = OPJ(f"logs_{genome}")
data_dir = OPJ("data")

counter = "feature_count"
counts_dir = OPJ(aligner, f"mapped_{genome}", counter)
# Limit risks of ambiguity by imposing replicates to be numbers
# and restricting possible forms of some other wildcards

wildcard_constraints:
    lib="|".join(LIBS),
    rep="\d+",
    orientation="|".join(ORIENTATIONS),

localrules: all, link_raw_data

rule all:
    input:
        #expand(
        #    OPJ(data_dir, "trimmed", "{lib}_{rep}.fastq.gz"),
        #    lib=LIBS, rep=REPS),
        # expand(
        #     OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam" % genome),
        #     lib=LIBS, rep=REPS),
        # expand(
        #     OPJ(counts_dir,
        #         f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"),
        #     biotype=BIOTYPES, orientation=ORIENTATIONS),
        expand(
            OPJ(counts_dir, "summaries",
                "all_on_%s_{orientation}_counts.txt" % genome),
            orientation=ORIENTATIONS),
        # expand(
        #     OPJ(counts_dir, f"all_on_{genome}",
        #         "{biotype}_{orientation}_TPM.txt"),
        #     biotype=["alltypes"], orientation=ORIENTATIONS),
        # expand(
        #     OPJ(counts_dir, "{lib}_mean_on_%s" % genome,
        #         "{lib}_mean_{biotype}_{orientation}_TPM.txt"),
        #     lib=LIBS, biotype=["alltypes"], orientation=ORIENTATIONS),
        # degradation_efficiency
        # expand(
        #     OPJ(counts_dir, f"all_on_{genome}", "{biotype}_{orientation}_%s.txt" % eff_name),
        #     lib=LIBS, biotype=["alltypes"], orientation=ORIENTATIONS),
        expand(
            OPJ(counts_dir, "diff_%s" % eff_name, "{contrast}", "{orientation}_{biotype}", "{contrast}_diff_%s.txt" % eff_name),
            contrast=DD_CONTRASTS, orientation=["fwd"], biotype=["alltypes", "protein_coding"]),


include: ensure_relative(irules["link_raw_data"], workflow.basedir)


rule trim_reads:
    input:
        raw = rules.link_raw_data.output.link,
    output:
        trimmed = OPJ(data_dir, "trimmed", "{lib}_{rep}.fastq.gz"),
        methods = OPJ(data_dir, "trimmed", "methods", "{lib}_{rep}_trim_reads_methods.txt"),
    params:
        trim3 = 4,
    shell:
        """
        seqtk trimfq -b {params.trim3} {input.raw} | gzip > {output.trimmed}
        echo "# {rule}" > {output.methods}
        seqtk_version=$(seqtk 2>&1 | grep "Version" | cut -d" " -f2)
        echo "Raw reads were trimmed at their low quality 3' end using seqtk (version ${{seqtk_version}}) as follows: 'seqtk trimfq -b {params.trim3}'" >> {output.methods}
        """


rule map_on_genome:
    input:
        fastq = rules.trim_reads.output.trimmed,
        methods = rules.trim_reads.output.methods,
    output:
        # temp because it uses a lot of space
        sam = temp(OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s.sam" % genome)),
        nomap_fastq = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s.fastq.gz" % genome),
        methods = OPJ(aligner, f"mapped_{genome}", "methods", "{lib}_{rep}_map_on_genome_methods.txt"),
    params:
        aligner = aligner,
        index = genome_db,
        settings = alignment_settings[aligner],
        # For auto-methods
        read_name = "trimmed",
    message:
        "Mapping {wildcards.lib}_{wildcards.rep} on %s." % genome
    log:
        log = OPJ(log_dir, "map_on_genome_{lib}_{rep}.log"),
        err = OPJ(log_dir, "map_on_genome_{lib}_{rep}.err")
    resources:
        io=5
    threads:
        8
    wrapper:
        f"file://{wrappers_dir}/map_on_genome"


rule sam2indexedbam:
    input:
        sam = rules.map_on_genome.output.sam,
    output:
        sorted_bam = protected(OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam" % genome)),
        index = protected(OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam.bai" % genome)),
    message:
        "Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}."
    log:
        log = OPJ(log_dir, "sam2indexedbam_{lib}_{rep}.log"),
        err = OPJ(log_dir, "sam2indexedbam_{lib}_{rep}.err"),
    threads:
        8
    resources:
        mem_mb=4100
    wrapper:
        f"file://{wrappers_dir}/sam2indexedbam"


def biotype2annot(wildcards):
    if wildcards.biotype.endswith("_rmsk_families"):
        biotype = wildcards.biotype[:-9]
    else:
        biotype = wildcards.biotype
    return OPJ(annot_dir, f"{biotype}.gtf")


rule feature_count_reads:
    input:
        sorted_bam = rules.sam2indexedbam.output.sorted_bam,
        index = rules.sam2indexedbam.output.index,
        methods = rules.map_on_genome.output.methods,
    output:
        counts = OPJ(counts_dir, "{lib}_{rep}_on_%s" % genome, "{lib}_{rep}_{biotype}_{orientation}_counts.txt"),
        counts_converted = OPJ(counts_dir, "{lib}_{rep}_on_%s" % genome, "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"),
        methods = OPJ(counts_dir, "{lib}_{rep}_on_%s" % genome, "methods", "{lib}_{rep}_{biotype}_{orientation}_feature_count_reads_methods.txt"),
    params:
        stranded = feature_orientation2stranded(LIB_TYPE),
        annot = biotype2annot,
        # pickled dictionary that associates gene ids to gene names
    message:
        "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} with featureCounts."
    benchmark:
        OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{biotype}_{orientation}_benchmark.txt")
    log:
        log = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{biotype}_{orientation}.log"),
        err = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{biotype}_{orientation}.err"),
    shell:
        """
        tmpdir=$(mktemp -dt "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.biotype}_{wildcards.orientation}.XXXXXXXXXX")
        cmd="featureCounts -a {params.annot} -o {output.counts} -t transcript -g "gene_id" -O -M --primary -s {params.stranded} --fracOverlap 0 --tmpDir ${{tmpdir}} {input.sorted_bam}"
        featureCounts -v 2> {log.log}
        echo ${{cmd}} 1>> {log.log}
        eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed"
        rm -rf ${{tmpdir}}
        cat {output.counts} | wormid2name > {output.counts_converted}
        cp {input.methods} {output.methods}
        echo "# {rule}" >> {output.methods}
        featurecounts_version=$(featureCounts -v 2>&1 | grep featureCounts | cut -d" " -f2)
        echo "Mapping on genes were counted against transcript annotations using featureCounts (version ${{featurecounts_version}}) as follows: ${{cmd}}" >> {output.methods}
        """


rule summarize_counts:
    """For a given library, write a summary of the read counts for the various biotypes."""
    input:
        biotype_counts_files = expand(
            OPJ(counts_dir, "{{lib}}_{{rep}}_on_%s" % genome, "{{lib}}_{{rep}}_{biotype}_{{orientation}}_counts.txt"),
            biotype=BIOTYPES),
    output:
        summary = OPJ(counts_dir, "summaries", "{lib}_{rep}_on_%s_{orientation}_counts.txt" % genome)
    run:
        sum_counter = sum_feature_counts
        with open(output.summary, "w") as summary_file:
            header = "\t".join(BIOTYPES)
            summary_file.write("%s\n" % header)
            sums = "\t".join((str(sum_counter(counts_file)) for counts_file in input.biotype_counts_files))
            summary_file.write("%s\n" % sums)


rule gather_read_counts_summaries:
    """Gather read count summaries across libraries: lib_rep -> all."""
    input:
        summary_tables = expand(OPJ(
            counts_dir, "summaries",
            "{name}_on_%s_{{orientation}}_counts.txt" % genome), name=COND_NAMES),
    output:
        summary_table = OPJ(
            counts_dir, "summaries",
            "all_on_%s_{orientation}_counts.txt" % genome),
    run:
        #summary_files = (OPJ(
        summary_files = [OPJ(
            counts_dir, "summaries",
            f"{cond_name}_on_{genome}_{wildcards.orientation}_counts.txt") for cond_name in COND_NAMES]
        try:
            summaries = pd.concat((pd.read_table(summary_file).T.astype(int) for summary_file in summary_files), axis=1)
        except ValueError:
            warnings.warn("Some data may be missing. Using floats instead of ints.\n")
            summaries = pd.concat((pd.read_table(summary_file).T.astype(float) for summary_file in summary_files), axis=1)
        summaries.columns = COND_NAMES
        summaries.to_csv(output.summary_table, na_rep="NA", sep="\t")


rule gather_counts:
    """For a given biotype, gather counts from all libraries in one table."""
    input:
        counts_tables = expand(
            OPJ(counts_dir, "{lib}_{rep}_on_%s" % genome,
            "{lib}_{rep}_{{biotype}}_{{orientation}}_counts.txt"),
            lib=LIBS, rep=REPS),
    output:
        counts_table = OPJ(
            counts_dir, f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"),
    wildcard_constraints:
        # Avoid ambiguity with join_all_counts
        biotype = "|".join(BIOTYPES)
    run:
        # Gathering the counts data
        ############################
        #counts_files = (OPJ(
        counts_files = [OPJ(
            counts_dir,
            f"{cond_name}_on_{genome}",
            f"{cond_name}_{wildcards.biotype}_{wildcards.orientation}_counts.txt") for cond_name in COND_NAMES]
            #"{biotype}_{orientation}_counts.txt".format(biotype=wildcards.biotype, orientation=wildcards.orientation)) for cond_name in COND_NAMES)
        counts_data = pd.concat(
            map(read_feature_counts, counts_files),
            axis=1).fillna(0).astype(int)
        counts_data.columns = COND_NAMES
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:1
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:2
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:3
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:4
        # -> Simple_repeat|Simple_repeat|(TTTTTTG)n
        if wildcards.biotype.endswith("_rmsk_families"):
            counts_data = sum_by_family(counts_data)
        counts_data.index.names = ["gene"]
        counts_data.to_csv(output.counts_table, na_rep="NA", sep="\t")


def make_tag_association(dfs, tag):
    """Associates a tag "tag" to the union of the indices of dataframes *dfs*."""
    idx = reduce(union, (df.index for df in dfs))
    return pd.DataFrame(list(zip(idx, repeat(tag)))).set_index(0)


rule associate_biotype:
    """This rule uses the read count matrices to associate a biotype to each gene identifier."""
    input:
        counts_tables = expand(OPJ(
            counts_dir,
            "all_on_%s" % genome,
            "{biotype}_all_counts.txt"), biotype=[b_name for b_name in BIOTYPES if not b_name.startswith("protein_coding_")]),
    output:
        tags_table = OPJ(counts_dir, "all_on_%s" % genome, "id2tags.txt"),
    run:
        biotype2tags_tables = {}
        for biotype in BIOTYPES:
            if biotype.startswith("protein_coding_"):
                continue
            biotype2tags_tables[biotype] = make_tag_association(
                (pd.read_table(OPJ(
                    counts_dir,
                    f"all_on_{genome}",
                    f"{biotype}_all_counts.txt"), index_col="gene"),), biotype)
        tags_table = pd.concat(chain(biotype2tags_tables.values()))
        tags_table.index.names = ["gene"]
        tags_table.columns = ["biotype"]
        tags_table.to_csv(output.tags_table, sep="\t")


def add_tags_column(data, tags_table, tag_name):
    """Adds a column *tag_name* to *data* based on the DataFrame *tag_table*
    associating tags to row names."""
    # Why "inner" ?
    df = pd.concat((data, pd.read_table(tags_table, index_col=0)), join="inner", axis=1)
    df.columns = (*data.columns, tag_name)
    return df


rule join_all_counts:
    """Concatenate counts for all biotypes into "alltypes"."""
    input:
        counts_tables = expand(
            OPJ(counts_dir,
                f"all_on_{genome}", "{biotype}_{{orientation}}_counts.txt"),
            biotype=BIOTYPES),
    output:
        counts_table = OPJ(
            counts_dir,
            f"all_on_{genome}", "alltypes_{orientation}_counts.txt"),
    run:
        counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables))
        assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table."
        counts_data.index.names = ["gene"]
        counts_data.to_csv(output.counts_table, sep="\t")


@wc_applied
def source_counts(wildcards):
    """Determines from which rule the gathered counts should be sourced."""
    if wildcards.biotype == "alltypes":
        return rules.join_all_counts.output.counts_table
    else:
        # "Directly" from the counts gathered across libraries
        return rules.gather_counts.output.counts_table


rule compute_RPK:
    """For a given biotype, compute the corresponding RPK value (reads per kilobase)."""
    input:
        counts_data = source_counts,
        #counts_data = rules.gather_counts.output.counts_table,
        #counts_data = OPJ(aligner, f"mapped_{genome}", "feature_count",
        #    f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"),
    output:
        rpk_file = OPJ(counts_dir,
            f"all_on_{genome}", "{biotype}_{orientation}_RPK.txt"),
    params:
        feature_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"),
    # run:
    #     counts_data = pd.read_table(input.counts_data, index_col="gene")
    #     feature_lengths = pd.read_table(params.feature_lengths_file, index_col="gene")
    #     common = counts_data.index.intersection(feature_lengths.index)
    #     rpk = 1000 * counts_data.loc[common].div(feature_lengths.loc[common]["union_exon_len"], axis="index")
    #     rpk.to_csv(output.rpk_file, sep="\t")
    wrapper:
        f"file://{wrappers_dir}/compute_RPK"


# Compute TPM using total number of mappers divided by genome length
# (not sum of RPK across biotypes: some mappers may not be counted)
# No, doesn't work: mappers / genome length not really comparable
# Needs to be done on all_types
rule compute_TPM:
    """For a given biotype, compute the corresponding TPM value (reads per kilobase per million mappers)."""
    input:
        rpk_file = rules.compute_RPK.output.rpk_file
    output:
        tpm_file = OPJ(counts_dir,
            f"all_on_{genome}", "{biotype}_{orientation}_TPM.txt"),
    # The sum must be done over all counted features
    wildcard_constraints:
        biotype = "|".join(["alltypes", "protein_coding"]),
    # run:
    #     rpk = pd.read_table(input.rpk_file, index_col="gene")
    #     tpm = 1000000 * rpk / rpk.sum()
    #     tpm.to_csv(output.tpm_file, sep="\t")
    wrapper:
        f"file://{wrappers_dir}/compute_TPM"


# TODO: Is it better to compute the mean and then the fold of the means?
# Here, the matching between Degradome-seq and RNA-seq replicates is arbitrary.
# We don't even always have the same number of replicates,
# so we work with means across replicates.
rule compute_mean_TPM:
    input:
        all_tmp_file = rules.compute_TPM.output.tpm_file
    output:
        tpm_file = OPJ(counts_dir,
            "{lib}_mean_on_%s" % genome, "{lib}_mean_{biotype}_{orientation}_TPM.txt"),
    run:
        tpm = pd.read_table(
            input.all_tmp_file, index_col="gene",
            usecols=["gene", *[f"{wildcards.lib}_{rep}" for rep in REPS]])
        tpm_mean = tpm.mean(axis=1)
        # This is a Series
        tpm_mean.name = wildcards.lib
        tpm_mean.to_csv(output.tpm_file, sep="\t", header=True)


# TODO
rule compute_efficiency:
    """This is actually a TPM fold between library types."""
    input:
        mean_tpm = rules.compute_mean_TPM.output.tpm_file,
        tags_table = rules.associate_biotype.output.tags_table,
    output:
        eff_file = OPJ(counts_dir,
            "{lib}_mean_on_%s" % genome, "{lib}_{biotype}_{orientation}_%s.txt" % eff_name),
    run:
        mean_tpm = pd.read_table(input.mean_tpm, index_col="gene")
        transcriptome_tpm = pd.read_table(config["transcriptome_TPM"][wildcards.lib], index_col="gene")
        transcriptome_tpm.columns = mean_tpm.columns
        lfc = np.log2((mean_tpm + 1) / (transcriptome_tpm + 1))
        # TE_mut = (ribo_tpm_mut / transcritome_tpm_mut)
        # TE_WT = (ribo_tpm_WT / transcritome_tpm_WT)
        # log2(TE_mut / TE_WT) = log2(TE_mut) - log2(TE_WT)
        tpm_and_eff = pd.concat([mean_tpm, transcriptome_tpm, lfc], axis=1)
        tpm_and_eff.columns = [this_TPM, ref_TPM, eff_name]
        tpm_and_eff.index.name = "gene"
        # Converting gene IDs
        ######################
        with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
            tpm_and_eff = tpm_and_eff.assign(cosmid=tpm_and_eff.apply(
                column_converter(load(dict_file)), axis=1))
        #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
        #    tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply(
        #        column_converter(load(dict_file)), axis=1))
        tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply(
            column_converter(wormid2name), axis=1))
        tpm_and_eff = add_tags_column(tpm_and_eff, input.tags_table, "biotype")
        tpm_and_eff.to_csv(output.eff_file, sep="\t", na_rep="NA")


rule gather_efficiency:
    input:
        eff_files = expand(OPJ(
            counts_dir,
            "{lib}_mean_on_%s" % genome,
            "{lib}_{{biotype}}_{{orientation}}_%s.txt" % eff_name), lib=EFF_LIBS),
        #tags_table = rules.associate_biotype.output.tags_table,
    output:
        eff_file = OPJ(
            counts_dir,
            "all_on_%s" % genome, "{biotype}_{orientation}_%s.txt" % eff_name),
    run:
        eff_data = pd.concat(
            [pd.read_table(eff_file, index_col="gene", usecols=["gene", eff_name]) for eff_file in input.eff_files],
            axis=1)
        eff_data.columns = [f"{lib}_{eff_name}" for lib in EFF_LIBS]
        eff_data.index.name = "gene"
        # Converting gene IDs
        ######################
        with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
            eff_data = eff_data.assign(cosmid=eff_data.apply(
                column_converter(load(dict_file)), axis=1))
        #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
        #    eff_data = eff_data.assign(name=eff_data.apply(
        #        column_converter(load(dict_file)), axis=1))
        eff_data = eff_data.assign(name=eff_data.apply(
            column_converter(wormid2name), axis=1))
        #eff_data = add_tags_column(eff_data, input.tags_table, "biotype")
        eff_data.to_csv(output.eff_file, sep="\t", na_rep="NA")


rule compute_efficiency_difference:
    input:
        eff_file = rules.gather_efficiency.output.eff_file,
    output:
        diff_eff_file = OPJ(counts_dir, "diff_%s" % eff_name,
            "{contrast}", "{orientation}_{biotype}", "{contrast}_diff_%s.txt" % eff_name),
    run:
        eff_data = pd.read_table(input.eff_file, index_col="gene")
        (cond, ref) = CONTRAST2PAIR[wildcards.contrast]
        # TE_mut = (ribo_tpm_mut / transcritome_tpm_mut)
        # TE_WT = (ribo_tpm_WT / transcritome_tpm_WT)
        # log2(TE_mut / TE_WT) = log2(TE_mut) - log2(TE_WT)
        diff_eff = eff_data[f"{cond}_{eff_name}"] - eff_data[f"{ref}_{eff_name}"]
        diff_eff.to_csv(output.diff_eff_file, sep="\t", na_rep="NA")


onsuccess:
    print("Degradome-seq analysis finished.")
    cleanup_and_backup(output_dir, config, delete=True)

onerror:
    shell(f"rm -rf {output_dir}_err")
    shell(f"cp -rp {output_dir} {output_dir}_err")
    if upload_on_err:
        cleanup_and_backup(output_dir + "_err", config)
    print("Degradome-seq analysis failed.")