iCLIP.snakefile

"""
Snakefile to process iCLIP data.
"""
import sys
major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
    sys.exit("Need at least python 3.6\n")


import os
OPJ = os.path.join
from glob import glob
from subprocess import CalledProcessError

from collections import defaultdict
from itertools import product

import matplotlib as mpl
# To be able to run the script without a defined $DISPLAY
mpl.use("PDF")
#mpl.rcParams["figure.figsize"] = 2, 4
mpl.rcParams["font.sans-serif"] = [
    "Arial", "Liberation Sans", "Bitstream Vera Sans"]
mpl.rcParams["font.family"] = "sans-serif"
#mpl.rcParams["figure.figsize"] = [16, 30]
import pandas as pd
import matplotlib.pyplot as plt

from libhts import make_empty_bigwig, median_ratio_to_pseudo_ref_size_factors, plot_histo
from libworkflows import get_chrom_sizes, cleanup_and_backup
from libworkflows import last_lines, ensure_relative, SHELL_FUNCTIONS, warn_context
from libworkflows import feature_orientation2stranded
from libworkflows import sum_by_family, read_feature_counts, sum_feature_counts
from smincludes import rules as irules
from smwrappers import wrappers_dir

# Define functions to be used in shell portions
shell.prefix(SHELL_FUNCTIONS)

aligner = config["aligner"]
genome_dict = config["genome_dict"]
genome = genome_dict["name"]
chrom_sizes = get_chrom_sizes(genome_dict["size"])
genomelen = sum(chrom_sizes.values())
genome_db = genome_dict["db"][aligner]
# bed file binning the genome in 10nt bins
genome_binned = genome_dict["binned"]
annot_dir = genome_dict["annot_dir"]
# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]
convert_dir = genome_dict["convert_dir"]
gene_lists_dir = genome_dict["gene_lists_dir"]
avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))

merged_fastq = config["merged_fastq"]
barcode_dict = config["barcode_dict"]
BARCODES = list(barcode_dict.keys())
MAX_DIFF = config["max_diff"]
#output_dir = config["output_dir"]
#workdir: config["output_dir"]
output_dir = os.path.abspath(".")
log_dir = OPJ("logs")
data_dir = OPJ("data")
demux_dir = OPJ(data_dir, f"demultiplexed_{MAX_DIFF}")
lib2raw = defaultdict(dict)
REPS = set()
for (barcode, lib_info) in barcode_dict.items():
    REPS.add(lib_info["rep"])
    lib2raw[lib_info["lib"]][lib_info["rep"]] = OPJ(demux_dir, f"{barcode}.fastq.gz")
LIBS = list(lib2raw.keys())
REPS = sorted(REPS)
CONDITIONS = [{
    "lib" : lib,
    "rep" : rep} for rep in REPS for lib in LIBS]
# We use this for various things in order to have always the same library order:
COND_NAMES = ["_".join((
    cond["lib"],
    cond["rep"])) for cond in CONDITIONS]
COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
    cond_name=pd.Series(COND_NAMES).values).set_index("cond_name")

LIB_TYPE = config["lib_type"]
#TRIMMERS = ["fastx_clipper"]
TRIMMERS = ["cutadapt"]
#COUNTERS = ["feature_count"]
ORIENTATIONS = ["fwd", "rev", "all"]
WITH_ADAPT = ["adapt_deduped", "adapt_nodedup"]
POST_TRIMMING = ["noadapt_deduped"] + WITH_ADAPT
SIZE_RANGES = ["12-18", "21-24", "26-40", "48-52"]
SIZE_SELECTED = [f"{read_type}_{size_range}" for (read_type, size_range) in product(WITH_ADAPT, SIZE_RANGES)]

# TODO: Have different settings for different size ranges
# recommended k-mer length for D. melanogaster is 20
# However, reads shorter thant the k-mer length will be ignored.
# http://crac.gforge.inria.fr/documentation/crac/#sec-2
alignment_settings = {
    #"bowtie2": "-L 6 -i S,1,0.8 -N 0",
    "bowtie2": "-L 6 -i S,1,0.8 -N 1",
    # Small RNA-seq parameters may not be compatible with --local
    #"bowtie2": "--local -L 6 -i S,1,0.8 -N 0",
    "crac": "-k 20 --stranded --use-x-in-cigar"}
# Lower stringency settings, to remap the unmapped
realignment_settings = {
    # Try with almost-default settings
    "bowtie2": "-N 1",
    # Allow more mismatches in the seed
    # Reduce minimal mismatch and gap open penalties
    #"bowtie2": "--local -L 6 -i S,1,0.8 -N 1 --mp 6,1 --rdg 4,3",
    # TODO: Find how to be less stringent with crac
    "crac": "-k 20 --stranded --use-x-in-cigar"}
#test_alignment_settings = {
#    "bowtie2": {
#        "": "-L 6 -i S,1,0.8 -N 1",
#        "": "-L 6 -i S,1,0.8 -N 1",
#        "": "-L 6 -i S,1,0.8 -N 1",
#        "": "-L 6 -i S,1,0.8 -N 1",
#        "": "-L 6 -i S,1,0.8 -N 1",
#        "": "-L 6 -i S,1,0.8 -N 1"}
#    }


# For compatibility with trim_and_dedup as used in PRO-seq pipeline
lib2adapt = defaultdict(lambda: config["adapter"])
MAX_ADAPT_ERROR_RATE = config["max_adapt_error_rate"]

COUNT_BIOTYPES = ["protein_coding", "DNA_transposons_rmsk_families", "RNA_transposons_rmsk_families"]
SIZE_FACTORS = ["protein_coding", "median_ratio_to_pseudo_ref"]
assert set(SIZE_FACTORS).issubset(set(COUNT_BIOTYPES) | {"median_ratio_to_pseudo_ref"})
NORM_TYPES = ["protein_coding", "median_ratio_to_pseudo_ref"]
assert set(NORM_TYPES).issubset(set(SIZE_FACTORS))

wildcard_constraints:
    lib="|".join(LIBS),
    rep="\d+",
    orientation="|".join(ORIENTATIONS),
    norm="|".join(SIZE_FACTORS),
    #size_range="\d+-\d+"

preprocessing = [
    ## Will be pulled in as dependencies of other needed results:
    # expand(OPJ(demux_dir, "{barcode}.fastq.gz"), barcode=BARCODES),
    # expand(OPJ(data_dir, "{lib}_{rep}.fastq.gz"), lib=LIBS, rep=REPS),
    # expand(OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}.fastq.gz"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED),
    ##
    expand(OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}_fastqc.html"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED),
    expand(OPJ(data_dir, "trimmed_{trimmer}", "read_stats", "{lib}_{rep}", "{read_type}_size_distribution.pdf"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING),
]

mapping = [
    ## Will be pulled in as dependencies of other needed results:
    # expand(OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED),
    ##
    expand(
        OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_samtools_stats.txt" % genome),
        trimmer=TRIMMERS, lib=LIBS, rep=REPS,
        read_type=POST_TRIMMING + SIZE_SELECTED + [f"{to_map}_unmapped" for to_map in POST_TRIMMING + SIZE_SELECTED]),
]

counting = [
    ## Will be pulled in as dependencies of other needed results:
    # expand(OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
    ##
    expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "summaries", "all_{read_type}_on_%s_{orientation}_counts.txt" % genome), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED, orientation=ORIENTATIONS),
    expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
    expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED, norm=NORM_TYPES, orientation=["all"]),
]

#TODO:
# - Plot histogram of read type counts at successive processing steps
# - Remap unmapped with less stringency to check if we are too stringent
# (- remove deduplication step ?)
# - map and featureCount rev/fwd: fwd -> mRNA, rev -> smallRNA
# - map with CRAC, detect chimera and crosslink-induced sequencing errors
# - find cross-link sites on genes: should be 5' of antisense reads
# (otherwise, we expect mismatches at the cross-link sites: distribution of mismatch positions in the reads)
# see also https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-1130-x
rule all:
    """This top rule is used to drive the whole workflow by taking as input its final products."""
    input:
        preprocessing,
        mapping,
        counting,


#################
# Preprocessing #
#################
rule demultiplex:
    input:
        fq_in = merged_fastq,
    output:
        expand(OPJ(demux_dir, "{barcode}.fastq.gz"), barcode=BARCODES),
    params:
        demux_dir = demux_dir,
        bc_start = config["bc_start"],
        barcodes = " -b ".join(BARCODES),
        max_diff = MAX_DIFF
    log:
        err = OPJ(log_dir, "demultiplex.err")
    benchmark:
        OPJ(log_dir, "demultiplex_benchmark.txt")
    shell:
        # qaf_demux should be available here: https://gitlab.pasteur.fr/bli/qaf_demux
        """
        qaf_demux \\
            -i {input.fq_in} -g -o {params.demux_dir} \\
            -s {params.bc_start} -b {params.barcodes} -m {params.max_diff} \\
            2> {log.err} || error_exit "qaf_demux failed"
        """
include: ensure_relative(irules["link_raw_data"], workflow.basedir)


rule trim_and_dedup:
    """The adaptor is trimmed, then reads are treated in two groups depending
    on whether the adapter was found or not. For each group the reads are
    sorted, deduplicated, and the random k-mers that helped identify
    PCR duplicates are removed at both ends"""
    input:
        rules.link_raw_data.output,
    output:
        noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_noadapt_deduped.fastq.gz"),
        adapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_adapt_deduped.fastq.gz"),
        adapt_nodedup = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_adapt_nodedup.fastq.gz"),
        nb_raw =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_raw.txt"),
        nb_adapt =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_adapt.txt"),
        nb_adapt_deduped =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_adapt_deduped.txt"),
        nb_noadapt =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt.txt"),
        nb_noadapt_deduped =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt_deduped.txt"),
    params:
        adapter = lambda wildcards : lib2adapt[wildcards.lib],
        max_adapt_error_rate = MAX_ADAPT_ERROR_RATE,
        process_type = "iCLIP",
        trim5 = 8,
        trim3 = 4,
    threads: 8 # Actually, to avoid too much IO
    message:
        "Trimming adaptor from raw data using {wildcards.trimmer}, deduplicating reads, and removing 5' and 3' random n-mers for {wildcards.lib}_{wildcards.rep}."
    benchmark:
        OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim_benchmark.txt")
    log:
        trim = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim.log"),
        log = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.log"),
        err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"),
    run:
        shell_commands = """
MAX_ERROR_RATE="{params.max_adapt_error_rate}" THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {wildcards.trimmer} {input} \\
    {params.adapter} {params.trim5} {params.trim3} \\
    {output.adapt} {output.noadapt} {output.adapt_nodedup} {log.trim} \\
    {output.nb_raw} {output.nb_adapt} {output.nb_adapt_deduped} \\
    {output.nb_noadapt} {output.nb_noadapt_deduped} 1> {log.log} 2> {log.err}
"""
        shell(shell_commands)


def source_trimmed_fastq(wildcards):
    """Determine the fastq file corresponding to a given read type."""
    # remove size range
    read_type = "_".join(wildcards.read_type.split("_")[:-1])
    if read_type == "adapt_deduped":
        return rules.trim_and_dedup.output.adapt
    elif read_type == "noadapt_deduped":
        return rules.trim_and_dedup.output.noadapt
    elif read_type == "adapt_nodedup":
        return rules.trim_and_dedup.output.adapt_nodedup
    else:
        raise NotImplementedError("Unknown read type: %s" % read_type)


def source_fastq(wildcards):
    """Determine the fastq file corresponding to a given read type."""
    read_type = wildcards.read_type
    if read_type == "raw":
        return rules.link_raw_data.output
    elif read_type in SIZE_SELECTED:
        return rules.select_size_range.output.selected
    elif read_type == "adapt_deduped":
        return rules.trim_and_dedup.output.adapt
    elif read_type == "noadapt_deduped":
        return rules.trim_and_dedup.output.noadapt
    elif read_type == "adapt_nodedup":
        return rules.trim_and_dedup.output.adapt_nodedup
    elif read_type.endswith("unmapped"):
        return rules.map_on_genome.output.nomap_fastq
    else:
        raise NotImplementedError("Unknown read type: %s" % read_type)


def awk_size_filter(wildcards):
    """Returns the bioawk filter to select reads of type *wildcards.read_type*."""
    size_range = wildcards.read_type.split("_")[-1]
    (min_len, max_len) = size_range.split("-")
    return f"{min_len} <= length($seq) && length($seq) <= {max_len}"


rule select_size_range:
    """Select (and count) reads in the correct size range."""
    input:
        source_trimmed_fastq,
    output:
        selected = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}.fastq.gz"),
        nb_selected = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_{read_type}.txt"),
    wildcard_constraints:
        read_type = "|".join(SIZE_SELECTED)
    params:
        awk_filter = awk_size_filter,
    message:
        "Selecting {wildcards.read_type} for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}."
    shell:
        """
        bioawk -c fastx '{params.awk_filter} {{print "@"$name" "$4"\\n"$seq"\\n+\\n"$qual}}' {input} \\
            | tee >(count_fastq_reads {output.nb_selected}) \\
            | gzip > {output.selected}
        """


# for with_size_selection in [True, False]:
#     if with_size_selection:
#         rule do_fastqc_on_size_selected:
#             input:
#                 fastq = source_fastq
#             output:
#                 fastqc_out = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}_{size_range}_fastqc.html")
#             shell:
#                 """
#                 fastqc {input.fastq}
#                 """
#     else:
#         rule do_fastqc:
#             input:
#                 fastq = source_fastq
#             output:
#                 fastqc_out = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}_fastqc.html")
#             shell:
#                 """
#                 fastqc {input.fastq}
#                 """

rule do_fastqc:
    input:
        fastq = source_fastq,
    output:
        fastqc_out = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}_fastqc.html")
    shell:
        """
        fastqc {input.fastq}
        """
# rule do_fastqc_on_trimmed:
#     input:
#         fastq = source_fastq,
#     output:
#         fastqc_out = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}_fastqc.html")
#     shell:
#         """
#         fastqc {input.fastq}
#         """
# 
# 
# rule do_fastqc_on_size_selected:
#     input:
#         fastq = source_fastq,
#         #fastq = rules.select_size_range.output.selected,
#     output:
#         fastqc_out = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}_{size_range}_fastqc.html")
#     shell:
#         """
#         fastqc {input.fastq}
#         """


rule compute_size_distribution:
    input:
        source_fastq
    output:
        OPJ(data_dir, "trimmed_{trimmer}", "read_stats", "{lib}_{rep}", "{read_type}_size_distribution.txt"),
    message:
        "Computing read size distribution for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}."
    shell:
        """
        zcat {input} | compute_size_distribution {output}
        """


rule plot_size_distribution:
    input:
        rules.compute_size_distribution.output
    output:
        OPJ(data_dir, "trimmed_{trimmer}", "read_stats", "{lib}_{rep}", "{read_type}_size_distribution.{fig_format}")
    message:
        "Plotting size distribution for trimmed {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}."
    run:
        data = pd.read_table(input[0], header=None, names=("size", "count"), index_col=0)
        title = f"read size distribution for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}"
        plot_histo(output[0], data, title)


def set_alignment_settings(wildcards):
    return alignment_settings[aligner]


def set_realignment_settings(wildcards):
    return realignment_settings[aligner]


###########
# Mapping #
###########
rule map_on_genome:
    input:
        # fastq = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}.fastq.gz"),
        fastq = source_fastq,
    output:
        # sam files take a lot of space
        sam = temp(OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s.sam" % genome)),
        nomap_fastq = OPJ("{trimmer}", aligner, "info_mapping_%s" % genome, "{lib}_{rep}_{read_type}_unmapped_on_%s.fastq.gz" % genome),
    wildcard_constraints:
        read_type = "|".join(POST_TRIMMING + SIZE_SELECTED)
    params:
        aligner = aligner,
        index = genome_db,
        #settings = alignment_settings[aligner],
        settings = set_alignment_settings,
    message:
        "Mapping {wildcards.lib}_{wildcards.rep}_{wildcards.read_type} on C. elegans genome."
    log:
        log = OPJ(log_dir, "{trimmer}", aligner, "map_{read_type}_on_genome", "{lib}_{rep}.log"),
        err = OPJ(log_dir, "{trimmer}", aligner, "map_{read_type}_on_genome", "{lib}_{rep}.err"),
    threads: 12
    wrapper:
        f"file://{wrappers_dir}/map_on_genome"


rule remap_on_genome:
    input:
        # fastq = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}.fastq.gz"),
        #fastq = rules.map_on_genome.output.nomap_fastq,
        fastq = OPJ("{trimmer}", aligner, "info_mapping_%s" % genome, "{lib}_{rep}_{read_type}_unmapped_on_%s.fastq.gz" % genome),
    output:
        # sam files take a lot of space
        sam = temp(OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_unmapped_on_%s.sam" % genome)),
        nomap_fastq = OPJ("{trimmer}", aligner, "info_mapping_%s" % genome, "{lib}_{rep}_{read_type}_unmapped_unmapped_on_%s.fastq.gz" % genome),
    wildcard_constraints:
        read_type = "|".join(POST_TRIMMING + SIZE_SELECTED)
    #wildcard_constraints:
    #    read_type = "|".join([f"{to_map}_unmapped" for to_map in POST_TRIMMING + SIZE_SELECTED])
    params:
        aligner = aligner,
        index = genome_db,
        #settings = realignment_settings[aligner],
        settings = set_realignment_settings,
    message:
        "Re-mapping unmapped {wildcards.lib}_{wildcards.rep}_{wildcards.read_type} on C. elegans genome."
    log:
        log = OPJ(log_dir, "{trimmer}", aligner, "remap_{read_type}_unmapped_on_genome", "{lib}_{rep}.log"),
        err = OPJ(log_dir, "{trimmer}", aligner, "remap_{read_type}_unmapped_on_genome", "{lib}_{rep}.err"),
    threads: 12
    wrapper:
        f"file://{wrappers_dir}/map_on_genome"


rule sam2indexedbam:
    input:
        sam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s.sam" % genome),
    output:
        sorted_bam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam" % genome),
        index = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam.bai" % genome),
    message:
        "Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}."
    log:
        log = OPJ(log_dir, "{trimmer}", "sam2indexedbam", "{lib}_{rep}_{read_type}.log"),
        err = OPJ(log_dir, "{trimmer}", "sam2indexedbam", "{lib}_{rep}_{read_type}.err"),
    threads:
        8
    resources:
        mem_mb=4100
    wrapper:
        f"file://{wrappers_dir}/sam2indexedbam"


rule compute_mapping_stats:
    input:
        sorted_bam = rules.sam2indexedbam.output.sorted_bam,
    output:
        stats = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_samtools_stats.txt" % genome),
    shell:
        """samtools stats {input.sorted_bam} > {output.stats}"""


rule fuse_bams:
    """This rule fuses the two sorted bam files corresponding to the mapping
    of the reads containing the adaptor or not."""
    input:
        noadapt_sorted_bam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_noadapt_on_%s_sorted.bam" % genome),
        adapt_sorted_bam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_adapt_on_%s_sorted.bam" % genome),
    output:
        sorted_bam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_on_%s_sorted.bam" % genome),
        bai = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_on_C_%s_sorted.bam.bai" % genome),
    message:
        "Fusing sorted bam files for {wildcards.lib}_{wildcards.rep}"
    log:
        log = OPJ(log_dir, "{trimmer}", "fuse_bams", "{lib}_{rep}.log"),
        err = OPJ(log_dir, "{trimmer}", "fuse_bams", "{lib}_{rep}.err"),
    shell:
        """
        samtools merge -c {output.sorted_bam} {input.noadapt_sorted_bam} {input.adapt_sorted_bam} 1> {log.log} 2> {log.err}
        indexed=""
        while [ ! ${{indexed}} ]
        do
            samtools index {output.sorted_bam} && indexed="OK"
            if [ ! ${{indexed}} ]
            then
                rm -f {output.bai}
                echo "Indexing failed. Retrying" 1>&2
            fi
        done 1>> {log.log} 2>> {log.err}
        """


def biotype2annot(wildcards):
    if wildcards.biotype.endswith("_rmsk_families"):
        biotype = wildcards.biotype[:-9]
    else:
        biotype = wildcards.biotype
    return OPJ(annot_dir, f"{biotype}.gtf")


rule feature_count_reads:
    input:
        sorted_bam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam" % genome),
        bai = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam.bai" % genome),
    output:
        counts = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"),
        counts_converted = OPJ("{trimmer}", aligner, "mapped_C_elegans", "feature_count", "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts_gene_names.txt"),
    params:
        stranded = feature_orientation2stranded(LIB_TYPE),
        annot = biotype2annot,
    message:
        "Counting {wildcards.orientation} {wildcards.biotype} {wildcards.read_type} reads for {wildcards.lib}_{wildcards.rep} with featureCounts."
    log:
        log = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{lib}_{rep}_{read_type}.log"),
        err = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{lib}_{rep}_{read_type}.err")
    shell:
        """
        # converter="/pasteur/entites/Mhe/Genomes/C_elegans/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes_id2name.pickle"
        tmpdir=$(mktemp -dt "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.read_type}_{wildcards.biotype}_{wildcards.orientation}.XXXXXXXXXX")
        cmd="featureCounts -a {params.annot} -o {output.counts} -t transcript -g "gene_id" -O -M --primary -s {params.stranded} --fracOverlap 0 --tmpDir ${{tmpdir}} {input.sorted_bam}"
        featureCounts -v 2> {log.log}
        echo ${{cmd}} 1>> {log.log}
        eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed"
        rm -rf ${{tmpdir}}
        cat {output.counts} | wormid2name > {output.counts_converted}
        # cat {output.counts} | id2name.py ${{converter}} > {output.counts_converted}
        """


rule summarize_feature_counts:
    """For a given library, compute the total counts for each biotype and write this in a summary table."""
    input:
        biotype_counts_files = expand(OPJ("{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count", "{{lib}}_{{rep}}_{{read_type}}_on_%s" % genome, "{biotype}_{{orientation}}_counts.txt"), biotype=COUNT_BIOTYPES),
    output:
        summary = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "summaries", "{lib}_{rep}_{read_type}_on_%s_{orientation}_counts.txt" % genome),
    run:
        sum_counter = sum_feature_counts
        with open(output.summary, "w") as summary_file:
            header = "\t".join(COUNT_BIOTYPES)
            summary_file.write("%s\n" % header)
            sums = "\t".join((str(sum_counter(counts_file)) for counts_file in input.biotype_counts_files))
            summary_file.write("%s\n" % sums)


rule gather_read_counts_summaries:
    input:
        summary_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count", "summaries", "{lib}_{rep}_{{read_type}}_on_%s_{{orientation}}_counts.txt" % genome), lib=LIBS, rep=REPS),
    output:
        summary_table = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "summaries", "all_{read_type}_on_%s_{orientation}_counts.txt" % genome),
    run:
        summary_files = (OPJ(
            wildcards.trimmer,
            aligner,
            "mapped_%s" % genome,
            "feature_count",
            "summaries",
            f"{cond_name}_{wildcards.read_type}_on_%s_{wildcards.orientation}_counts.txt" % genome) for cond_name in COND_NAMES)
        summaries = pd.concat(
            (pd.read_table(summary_file).T.astype(int) for summary_file in summary_files),
            axis=1)
        summaries.columns = COND_NAMES
        summaries.to_csv(output.summary_table, sep="\t")


rule gather_counts:
    """For a given biotype, gather counts from all libraries in one table."""
    input:
        counts_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{{read_type}}_on_%s" % genome, "{{biotype}}_{{orientation}}_counts.txt"), lib=LIBS, rep=REPS),
    output:
        counts_table = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"),
    # wildcard_constraints:
    #     # Avoid ambiguity with join_all_counts
    #     biotype = "|".join(COUNT_BIOTYPES)
    run:
        # Gathering the counts data
        ############################
        counts_files = (OPJ(
            wildcards.trimmer,
            aligner,
            "mapped_%s" % genome,
            "feature_count",
            f"{cond_name}_{wildcards.read_type}_on_%s" % genome,
            f"{wildcards.biotype}_{wildcards.orientation}_counts.txt") for cond_name in COND_NAMES)
        # if wildcards.counter == "htseq_count":
        #     counts_data = pd.concat(
        #         map(read_htseq_counts, counts_files),
        #         axis=1).fillna(0).astype(int)
        # elif wildcards.counter == "intersect_count":
        #     counts_data = pd.concat(
        #         map(read_intersect_counts, counts_files),
        #         axis=1).fillna(0).astype(int)
        # elif wildcards.counter == "feature_count":
        #     counts_data = pd.concat(
        #         map(read_feature_counts, counts_files),
        #         axis=1).fillna(0).astype(int)
        # else:
        #     raise NotImplementedError(f"{wilcards.counter} not handled (yet?)")
        counts_data = pd.concat(
            map(read_feature_counts, counts_files),
            axis=1).fillna(0).astype(int)
        counts_data.columns = COND_NAMES
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:1
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:2
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:3
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:4
        # -> Simple_repeat|Simple_repeat|(TTTTTTG)n
        if wildcards.biotype.endswith("_rmsk_families"):
            counts_data = sum_by_family(counts_data)
        counts_data.index.names = ["gene"]
        counts_data.to_csv(output.counts_table, sep="\t")


rule compute_median_ratio_to_pseudo_ref_size_factors:
    input:
        counts_table = rules.gather_counts.output.counts_table,
    output:
        median_ratios_file = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_median_ratios_to_pseudo_ref.txt"),
    run:
        counts_data = pd.read_table(
            input.counts_table,
            index_col=0,
            na_filter=False)
        # http://stackoverflow.com/a/21320592/1878788
        #median_ratios = pd.DataFrame(median_ratio_to_pseudo_ref_size_factors(counts_data)).T
        #median_ratios.index.names = ["median_ratio_to_pseudo_ref"]
        # Easier to grep when not transposed, actually:
        median_ratios = median_ratio_to_pseudo_ref_size_factors(counts_data)
        print(median_ratios)
        median_ratios.to_csv(output.median_ratios_file, sep="\t")


def source_norm_file(wildcards):
    if wildcards.norm == "median_ratio_to_pseudo_ref":
        return OPJ(f"{wildcards.trimmer}", aligner, f"mapped_{genome}", "feature_count", f"all_{wildcards.read_type}_on_%s" % genome, "protein_coding_fwd_median_ratios_to_pseudo_ref.txt")
    else:
        return rules.summarize_feature_counts.output.summary


rule make_normalized_bigwig:
    input:
        bam = rules.sam2indexedbam.output.sorted_bam,
        #bam = rules.fuse_bams.output.sorted_bam,
        # TODO: use sourcing function based on norm
        norm_file = source_norm_file,
        #size_factor_file = rules.compute_coverage.output.coverage
        #median_ratios_file = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "all_{read_type}_on_%s" % genome, "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"),
        # TODO: compute this
        #scale_factor_file = OPJ(aligner, "mapped_C_elegans", "annotation", "all_%s_on_C_elegans" % size_selected, "pisimi_median_ratios_to_pseudo_ref.txt"),
    output:
        bigwig_norm = OPJ("{trimmer}", aligner, f"mapped_{genome}", "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome),
    #params:
    #    orient_filter = bamcoverage_filter,
    threads: 12  # to limit memory usage, actually
    benchmark:
        OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_{read_type}_by_{norm}_{orientation}_benchmark.txt")
    params:
        genome_binned = genome_binned,
    log:
        log = OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_{read_type}_by_{norm}_{orientation}.log"),
        err = OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_{read_type}_by_{norm}_{orientation}.err"),
    run:
        if wildcards.norm == "median_ratio_to_pseudo_ref":
            size = float(pd.read_table(
                input.norm_file, index_col=0, header=None).loc[
                    f"{wildcards.lib}_{wildcards.rep}"])
        else:
            # We normalize by million in order not to have too small values
            size = pd.read_table(input.norm_file).T.loc[wildcards.norm][0] / 1000000
            #scale = 1 / pd.read_table(input.summary, index_col=0).loc[
            #    wildcards.norm_file].loc[f"{wildcards.lib}_{wildcards.rep}"]
        assert size >= 0, f"{size} is not positive"
        if size == 0:
            make_empty_bigwig(output.bigwig_norm, chrom_sizes)
        else:
            # TODO: make this a function of deeptools version
            no_reads = """Error: The generated bedGraphFile was empty. Please adjust
    your deepTools settings and check your input files.
    """
            zero_bytes = """needLargeMem: trying to allocate 0 bytes (limit: 100000000000)
    bam2bigwig.sh: bedGraphToBigWig failed
    """
            try:
                # bam2bigwig.sh should be installed with libhts
                shell("""
                    bam2bigwig.sh {input.bam} {params.genome_binned} \\
                        {wildcards.lib}_{wildcards.rep} {wildcards.orientation} %s \\
                        %f {output.bigwig_norm} \\
                        > {log.log} 2> {log.err} \\
                        || error_exit "bam2bigwig.sh failed"
                    """ % (LIB_TYPE[-1], size))
            except CalledProcessError as e:
                if last_lines(log.err, 2) in {no_reads, zero_bytes}:
                    make_empty_bigwig(output.bigwig_norm, chrom_sizes)
                    #with open(output.bigwig_norm, "w") as bwfile:
                    #    bwfile.write("")
                else:
                    raise


onsuccess:
    print("iCLIP data analysis finished.")
    cleanup_and_backup(output_dir, config, delete=True)

onerror:
    shell(f"rm -rf {output_dir}_err")
    shell(f"cp -rp {output_dir} {output_dir}_err")
    cleanup_and_backup(output_dir + "_err", config)
    print("iCLIP data analysis failed.")