From 5390984614bddd589debfe7011b115380435c428 Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Fri, 2 Feb 2018 19:04:20 +0100
Subject: [PATCH] Pipeline to process iCLIP data.

Currently goes from demultiplexing to mapping, via trimming and
deduplicating. The mapping is performed on 3 read type:
- adapt_nodedup (the adaptor was found, and the reads were trimmed but
  not deduplicated)
- adapt_deduped (the adaptor was found, and the reads were trimmed and
  deduplicated)
- noadapt_deduped (the adaptor was not found, and the reads were trimmed
  and deduplicated)

The trim_and_dedup script currenly assumes that two low-diversity zones
are present, and ignores them for deduplication:

NNNNNGCACTANNNWWW[YYYY]NNNN
1---5 : 5' UMI
     6--11: barcode (lower diversity)
          12-14: UMI
            15-17: AT(or GC?)-rich (low diversity)
                [fragment]
                       -4 -> -1: 3' UMI

It may be a problem to deduplicate taking into account the end of the
reads, which tends to be of lower quality. The reads with errors will be
over-represented. That is why we decided to also look at the
non-deduplicated reads.
---
 CLIP/iCLIP.snakefile         | 339 +++++++++++++++++++++++++++++++++++
 CLIP/iCLIP_trim_and_dedup.sh | 203 +++++++++++++++++++++
 2 files changed, 542 insertions(+)
 create mode 100644 CLIP/iCLIP.snakefile
 create mode 100755 CLIP/iCLIP_trim_and_dedup.sh

diff --git a/CLIP/iCLIP.snakefile b/CLIP/iCLIP.snakefile
new file mode 100644
index 0000000..44505bd
--- /dev/null
+++ b/CLIP/iCLIP.snakefile
@@ -0,0 +1,339 @@
+"""
+Snakefile to process iCLIP data.
+"""
+import sys
+major, minor = sys.version_info[:2]
+if major < 3 or (major == 3 and minor < 6):
+    sys.exit("Need at least python 3.6\n")
+
+
+import os
+OPJ = os.path.join
+from glob import glob
+
+from collections import defaultdict
+
+import matplotlib as mpl
+# To be able to run the script without a defined $DISPLAY
+mpl.use("PDF")
+#mpl.rcParams["figure.figsize"] = 2, 4
+mpl.rcParams["font.sans-serif"] = [
+    "Arial", "Liberation Sans", "Bitstream Vera Sans"]
+mpl.rcParams["font.family"] = "sans-serif"
+#mpl.rcParams["figure.figsize"] = [16, 30]
+import pandas as pd
+import matplotlib.pyplot as plt
+
+from libworkflows import get_chrom_sizes
+from libworkflows import ensure_relative, SHELL_FUNCTIONS, warn_context
+from smincludes import rules as irules
+
+# recommended k-mer length for D. melanogaster is 20
+# However, reads shorter thant the k-mer length will be ignored.
+# http://crac.gforge.inria.fr/documentation/crac/#sec-2
+alignment_settings = {
+    "bowtie2": "--local -L 6 -i S,1,0.8 -N 0",
+    "crac": "-k 20 --stranded --use-x-in-cigar"}
+
+# Define functions to be used in shell portions
+shell.prefix(SHELL_FUNCTIONS)
+
+aligner = config["aligner"]
+genome_dict = config["genome_dict"]
+genome = genome_dict["name"]
+chrom_sizes = get_chrom_sizes(genome_dict["size"])
+genomelen = sum(chrom_sizes.values())
+genome_db = genome_dict["db"][aligner]
+# bed file binning the genome in 10nt bins
+genome_binned = genome_dict["binned"]
+annot_dir = genome_dict["annot_dir"]
+# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]
+convert_dir = genome_dict["convert_dir"]
+gene_lists_dir = genome_dict["gene_lists_dir"]
+avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
+
+merged_fastq = config["merged_fastq"]
+barcode_dict = config["barcode_dict"]
+BARCODES = list(barcode_dict.keys())
+MAX_DIFF = config["max_diff"]
+output_dir = config["output_dir"]
+log_dir = OPJ(output_dir, "logs")
+data_dir = OPJ(output_dir, "data")
+demux_dir = OPJ(data_dir, f"demultiplexed_{MAX_DIFF}")
+lib2raw = defaultdict(dict)
+REPS = set()
+for (barcode, lib_info) in barcode_dict.items():
+    REPS.add(lib_info["rep"])
+    lib2raw[lib_info["lib"]][lib_info["rep"]] = OPJ(demux_dir, f"{barcode}.fastq.gz")
+LIBS = list(lib2raw.keys())
+REPS = sorted(REPS)
+
+TRIMMERS = ["fastx_clipper", "cutadapt"]
+READ_TYPES = ["noadapt_deduped", "adapt_deduped", "adapt_nodedup"]
+#TRIMMERS = ["fastx_clipper"]
+# For compatibility with trim_and_dedup as used in PRO-seq pipeline
+lib2adapt = defaultdict(lambda: config["adapter"])
+MAX_ADAPT_ERROR_RATE = config["max_adapt_error_rate"]
+
+wildcard_constraints:
+    lib="|".join(LIBS),
+    rep="\d+"
+
+preprocessing = [
+    expand(OPJ(demux_dir, "{barcode}.fastq.gz"), barcode=BARCODES),
+    expand(OPJ(data_dir, "{lib}_{rep}.fastq.gz"), lib=LIBS, rep=REPS),
+    expand(OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}.fastq.gz"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=READ_TYPES),
+    expand(OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}_fastqc.html"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=READ_TYPES),
+    expand(OPJ(data_dir, "trimmed_{trimmer}", "read_stats", "{lib}_{rep}", "{read_type}_size_distribution.pdf"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=READ_TYPES),
+]
+
+mapping = [
+    expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=READ_TYPES),
+    expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_samtools_stats.txt" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=READ_TYPES),
+]
+
+#TODO:
+# - map with CRAC, detect chimera and crosslink-induced sequencing erros
+# - find cross-link sites on genes: should be 5' of antisense reads
+rule all:
+    """This top rule is used to drive the whole workflow by taking as input its final products."""
+    input:
+        preprocessing,
+        mapping,
+
+
+#################
+# Preprocessing #
+#################
+rule demultiplex:
+    input:
+        fq_in = merged_fastq,
+    output:
+        expand(OPJ(demux_dir, "{barcode}.fastq.gz"), barcode=BARCODES),
+    params:
+        demux_dir = demux_dir,
+        bc_start = config["bc_start"],
+        barcodes = " ".join(BARCODES),
+        max_diff = MAX_DIFF
+    log:
+        err = OPJ(log_dir, "demultiplex.err")
+    benchmark:
+        OPJ(log_dir, "demultiplex_benchmark.txt")
+    shell:
+        """
+        qaf-demux -i {input.fq_in} -o {params.demux_dir} \\
+            -s {params.bc_start} -b {params.barcodes} -m {params.max_diff} \\
+            2> {log.err} || error_exit "qaf_demux failed"
+        """
+
+
+include: ensure_relative(irules["link_raw_data"], workflow.basedir)
+
+
+rule trim_and_dedup:
+    """The adaptor is trimmed, then reads are treated in two groups depending
+    on whether the adapter was found or not. For each group the reads are
+    sorted, deduplicated, and the random k-mers that helped identify
+    PCR duplicates are removed at both ends"""
+    input:
+        rules.link_raw_data.output,
+    output:
+        noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_noadapt_deduped.fastq.gz"),
+        adapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_adapt_deduped.fastq.gz"),
+        adapt_nodedup = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_adapt_nodedup.fastq.gz"),
+        nb_raw =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_raw.txt"),
+        nb_adapt =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_adapt.txt"),
+        nb_adapt_deduped =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_adapt_deduped.txt"),
+        nb_noadapt =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt.txt"),
+        nb_noadapt_deduped =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt_deduped.txt"),
+    params:
+        adapter = lambda wildcards : lib2adapt[wildcards.lib],
+        max_adapt_error_rate = MAX_ADAPT_ERROR_RATE,
+        process_type = "iCLIP",
+        trim5 = 8,
+        trim3 = 4,
+    threads: 4 # Actually, to avoid too much IO
+    message:
+        "Trimming adaptor from raw data using {wildcards.trimmer}, deduplicating reads, and removing 5' and 3' random n-mers for {wildcards.lib}_{wildcards.rep}."
+    benchmark:
+        OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim_benchmark.txt")
+    log:
+        trim = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim.log"),
+        log = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.log"),
+        err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"),
+    run:
+        shell_commands = """
+MAX_ERROR_RATE="{params.max_adapt_error_rate}" THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {wildcards.trimmer} {input} \\
+    {params.adapter} {params.trim5} {params.trim3} \\
+    {output.adapt} {output.noadapt} {output.adapt_nodedup} {log.trim} \\
+    {output.nb_raw} {output.nb_adapt} {output.nb_adapt_deduped} \\
+    {output.nb_noadapt} {output.nb_noadapt_deduped} 1> {log.log} 2> {log.err}
+"""
+        shell(shell_commands)
+
+
+def source_fastq(wildcards):
+    """Determine the fastq file corresponding to a given read type."""
+    read_type = wildcards.read_type
+    if read_type == "raw":
+        return rules.link_raw_data.output
+    elif read_type == "adapt_deduped":
+        return rules.trim_and_dedup.output.adapt
+    elif read_type == "noadapt_deduped":
+        return rules.trim_and_dedup.output.noadapt
+    elif read_type == "adapt_nodedup":
+        return rules.trim_and_dedup.output.adapt_nodedup
+    else:
+        raise NotImplementedError("Unknown read type: %s" % read_type)
+
+
+rule do_fastqc:
+    input:
+        fastq = source_fastq
+    output:
+        fastqc_out = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}_fastqc.html")
+    shell:
+        """
+        fastqc {input.fastq}
+        """
+
+
+rule compute_size_distribution:
+    input:
+        source_fastq
+    output:
+        OPJ(data_dir, "trimmed_{trimmer}", "read_stats", "{lib}_{rep}", "{read_type}_size_distribution.txt"),
+    message:
+        "Computing read size distribution for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}."
+    shell:
+        """
+        zcat {input} | compute_size_distribution {output}
+        """
+
+
+def plot_histo(outfile, data, title=None):
+    fig = plt.figure(figsize=(15,7))
+    ax = fig.add_subplot(111)
+    ax.set_xlim([data.index[0] - 0.5, data.index[-1] + 0.5])
+    #ax.set_ylim([0, 100])
+    bar_width = 0.8
+    #letter2legend = dict(zip("ACGT", "ACGT"))
+    for (read_len, count) in data.iterrows():
+        plt.bar(
+            read_len,
+            count,
+            align="center",
+            width=bar_width)
+            #color=letter2colour[letter],
+            #label=letter2legend[letter])
+    ax.legend()
+    ax.set_xticks(data.index)
+    ax.set_xticklabels(data.index)
+    ax.set_xlabel("read length")
+    ax.set_ylabel("number of reads")
+    plt.setp(ax.get_xticklabels(), rotation=90)
+    if title is not None:
+        plt.title(title)
+    plt.savefig(outfile)
+
+
+rule plot_size_distribution:
+    input:
+        rules.compute_size_distribution.output
+    output:
+        OPJ(data_dir, "trimmed_{trimmer}", "read_stats", "{lib}_{rep}", "{read_type}_size_distribution.{fig_format}")
+    message:
+        "Plotting size distribution for trimmed {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}."
+    run:
+        data = pd.read_table(input[0], header=None, names=("size", "count"), index_col=0)
+        title = f"read size distribution for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}"
+        plot_histo(output[0], data, title)
+
+
+###########
+# Mapping #
+###########
+rule map_on_genome:
+    input:
+        # fastq = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}.fastq.gz"),
+        fastq = source_fastq,
+    output:
+        # sam files take a lot of space
+        sam = temp(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s.sam" % genome)),
+        nomap_fastq = OPJ(output_dir, "{trimmer}", aligner, "info_mapping_%s" % genome, "{lib}_{rep}_{read_type}_unmapped_on_%s.fastq.gz" % genome),
+    params:
+        aligner = aligner,
+        index = genome_db,
+        settings = alignment_settings[aligner],
+    message:
+        "Mapping {wildcards.lib}_{wildcards.rep}_{wildcards.read_type} on C. elegans genome."
+    log:
+        log = OPJ(log_dir, "{trimmer}", aligner, "map_{read_type}_on_genome", "{lib}_{rep}.log"),
+        err = OPJ(log_dir, "{trimmer}", aligner, "map_{read_type}_on_genome", "{lib}_{rep}.err"),
+    threads: 12
+    wrapper:
+        "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/map_on_genome"
+
+
+rule sam2indexedbam:
+    input:
+        sam = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s.sam" % genome),
+    output:
+        sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam" % genome),
+        index = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam.bai" % genome),
+    message:
+        "Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}."
+    log:
+        log = OPJ(log_dir, "{trimmer}", "sam2indexedbam", "{lib}_{rep}_{read_type}.log"),
+        err = OPJ(log_dir, "{trimmer}", "sam2indexedbam", "{lib}_{rep}_{read_type}.err"),
+    threads:
+        4
+    wrapper:
+        "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/sam2indexedbam"
+
+
+rule compute_mapping_stats:
+    input:
+        sorted_bam = rules.sam2indexedbam.output.sorted_bam,
+    output:
+        stats = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_samtools_stats.txt" % genome),
+    shell:
+        """samtools stats {input.sorted_bam} > {output.stats}"""
+
+
+rule fuse_bams:
+    """This rules fuses the two sorted bam files corresponding to the mapping
+    of the reads containing the adaptor or not."""
+    input:
+        noadapt_sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_noadapt_on_%s_sorted.bam" % genome),
+        adapt_sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_adapt_on_%s_sorted.bam" % genome),
+    output:
+        sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_on_%s_sorted.bam" % genome),
+        bai = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_on_C_%s_sorted.bam.bai" % genome),
+    message:
+        "Fusing sorted bam files for {wildcards.lib}_{wildcards.rep}"
+    log:
+        log = OPJ(log_dir, "{trimmer}", "fuse_bams", "{lib}_{rep}.log"),
+        err = OPJ(log_dir, "{trimmer}", "fuse_bams", "{lib}_{rep}.err"),
+    shell:
+        """
+        samtools merge -c {output.sorted_bam} {input.noadapt_sorted_bam} {input.adapt_sorted_bam} 1> {log.log} 2> {log.err}
+        indexed=""
+        while [ ! ${{indexed}} ]
+        do
+            samtools index {output.sorted_bam} && indexed="OK"
+            if [ ! ${{indexed}} ]
+            then
+                rm -f {output.bai}
+                echo "Indexing failed. Retrying" 1>&2
+            fi
+        done 1>> {log.log} 2>> {log.err}
+        """
+
+
+onsuccess:
+    print("iCLIP data pre-processing finished.")
+
+onerror:
+    print("iCLIP data pre-processing failed.")
+
diff --git a/CLIP/iCLIP_trim_and_dedup.sh b/CLIP/iCLIP_trim_and_dedup.sh
new file mode 100755
index 0000000..9acbe20
--- /dev/null
+++ b/CLIP/iCLIP_trim_and_dedup.sh
@@ -0,0 +1,203 @@
+#!/usr/bin/env bash
+# Usage: iCLIP_trim_and_dedup.sh <trimmer> <raw fastq> <ADAPTER> <nb 5'> <nb 3'> <trimmed fastq> <untrimmed fastq> <trimmed_nodedup> <trimmer log> <nb_raw> <nb_adapt> <nb_adapt_deduped> <nb_noadapt> <nb_noadapt_deduped>
+
+# http://linuxcommand.org/wss0150.php
+PROGNAME=$(basename $0)
+
+function error_exit
+{
+#	----------------------------------------------------------------
+#	Function for exit due to fatal program error
+#		Accepts 1 argument:
+#			string containing descriptive error message
+#	----------------------------------------------------------------
+    echo "${PROGNAME}: ${1:-"Unknown Error"}" 1>&2
+    exit 1
+}
+
+# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in
+#PACKAGEDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+
+trimmer=${1}
+
+raw_in=${2}
+
+adapt=${3}
+
+# Sum of lengths of UMIs both left and right of barcode
+fiveprime_random=${4}
+# Adding lenghts of "low diversity zones"
+# (to process without deduplicating)
+total_fiveprime=$((${fiveprime_random}+9))
+# Length of the 3' UMI
+threeprime_random=${5}
+# For the untrimmed, the adaptor was not found,
+# but maybe its first 3 bases were there.
+threeprime_random_with_margin=$((${threeprime_random}+3))
+echo ${threeprime_random_with_margin}
+
+# fastq files
+trimmed_and_dedup_out=${6}
+untrimmed_out=${7}
+trimmed_nodedup_out=${8}
+
+log=${9}
+
+# count files
+nb_raw=${10}
+nb_adapt=${11}
+nb_adapt_deduped=${12}
+nb_noadapt=${13}
+nb_noadapt_deduped=${14}
+
+count_fastq_reads()
+{
+    # $1: file in which to write the number of fastq records
+    wc -l | { read nblines; echo ${nblines} / 4 | bc > ${1}; }
+}
+
+# Removing parts that have lower diversity,
+# and therefore more likely to have sequencing errors (6-11,15-17)
+# Reads should be:
+# 
+# NNNNNGCACTANNNWWW[YYYY]NNNN
+# 1---5 : 5' UMI
+#      6--11: barcode (lower diversity)
+#           12-14: UMI
+#             15-17: AT(or GC?)-rich (low diversity)
+#                 [fragment]
+#                        -4 -> -1: 3' UMI
+strip_low_qual_zones()
+{
+    bioawk -c fastx '{print $name"\t"substr($seq, 1, 5)""substr($seq, 12, 3)""substr($seq, 18)"\t"substr($qual, 1, 5)""substr($qual, 12, 3)""substr($qual, 18)}' | mawk '{print "@"$1"\n"$2"\n+\n"$3}'
+}
+
+# This script performs 2 sorting and deduplicating operations, depending on the
+# presence or absence of the adapter in the read.
+
+
+# The -s option of fastq-sort sorts the reads by their sequence.
+sort_by_seq()
+{
+    TMPDIR="/var/tmp" fastq-sort -s || error_exit "fastq-sort failed"
+}
+
+# Once the reads are sorted by sequence,
+# successive reads with the same sequence are merged,
+# keeping the best quality at each position.
+
+dedup () {
+    #${PACKAGEDIR}/remove_duplicates_from_sorted_fastq/remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
+    #remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
+    sort_by_seq | remove-duplicates-from-sorted-fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
+}
+
+trim_random_nt()
+{
+    # $1: nb of bases to trim at 5' end
+    # $2: nb of bases to trim at 3' end
+    cutadapt -u ${1} -u -${2} - 2> /dev/null || error_exit "trim_random_nt failed"
+}
+
+process_without_deduplication () {
+    # $1: compressed fastq file in which to write the trimmed reads
+    trim_random_nt ${total_fiveprime} ${threeprime_random} | gzip > ${1}
+}
+
+# This named pipe is used to avoid writing the intermediate file to disk
+# It will transmit reads that did not seem to contain the adapter to the
+# second sorting and deduplicating.
+mkfifo ${untrimmed_out}.fifo
+
+minsize_trimmed=$(echo "${fiveprime_random} + 16 + ${threeprime_random}" | bc)
+
+case "${trimmer}" in
+    "cutadapt")
+        if [ ${MAX_ERROR_RATE} ]
+        then
+            #>&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
+            error_opt="-e ${MAX_ERROR_RATE}"
+        else
+            error_opt=""
+        fi
+        if [ ${THREADS} ]
+        then
+            #thread_opt="-j ${THREADS}"
+            >&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
+            thread_opt=""
+        else
+            thread_opt=""
+        fi
+        # -m ${minsize_random} is to discard reads that are shorter than this after trimming
+        trim_cmd="${trimmer} -a ${adapt} -m ${minsize_trimmed} ${error_opt} ${thread_opt} --untrimmed-output=${untrimmed_out}.fifo - 2> ${log}"
+        ;;
+    "fastx_clipper")
+        # -n is to keep reads with N
+        # -l is equivalent to -m in cutadapt (not sure it has effect with -C)
+        # -c is to keep only the trimmed reads
+        # -C is to keep only the non-trimmed reads
+        # -v is to have verbose things to put in the log
+        trim_cmd="tee >(${trimmer} -a ${adapt} -l ${minsize_trimmed} -C -M 3 -n > ${untrimmed_out}.fifo) | ${trimmer} -a ${adapt} -l ${minsize_trimmed} -c -M 3 -n -v 2>> ${log}"
+        ;;
+    *)
+        error_exit "Trimming adapter with ${trimmer} not implemented."
+        ;;
+esac
+
+# a second cutadapt step removes the random nucleotides that helped identify PCR duplicates.
+#dedup_trimmed()
+#{
+#    # $1: file in which to write the number of fastq records after adapter trimming
+#    # $2: file in which to write the number of fastq records after deduplication
+#    cmd="${trim_cmd} | strip_low_qual_zones | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip"
+#    echo ${cmd}
+#    eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed"
+#}
+dedup_trimmed()
+{
+    # $1: file in which to write the number of fastq records after adapter trimming
+    # $2: file in which to write the number of fastq records after deduplication
+    cmd="${trim_cmd} | tee >(process_without_deduplication  ${trimmed_nodedup_out}) | strip_low_qual_zones | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip"
+    echo ${cmd}
+    eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed"
+}
+
+#dedup_untrimmed()
+#{
+#    # $1: file in which to write the number of fastq records after deduplication
+#    cmd="cat - | strip_low_qual_zones | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | tee >(count_fastq_reads ${1}) | gzip"
+#    echo ${cmd}
+#    eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed"
+#}
+dedup_untrimmed()
+{
+    # $1: file in which to write the number of fastq records after deduplication
+    cmd="cat - | strip_low_qual_zones | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | tee >(count_fastq_reads ${1}) | gzip"
+    echo ${cmd}
+    eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed"
+}
+
+filetype="$(file -L ${raw_in} | cut -d " " -f2)"
+case "${filetype}" in
+    "gzip")
+        cat_cmd="zcat"
+        ;;
+    "ASCII")
+        cat_cmd="cat"
+        ;;
+    *)
+        error_exit "Unexpected file type for ${raw_in}"
+        ;;
+esac
+
+${cat_cmd} ${raw_in} \
+    | tee >(count_fastq_reads ${nb_raw}) \
+    | dedup_trimmed ${nb_adapt} ${nb_adapt_deduped} &
+pid_to_wait=$!
+cat ${untrimmed_out}.fifo \
+    | tee >(count_fastq_reads ${nb_noadapt}) \
+    | dedup_untrimmed ${nb_noadapt_deduped} || rm -f ${untrimmed_out}.fifo
+wait ${pid_to_wait}
+rm -f ${untrimmed_out}.fifo
+
+exit 0
-- 
GitLab