Deduplication of GRO-seq data is now optional.

f241ee02 · Blaise Li · cff1d2d4 · f241ee02 · f241ee02
Commit f241ee02 authored 6 years ago by Blaise Li
--- a/PRO-seq/PRO-seq.snakefile
+++ b/PRO-seq/PRO-seq.snakefile
@@ -352,11 +352,62 @@ THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {wildcards.trimmer}
        shell(shell_commands)


+rule trim_only:
+    """The adaptor is trimmed, then reads are treated in two groups depending
+    on whether the adapter was found or not. For each group, the extra k-mers
+    are removed at both ends."""
+    input:
+        rules.link_raw_data.output,
+    output:
+        noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_noadapt.fastq.gz"),
+        adapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_adapt.fastq.gz"),
+        nb_raw =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_raw.txt"),
+        nb_adapt =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_adapt.txt"),
+        nb_noadapt =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt.txt"),
+    params:
+        adapter = lambda wildcards : lib2adapt[wildcards.lib],
+        process_type = "PRO-seq",
+        trim5 = lambda wildcards : lib2UMI[wildcards.lib][0],
+        trim3 = lambda wildcards : lib2UMI[wildcards.lib][1],
+    threads: 4 # Actually, to avoid too much IO
+    message:
+        "Trimming adaptor from raw data using {wildcards.trimmer} and removing 5' and 3' random n-mers for {wildcards.lib}_{wildcards.rep}."
+    benchmark:
+        OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim_benchmark.txt")
+    log:
+        trim = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim.log"),
+        log = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.log"),
+        err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"),
+    run:
+        shell_commands = """
+THREADS="{threads}" {params.process_type}_trim_only.sh {wildcards.trimmer} {input} \\
+    {params.adapter} {params.trim5} {params.trim3} \\
+    {output.adapt} {output.noadapt} {log.trim} \\
+    {output.nb_raw} {output.nb_adapt} {output.nb_noadapt} \\
+    1> {log.log} 2> {log.err}
+"""
+        shell(shell_commands)
+
+
+def source_fastq(wildcards):
+    """
+    Determine the correct pre-processed fastq file depending on the pipeline
+    configuration and the current wildcards.
+    """
+    if config.get("deduplicate", False):
+        return OPJ(
+            data_dir, f"trimmed_{wildcards.trimmer}",
+            f"{wildcards.lib}_{wildcards.rep}_{wildcards.type}_deduped.fastq.gz"),
+    else:
+        return OPJ(
+            data_dir, f"trimmed_{wildcards.trimmer}",
+            f"{wildcards.lib}_{wildcards.rep}_{wildcards.type}.fastq.gz"),
+
+
 # TODO: Do not deduplicate, or at least do not use the noadapt_deduped: The 3' UMI is not present.
 rule map_on_genome:
    input:
-        fastq = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{type}_deduped.fastq.gz"),
-        #rules.trim_and_dedup.output,
+        fastq = source_fastq,
    output:
        # sam files take a lot of space
        sam = temp(OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans.sam")),

--- a/PRO-seq/PRO-seq_trim_only.sh
+++ b/PRO-seq/PRO-seq_trim_only.sh
+#!/usr/bin/env bash
+# Usage: PRO-seq_trim_only.sh <trimmer> <raw fastq> <ADAPTER> <nb 5'> <nb 3'> <trimmed fastq> <untrimmed fastq> <trimmer log> <nb_raw> <nb_adapt> <nb_noadapt>
+
+# http://linuxcommand.org/wss0150.php
+PROGNAME=$(basename $0)
+
+function error_exit
+{
+#	----------------------------------------------------------------
+#	Function for exit due to fatal program error
+#		Accepts 1 argument:
+#			string containing descriptive error message
+#	----------------------------------------------------------------
+    echo "${PROGNAME}: ${1:-"Unknown Error"}" 1>&2
+    exit 1
+}
+
+# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in
+#PACKAGEDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+
+trimmer=${1}
+
+raw_in=${2}
+
+adapt=${3}
+
+fiveprime_random=${4}
+threeprime_random=${5}
+# For the untrimmed, the adaptor was not found,
+# but maybe its first 3 bases were there.
+threeprime_random_with_margin=$((${threeprime_random}+3))
+
+trimmed_out=${6}
+
+untrimmed_out=${7}
+
+log=${8}
+
+nb_raw=${9}
+nb_adapt=${10}
+nb_noadapt=${11}
+
+count_fastq_reads()
+{
+    # $1: file in which to write the number of fastq records
+    wc -l | { read nblines; echo ${nblines} / 4 | bc > ${1}; }
+}
+
+
+trim_random_nt()
+{
+    # $1: nb of bases to trim at 5' end
+    # $2: nb of bases to trim at 3' end
+    cutadapt -u ${1} -u -${2} - 2> /dev/null || error_exit "trim_random_nt failed"
+}
+
+
+# This named pipe is used to avoid writing the intermediate file to disk
+# It will transmit reads that did not seem to contain the adapter to the
+# UMI trimming step.
+mkfifo ${untrimmed_out}.fifo
+
+minsize_trimmed=$(echo "${fiveprime_random} + 16 + ${threeprime_random}" | bc)
+
+case "${trimmer}" in
+    "cutadapt")
+        if [ ${MAX_ERROR_RATE} ]
+        then
+            #>&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
+            error_opt="-e ${MAX_ERROR_RATE}"
+        else
+            error_opt=""
+        fi
+        if [ ${THREADS} ]
+        then
+            #thread_opt="-j ${THREADS}"
+            >&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
+            thread_opt=""
+        else
+            thread_opt=""
+        fi
+        # -m ${minsize_random} is to discard reads that are shorter than this after trimming
+        trim_cmd="${trimmer} -a ${adapt} -m ${minsize_trimmed} ${error_opt} ${thread_opt} --untrimmed-output=${untrimmed_out}.fifo - 2> ${log}"
+        ;;
+    "fastx_clipper")
+        # -n is to keep reads with N
+        # -l is equivalent to -m in cutadapt (not sure it has effect with -C)
+        # -c is to keep only the trimmed reads
+        # -C is to keep only the non-trimmed reads
+        # -v is to have verbose things to put in the log
+        trim_cmd="tee >(${trimmer} -a ${adapt} -l ${minsize_trimmed} -C -M 3 -n > ${untrimmed_out}.fifo) | ${trimmer} -a ${adapt} -l ${minsize_trimmed} -c -M 3 -n -v 2>> ${log}"
+        ;;
+    *)
+        error_exit "Trimming adapter with ${trimmer} not implemented."
+        ;;
+esac
+
+# a second cutadapt step removes the random nucleotides that helped identify PCR duplicates.
+trim_trimmed()
+{
+    # $1: file in which to write the number of fastq records after adapter trimming
+    cmd="${trim_cmd} | tee >(count_fastq_reads ${1}) | trim_random_nt ${fiveprime_random} ${threeprime_random} | gzip"
+    echo ${cmd}
+    eval ${cmd} > ${trimmed_out} || error_exit "${cmd} failed"
+}
+
+trim_untrimmed()
+{
+    cmd="trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | gzip"
+    echo ${cmd}
+    eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed"
+}
+
+filetype="$(file -L ${raw_in} | cut -d " " -f2)"
+case "${filetype}" in
+    "gzip")
+        cat_cmd="zcat"
+        ;;
+    "ASCII")
+        cat_cmd="cat"
+        ;;
+    *)
+        error_exit "Unexpected file type for ${raw_in}"
+        ;;
+esac
+
+${cat_cmd} ${raw_in} \
+    | tee >(count_fastq_reads ${nb_raw}) \
+    | trim_trimmed ${nb_adapt} ${nb_adapt_deduped} &
+pid_to_wait=$!
+cat ${untrimmed_out}.fifo \
+    | tee >(count_fastq_reads ${nb_noadapt}) \
+    | trim_untrimmed ${nb_noadapt_deduped} || rm -f ${untrimmed_out}.fifo
+wait ${pid_to_wait}
+rm -f ${untrimmed_out}.fifo
+
+exit 0