Homogenized GRO-seq trim_and_dedup with iCLIP.

c9217c51 · Blaise Li · efd0a96f · c9217c51 · c9217c51
Commit c9217c51 authored 7 years ago by Blaise Li
--- a/PRO-seq/PRO-seq.snakefile
+++ b/PRO-seq/PRO-seq.snakefile
@@ -270,12 +270,6 @@ rule trim_and_dedup:
    PCR duplicates are removed at both ends"""
    input:
        rules.link_raw_data.output,
-    params:
-        adapter = lambda wildcards : lib2adapt[wildcards.lib],
-        trim5 = lambda wildcards : lib2UMI[wildcards.lib][0],
-        trim3 = lambda wildcards : lib2UMI[wildcards.lib][1],
-        minsize_trimmed = lambda wildcards : str(int(lib2UMI[wildcards.lib][0]) + 16 + int(lib2UMI[wildcards.lib][1])),
-        untrimmed_tmp = lambda wildcards : OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_noadapt_tmp.fifo".format(lib=wildcards.lib, rep=wildcards.rep)),
    output:
        noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_noadapt_deduped.fastq.gz"),
        adapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_adapt_deduped.fastq.gz"),
@@ -284,6 +278,11 @@ rule trim_and_dedup:
        nb_adapt_deduped =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_adapt_deduped.txt"),
        nb_noadapt =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt.txt"),
        nb_noadapt_deduped =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt_deduped.txt"),
+    params:
+        adapter = lambda wildcards : lib2adapt[wildcards.lib],
+        process_type = "PRO-seq",
+        trim5 = lambda wildcards : lib2UMI[wildcards.lib][0],
+        trim3 = lambda wildcards : lib2UMI[wildcards.lib][1],
    threads: 4 # Actually, to avoid too much IO
    message:
        "Trimming adaptor from raw data using {wildcards.trimmer}, deduplicating reads, and removing 5' and 3' random n-mers for {wildcards.lib}_{wildcards.rep}."
@@ -294,60 +293,13 @@ rule trim_and_dedup:
        log = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.log"),
        err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"),
    run:
-        if True:
+        shell_commands = """
-        #if wildcards.trimmer == "cutadapt":
+THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {wildcards.trimmer} {input} \\
-            shell_commands = """
+    {params.adapter} {params.trim5} {params.trim3} \\
-PRO-seq_trim_and_dedup.sh {wildcards.trimmer} {input} {params.adapter} {params.trim5} {params.trim3} {output.adapt} {output.noadapt} {log.trim} {output.nb_raw} {output.nb_adapt} {output.nb_adapt_deduped} {output.nb_noadapt} {output.nb_noadapt_deduped} 1> {log.log} 2> {log.err}
+    {output.adapt} {output.noadapt} {log.trim} \\
-# This named pipe is used to avoid writing the intermediate file to disk
+    {output.nb_raw} {output.nb_adapt} {output.nb_adapt_deduped} \\
-# It will transmit reads that did not seem to contain the adapter to the
+    {output.nb_noadapt} {output.nb_noadapt_deduped} 1> {log.log} 2> {log.err}
-# second sorting and deduplicating.
-#mkfifo {params.untrimmed_tmp}
-# -m {params.minsize_trimmed} is to discard reads that are shorter than this after trimming
-# a second cutadapt step removes the random nucleotides that helped identify PCR duplicates.
-#zcat {input} \\
-#    | tee >(count_fastq_reads {output.nb_raw}) \\
-#    | cutadapt -a {params.adapter} -m {params.minsize_trimmed} \\
-#        --untrimmed-output={params.untrimmed_tmp} - 2> {log.trim} \\
-#    | tee >(count_fastq_reads {output.nb_adapt}) \\
-#    | dedup \\
-#    | trim_random_nt {params.trim5} {params.trim3} \\
-#    | tee >(count_fastq_reads {output.nb_adapt_deduped}) \\
-#    | gzip > {output.noadapt} &
-#pid_to_wait=$!
-#cat {params.untrimmed_tmp} \\
-#    | tee >(count_fastq_reads {output.nb_noadapt}) \\
-#    | dedup \\
-#    | trim_random_nt {params.trim5} {params.trim3} \\
-#    | tee >(count_fastq_reads {output.nb_noadapt_deduped}) \\
-#    | gzip > {output.adapt} \\
-#    || rm -f {params.untrimmed_tmp}
-#wait ${{pid_to_wait}}
-#rm -f {params.untrimmed_tmp}
 """
-#        elif wildcards.trimmer == "fastx_clipper":
-#            shell_commands = """
-#zcat {input} \\
-#    | tee >(count_fastq_reads {output.nb_raw}) \\
-#    | fastx_clipper -a {params.adapter} -l {params.minsize_trimmed} \\
-#        -c -M 3 -n -v 2> {log.trim} \\
-#    | tee >(count_fastq_reads {output.nb_adapt}) \\
-#    | dedup \\
-#    | tee >(count_fastq_reads {output.nb_adapt_deduped}) \\
-#    | trim_random_nt {params.trim5} {params.trim3} \\
-#    | gzip > {output.noadapt} &
-#pid_to_wait=$!
-#zcat {input} \\
-#    | fastx_clipper -a {params.adapter} -l {params.minsize_trimmed} \\
-#        -C -M 3 -n -v 2>> {log.trim} \\
-#    | tee >(count_fastq_reads {output.nb_noadapt}) \\
-#    | dedup \\
-#    | tee >(count_fastq_reads {output.nb_noadapt_deduped}) \\
-#    | trim_random_nt {params.trim5} {params.trim3} \\
-#    | gzip > {output.adapt}
-#wait ${{pid_to_wait}}
-#"""
-#        else:
-#            raise NotImplementedError("%s is not yet handled." % wildcards.trimmer)
        shell(shell_commands)

--- a/PRO-seq/PRO-seq_trim_and_dedup.sh
+++ b/PRO-seq/PRO-seq_trim_and_dedup.sh
@@ -26,6 +26,9 @@ adapt=${3}
 fiveprime_random=${4}
 threeprime_random=${5}
+# For the untrimmed, the adaptor was not found,
+# but maybe its first 3 bases were there.
+threeprime_random_with_margin=$((${threeprime_random}+3))
 trimmed_and_dedup_out=${6}
@@ -82,8 +85,23 @@ minsize_trimmed=$(echo "${fiveprime_random} + 16 + ${threeprime_random}" | bc)
 case "${trimmer}" in
    "cutadapt")
+        if [ ${MAX_ERROR_RATE} ]
+        then
+            #>&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
+            error_opt="-e ${MAX_ERROR_RATE}"
+        else
+            error_opt=""
+        fi
+        if [ ${THREADS} ]
+        then
+            #thread_opt="-j ${THREADS}"
+            >&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
+            thread_opt=""
+        else
+            thread_opt=""
+        fi
        # -m ${minsize_random} is to discard reads that are shorter than this after trimming
-        trim_cmd="${trimmer} -a ${adapt} -m ${minsize_trimmed} --untrimmed-output=${untrimmed_out}.fifo - 2> ${log}"
+        trim_cmd="${trimmer} -a ${adapt} -m ${minsize_trimmed} ${error_opt} ${thread_opt} --untrimmed-output=${untrimmed_out}.fifo - 2> ${log}"
        ;;
    "fastx_clipper")
        # -n is to keep reads with N
@@ -111,12 +129,25 @@ dedup_trimmed()
 dedup_untrimmed()
 {
    # $1: file in which to write the number of fastq records after deduplication
-    cmd="cat - | dedup | trim_random_nt ${threeprime_random} ${fiveprime_random} | tee >(count_fastq_reads ${1}) | gzip"
+    cmd="cat - | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | tee >(count_fastq_reads ${1}) | gzip"
    echo ${cmd}
    eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed"
 }
-zcat ${raw_in} \
+filetype="$(file -L ${raw_in} | cut -d " " -f2)"
+case "${filetype}" in
+    "gzip")
+        cat_cmd="zcat"
+        ;;
+    "ASCII")
+        cat_cmd="cat"
+        ;;
+    *)
+        error_exit "Unexpected file type for ${raw_in}"
+        ;;
+esac
+${cat_cmd} ${raw_in} \
    | tee >(count_fastq_reads ${nb_raw}) \
    | dedup_trimmed ${nb_adapt} ${nb_adapt_deduped} &
 pid_to_wait=$!