Configurable 5' and 3' UMI sizes, count reads.

4777a77c · Blaise Li · 55e1c85f · 4777a77c
Commit 4777a77c authored 8 years ago by Blaise Li
--- a/PRO-seq/PRO-seq_trim_and_dedup.sh
+++ b/PRO-seq/PRO-seq_trim_and_dedup.sh
 #!/usr/bin/env bash
-# Usage: PRO-seq_trim_and_dedup.sh <raw fastq> <ADAPTER> <trimmed fastq> <untrimmed fastq>
+# Usage: PRO-seq_trim_and_dedup.sh <raw fastq> <ADAPTER> <nb 5'> <nb 3'> <trimmed fastq> <untrimmed fastq> <cutadapt log> <nb_raw> <nb_adapt> <nb_adapt_deduped> <nb_noadapt> <nb_noadapt_deduped>

 # http://linuxcommand.org/wss0150.php
 PROGNAME=$(basename $0)
@@ -22,15 +22,31 @@ raw_in=${1}

 adapt=${2}

-trimmed_and_dedup_out=${3}
+fiveprime_random=${3}
+threeprime_random=${4}

-untrimmed_out=${4}
+trimmed_and_dedup_out=${5}

-log=${5}
+untrimmed_out=${6}
+
+log=${7}
+
+nb_raw=${8}
+nb_adapt=${9}
+nb_adapt_deduped=${10}
+nb_noadapt=${11}
+nb_noadapt_deduped=${12}

 # This script performs 2 sorting and deduplicating operations, depending on the
 # presence or absence of the adapter in the read.

+count_fastq_reads()
+{
+    # $1: file in which to write the number of fastq records
+    wc -l | { read nblines; echo ${nblines} / 4 | bc > ${1}; }
+}
+
+
 # The -s option of fastq-sort sorts the reads by their sequence.
 sort_by_seq()
 {
@@ -40,36 +56,52 @@ sort_by_seq()
 # Once the reads are sorted by sequence,
 # successive reads with the same sequence are merged,
 # keeping the best quality at each position.
-dedup()
-{
+
+dedup () {
    #${PACKAGEDIR}/remove_duplicates_from_sorted_fastq/remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
    #remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
-    remove-duplicates-from-sorted-fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
+    sort_by_seq | remove-duplicates-from-sorted-fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
+}
+
+trim_random_nt()
+{
+    # $1: nb of bases to trim at 5' end
+    # $2: nb of bases to trim at 3' end
+    cutadapt -u ${1} -u -${2} - || error_exit "trim_random_nt failed"
 }

+
 # This named pipe is used to avoid writing the intermediate file to disk
 # It will transmit reads that did not seem to contain the adapter to the
 # second sorting and deduplicating.
 mkfifo ${untrimmed_out}.fifo

-# -m 24 is to discard reads that are shorter than 24 after trimming
+minsize_trimmed=$(echo "${fiveprime_random} + 20 + ${threeprime_random}" | bc)
+# -m ${minsize_random} is to discard reads that are shorter than this after trimming
 # a second cutadapt step removes the random nucleotides that helped identify PCR duplicates.
 dedup_trimmed()
 {
-    cmd="cutadapt -a ${adapt} -m 24 --untrimmed-output=${untrimmed_out}.fifo ${raw_in} 2> ${log} | sort_by_seq | dedup | cutadapt -u -4 -u 4 - | gzip"
+    # $1: file in which to write the number of fastq records after adapter trimming
+    # $2: file in which to write the number of fastq records after deduplication
+    cmd="cutadapt -a ${adapt} -m ${minsize_trimmed} --untrimmed-output=${untrimmed_out}.fifo - 2> ${log} | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip"
    echo ${cmd}
    eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed"
 }

 dedup_untrimmed()
 {
-    cmd="cat ${untrimmed_out}.fifo | sort_by_seq | dedup | cutadapt -u -4 -u 4 - | gzip"
+    # $1: file in which to write the number of fastq records after deduplication
+    cmd="cat - | dedup | cutadapt -u -${threeprime_random} -u ${fiveprime_random} - | tee >(count_fastq_reads ${1}) | gzip"
    echo ${cmd}
    eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed"
 }

-dedup_trimmed &
-dedup_untrimmed || rm -f ${untrimmed_out}.fifo
+zcat ${raw_in} \
+    | tee >(count_fastq_reads ${nb_raw}) \
+    | dedup_trimmed ${nb_adapt} ${nb_adapt_deduped} &
+cat ${untrimmed_out}.fifo \
+    | tee >(count_fastq_reads ${nb_noadapt}) \
+    | dedup_untrimmed ${nb_noadapt_deduped} || rm -f ${untrimmed_out}.fifo
 rm -f ${untrimmed_out}.fifo

 exit 0