Script that trims and deduplicate PRO-seq reads.

There are two deduplication flows: for the reads with or without the adapter.

Script that trims and deduplicate PRO-seq reads.
84d9beba · Blaise Li · db8d78dc · 84d9beba
Commit 84d9beba authored 8 years ago by Blaise Li
--- a/PRO-seq/PRO-seq_trim_and_dedup.sh
+++ b/PRO-seq/PRO-seq_trim_and_dedup.sh
+#!/usr/bin/env bash
+# Usage: PRO-seq_trim_and_dedup.sh <raw fastq> <ADAPTER> <trimmed fastq> <untrimmed fastq>
+
+# http://linuxcommand.org/wss0150.php
+PROGNAME=$(basename $0)
+
+function error_exit
+{
+#	----------------------------------------------------------------
+#	Function for exit due to fatal program error
+#		Accepts 1 argument:
+#			string containing descriptive error message
+#	----------------------------------------------------------------
+    echo "${PROGNAME}: ${1:-"Unknown Error"}" 1>&2
+    exit 1
+}
+
+# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in
+#PACKAGEDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+
+raw_in=${1}
+
+adapt=${2}
+
+trimmed_and_dedup_out=${3}
+
+untrimmed_out=${4}
+
+log=${5}
+
+# This script performs 2 sorting and deduplicating operations, depending on the
+# presence or absence of the adapter in the read.
+
+# The -s option of fastq-sort sorts the reads by their sequence.
+sort_by_seq()
+{
+    fastq-sort -s || error_exit "fastq-sort failed"
+}
+
+# Once the reads are sorted by sequence,
+# successive reads with the same sequence are merged,
+# keeping the best quality at each position.
+dedup()
+{
+    #${PACKAGEDIR}/remove_duplicates_from_sorted_fastq/remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
+    #remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
+    remove-duplicates-from-sorted-fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
+}
+
+# This named pipe is used to avoid writing the intermediate file to disk
+# It will transmit reads that did not seem to contain the adapter to the
+# second sorting and deduplicating.
+mkfifo ${untrimmed_out}.fifo
+
+# -m 24 is to discard reads that are shorter than 24 after trimming
+# a second cutadapt step removes the random nucleotides that helped identify PCR duplicates.
+dedup_trimmed()
+{
+    cmd="cutadapt -a ${adapt} -m 24 --untrimmed-output=${untrimmed_out}.fifo ${raw_in} 2> ${log} | sort_by_seq | dedup | cutadapt -u -4 -u 4 - | gzip"
+    echo ${cmd}
+    eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed"
+}
+
+dedup_untrimmed()
+{
+    cmd="cat ${untrimmed_out}.fifo | sort_by_seq | dedup | cutadapt -u -4 -u 4 - | gzip"
+    echo ${cmd}
+    eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed"
+}
+
+dedup_trimmed &
+dedup_untrimmed || rm -f ${untrimmed_out}.fifo
+rm -f ${untrimmed_out}.fifo
+
+exit 0