Skip to content
Snippets Groups Projects
Commit 84d9beba authored by Blaise Li's avatar Blaise Li
Browse files

Script that trims and deduplicate PRO-seq reads.

There are two deduplication flows: for the reads with or without the
adapter.
parent db8d78dc
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env bash
# Usage: PRO-seq_trim_and_dedup.sh <raw fastq> <ADAPTER> <trimmed fastq> <untrimmed fastq>
# http://linuxcommand.org/wss0150.php
PROGNAME=$(basename $0)
function error_exit
{
# ----------------------------------------------------------------
# Function for exit due to fatal program error
# Accepts 1 argument:
# string containing descriptive error message
# ----------------------------------------------------------------
echo "${PROGNAME}: ${1:-"Unknown Error"}" 1>&2
exit 1
}
# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in
#PACKAGEDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
raw_in=${1}
adapt=${2}
trimmed_and_dedup_out=${3}
untrimmed_out=${4}
log=${5}
# This script performs 2 sorting and deduplicating operations, depending on the
# presence or absence of the adapter in the read.
# The -s option of fastq-sort sorts the reads by their sequence.
sort_by_seq()
{
fastq-sort -s || error_exit "fastq-sort failed"
}
# Once the reads are sorted by sequence,
# successive reads with the same sequence are merged,
# keeping the best quality at each position.
dedup()
{
#${PACKAGEDIR}/remove_duplicates_from_sorted_fastq/remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
#remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
remove-duplicates-from-sorted-fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
}
# This named pipe is used to avoid writing the intermediate file to disk
# It will transmit reads that did not seem to contain the adapter to the
# second sorting and deduplicating.
mkfifo ${untrimmed_out}.fifo
# -m 24 is to discard reads that are shorter than 24 after trimming
# a second cutadapt step removes the random nucleotides that helped identify PCR duplicates.
dedup_trimmed()
{
cmd="cutadapt -a ${adapt} -m 24 --untrimmed-output=${untrimmed_out}.fifo ${raw_in} 2> ${log} | sort_by_seq | dedup | cutadapt -u -4 -u 4 - | gzip"
echo ${cmd}
eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed"
}
dedup_untrimmed()
{
cmd="cat ${untrimmed_out}.fifo | sort_by_seq | dedup | cutadapt -u -4 -u 4 - | gzip"
echo ${cmd}
eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed"
}
dedup_trimmed &
dedup_untrimmed || rm -f ${untrimmed_out}.fifo
rm -f ${untrimmed_out}.fifo
exit 0
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment