iCLIP_trim_and_dedup.sh 7.19 KiB
#!/usr/bin/env bash
# Usage: iCLIP_trim_and_dedup.sh <trimmer> <raw fastq> <ADAPTER> <nb 5'> <nb 3'> <trimmed fastq> <untrimmed fastq> <trimmed_nodedup> <trimmer log> <nb_raw> <nb_adapt> <nb_adapt_deduped> <nb_noadapt> <nb_noadapt_deduped>
# http://linuxcommand.org/wss0150.php
PROGNAME=$(basename $0)
function error_exit
{
# ----------------------------------------------------------------
# Function for exit due to fatal program error
# Accepts 1 argument:
# string containing descriptive error message
# ----------------------------------------------------------------
echo "${PROGNAME}: ${1:-"Unknown Error"}" 1>&2
exit 1
}
# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in
#PACKAGEDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
trimmer=${1}
raw_in=${2}
adapt=${3}
# Sum of lengths of UMIs both left and right of barcode
fiveprime_random=${4}
# Adding lenghts of "low diversity zones"
# (to process without deduplicating)
total_fiveprime=$((${fiveprime_random}+9))
# Length of the 3' UMI
threeprime_random=${5}
# For the untrimmed, the adaptor was not found,
# but maybe its first 3 bases were there.
threeprime_random_with_margin=$((${threeprime_random}+3))
echo ${threeprime_random_with_margin}
# fastq files
trimmed_and_dedup_out=${6}
untrimmed_out=${7}
trimmed_nodedup_out=${8}
log=${9}
# count files
nb_raw=${10}
nb_adapt=${11}
nb_adapt_deduped=${12}
nb_noadapt=${13}
nb_noadapt_deduped=${14}
count_fastq_reads()
{
# $1: file in which to write the number of fastq records
wc -l | { read nblines; echo ${nblines} / 4 | bc > ${1}; }
}
# Removing parts that have lower diversity,
# and therefore more likely to have sequencing errors (6-11,15-17)
# Reads should be:
#
# NNNNNGCACTANNNWWW[YYYY]NNNN
# 1---5 : 5' UMI
# 6--11: barcode (lower diversity)
# 12-14: UMI
# 15-17: AT(or GC?)-rich (low diversity)
# [fragment]
# -4 -> -1: 3' UMI
strip_low_qual_zones()
{
bioawk -c fastx '{print $name"\t"substr($seq, 1, 5)""substr($seq, 12, 3)""substr($seq, 18)"\t"substr($qual, 1, 5)""substr($qual, 12, 3)""substr($qual, 18)}' | mawk '{print "@"$1"\n"$2"\n+\n"$3}'
}
# This script performs 2 sorting and deduplicating operations, depending on the
# presence or absence of the adapter in the read.
# The -s option of fastq-sort sorts the reads by their sequence.
sort_by_seq()
{
TMPDIR="/var/tmp" fastq-sort -s || error_exit "fastq-sort failed"
}
# Once the reads are sorted by sequence,
# successive reads with the same sequence are merged,
# keeping the best quality at each position.
dedup () {
#${PACKAGEDIR}/remove_duplicates_from_sorted_fastq/remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
#remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
sort_by_seq | remove-duplicates-from-sorted-fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
}
trim_random_nt()
{
# $1: nb of bases to trim at 5' end
# $2: nb of bases to trim at 3' end
cutadapt -u ${1} -u -${2} - 2> /dev/null || error_exit "trim_random_nt failed"
}
process_without_deduplication () {
# $1: compressed fastq file in which to write the trimmed reads
trim_random_nt ${total_fiveprime} ${threeprime_random} | gzip > ${1}
}
# This named pipe is used to avoid writing the intermediate file to disk
# It will transmit reads that did not seem to contain the adapter to the
# second sorting and deduplicating.
mkfifo ${untrimmed_out}.fifo
minsize_trimmed=$(echo "${fiveprime_random} + 16 + ${threeprime_random}" | bc)
case "${trimmer}" in
"cutadapt")
if [ ${MAX_ERROR_RATE} ]
then
#>&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
error_opt="-e ${MAX_ERROR_RATE}"
else
error_opt=""
fi
if [ ${THREADS} ]
then
#thread_opt="-j ${THREADS}"
>&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
thread_opt=""
else
thread_opt=""
fi
# -m ${minsize_random} is to discard reads that are shorter than this after trimming
trim_cmd="${trimmer} -a ${adapt} -m ${minsize_trimmed} ${error_opt} ${thread_opt} --untrimmed-output=${untrimmed_out}.fifo - 2> ${log}"
;;
"fastx_clipper")
# -n is to keep reads with N
# -l is equivalent to -m in cutadapt (not sure it has effect with -C)
# -c is to keep only the trimmed reads
# -C is to keep only the non-trimmed reads
# -v is to have verbose things to put in the log
trim_cmd="tee >(${trimmer} -a ${adapt} -l ${minsize_trimmed} -C -M 3 -n > ${untrimmed_out}.fifo) | ${trimmer} -a ${adapt} -l ${minsize_trimmed} -c -M 3 -n -v 2>> ${log}"
;;
*)
error_exit "Trimming adapter with ${trimmer} not implemented."
;;
esac
# a second cutadapt step removes the random nucleotides that helped identify PCR duplicates.
#dedup_trimmed()
#{
# # $1: file in which to write the number of fastq records after adapter trimming
# # $2: file in which to write the number of fastq records after deduplication
# cmd="${trim_cmd} | strip_low_qual_zones | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip"
# echo ${cmd}
# eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed"
#}
dedup_trimmed()
{
# $1: file in which to write the number of fastq records after adapter trimming
# $2: file in which to write the number of fastq records after deduplication
cmd="${trim_cmd} | tee >(process_without_deduplication ${trimmed_nodedup_out}) | strip_low_qual_zones | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip"
echo ${cmd}
eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed"
}
#dedup_untrimmed()
#{
# # $1: file in which to write the number of fastq records after deduplication
# cmd="cat - | strip_low_qual_zones | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | tee >(count_fastq_reads ${1}) | gzip"
# echo ${cmd}
# eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed"
#}
dedup_untrimmed()
{
# $1: file in which to write the number of fastq records after deduplication
cmd="cat - | strip_low_qual_zones | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | tee >(count_fastq_reads ${1}) | gzip"
echo ${cmd}
eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed"
}
filetype="$(file -L ${raw_in} | cut -d " " -f2)"
case "${filetype}" in
"gzip")
cat_cmd="zcat"
;;
"ASCII")
cat_cmd="cat"
;;
*)
error_exit "Unexpected file type for ${raw_in}"
;;
esac
${cat_cmd} ${raw_in} \
| tee >(count_fastq_reads ${nb_raw}) \
| dedup_trimmed ${nb_adapt} ${nb_adapt_deduped} &
pid_to_wait=$!
cat ${untrimmed_out}.fifo \
| tee >(count_fastq_reads ${nb_noadapt}) \
| dedup_untrimmed ${nb_noadapt_deduped} || rm -f ${untrimmed_out}.fifo
wait ${pid_to_wait}
rm -f ${untrimmed_out}.fifo
exit 0