Skip to content
Snippets Groups Projects
iCLIP_trim_and_dedup.sh 7.19 KiB
#!/usr/bin/env bash
# Usage: iCLIP_trim_and_dedup.sh <trimmer> <raw fastq> <ADAPTER> <nb 5'> <nb 3'> <trimmed fastq> <untrimmed fastq> <trimmed_nodedup> <trimmer log> <nb_raw> <nb_adapt> <nb_adapt_deduped> <nb_noadapt> <nb_noadapt_deduped>

# http://linuxcommand.org/wss0150.php
PROGNAME=$(basename $0)

function error_exit
{
#	----------------------------------------------------------------
#	Function for exit due to fatal program error
#		Accepts 1 argument:
#			string containing descriptive error message
#	----------------------------------------------------------------
    echo "${PROGNAME}: ${1:-"Unknown Error"}" 1>&2
    exit 1
}

# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in
#PACKAGEDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )

trimmer=${1}

raw_in=${2}

adapt=${3}

# Sum of lengths of UMIs both left and right of barcode
fiveprime_random=${4}
# Adding lenghts of "low diversity zones"
# (to process without deduplicating)
total_fiveprime=$((${fiveprime_random}+9))
# Length of the 3' UMI
threeprime_random=${5}
# For the untrimmed, the adaptor was not found,
# but maybe its first 3 bases were there.
threeprime_random_with_margin=$((${threeprime_random}+3))
echo ${threeprime_random_with_margin}

# fastq files
trimmed_and_dedup_out=${6}
untrimmed_out=${7}
trimmed_nodedup_out=${8}

log=${9}

# count files
nb_raw=${10}
nb_adapt=${11}
nb_adapt_deduped=${12}
nb_noadapt=${13}
nb_noadapt_deduped=${14}

count_fastq_reads()
{
    # $1: file in which to write the number of fastq records
    wc -l | { read nblines; echo ${nblines} / 4 | bc > ${1}; }
}

# Removing parts that have lower diversity,
# and therefore more likely to have sequencing errors (6-11,15-17)
# Reads should be:
# 
# NNNNNGCACTANNNWWW[YYYY]NNNN
# 1---5 : 5' UMI
#      6--11: barcode (lower diversity)
#           12-14: UMI
#             15-17: AT(or GC?)-rich (low diversity)
#                 [fragment]
#                        -4 -> -1: 3' UMI
strip_low_qual_zones()
{
    bioawk -c fastx '{print $name"\t"substr($seq, 1, 5)""substr($seq, 12, 3)""substr($seq, 18)"\t"substr($qual, 1, 5)""substr($qual, 12, 3)""substr($qual, 18)}' | mawk '{print "@"$1"\n"$2"\n+\n"$3}'
}

# This script performs 2 sorting and deduplicating operations, depending on the
# presence or absence of the adapter in the read.


# The -s option of fastq-sort sorts the reads by their sequence.
sort_by_seq()
{
    TMPDIR="/var/tmp" fastq-sort -s || error_exit "fastq-sort failed"
}

# Once the reads are sorted by sequence,
# successive reads with the same sequence are merged,
# keeping the best quality at each position.

dedup () {
    #${PACKAGEDIR}/remove_duplicates_from_sorted_fastq/remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
    #remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
    sort_by_seq | remove-duplicates-from-sorted-fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
}

trim_random_nt()
{
    # $1: nb of bases to trim at 5' end
    # $2: nb of bases to trim at 3' end
    cutadapt -u ${1} -u -${2} - 2> /dev/null || error_exit "trim_random_nt failed"
}

process_without_deduplication () {
    # $1: compressed fastq file in which to write the trimmed reads
    trim_random_nt ${total_fiveprime} ${threeprime_random} | gzip > ${1}
}

# This named pipe is used to avoid writing the intermediate file to disk
# It will transmit reads that did not seem to contain the adapter to the
# second sorting and deduplicating.
mkfifo ${untrimmed_out}.fifo

minsize_trimmed=$(echo "${fiveprime_random} + 16 + ${threeprime_random}" | bc)

case "${trimmer}" in
    "cutadapt")
        if [ ${MAX_ERROR_RATE} ]
        then
            #>&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
            error_opt="-e ${MAX_ERROR_RATE}"
        else
            error_opt=""
        fi
        if [ ${THREADS} ]
        then
            #thread_opt="-j ${THREADS}"
            >&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
            thread_opt=""
        else
            thread_opt=""
        fi
        # -m ${minsize_random} is to discard reads that are shorter than this after trimming
        trim_cmd="${trimmer} -a ${adapt} -m ${minsize_trimmed} ${error_opt} ${thread_opt} --untrimmed-output=${untrimmed_out}.fifo - 2> ${log}"
        ;;
    "fastx_clipper")
        # -n is to keep reads with N
        # -l is equivalent to -m in cutadapt (not sure it has effect with -C)
        # -c is to keep only the trimmed reads
        # -C is to keep only the non-trimmed reads
        # -v is to have verbose things to put in the log
        trim_cmd="tee >(${trimmer} -a ${adapt} -l ${minsize_trimmed} -C -M 3 -n > ${untrimmed_out}.fifo) | ${trimmer} -a ${adapt} -l ${minsize_trimmed} -c -M 3 -n -v 2>> ${log}"
        ;;
    *)
        error_exit "Trimming adapter with ${trimmer} not implemented."
        ;;
esac

# a second cutadapt step removes the random nucleotides that helped identify PCR duplicates.
#dedup_trimmed()
#{
#    # $1: file in which to write the number of fastq records after adapter trimming
#    # $2: file in which to write the number of fastq records after deduplication
#    cmd="${trim_cmd} | strip_low_qual_zones | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip"
#    echo ${cmd}
#    eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed"
#}
dedup_trimmed()
{
    # $1: file in which to write the number of fastq records after adapter trimming
    # $2: file in which to write the number of fastq records after deduplication
    cmd="${trim_cmd} | tee >(process_without_deduplication  ${trimmed_nodedup_out}) | strip_low_qual_zones | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip"
    echo ${cmd}
    eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed"
}

#dedup_untrimmed()
#{
#    # $1: file in which to write the number of fastq records after deduplication
#    cmd="cat - | strip_low_qual_zones | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | tee >(count_fastq_reads ${1}) | gzip"
#    echo ${cmd}
#    eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed"
#}
dedup_untrimmed()
{
    # $1: file in which to write the number of fastq records after deduplication
    cmd="cat - | strip_low_qual_zones | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | tee >(count_fastq_reads ${1}) | gzip"
    echo ${cmd}
    eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed"
}

filetype="$(file -L ${raw_in} | cut -d " " -f2)"
case "${filetype}" in
    "gzip")
        cat_cmd="zcat"
        ;;
    "ASCII")
        cat_cmd="cat"
        ;;
    *)
        error_exit "Unexpected file type for ${raw_in}"
        ;;
esac

${cat_cmd} ${raw_in} \
    | tee >(count_fastq_reads ${nb_raw}) \
    | dedup_trimmed ${nb_adapt} ${nb_adapt_deduped} &
pid_to_wait=$!
cat ${untrimmed_out}.fifo \
    | tee >(count_fastq_reads ${nb_noadapt}) \
    | dedup_untrimmed ${nb_noadapt_deduped} || rm -f ${untrimmed_out}.fifo
wait ${pid_to_wait}
rm -f ${untrimmed_out}.fifo

exit 0