Skip to content
Snippets Groups Projects
PRO-seq_trim_and_dedup.sh 5.22 KiB
#!/usr/bin/env bash
# Usage: PRO-seq_trim_and_dedup.sh <trimmer> <raw fastq> <ADAPTER> <nb 5'> <nb 3'> <trimmed fastq> <untrimmed fastq> <trimmer log> <nb_raw> <nb_adapt> <nb_adapt_deduped> <nb_noadapt> <nb_noadapt_deduped>

# http://linuxcommand.org/wss0150.php
PROGNAME=$(basename $0)

function error_exit
{
#	----------------------------------------------------------------
#	Function for exit due to fatal program error
#		Accepts 1 argument:
#			string containing descriptive error message
#	----------------------------------------------------------------
    echo "${PROGNAME}: ${1:-"Unknown Error"}" 1>&2
    exit 1
}

# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in
#PACKAGEDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )

trimmer=${1}

raw_in=${2}

adapt=${3}

fiveprime_random=${4}
threeprime_random=${5}
# For the untrimmed, the adaptor was not found,
# but maybe its first 3 bases were there.
threeprime_random_with_margin=$((${threeprime_random}+3))

trimmed_and_dedup_out=${6}

untrimmed_out=${7}

log=${8}

nb_raw=${9}
nb_adapt=${10}
nb_adapt_deduped=${11}
nb_noadapt=${12}
nb_noadapt_deduped=${13}

count_fastq_reads()
{
    # $1: file in which to write the number of fastq records
    wc -l | { read nblines; echo ${nblines} / 4 | bc > ${1}; }
}

# This script performs 2 sorting and deduplicating operations, depending on the
# presence or absence of the adapter in the read.


# The -s option of fastq-sort sorts the reads by their sequence.
sort_by_seq()
{
    fastq-sort -s || error_exit "fastq-sort failed"
}

# Once the reads are sorted by sequence,
# successive reads with the same sequence are merged,
# keeping the best quality at each position.

dedup () {
    #${PACKAGEDIR}/remove_duplicates_from_sorted_fastq/remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
    #remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
    sort_by_seq | remove-duplicates-from-sorted-fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
}

trim_random_nt()
{
    # $1: nb of bases to trim at 5' end
    # $2: nb of bases to trim at 3' end
    cutadapt -u ${1} -u -${2} - 2> /dev/null || error_exit "trim_random_nt failed"
}


# This named pipe is used to avoid writing the intermediate file to disk
# It will transmit reads that did not seem to contain the adapter to the
# second sorting and deduplicating.
mkfifo ${untrimmed_out}.fifo

minsize_trimmed=$(echo "${fiveprime_random} + 16 + ${threeprime_random}" | bc)

case "${trimmer}" in
    "cutadapt")
        if [ ${MAX_ERROR_RATE} ]
        then
            #>&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
            error_opt="-e ${MAX_ERROR_RATE}"
        else
            error_opt=""
        fi
        if [ ${THREADS} ]
        then
            #thread_opt="-j ${THREADS}"
            >&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
            thread_opt=""
        else
            thread_opt=""
        fi
        # -m ${minsize_random} is to discard reads that are shorter than this after trimming
        trim_cmd="${trimmer} -a ${adapt} -m ${minsize_trimmed} ${error_opt} ${thread_opt} --untrimmed-output=${untrimmed_out}.fifo - 2> ${log}"
        ;;
    "fastx_clipper")
        # -n is to keep reads with N
        # -l is equivalent to -m in cutadapt (not sure it has effect with -C)
        # -c is to keep only the trimmed reads
        # -C is to keep only the non-trimmed reads
        # -v is to have verbose things to put in the log
        trim_cmd="tee >(${trimmer} -a ${adapt} -l ${minsize_trimmed} -C -M 3 -n > ${untrimmed_out}.fifo) | ${trimmer} -a ${adapt} -l ${minsize_trimmed} -c -M 3 -n -v 2>> ${log}"
        ;;
    *)
        error_exit "Trimming adapter with ${trimmer} not implemented."
        ;;
esac

# a second cutadapt step removes the random nucleotides that helped identify PCR duplicates.
dedup_trimmed()
{
    # $1: file in which to write the number of fastq records after adapter trimming
    # $2: file in which to write the number of fastq records after deduplication
    cmd="${trim_cmd} | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip"
    echo ${cmd}
    eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed"
}

dedup_untrimmed()
{
    # $1: file in which to write the number of fastq records after deduplication
    cmd="cat - | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | tee >(count_fastq_reads ${1}) | gzip"
    echo ${cmd}
    eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed"
}

filetype="$(file -L ${raw_in} | cut -d " " -f2)"
case "${filetype}" in
    "gzip")
        cat_cmd="zcat"
        ;;
    "ASCII")
        cat_cmd="cat"
        ;;
    *)
        error_exit "Unexpected file type for ${raw_in}"
        ;;
esac

${cat_cmd} ${raw_in} \
    | tee >(count_fastq_reads ${nb_raw}) \
    | dedup_trimmed ${nb_adapt} ${nb_adapt_deduped} &
pid_to_wait=$!
cat ${untrimmed_out}.fifo \
    | tee >(count_fastq_reads ${nb_noadapt}) \
    | dedup_untrimmed ${nb_noadapt_deduped} || rm -f ${untrimmed_out}.fifo
wait ${pid_to_wait}
rm -f ${untrimmed_out}.fifo

exit 0