Skip to content
Snippets Groups Projects
Commit 4777a77c authored by Blaise Li's avatar Blaise Li
Browse files

Configurable 5' and 3' UMI sizes, count reads.

parent 55e1c85f
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env bash
# Usage: PRO-seq_trim_and_dedup.sh <raw fastq> <ADAPTER> <trimmed fastq> <untrimmed fastq>
# Usage: PRO-seq_trim_and_dedup.sh <raw fastq> <ADAPTER> <nb 5'> <nb 3'> <trimmed fastq> <untrimmed fastq> <cutadapt log> <nb_raw> <nb_adapt> <nb_adapt_deduped> <nb_noadapt> <nb_noadapt_deduped>
# http://linuxcommand.org/wss0150.php
PROGNAME=$(basename $0)
......@@ -22,15 +22,31 @@ raw_in=${1}
adapt=${2}
trimmed_and_dedup_out=${3}
fiveprime_random=${3}
threeprime_random=${4}
untrimmed_out=${4}
trimmed_and_dedup_out=${5}
log=${5}
untrimmed_out=${6}
log=${7}
nb_raw=${8}
nb_adapt=${9}
nb_adapt_deduped=${10}
nb_noadapt=${11}
nb_noadapt_deduped=${12}
# This script performs 2 sorting and deduplicating operations, depending on the
# presence or absence of the adapter in the read.
count_fastq_reads()
{
# $1: file in which to write the number of fastq records
wc -l | { read nblines; echo ${nblines} / 4 | bc > ${1}; }
}
# The -s option of fastq-sort sorts the reads by their sequence.
sort_by_seq()
{
......@@ -40,36 +56,52 @@ sort_by_seq()
# Once the reads are sorted by sequence,
# successive reads with the same sequence are merged,
# keeping the best quality at each position.
dedup()
{
dedup () {
#${PACKAGEDIR}/remove_duplicates_from_sorted_fastq/remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
#remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
remove-duplicates-from-sorted-fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
sort_by_seq | remove-duplicates-from-sorted-fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
}
trim_random_nt()
{
# $1: nb of bases to trim at 5' end
# $2: nb of bases to trim at 3' end
cutadapt -u ${1} -u -${2} - || error_exit "trim_random_nt failed"
}
# This named pipe is used to avoid writing the intermediate file to disk
# It will transmit reads that did not seem to contain the adapter to the
# second sorting and deduplicating.
mkfifo ${untrimmed_out}.fifo
# -m 24 is to discard reads that are shorter than 24 after trimming
minsize_trimmed=$(echo "${fiveprime_random} + 20 + ${threeprime_random}" | bc)
# -m ${minsize_random} is to discard reads that are shorter than this after trimming
# a second cutadapt step removes the random nucleotides that helped identify PCR duplicates.
dedup_trimmed()
{
cmd="cutadapt -a ${adapt} -m 24 --untrimmed-output=${untrimmed_out}.fifo ${raw_in} 2> ${log} | sort_by_seq | dedup | cutadapt -u -4 -u 4 - | gzip"
# $1: file in which to write the number of fastq records after adapter trimming
# $2: file in which to write the number of fastq records after deduplication
cmd="cutadapt -a ${adapt} -m ${minsize_trimmed} --untrimmed-output=${untrimmed_out}.fifo - 2> ${log} | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip"
echo ${cmd}
eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed"
}
dedup_untrimmed()
{
cmd="cat ${untrimmed_out}.fifo | sort_by_seq | dedup | cutadapt -u -4 -u 4 - | gzip"
# $1: file in which to write the number of fastq records after deduplication
cmd="cat - | dedup | cutadapt -u -${threeprime_random} -u ${fiveprime_random} - | tee >(count_fastq_reads ${1}) | gzip"
echo ${cmd}
eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed"
}
dedup_trimmed &
dedup_untrimmed || rm -f ${untrimmed_out}.fifo
zcat ${raw_in} \
| tee >(count_fastq_reads ${nb_raw}) \
| dedup_trimmed ${nb_adapt} ${nb_adapt_deduped} &
cat ${untrimmed_out}.fifo \
| tee >(count_fastq_reads ${nb_noadapt}) \
| dedup_untrimmed ${nb_noadapt_deduped} || rm -f ${untrimmed_out}.fifo
rm -f ${untrimmed_out}.fifo
exit 0
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment