diff --git a/CLIP/iCLIP_trim_and_dedup.sh b/CLIP/iCLIP_trim_and_dedup.sh index dca315ad5b9d4c42749c89d6966eddf3ad9b89ce..9acbe2049ce4121164fcf5d837c519be310a2572 100755 --- a/CLIP/iCLIP_trim_and_dedup.sh +++ b/CLIP/iCLIP_trim_and_dedup.sh @@ -71,11 +71,6 @@ strip_low_qual_zones() { bioawk -c fastx '{print $name"\t"substr($seq, 1, 5)""substr($seq, 12, 3)""substr($seq, 18)"\t"substr($qual, 1, 5)""substr($qual, 12, 3)""substr($qual, 18)}' | mawk '{print "@"$1"\n"$2"\n+\n"$3}' } -# Don't forget to remove ${total_fiveprime} and not ${fiveprime_random} when no stripping is done: -no_strip() -{ - bioawk -c fastx '{print "@"$name"\n"$seq"\n+\n"$qual}' -} # This script performs 2 sorting and deduplicating operations, depending on the # presence or absence of the adapter in the read. @@ -162,8 +157,7 @@ dedup_trimmed() { # $1: file in which to write the number of fastq records after adapter trimming # $2: file in which to write the number of fastq records after deduplication - #cmd="${trim_cmd} | tee >(process_without_deduplication ${trimmed_nodedup_out}) | strip_low_qual_zones | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip" - cmd="${trim_cmd} | tee >(process_without_deduplication ${trimmed_nodedup_out}) | no_strip | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${total_fiveprime} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip" + cmd="${trim_cmd} | tee >(process_without_deduplication ${trimmed_nodedup_out}) | strip_low_qual_zones | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip" echo ${cmd} eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed" } diff --git a/CLIP/iCLIP_trim_and_dedup_nostrip.sh b/CLIP/iCLIP_trim_and_dedup_nostrip.sh new file mode 100755 index 0000000000000000000000000000000000000000..dca315ad5b9d4c42749c89d6966eddf3ad9b89ce --- /dev/null +++ b/CLIP/iCLIP_trim_and_dedup_nostrip.sh @@ -0,0 +1,209 @@ +#!/usr/bin/env bash +# Usage: iCLIP_trim_and_dedup.sh <trimmer> <raw fastq> <ADAPTER> <nb 5'> <nb 3'> <trimmed fastq> <untrimmed fastq> <trimmed_nodedup> <trimmer log> <nb_raw> <nb_adapt> <nb_adapt_deduped> <nb_noadapt> <nb_noadapt_deduped> + +# http://linuxcommand.org/wss0150.php +PROGNAME=$(basename $0) + +function error_exit +{ +# ---------------------------------------------------------------- +# Function for exit due to fatal program error +# Accepts 1 argument: +# string containing descriptive error message +# ---------------------------------------------------------------- + echo "${PROGNAME}: ${1:-"Unknown Error"}" 1>&2 + exit 1 +} + +# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in +#PACKAGEDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) + +trimmer=${1} + +raw_in=${2} + +adapt=${3} + +# Sum of lengths of UMIs both left and right of barcode +fiveprime_random=${4} +# Adding lenghts of "low diversity zones" +# (to process without deduplicating) +total_fiveprime=$((${fiveprime_random}+9)) +# Length of the 3' UMI +threeprime_random=${5} +# For the untrimmed, the adaptor was not found, +# but maybe its first 3 bases were there. +threeprime_random_with_margin=$((${threeprime_random}+3)) +echo ${threeprime_random_with_margin} + +# fastq files +trimmed_and_dedup_out=${6} +untrimmed_out=${7} +trimmed_nodedup_out=${8} + +log=${9} + +# count files +nb_raw=${10} +nb_adapt=${11} +nb_adapt_deduped=${12} +nb_noadapt=${13} +nb_noadapt_deduped=${14} + +count_fastq_reads() +{ + # $1: file in which to write the number of fastq records + wc -l | { read nblines; echo ${nblines} / 4 | bc > ${1}; } +} + +# Removing parts that have lower diversity, +# and therefore more likely to have sequencing errors (6-11,15-17) +# Reads should be: +# +# NNNNNGCACTANNNWWW[YYYY]NNNN +# 1---5 : 5' UMI +# 6--11: barcode (lower diversity) +# 12-14: UMI +# 15-17: AT(or GC?)-rich (low diversity) +# [fragment] +# -4 -> -1: 3' UMI +strip_low_qual_zones() +{ + bioawk -c fastx '{print $name"\t"substr($seq, 1, 5)""substr($seq, 12, 3)""substr($seq, 18)"\t"substr($qual, 1, 5)""substr($qual, 12, 3)""substr($qual, 18)}' | mawk '{print "@"$1"\n"$2"\n+\n"$3}' +} +# Don't forget to remove ${total_fiveprime} and not ${fiveprime_random} when no stripping is done: +no_strip() +{ + bioawk -c fastx '{print "@"$name"\n"$seq"\n+\n"$qual}' +} + +# This script performs 2 sorting and deduplicating operations, depending on the +# presence or absence of the adapter in the read. + + +# The -s option of fastq-sort sorts the reads by their sequence. +sort_by_seq() +{ + TMPDIR="/var/tmp" fastq-sort -s || error_exit "fastq-sort failed" +} + +# Once the reads are sorted by sequence, +# successive reads with the same sequence are merged, +# keeping the best quality at each position. + +dedup () { + #${PACKAGEDIR}/remove_duplicates_from_sorted_fastq/remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed" + #remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed" + sort_by_seq | remove-duplicates-from-sorted-fastq || error_exit "remove_duplicates_from_sorted_fastq failed" +} + +trim_random_nt() +{ + # $1: nb of bases to trim at 5' end + # $2: nb of bases to trim at 3' end + cutadapt -u ${1} -u -${2} - 2> /dev/null || error_exit "trim_random_nt failed" +} + +process_without_deduplication () { + # $1: compressed fastq file in which to write the trimmed reads + trim_random_nt ${total_fiveprime} ${threeprime_random} | gzip > ${1} +} + +# This named pipe is used to avoid writing the intermediate file to disk +# It will transmit reads that did not seem to contain the adapter to the +# second sorting and deduplicating. +mkfifo ${untrimmed_out}.fifo + +minsize_trimmed=$(echo "${fiveprime_random} + 16 + ${threeprime_random}" | bc) + +case "${trimmer}" in + "cutadapt") + if [ ${MAX_ERROR_RATE} ] + then + #>&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS." + error_opt="-e ${MAX_ERROR_RATE}" + else + error_opt="" + fi + if [ ${THREADS} ] + then + #thread_opt="-j ${THREADS}" + >&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS." + thread_opt="" + else + thread_opt="" + fi + # -m ${minsize_random} is to discard reads that are shorter than this after trimming + trim_cmd="${trimmer} -a ${adapt} -m ${minsize_trimmed} ${error_opt} ${thread_opt} --untrimmed-output=${untrimmed_out}.fifo - 2> ${log}" + ;; + "fastx_clipper") + # -n is to keep reads with N + # -l is equivalent to -m in cutadapt (not sure it has effect with -C) + # -c is to keep only the trimmed reads + # -C is to keep only the non-trimmed reads + # -v is to have verbose things to put in the log + trim_cmd="tee >(${trimmer} -a ${adapt} -l ${minsize_trimmed} -C -M 3 -n > ${untrimmed_out}.fifo) | ${trimmer} -a ${adapt} -l ${minsize_trimmed} -c -M 3 -n -v 2>> ${log}" + ;; + *) + error_exit "Trimming adapter with ${trimmer} not implemented." + ;; +esac + +# a second cutadapt step removes the random nucleotides that helped identify PCR duplicates. +#dedup_trimmed() +#{ +# # $1: file in which to write the number of fastq records after adapter trimming +# # $2: file in which to write the number of fastq records after deduplication +# cmd="${trim_cmd} | strip_low_qual_zones | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip" +# echo ${cmd} +# eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed" +#} +dedup_trimmed() +{ + # $1: file in which to write the number of fastq records after adapter trimming + # $2: file in which to write the number of fastq records after deduplication + #cmd="${trim_cmd} | tee >(process_without_deduplication ${trimmed_nodedup_out}) | strip_low_qual_zones | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip" + cmd="${trim_cmd} | tee >(process_without_deduplication ${trimmed_nodedup_out}) | no_strip | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${total_fiveprime} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip" + echo ${cmd} + eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed" +} + +#dedup_untrimmed() +#{ +# # $1: file in which to write the number of fastq records after deduplication +# cmd="cat - | strip_low_qual_zones | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | tee >(count_fastq_reads ${1}) | gzip" +# echo ${cmd} +# eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed" +#} +dedup_untrimmed() +{ + # $1: file in which to write the number of fastq records after deduplication + cmd="cat - | strip_low_qual_zones | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | tee >(count_fastq_reads ${1}) | gzip" + echo ${cmd} + eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed" +} + +filetype="$(file -L ${raw_in} | cut -d " " -f2)" +case "${filetype}" in + "gzip") + cat_cmd="zcat" + ;; + "ASCII") + cat_cmd="cat" + ;; + *) + error_exit "Unexpected file type for ${raw_in}" + ;; +esac + +${cat_cmd} ${raw_in} \ + | tee >(count_fastq_reads ${nb_raw}) \ + | dedup_trimmed ${nb_adapt} ${nb_adapt_deduped} & +pid_to_wait=$! +cat ${untrimmed_out}.fifo \ + | tee >(count_fastq_reads ${nb_noadapt}) \ + | dedup_untrimmed ${nb_noadapt_deduped} || rm -f ${untrimmed_out}.fifo +wait ${pid_to_wait} +rm -f ${untrimmed_out}.fifo + +exit 0 diff --git a/PRO-seq_trim_and_dedup.sh b/PRO-seq_trim_and_dedup.sh new file mode 120000 index 0000000000000000000000000000000000000000..eebf79afbaa345bdaff165de72ba763f4623af78 --- /dev/null +++ b/PRO-seq_trim_and_dedup.sh @@ -0,0 +1 @@ +PRO-seq/PRO-seq_trim_and_dedup.sh \ No newline at end of file diff --git a/PRO-seq_trim_only.sh b/PRO-seq_trim_only.sh new file mode 120000 index 0000000000000000000000000000000000000000..fbe67dccfdf082d316e441a83b5fed3d6995dbb6 --- /dev/null +++ b/PRO-seq_trim_only.sh @@ -0,0 +1 @@ +PRO-seq/PRO-seq_trim_only.sh \ No newline at end of file diff --git a/iCLIP_trim_and_dedup.sh b/iCLIP_trim_and_dedup.sh new file mode 120000 index 0000000000000000000000000000000000000000..a8f0c4390016ce12a4b514bde271725fb3e3e461 --- /dev/null +++ b/iCLIP_trim_and_dedup.sh @@ -0,0 +1 @@ +CLIP/iCLIP_trim_and_dedup.sh \ No newline at end of file diff --git a/small_RNA-seq/small_RNA-seq_trim_and_dedup.sh b/small_RNA-seq/small_RNA-seq_trim_and_dedup.sh new file mode 100755 index 0000000000000000000000000000000000000000..5a861a5286fe747849f8aaeada23c44a9a86dc48 --- /dev/null +++ b/small_RNA-seq/small_RNA-seq_trim_and_dedup.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +# Usage: PRO-seq_trim_and_dedup.sh <raw fastq> <ADAPTER> <trimmed fastq> <untrimmed fastq> + +# http://linuxcommand.org/wss0150.php +PROGNAME=$(basename $0) + +function error_exit +{ +# ---------------------------------------------------------------- +# Function for exit due to fatal program error +# Accepts 1 argument: +# string containing descriptive error message +# ---------------------------------------------------------------- + echo "${PROGNAME}: ${1:-"Unknown Error"}" 1>&2 + exit 1 +} + +# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in +#PACKAGEDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) + +raw_in=${1} + +adapt=${2} + +trimmed_and_dedup_out=${3} + +log=${4} + +sort_by_seq() +{ + fastq-sort -s || error_exit "fastq-sort failed" +} + +dedup() +{ + #${PACKAGEDIR}/remove_duplicates_from_sorted_fastq/remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed" + #remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed" + remove-duplicates-from-sorted-fastq || error_exit "remove_duplicates_from_sorted_fastq failed" +} + +trim_random_nt() +{ + cutadapt -u -4 -u 4 - || error_exit "trim_random_nt failed" +} + +count_fastq_reads() +{ + wc -l | { read nblines; echo ${nblines} / 4 | bc > ${1}; } +} + +cmd="cutadapt -a ${adapt} -M 38 --discard-untrimmed ${raw_in} 2> ${log} | "'tee >(count_fastq_reads nb_trimmed.txt) | sort_by_seq | dedup | tee >(count_fastq_reads nb_deduped.txt) | trim_random_nt | gzip' +#cmd="cutadapt -a ${adapt} -M 38 --discard-untrimmed ${raw_in} 2> ${log} | "'tee >(wc -l | { read nblines; echo ${nblines} / 4 | bc > nb_trimmed.txt; }) | sort_by_seq | dedup | tee >(wc -l | { read nblines; echo ${nblines} / 4 | bc > nb_deduped.txt; }) | trim_random_nt | gzip' +echo ${cmd} +eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed" + + +exit 0 diff --git a/small_RNA-seq_trim_and_dedup.sh b/small_RNA-seq_trim_and_dedup.sh new file mode 120000 index 0000000000000000000000000000000000000000..059ba31e1eb1eade9d2c872321bf8346ca46895a --- /dev/null +++ b/small_RNA-seq_trim_and_dedup.sh @@ -0,0 +1 @@ +small_RNA-seq/small_RNA-seq_trim_and_dedup.sh \ No newline at end of file