diff --git a/CLIP/iCLIP_trim_and_dedup.sh b/CLIP/iCLIP_trim_and_dedup.sh
index dca315ad5b9d4c42749c89d6966eddf3ad9b89ce..9acbe2049ce4121164fcf5d837c519be310a2572 100755
--- a/CLIP/iCLIP_trim_and_dedup.sh
+++ b/CLIP/iCLIP_trim_and_dedup.sh
@@ -71,11 +71,6 @@ strip_low_qual_zones()
{
bioawk -c fastx '{print $name"\t"substr($seq, 1, 5)""substr($seq, 12, 3)""substr($seq, 18)"\t"substr($qual, 1, 5)""substr($qual, 12, 3)""substr($qual, 18)}' | mawk '{print "@"$1"\n"$2"\n+\n"$3}'
}
-# Don't forget to remove ${total_fiveprime} and not ${fiveprime_random} when no stripping is done:
-no_strip()
-{
- bioawk -c fastx '{print "@"$name"\n"$seq"\n+\n"$qual}'
-}
# This script performs 2 sorting and deduplicating operations, depending on the
# presence or absence of the adapter in the read.
@@ -162,8 +157,7 @@ dedup_trimmed()
{
# $1: file in which to write the number of fastq records after adapter trimming
# $2: file in which to write the number of fastq records after deduplication
- #cmd="${trim_cmd} | tee >(process_without_deduplication ${trimmed_nodedup_out}) | strip_low_qual_zones | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip"
- cmd="${trim_cmd} | tee >(process_without_deduplication ${trimmed_nodedup_out}) | no_strip | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${total_fiveprime} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip"
+ cmd="${trim_cmd} | tee >(process_without_deduplication ${trimmed_nodedup_out}) | strip_low_qual_zones | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip"
echo ${cmd}
eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed"
}
diff --git a/CLIP/iCLIP_trim_and_dedup_nostrip.sh b/CLIP/iCLIP_trim_and_dedup_nostrip.sh
new file mode 100755
index 0000000000000000000000000000000000000000..dca315ad5b9d4c42749c89d6966eddf3ad9b89ce
--- /dev/null
+++ b/CLIP/iCLIP_trim_and_dedup_nostrip.sh
@@ -0,0 +1,209 @@
+#!/usr/bin/env bash
+# Usage: iCLIP_trim_and_dedup.sh <trimmer> <raw fastq> <ADAPTER> <nb 5'> <nb 3'> <trimmed fastq> <untrimmed fastq> <trimmed_nodedup> <trimmer log> <nb_raw> <nb_adapt> <nb_adapt_deduped> <nb_noadapt> <nb_noadapt_deduped>
+
+# http://linuxcommand.org/wss0150.php
+PROGNAME=$(basename $0)
+
+function error_exit
+{
+# ----------------------------------------------------------------
+# Function for exit due to fatal program error
+# Accepts 1 argument:
+# string containing descriptive error message
+# ----------------------------------------------------------------
+ echo "${PROGNAME}: ${1:-"Unknown Error"}" 1>&2
+ exit 1
+}
+
+# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in
+#PACKAGEDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+
+trimmer=${1}
+
+raw_in=${2}
+
+adapt=${3}
+
+# Sum of lengths of UMIs both left and right of barcode
+fiveprime_random=${4}
+# Adding lenghts of "low diversity zones"
+# (to process without deduplicating)
+total_fiveprime=$((${fiveprime_random}+9))
+# Length of the 3' UMI
+threeprime_random=${5}
+# For the untrimmed, the adaptor was not found,
+# but maybe its first 3 bases were there.
+threeprime_random_with_margin=$((${threeprime_random}+3))
+echo ${threeprime_random_with_margin}
+
+# fastq files
+trimmed_and_dedup_out=${6}
+untrimmed_out=${7}
+trimmed_nodedup_out=${8}
+
+log=${9}
+
+# count files
+nb_raw=${10}
+nb_adapt=${11}
+nb_adapt_deduped=${12}
+nb_noadapt=${13}
+nb_noadapt_deduped=${14}
+
+count_fastq_reads()
+{
+ # $1: file in which to write the number of fastq records
+ wc -l | { read nblines; echo ${nblines} / 4 | bc > ${1}; }
+}
+
+# Removing parts that have lower diversity,
+# and therefore more likely to have sequencing errors (6-11,15-17)
+# Reads should be:
+#
+# NNNNNGCACTANNNWWW[YYYY]NNNN
+# 1---5 : 5' UMI
+# 6--11: barcode (lower diversity)
+# 12-14: UMI
+# 15-17: AT(or GC?)-rich (low diversity)
+# [fragment]
+# -4 -> -1: 3' UMI
+strip_low_qual_zones()
+{
+ bioawk -c fastx '{print $name"\t"substr($seq, 1, 5)""substr($seq, 12, 3)""substr($seq, 18)"\t"substr($qual, 1, 5)""substr($qual, 12, 3)""substr($qual, 18)}' | mawk '{print "@"$1"\n"$2"\n+\n"$3}'
+}
+# Don't forget to remove ${total_fiveprime} and not ${fiveprime_random} when no stripping is done:
+no_strip()
+{
+ bioawk -c fastx '{print "@"$name"\n"$seq"\n+\n"$qual}'
+}
+
+# This script performs 2 sorting and deduplicating operations, depending on the
+# presence or absence of the adapter in the read.
+
+
+# The -s option of fastq-sort sorts the reads by their sequence.
+sort_by_seq()
+{
+ TMPDIR="/var/tmp" fastq-sort -s || error_exit "fastq-sort failed"
+}
+
+# Once the reads are sorted by sequence,
+# successive reads with the same sequence are merged,
+# keeping the best quality at each position.
+
+dedup () {
+ #${PACKAGEDIR}/remove_duplicates_from_sorted_fastq/remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
+ #remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
+ sort_by_seq | remove-duplicates-from-sorted-fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
+}
+
+trim_random_nt()
+{
+ # $1: nb of bases to trim at 5' end
+ # $2: nb of bases to trim at 3' end
+ cutadapt -u ${1} -u -${2} - 2> /dev/null || error_exit "trim_random_nt failed"
+}
+
+process_without_deduplication () {
+ # $1: compressed fastq file in which to write the trimmed reads
+ trim_random_nt ${total_fiveprime} ${threeprime_random} | gzip > ${1}
+}
+
+# This named pipe is used to avoid writing the intermediate file to disk
+# It will transmit reads that did not seem to contain the adapter to the
+# second sorting and deduplicating.
+mkfifo ${untrimmed_out}.fifo
+
+minsize_trimmed=$(echo "${fiveprime_random} + 16 + ${threeprime_random}" | bc)
+
+case "${trimmer}" in
+ "cutadapt")
+ if [ ${MAX_ERROR_RATE} ]
+ then
+ #>&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
+ error_opt="-e ${MAX_ERROR_RATE}"
+ else
+ error_opt=""
+ fi
+ if [ ${THREADS} ]
+ then
+ #thread_opt="-j ${THREADS}"
+ >&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
+ thread_opt=""
+ else
+ thread_opt=""
+ fi
+ # -m ${minsize_random} is to discard reads that are shorter than this after trimming
+ trim_cmd="${trimmer} -a ${adapt} -m ${minsize_trimmed} ${error_opt} ${thread_opt} --untrimmed-output=${untrimmed_out}.fifo - 2> ${log}"
+ ;;
+ "fastx_clipper")
+ # -n is to keep reads with N
+ # -l is equivalent to -m in cutadapt (not sure it has effect with -C)
+ # -c is to keep only the trimmed reads
+ # -C is to keep only the non-trimmed reads
+ # -v is to have verbose things to put in the log
+ trim_cmd="tee >(${trimmer} -a ${adapt} -l ${minsize_trimmed} -C -M 3 -n > ${untrimmed_out}.fifo) | ${trimmer} -a ${adapt} -l ${minsize_trimmed} -c -M 3 -n -v 2>> ${log}"
+ ;;
+ *)
+ error_exit "Trimming adapter with ${trimmer} not implemented."
+ ;;
+esac
+
+# a second cutadapt step removes the random nucleotides that helped identify PCR duplicates.
+#dedup_trimmed()
+#{
+# # $1: file in which to write the number of fastq records after adapter trimming
+# # $2: file in which to write the number of fastq records after deduplication
+# cmd="${trim_cmd} | strip_low_qual_zones | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip"
+# echo ${cmd}
+# eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed"
+#}
+dedup_trimmed()
+{
+ # $1: file in which to write the number of fastq records after adapter trimming
+ # $2: file in which to write the number of fastq records after deduplication
+ #cmd="${trim_cmd} | tee >(process_without_deduplication ${trimmed_nodedup_out}) | strip_low_qual_zones | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip"
+ cmd="${trim_cmd} | tee >(process_without_deduplication ${trimmed_nodedup_out}) | no_strip | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${total_fiveprime} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip"
+ echo ${cmd}
+ eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed"
+}
+
+#dedup_untrimmed()
+#{
+# # $1: file in which to write the number of fastq records after deduplication
+# cmd="cat - | strip_low_qual_zones | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | tee >(count_fastq_reads ${1}) | gzip"
+# echo ${cmd}
+# eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed"
+#}
+dedup_untrimmed()
+{
+ # $1: file in which to write the number of fastq records after deduplication
+ cmd="cat - | strip_low_qual_zones | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | tee >(count_fastq_reads ${1}) | gzip"
+ echo ${cmd}
+ eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed"
+}
+
+filetype="$(file -L ${raw_in} | cut -d " " -f2)"
+case "${filetype}" in
+ "gzip")
+ cat_cmd="zcat"
+ ;;
+ "ASCII")
+ cat_cmd="cat"
+ ;;
+ *)
+ error_exit "Unexpected file type for ${raw_in}"
+ ;;
+esac
+
+${cat_cmd} ${raw_in} \
+ | tee >(count_fastq_reads ${nb_raw}) \
+ | dedup_trimmed ${nb_adapt} ${nb_adapt_deduped} &
+pid_to_wait=$!
+cat ${untrimmed_out}.fifo \
+ | tee >(count_fastq_reads ${nb_noadapt}) \
+ | dedup_untrimmed ${nb_noadapt_deduped} || rm -f ${untrimmed_out}.fifo
+wait ${pid_to_wait}
+rm -f ${untrimmed_out}.fifo
+
+exit 0
diff --git a/PRO-seq_trim_and_dedup.sh b/PRO-seq_trim_and_dedup.sh
new file mode 120000
index 0000000000000000000000000000000000000000..eebf79afbaa345bdaff165de72ba763f4623af78
--- /dev/null
+++ b/PRO-seq_trim_and_dedup.sh
@@ -0,0 +1 @@
+PRO-seq/PRO-seq_trim_and_dedup.sh
\ No newline at end of file
diff --git a/PRO-seq_trim_only.sh b/PRO-seq_trim_only.sh
new file mode 120000
index 0000000000000000000000000000000000000000..fbe67dccfdf082d316e441a83b5fed3d6995dbb6
--- /dev/null
+++ b/PRO-seq_trim_only.sh
@@ -0,0 +1 @@
+PRO-seq/PRO-seq_trim_only.sh
\ No newline at end of file
diff --git a/iCLIP_trim_and_dedup.sh b/iCLIP_trim_and_dedup.sh
new file mode 120000
index 0000000000000000000000000000000000000000..a8f0c4390016ce12a4b514bde271725fb3e3e461
--- /dev/null
+++ b/iCLIP_trim_and_dedup.sh
@@ -0,0 +1 @@
+CLIP/iCLIP_trim_and_dedup.sh
\ No newline at end of file
diff --git a/small_RNA-seq/small_RNA-seq_trim_and_dedup.sh b/small_RNA-seq/small_RNA-seq_trim_and_dedup.sh
new file mode 100755
index 0000000000000000000000000000000000000000..5a861a5286fe747849f8aaeada23c44a9a86dc48
--- /dev/null
+++ b/small_RNA-seq/small_RNA-seq_trim_and_dedup.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+# Usage: PRO-seq_trim_and_dedup.sh <raw fastq> <ADAPTER> <trimmed fastq> <untrimmed fastq>
+
+# http://linuxcommand.org/wss0150.php
+PROGNAME=$(basename $0)
+
+function error_exit
+{
+# ----------------------------------------------------------------
+# Function for exit due to fatal program error
+# Accepts 1 argument:
+# string containing descriptive error message
+# ----------------------------------------------------------------
+ echo "${PROGNAME}: ${1:-"Unknown Error"}" 1>&2
+ exit 1
+}
+
+# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in
+#PACKAGEDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+
+raw_in=${1}
+
+adapt=${2}
+
+trimmed_and_dedup_out=${3}
+
+log=${4}
+
+sort_by_seq()
+{
+ fastq-sort -s || error_exit "fastq-sort failed"
+}
+
+dedup()
+{
+ #${PACKAGEDIR}/remove_duplicates_from_sorted_fastq/remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
+ #remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
+ remove-duplicates-from-sorted-fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
+}
+
+trim_random_nt()
+{
+ cutadapt -u -4 -u 4 - || error_exit "trim_random_nt failed"
+}
+
+count_fastq_reads()
+{
+ wc -l | { read nblines; echo ${nblines} / 4 | bc > ${1}; }
+}
+
+cmd="cutadapt -a ${adapt} -M 38 --discard-untrimmed ${raw_in} 2> ${log} | "'tee >(count_fastq_reads nb_trimmed.txt) | sort_by_seq | dedup | tee >(count_fastq_reads nb_deduped.txt) | trim_random_nt | gzip'
+#cmd="cutadapt -a ${adapt} -M 38 --discard-untrimmed ${raw_in} 2> ${log} | "'tee >(wc -l | { read nblines; echo ${nblines} / 4 | bc > nb_trimmed.txt; }) | sort_by_seq | dedup | tee >(wc -l | { read nblines; echo ${nblines} / 4 | bc > nb_deduped.txt; }) | trim_random_nt | gzip'
+echo ${cmd}
+eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed"
+
+
+exit 0
diff --git a/small_RNA-seq_trim_and_dedup.sh b/small_RNA-seq_trim_and_dedup.sh
new file mode 120000
index 0000000000000000000000000000000000000000..059ba31e1eb1eade9d2c872321bf8346ca46895a
--- /dev/null
+++ b/small_RNA-seq_trim_and_dedup.sh
@@ -0,0 +1 @@
+small_RNA-seq/small_RNA-seq_trim_and_dedup.sh
\ No newline at end of file