Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Blaise LI
bioinfo_utils
Commits
964ae912
Commit
964ae912
authored
Nov 18, 2019
by
Blaise Li
Browse files
Added and reorganized trimming scripts.
parent
9a0d87a3
Changes
7
Hide whitespace changes
Inline
Side-by-side
CLIP/iCLIP_trim_and_dedup.sh
View file @
964ae912
...
...
@@ -71,11 +71,6 @@ strip_low_qual_zones()
{
bioawk
-c
fastx
'{print $name"\t"substr($seq, 1, 5)""substr($seq, 12, 3)""substr($seq, 18)"\t"substr($qual, 1, 5)""substr($qual, 12, 3)""substr($qual, 18)}'
| mawk
'{print "@"$1"\n"$2"\n+\n"$3}'
}
# Don't forget to remove ${total_fiveprime} and not ${fiveprime_random} when no stripping is done:
no_strip
()
{
bioawk
-c
fastx
'{print "@"$name"\n"$seq"\n+\n"$qual}'
}
# This script performs 2 sorting and deduplicating operations, depending on the
# presence or absence of the adapter in the read.
...
...
@@ -162,8 +157,7 @@ dedup_trimmed()
{
# $1: file in which to write the number of fastq records after adapter trimming
# $2: file in which to write the number of fastq records after deduplication
#cmd="${trim_cmd} | tee >(process_without_deduplication ${trimmed_nodedup_out}) | strip_low_qual_zones | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip"
cmd
=
"
${
trim_cmd
}
| tee >(process_without_deduplication
${
trimmed_nodedup_out
}
) | no_strip | tee >(count_fastq_reads
${
1
}
) | dedup | trim_random_nt
${
total_fiveprime
}
${
threeprime_random
}
| tee >(count_fastq_reads
${
2
}
) | gzip"
cmd
=
"
${
trim_cmd
}
| tee >(process_without_deduplication
${
trimmed_nodedup_out
}
) | strip_low_qual_zones | tee >(count_fastq_reads
${
1
}
) | dedup | trim_random_nt
${
fiveprime_random
}
${
threeprime_random
}
| tee >(count_fastq_reads
${
2
}
) | gzip"
echo
${
cmd
}
eval
${
cmd
}
>
${
trimmed_and_dedup_out
}
||
error_exit
"
${
cmd
}
failed"
}
...
...
CLIP/iCLIP_trim_and_dedup_nostrip.sh
0 → 100755
View file @
964ae912
#!/usr/bin/env bash
# Usage: iCLIP_trim_and_dedup.sh <trimmer> <raw fastq> <ADAPTER> <nb 5'> <nb 3'> <trimmed fastq> <untrimmed fastq> <trimmed_nodedup> <trimmer log> <nb_raw> <nb_adapt> <nb_adapt_deduped> <nb_noadapt> <nb_noadapt_deduped>
# http://linuxcommand.org/wss0150.php
PROGNAME
=
$(
basename
$0
)
function
error_exit
{
# ----------------------------------------------------------------
# Function for exit due to fatal program error
# Accepts 1 argument:
# string containing descriptive error message
# ----------------------------------------------------------------
echo
"
${
PROGNAME
}
:
${
1
:-
"Unknown Error"
}
"
1>&2
exit
1
}
# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in
#PACKAGEDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
trimmer
=
${
1
}
raw_in
=
${
2
}
adapt
=
${
3
}
# Sum of lengths of UMIs both left and right of barcode
fiveprime_random
=
${
4
}
# Adding lenghts of "low diversity zones"
# (to process without deduplicating)
total_fiveprime
=
$((${
fiveprime_random
}
+
9
))
# Length of the 3' UMI
threeprime_random
=
${
5
}
# For the untrimmed, the adaptor was not found,
# but maybe its first 3 bases were there.
threeprime_random_with_margin
=
$((${
threeprime_random
}
+
3
))
echo
${
threeprime_random_with_margin
}
# fastq files
trimmed_and_dedup_out
=
${
6
}
untrimmed_out
=
${
7
}
trimmed_nodedup_out
=
${
8
}
log
=
${
9
}
# count files
nb_raw
=
${
10
}
nb_adapt
=
${
11
}
nb_adapt_deduped
=
${
12
}
nb_noadapt
=
${
13
}
nb_noadapt_deduped
=
${
14
}
count_fastq_reads
()
{
# $1: file in which to write the number of fastq records
wc
-l
|
{
read
nblines
;
echo
${
nblines
}
/ 4 | bc
>
${
1
}
;
}
}
# Removing parts that have lower diversity,
# and therefore more likely to have sequencing errors (6-11,15-17)
# Reads should be:
#
# NNNNNGCACTANNNWWW[YYYY]NNNN
# 1---5 : 5' UMI
# 6--11: barcode (lower diversity)
# 12-14: UMI
# 15-17: AT(or GC?)-rich (low diversity)
# [fragment]
# -4 -> -1: 3' UMI
strip_low_qual_zones
()
{
bioawk
-c
fastx
'{print $name"\t"substr($seq, 1, 5)""substr($seq, 12, 3)""substr($seq, 18)"\t"substr($qual, 1, 5)""substr($qual, 12, 3)""substr($qual, 18)}'
| mawk
'{print "@"$1"\n"$2"\n+\n"$3}'
}
# Don't forget to remove ${total_fiveprime} and not ${fiveprime_random} when no stripping is done:
no_strip
()
{
bioawk
-c
fastx
'{print "@"$name"\n"$seq"\n+\n"$qual}'
}
# This script performs 2 sorting and deduplicating operations, depending on the
# presence or absence of the adapter in the read.
# The -s option of fastq-sort sorts the reads by their sequence.
sort_by_seq
()
{
TMPDIR
=
"/var/tmp"
fastq-sort
-s
||
error_exit
"fastq-sort failed"
}
# Once the reads are sorted by sequence,
# successive reads with the same sequence are merged,
# keeping the best quality at each position.
dedup
()
{
#${PACKAGEDIR}/remove_duplicates_from_sorted_fastq/remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
#remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
sort_by_seq | remove-duplicates-from-sorted-fastq
||
error_exit
"remove_duplicates_from_sorted_fastq failed"
}
trim_random_nt
()
{
# $1: nb of bases to trim at 5' end
# $2: nb of bases to trim at 3' end
cutadapt
-u
${
1
}
-u
-
${
2
}
- 2> /dev/null
||
error_exit
"trim_random_nt failed"
}
process_without_deduplication
()
{
# $1: compressed fastq file in which to write the trimmed reads
trim_random_nt
${
total_fiveprime
}
${
threeprime_random
}
|
gzip
>
${
1
}
}
# This named pipe is used to avoid writing the intermediate file to disk
# It will transmit reads that did not seem to contain the adapter to the
# second sorting and deduplicating.
mkfifo
${
untrimmed_out
}
.fifo
minsize_trimmed
=
$(
echo
"
${
fiveprime_random
}
+ 16 +
${
threeprime_random
}
"
| bc
)
case
"
${
trimmer
}
"
in
"cutadapt"
)
if
[
${
MAX_ERROR_RATE
}
]
then
#>&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
error_opt
=
"-e
${
MAX_ERROR_RATE
}
"
else
error_opt
=
""
fi
if
[
${
THREADS
}
]
then
#thread_opt="-j ${THREADS}"
>
&2
echo
"Cutadapt multithreading not working fully yet. Ignoring THREADS."
thread_opt
=
""
else
thread_opt
=
""
fi
# -m ${minsize_random} is to discard reads that are shorter than this after trimming
trim_cmd
=
"
${
trimmer
}
-a
${
adapt
}
-m
${
minsize_trimmed
}
${
error_opt
}
${
thread_opt
}
--untrimmed-output=
${
untrimmed_out
}
.fifo - 2>
${
log
}
"
;;
"fastx_clipper"
)
# -n is to keep reads with N
# -l is equivalent to -m in cutadapt (not sure it has effect with -C)
# -c is to keep only the trimmed reads
# -C is to keep only the non-trimmed reads
# -v is to have verbose things to put in the log
trim_cmd
=
"tee >(
${
trimmer
}
-a
${
adapt
}
-l
${
minsize_trimmed
}
-C -M 3 -n >
${
untrimmed_out
}
.fifo) |
${
trimmer
}
-a
${
adapt
}
-l
${
minsize_trimmed
}
-c -M 3 -n -v 2>>
${
log
}
"
;;
*
)
error_exit
"Trimming adapter with
${
trimmer
}
not implemented."
;;
esac
# a second cutadapt step removes the random nucleotides that helped identify PCR duplicates.
#dedup_trimmed()
#{
# # $1: file in which to write the number of fastq records after adapter trimming
# # $2: file in which to write the number of fastq records after deduplication
# cmd="${trim_cmd} | strip_low_qual_zones | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip"
# echo ${cmd}
# eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed"
#}
dedup_trimmed
()
{
# $1: file in which to write the number of fastq records after adapter trimming
# $2: file in which to write the number of fastq records after deduplication
#cmd="${trim_cmd} | tee >(process_without_deduplication ${trimmed_nodedup_out}) | strip_low_qual_zones | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip"
cmd
=
"
${
trim_cmd
}
| tee >(process_without_deduplication
${
trimmed_nodedup_out
}
) | no_strip | tee >(count_fastq_reads
${
1
}
) | dedup | trim_random_nt
${
total_fiveprime
}
${
threeprime_random
}
| tee >(count_fastq_reads
${
2
}
) | gzip"
echo
${
cmd
}
eval
${
cmd
}
>
${
trimmed_and_dedup_out
}
||
error_exit
"
${
cmd
}
failed"
}
#dedup_untrimmed()
#{
# # $1: file in which to write the number of fastq records after deduplication
# cmd="cat - | strip_low_qual_zones | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | tee >(count_fastq_reads ${1}) | gzip"
# echo ${cmd}
# eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed"
#}
dedup_untrimmed
()
{
# $1: file in which to write the number of fastq records after deduplication
cmd
=
"cat - | strip_low_qual_zones | dedup | trim_random_nt
${
fiveprime_random
}
${
threeprime_random_with_margin
}
| tee >(count_fastq_reads
${
1
}
) | gzip"
echo
${
cmd
}
eval
${
cmd
}
>
${
untrimmed_out
}
||
error_exit
"
${
cmd
}
failed"
}
filetype
=
"
$(
file
-L
${
raw_in
}
|
cut
-d
" "
-f2
)
"
case
"
${
filetype
}
"
in
"gzip"
)
cat_cmd
=
"zcat"
;;
"ASCII"
)
cat_cmd
=
"cat"
;;
*
)
error_exit
"Unexpected file type for
${
raw_in
}
"
;;
esac
${
cat_cmd
}
${
raw_in
}
\
|
tee
>(
count_fastq_reads
${
nb_raw
}
)
\
| dedup_trimmed
${
nb_adapt
}
${
nb_adapt_deduped
}
&
pid_to_wait
=
$!
cat
${
untrimmed_out
}
.fifo
\
|
tee
>(
count_fastq_reads
${
nb_noadapt
}
)
\
| dedup_untrimmed
${
nb_noadapt_deduped
}
||
rm
-f
${
untrimmed_out
}
.fifo
wait
${
pid_to_wait
}
rm
-f
${
untrimmed_out
}
.fifo
exit
0
PRO-seq_trim_and_dedup.sh
0 → 120000
View file @
964ae912
PRO-seq/PRO-seq_trim_and_dedup.sh
\ No newline at end of file
PRO-seq_trim_only.sh
0 → 120000
View file @
964ae912
PRO-seq/PRO-seq_trim_only.sh
\ No newline at end of file
iCLIP_trim_and_dedup.sh
0 → 120000
View file @
964ae912
CLIP/iCLIP_trim_and_dedup.sh
\ No newline at end of file
small_RNA-seq/small_RNA-seq_trim_and_dedup.sh
0 → 100755
View file @
964ae912
#!/usr/bin/env bash
# Usage: PRO-seq_trim_and_dedup.sh <raw fastq> <ADAPTER> <trimmed fastq> <untrimmed fastq>
# http://linuxcommand.org/wss0150.php
PROGNAME
=
$(
basename
$0
)
function
error_exit
{
# ----------------------------------------------------------------
# Function for exit due to fatal program error
# Accepts 1 argument:
# string containing descriptive error message
# ----------------------------------------------------------------
echo
"
${
PROGNAME
}
:
${
1
:-
"Unknown Error"
}
"
1>&2
exit
1
}
# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in
#PACKAGEDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
raw_in
=
${
1
}
adapt
=
${
2
}
trimmed_and_dedup_out
=
${
3
}
log
=
${
4
}
sort_by_seq
()
{
fastq-sort
-s
||
error_exit
"fastq-sort failed"
}
dedup
()
{
#${PACKAGEDIR}/remove_duplicates_from_sorted_fastq/remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
#remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
remove-duplicates-from-sorted-fastq
||
error_exit
"remove_duplicates_from_sorted_fastq failed"
}
trim_random_nt
()
{
cutadapt
-u
-4
-u
4 -
||
error_exit
"trim_random_nt failed"
}
count_fastq_reads
()
{
wc
-l
|
{
read
nblines
;
echo
${
nblines
}
/ 4 | bc
>
${
1
}
;
}
}
cmd
=
"cutadapt -a
${
adapt
}
-M 38 --discard-untrimmed
${
raw_in
}
2>
${
log
}
| "
'tee >(count_fastq_reads nb_trimmed.txt) | sort_by_seq | dedup | tee >(count_fastq_reads nb_deduped.txt) | trim_random_nt | gzip'
#cmd="cutadapt -a ${adapt} -M 38 --discard-untrimmed ${raw_in} 2> ${log} | "'tee >(wc -l | { read nblines; echo ${nblines} / 4 | bc > nb_trimmed.txt; }) | sort_by_seq | dedup | tee >(wc -l | { read nblines; echo ${nblines} / 4 | bc > nb_deduped.txt; }) | trim_random_nt | gzip'
echo
${
cmd
}
eval
${
cmd
}
>
${
trimmed_and_dedup_out
}
||
error_exit
"
${
cmd
}
failed"
exit
0
small_RNA-seq_trim_and_dedup.sh
0 → 120000
View file @
964ae912
small_RNA-seq/small_RNA-seq_trim_and_dedup.sh
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment