Commit c9217c51 authored by Blaise Li's avatar Blaise Li
Browse files

Homogenized GRO-seq trim_and_dedup with iCLIP.

parent efd0a96f
......@@ -270,12 +270,6 @@ rule trim_and_dedup:
PCR duplicates are removed at both ends"""
input:
rules.link_raw_data.output,
params:
adapter = lambda wildcards : lib2adapt[wildcards.lib],
trim5 = lambda wildcards : lib2UMI[wildcards.lib][0],
trim3 = lambda wildcards : lib2UMI[wildcards.lib][1],
minsize_trimmed = lambda wildcards : str(int(lib2UMI[wildcards.lib][0]) + 16 + int(lib2UMI[wildcards.lib][1])),
untrimmed_tmp = lambda wildcards : OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_noadapt_tmp.fifo".format(lib=wildcards.lib, rep=wildcards.rep)),
output:
noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_noadapt_deduped.fastq.gz"),
adapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_adapt_deduped.fastq.gz"),
......@@ -284,6 +278,11 @@ rule trim_and_dedup:
nb_adapt_deduped = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_adapt_deduped.txt"),
nb_noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt.txt"),
nb_noadapt_deduped = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt_deduped.txt"),
params:
adapter = lambda wildcards : lib2adapt[wildcards.lib],
process_type = "PRO-seq",
trim5 = lambda wildcards : lib2UMI[wildcards.lib][0],
trim3 = lambda wildcards : lib2UMI[wildcards.lib][1],
threads: 4 # Actually, to avoid too much IO
message:
"Trimming adaptor from raw data using {wildcards.trimmer}, deduplicating reads, and removing 5' and 3' random n-mers for {wildcards.lib}_{wildcards.rep}."
......@@ -294,60 +293,13 @@ rule trim_and_dedup:
log = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.log"),
err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"),
run:
if True:
#if wildcards.trimmer == "cutadapt":
shell_commands = """
PRO-seq_trim_and_dedup.sh {wildcards.trimmer} {input} {params.adapter} {params.trim5} {params.trim3} {output.adapt} {output.noadapt} {log.trim} {output.nb_raw} {output.nb_adapt} {output.nb_adapt_deduped} {output.nb_noadapt} {output.nb_noadapt_deduped} 1> {log.log} 2> {log.err}
# This named pipe is used to avoid writing the intermediate file to disk
# It will transmit reads that did not seem to contain the adapter to the
# second sorting and deduplicating.
#mkfifo {params.untrimmed_tmp}
# -m {params.minsize_trimmed} is to discard reads that are shorter than this after trimming
# a second cutadapt step removes the random nucleotides that helped identify PCR duplicates.
#zcat {input} \\
# | tee >(count_fastq_reads {output.nb_raw}) \\
# | cutadapt -a {params.adapter} -m {params.minsize_trimmed} \\
# --untrimmed-output={params.untrimmed_tmp} - 2> {log.trim} \\
# | tee >(count_fastq_reads {output.nb_adapt}) \\
# | dedup \\
# | trim_random_nt {params.trim5} {params.trim3} \\
# | tee >(count_fastq_reads {output.nb_adapt_deduped}) \\
# | gzip > {output.noadapt} &
#pid_to_wait=$!
#cat {params.untrimmed_tmp} \\
# | tee >(count_fastq_reads {output.nb_noadapt}) \\
# | dedup \\
# | trim_random_nt {params.trim5} {params.trim3} \\
# | tee >(count_fastq_reads {output.nb_noadapt_deduped}) \\
# | gzip > {output.adapt} \\
# || rm -f {params.untrimmed_tmp}
#wait ${{pid_to_wait}}
#rm -f {params.untrimmed_tmp}
shell_commands = """
THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {wildcards.trimmer} {input} \\
{params.adapter} {params.trim5} {params.trim3} \\
{output.adapt} {output.noadapt} {log.trim} \\
{output.nb_raw} {output.nb_adapt} {output.nb_adapt_deduped} \\
{output.nb_noadapt} {output.nb_noadapt_deduped} 1> {log.log} 2> {log.err}
"""
# elif wildcards.trimmer == "fastx_clipper":
# shell_commands = """
#zcat {input} \\
# | tee >(count_fastq_reads {output.nb_raw}) \\
# | fastx_clipper -a {params.adapter} -l {params.minsize_trimmed} \\
# -c -M 3 -n -v 2> {log.trim} \\
# | tee >(count_fastq_reads {output.nb_adapt}) \\
# | dedup \\
# | tee >(count_fastq_reads {output.nb_adapt_deduped}) \\
# | trim_random_nt {params.trim5} {params.trim3} \\
# | gzip > {output.noadapt} &
#pid_to_wait=$!
#zcat {input} \\
# | fastx_clipper -a {params.adapter} -l {params.minsize_trimmed} \\
# -C -M 3 -n -v 2>> {log.trim} \\
# | tee >(count_fastq_reads {output.nb_noadapt}) \\
# | dedup \\
# | tee >(count_fastq_reads {output.nb_noadapt_deduped}) \\
# | trim_random_nt {params.trim5} {params.trim3} \\
# | gzip > {output.adapt}
#wait ${{pid_to_wait}}
#"""
# else:
# raise NotImplementedError("%s is not yet handled." % wildcards.trimmer)
shell(shell_commands)
......
......@@ -26,6 +26,9 @@ adapt=${3}
fiveprime_random=${4}
threeprime_random=${5}
# For the untrimmed, the adaptor was not found,
# but maybe its first 3 bases were there.
threeprime_random_with_margin=$((${threeprime_random}+3))
trimmed_and_dedup_out=${6}
......@@ -82,8 +85,23 @@ minsize_trimmed=$(echo "${fiveprime_random} + 16 + ${threeprime_random}" | bc)
case "${trimmer}" in
"cutadapt")
if [ ${MAX_ERROR_RATE} ]
then
#>&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
error_opt="-e ${MAX_ERROR_RATE}"
else
error_opt=""
fi
if [ ${THREADS} ]
then
#thread_opt="-j ${THREADS}"
>&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
thread_opt=""
else
thread_opt=""
fi
# -m ${minsize_random} is to discard reads that are shorter than this after trimming
trim_cmd="${trimmer} -a ${adapt} -m ${minsize_trimmed} --untrimmed-output=${untrimmed_out}.fifo - 2> ${log}"
trim_cmd="${trimmer} -a ${adapt} -m ${minsize_trimmed} ${error_opt} ${thread_opt} --untrimmed-output=${untrimmed_out}.fifo - 2> ${log}"
;;
"fastx_clipper")
# -n is to keep reads with N
......@@ -111,12 +129,25 @@ dedup_trimmed()
dedup_untrimmed()
{
# $1: file in which to write the number of fastq records after deduplication
cmd="cat - | dedup | trim_random_nt ${threeprime_random} ${fiveprime_random} | tee >(count_fastq_reads ${1}) | gzip"
cmd="cat - | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | tee >(count_fastq_reads ${1}) | gzip"
echo ${cmd}
eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed"
}
zcat ${raw_in} \
filetype="$(file -L ${raw_in} | cut -d " " -f2)"
case "${filetype}" in
"gzip")
cat_cmd="zcat"
;;
"ASCII")
cat_cmd="cat"
;;
*)
error_exit "Unexpected file type for ${raw_in}"
;;
esac
${cat_cmd} ${raw_in} \
| tee >(count_fastq_reads ${nb_raw}) \
| dedup_trimmed ${nb_adapt} ${nb_adapt_deduped} &
pid_to_wait=$!
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment