Commit f241ee02 authored by Blaise Li's avatar Blaise Li
Browse files

Deduplication of GRO-seq data is now optional.

parent cff1d2d4
......@@ -352,11 +352,62 @@ THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {wildcards.trimmer}
shell(shell_commands)
rule trim_only:
"""The adaptor is trimmed, then reads are treated in two groups depending
on whether the adapter was found or not. For each group, the extra k-mers
are removed at both ends."""
input:
rules.link_raw_data.output,
output:
noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_noadapt.fastq.gz"),
adapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_adapt.fastq.gz"),
nb_raw = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_raw.txt"),
nb_adapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_adapt.txt"),
nb_noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt.txt"),
params:
adapter = lambda wildcards : lib2adapt[wildcards.lib],
process_type = "PRO-seq",
trim5 = lambda wildcards : lib2UMI[wildcards.lib][0],
trim3 = lambda wildcards : lib2UMI[wildcards.lib][1],
threads: 4 # Actually, to avoid too much IO
message:
"Trimming adaptor from raw data using {wildcards.trimmer} and removing 5' and 3' random n-mers for {wildcards.lib}_{wildcards.rep}."
benchmark:
OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim_benchmark.txt")
log:
trim = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim.log"),
log = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.log"),
err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"),
run:
shell_commands = """
THREADS="{threads}" {params.process_type}_trim_only.sh {wildcards.trimmer} {input} \\
{params.adapter} {params.trim5} {params.trim3} \\
{output.adapt} {output.noadapt} {log.trim} \\
{output.nb_raw} {output.nb_adapt} {output.nb_noadapt} \\
1> {log.log} 2> {log.err}
"""
shell(shell_commands)
def source_fastq(wildcards):
"""
Determine the correct pre-processed fastq file depending on the pipeline
configuration and the current wildcards.
"""
if config.get("deduplicate", False):
return OPJ(
data_dir, f"trimmed_{wildcards.trimmer}",
f"{wildcards.lib}_{wildcards.rep}_{wildcards.type}_deduped.fastq.gz"),
else:
return OPJ(
data_dir, f"trimmed_{wildcards.trimmer}",
f"{wildcards.lib}_{wildcards.rep}_{wildcards.type}.fastq.gz"),
# TODO: Do not deduplicate, or at least do not use the noadapt_deduped: The 3' UMI is not present.
rule map_on_genome:
input:
fastq = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{type}_deduped.fastq.gz"),
#rules.trim_and_dedup.output,
fastq = source_fastq,
output:
# sam files take a lot of space
sam = temp(OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans.sam")),
......
#!/usr/bin/env bash
# Usage: PRO-seq_trim_only.sh <trimmer> <raw fastq> <ADAPTER> <nb 5'> <nb 3'> <trimmed fastq> <untrimmed fastq> <trimmer log> <nb_raw> <nb_adapt> <nb_noadapt>
# http://linuxcommand.org/wss0150.php
PROGNAME=$(basename $0)
function error_exit
{
# ----------------------------------------------------------------
# Function for exit due to fatal program error
# Accepts 1 argument:
# string containing descriptive error message
# ----------------------------------------------------------------
echo "${PROGNAME}: ${1:-"Unknown Error"}" 1>&2
exit 1
}
# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in
#PACKAGEDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
trimmer=${1}
raw_in=${2}
adapt=${3}
fiveprime_random=${4}
threeprime_random=${5}
# For the untrimmed, the adaptor was not found,
# but maybe its first 3 bases were there.
threeprime_random_with_margin=$((${threeprime_random}+3))
trimmed_out=${6}
untrimmed_out=${7}
log=${8}
nb_raw=${9}
nb_adapt=${10}
nb_noadapt=${11}
count_fastq_reads()
{
# $1: file in which to write the number of fastq records
wc -l | { read nblines; echo ${nblines} / 4 | bc > ${1}; }
}
trim_random_nt()
{
# $1: nb of bases to trim at 5' end
# $2: nb of bases to trim at 3' end
cutadapt -u ${1} -u -${2} - 2> /dev/null || error_exit "trim_random_nt failed"
}
# This named pipe is used to avoid writing the intermediate file to disk
# It will transmit reads that did not seem to contain the adapter to the
# UMI trimming step.
mkfifo ${untrimmed_out}.fifo
minsize_trimmed=$(echo "${fiveprime_random} + 16 + ${threeprime_random}" | bc)
case "${trimmer}" in
"cutadapt")
if [ ${MAX_ERROR_RATE} ]
then
#>&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
error_opt="-e ${MAX_ERROR_RATE}"
else
error_opt=""
fi
if [ ${THREADS} ]
then
#thread_opt="-j ${THREADS}"
>&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
thread_opt=""
else
thread_opt=""
fi
# -m ${minsize_random} is to discard reads that are shorter than this after trimming
trim_cmd="${trimmer} -a ${adapt} -m ${minsize_trimmed} ${error_opt} ${thread_opt} --untrimmed-output=${untrimmed_out}.fifo - 2> ${log}"
;;
"fastx_clipper")
# -n is to keep reads with N
# -l is equivalent to -m in cutadapt (not sure it has effect with -C)
# -c is to keep only the trimmed reads
# -C is to keep only the non-trimmed reads
# -v is to have verbose things to put in the log
trim_cmd="tee >(${trimmer} -a ${adapt} -l ${minsize_trimmed} -C -M 3 -n > ${untrimmed_out}.fifo) | ${trimmer} -a ${adapt} -l ${minsize_trimmed} -c -M 3 -n -v 2>> ${log}"
;;
*)
error_exit "Trimming adapter with ${trimmer} not implemented."
;;
esac
# a second cutadapt step removes the random nucleotides that helped identify PCR duplicates.
trim_trimmed()
{
# $1: file in which to write the number of fastq records after adapter trimming
cmd="${trim_cmd} | tee >(count_fastq_reads ${1}) | trim_random_nt ${fiveprime_random} ${threeprime_random} | gzip"
echo ${cmd}
eval ${cmd} > ${trimmed_out} || error_exit "${cmd} failed"
}
trim_untrimmed()
{
cmd="trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | gzip"
echo ${cmd}
eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed"
}
filetype="$(file -L ${raw_in} | cut -d " " -f2)"
case "${filetype}" in
"gzip")
cat_cmd="zcat"
;;
"ASCII")
cat_cmd="cat"
;;
*)
error_exit "Unexpected file type for ${raw_in}"
;;
esac
${cat_cmd} ${raw_in} \
| tee >(count_fastq_reads ${nb_raw}) \
| trim_trimmed ${nb_adapt} ${nb_adapt_deduped} &
pid_to_wait=$!
cat ${untrimmed_out}.fifo \
| tee >(count_fastq_reads ${nb_noadapt}) \
| trim_untrimmed ${nb_noadapt_deduped} || rm -f ${untrimmed_out}.fifo
wait ${pid_to_wait}
rm -f ${untrimmed_out}.fifo
exit 0
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment