diff --git a/PRO-seq/PRO-seq.snakefile b/PRO-seq/PRO-seq.snakefile index cfbbfd16e2a8e6f2151d07bf0a25665bd8b229ed..a83e7a1adc6027bebe67aa2efcacc18e940e5553 100644 --- a/PRO-seq/PRO-seq.snakefile +++ b/PRO-seq/PRO-seq.snakefile @@ -352,11 +352,62 @@ THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {wildcards.trimmer} shell(shell_commands) +rule trim_only: + """The adaptor is trimmed, then reads are treated in two groups depending + on whether the adapter was found or not. For each group, the extra k-mers + are removed at both ends.""" + input: + rules.link_raw_data.output, + output: + noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_noadapt.fastq.gz"), + adapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_adapt.fastq.gz"), + nb_raw = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_raw.txt"), + nb_adapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_adapt.txt"), + nb_noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt.txt"), + params: + adapter = lambda wildcards : lib2adapt[wildcards.lib], + process_type = "PRO-seq", + trim5 = lambda wildcards : lib2UMI[wildcards.lib][0], + trim3 = lambda wildcards : lib2UMI[wildcards.lib][1], + threads: 4 # Actually, to avoid too much IO + message: + "Trimming adaptor from raw data using {wildcards.trimmer} and removing 5' and 3' random n-mers for {wildcards.lib}_{wildcards.rep}." + benchmark: + OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim_benchmark.txt") + log: + trim = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim.log"), + log = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.log"), + err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"), + run: + shell_commands = """ +THREADS="{threads}" {params.process_type}_trim_only.sh {wildcards.trimmer} {input} \\ + {params.adapter} {params.trim5} {params.trim3} \\ + {output.adapt} {output.noadapt} {log.trim} \\ + {output.nb_raw} {output.nb_adapt} {output.nb_noadapt} \\ + 1> {log.log} 2> {log.err} +""" + shell(shell_commands) + + +def source_fastq(wildcards): + """ + Determine the correct pre-processed fastq file depending on the pipeline + configuration and the current wildcards. + """ + if config.get("deduplicate", False): + return OPJ( + data_dir, f"trimmed_{wildcards.trimmer}", + f"{wildcards.lib}_{wildcards.rep}_{wildcards.type}_deduped.fastq.gz"), + else: + return OPJ( + data_dir, f"trimmed_{wildcards.trimmer}", + f"{wildcards.lib}_{wildcards.rep}_{wildcards.type}.fastq.gz"), + + # TODO: Do not deduplicate, or at least do not use the noadapt_deduped: The 3' UMI is not present. rule map_on_genome: input: - fastq = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{type}_deduped.fastq.gz"), - #rules.trim_and_dedup.output, + fastq = source_fastq, output: # sam files take a lot of space sam = temp(OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans.sam")), diff --git a/PRO-seq/PRO-seq_trim_only.sh b/PRO-seq/PRO-seq_trim_only.sh new file mode 100755 index 0000000000000000000000000000000000000000..d887ffef39fc7560d58e432e341df20a80fff6d2 --- /dev/null +++ b/PRO-seq/PRO-seq_trim_only.sh @@ -0,0 +1,137 @@ +#!/usr/bin/env bash +# Usage: PRO-seq_trim_only.sh <trimmer> <raw fastq> <ADAPTER> <nb 5'> <nb 3'> <trimmed fastq> <untrimmed fastq> <trimmer log> <nb_raw> <nb_adapt> <nb_noadapt> + +# http://linuxcommand.org/wss0150.php +PROGNAME=$(basename $0) + +function error_exit +{ +# ---------------------------------------------------------------- +# Function for exit due to fatal program error +# Accepts 1 argument: +# string containing descriptive error message +# ---------------------------------------------------------------- + echo "${PROGNAME}: ${1:-"Unknown Error"}" 1>&2 + exit 1 +} + +# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in +#PACKAGEDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) + +trimmer=${1} + +raw_in=${2} + +adapt=${3} + +fiveprime_random=${4} +threeprime_random=${5} +# For the untrimmed, the adaptor was not found, +# but maybe its first 3 bases were there. +threeprime_random_with_margin=$((${threeprime_random}+3)) + +trimmed_out=${6} + +untrimmed_out=${7} + +log=${8} + +nb_raw=${9} +nb_adapt=${10} +nb_noadapt=${11} + +count_fastq_reads() +{ + # $1: file in which to write the number of fastq records + wc -l | { read nblines; echo ${nblines} / 4 | bc > ${1}; } +} + + +trim_random_nt() +{ + # $1: nb of bases to trim at 5' end + # $2: nb of bases to trim at 3' end + cutadapt -u ${1} -u -${2} - 2> /dev/null || error_exit "trim_random_nt failed" +} + + +# This named pipe is used to avoid writing the intermediate file to disk +# It will transmit reads that did not seem to contain the adapter to the +# UMI trimming step. +mkfifo ${untrimmed_out}.fifo + +minsize_trimmed=$(echo "${fiveprime_random} + 16 + ${threeprime_random}" | bc) + +case "${trimmer}" in + "cutadapt") + if [ ${MAX_ERROR_RATE} ] + then + #>&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS." + error_opt="-e ${MAX_ERROR_RATE}" + else + error_opt="" + fi + if [ ${THREADS} ] + then + #thread_opt="-j ${THREADS}" + >&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS." + thread_opt="" + else + thread_opt="" + fi + # -m ${minsize_random} is to discard reads that are shorter than this after trimming + trim_cmd="${trimmer} -a ${adapt} -m ${minsize_trimmed} ${error_opt} ${thread_opt} --untrimmed-output=${untrimmed_out}.fifo - 2> ${log}" + ;; + "fastx_clipper") + # -n is to keep reads with N + # -l is equivalent to -m in cutadapt (not sure it has effect with -C) + # -c is to keep only the trimmed reads + # -C is to keep only the non-trimmed reads + # -v is to have verbose things to put in the log + trim_cmd="tee >(${trimmer} -a ${adapt} -l ${minsize_trimmed} -C -M 3 -n > ${untrimmed_out}.fifo) | ${trimmer} -a ${adapt} -l ${minsize_trimmed} -c -M 3 -n -v 2>> ${log}" + ;; + *) + error_exit "Trimming adapter with ${trimmer} not implemented." + ;; +esac + +# a second cutadapt step removes the random nucleotides that helped identify PCR duplicates. +trim_trimmed() +{ + # $1: file in which to write the number of fastq records after adapter trimming + cmd="${trim_cmd} | tee >(count_fastq_reads ${1}) | trim_random_nt ${fiveprime_random} ${threeprime_random} | gzip" + echo ${cmd} + eval ${cmd} > ${trimmed_out} || error_exit "${cmd} failed" +} + +trim_untrimmed() +{ + cmd="trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | gzip" + echo ${cmd} + eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed" +} + +filetype="$(file -L ${raw_in} | cut -d " " -f2)" +case "${filetype}" in + "gzip") + cat_cmd="zcat" + ;; + "ASCII") + cat_cmd="cat" + ;; + *) + error_exit "Unexpected file type for ${raw_in}" + ;; +esac + +${cat_cmd} ${raw_in} \ + | tee >(count_fastq_reads ${nb_raw}) \ + | trim_trimmed ${nb_adapt} ${nb_adapt_deduped} & +pid_to_wait=$! +cat ${untrimmed_out}.fifo \ + | tee >(count_fastq_reads ${nb_noadapt}) \ + | trim_untrimmed ${nb_noadapt_deduped} || rm -f ${untrimmed_out}.fifo +wait ${pid_to_wait} +rm -f ${untrimmed_out}.fifo + +exit 0