diff --git a/PRO-seq/PRO-seq.snakefile b/PRO-seq/PRO-seq.snakefile index 8155de603a54802996defe710fa64a2a56db07b3..1f166157e9dce97f537f2160b896197df6da88dd 100644 --- a/PRO-seq/PRO-seq.snakefile +++ b/PRO-seq/PRO-seq.snakefile @@ -270,12 +270,6 @@ rule trim_and_dedup: PCR duplicates are removed at both ends""" input: rules.link_raw_data.output, - params: - adapter = lambda wildcards : lib2adapt[wildcards.lib], - trim5 = lambda wildcards : lib2UMI[wildcards.lib][0], - trim3 = lambda wildcards : lib2UMI[wildcards.lib][1], - minsize_trimmed = lambda wildcards : str(int(lib2UMI[wildcards.lib][0]) + 16 + int(lib2UMI[wildcards.lib][1])), - untrimmed_tmp = lambda wildcards : OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_noadapt_tmp.fifo".format(lib=wildcards.lib, rep=wildcards.rep)), output: noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_noadapt_deduped.fastq.gz"), adapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_adapt_deduped.fastq.gz"), @@ -284,6 +278,11 @@ rule trim_and_dedup: nb_adapt_deduped = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_adapt_deduped.txt"), nb_noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt.txt"), nb_noadapt_deduped = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt_deduped.txt"), + params: + adapter = lambda wildcards : lib2adapt[wildcards.lib], + process_type = "PRO-seq", + trim5 = lambda wildcards : lib2UMI[wildcards.lib][0], + trim3 = lambda wildcards : lib2UMI[wildcards.lib][1], threads: 4 # Actually, to avoid too much IO message: "Trimming adaptor from raw data using {wildcards.trimmer}, deduplicating reads, and removing 5' and 3' random n-mers for {wildcards.lib}_{wildcards.rep}." @@ -294,60 +293,13 @@ rule trim_and_dedup: log = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.log"), err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"), run: - if True: - #if wildcards.trimmer == "cutadapt": - shell_commands = """ -PRO-seq_trim_and_dedup.sh {wildcards.trimmer} {input} {params.adapter} {params.trim5} {params.trim3} {output.adapt} {output.noadapt} {log.trim} {output.nb_raw} {output.nb_adapt} {output.nb_adapt_deduped} {output.nb_noadapt} {output.nb_noadapt_deduped} 1> {log.log} 2> {log.err} -# This named pipe is used to avoid writing the intermediate file to disk -# It will transmit reads that did not seem to contain the adapter to the -# second sorting and deduplicating. -#mkfifo {params.untrimmed_tmp} -# -m {params.minsize_trimmed} is to discard reads that are shorter than this after trimming -# a second cutadapt step removes the random nucleotides that helped identify PCR duplicates. -#zcat {input} \\ -# | tee >(count_fastq_reads {output.nb_raw}) \\ -# | cutadapt -a {params.adapter} -m {params.minsize_trimmed} \\ -# --untrimmed-output={params.untrimmed_tmp} - 2> {log.trim} \\ -# | tee >(count_fastq_reads {output.nb_adapt}) \\ -# | dedup \\ -# | trim_random_nt {params.trim5} {params.trim3} \\ -# | tee >(count_fastq_reads {output.nb_adapt_deduped}) \\ -# | gzip > {output.noadapt} & -#pid_to_wait=$! -#cat {params.untrimmed_tmp} \\ -# | tee >(count_fastq_reads {output.nb_noadapt}) \\ -# | dedup \\ -# | trim_random_nt {params.trim5} {params.trim3} \\ -# | tee >(count_fastq_reads {output.nb_noadapt_deduped}) \\ -# | gzip > {output.adapt} \\ -# || rm -f {params.untrimmed_tmp} -#wait ${{pid_to_wait}} -#rm -f {params.untrimmed_tmp} + shell_commands = """ +THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {wildcards.trimmer} {input} \\ + {params.adapter} {params.trim5} {params.trim3} \\ + {output.adapt} {output.noadapt} {log.trim} \\ + {output.nb_raw} {output.nb_adapt} {output.nb_adapt_deduped} \\ + {output.nb_noadapt} {output.nb_noadapt_deduped} 1> {log.log} 2> {log.err} """ -# elif wildcards.trimmer == "fastx_clipper": -# shell_commands = """ -#zcat {input} \\ -# | tee >(count_fastq_reads {output.nb_raw}) \\ -# | fastx_clipper -a {params.adapter} -l {params.minsize_trimmed} \\ -# -c -M 3 -n -v 2> {log.trim} \\ -# | tee >(count_fastq_reads {output.nb_adapt}) \\ -# | dedup \\ -# | tee >(count_fastq_reads {output.nb_adapt_deduped}) \\ -# | trim_random_nt {params.trim5} {params.trim3} \\ -# | gzip > {output.noadapt} & -#pid_to_wait=$! -#zcat {input} \\ -# | fastx_clipper -a {params.adapter} -l {params.minsize_trimmed} \\ -# -C -M 3 -n -v 2>> {log.trim} \\ -# | tee >(count_fastq_reads {output.nb_noadapt}) \\ -# | dedup \\ -# | tee >(count_fastq_reads {output.nb_noadapt_deduped}) \\ -# | trim_random_nt {params.trim5} {params.trim3} \\ -# | gzip > {output.adapt} -#wait ${{pid_to_wait}} -#""" -# else: -# raise NotImplementedError("%s is not yet handled." % wildcards.trimmer) shell(shell_commands) diff --git a/PRO-seq/PRO-seq_trim_and_dedup.sh b/PRO-seq/PRO-seq_trim_and_dedup.sh index 71bc23a2ae7fa68c18b1de6ff19b7f04ff00049f..2721816f7e8feb5f690cfc99b5c9ffd772cf0bb6 100755 --- a/PRO-seq/PRO-seq_trim_and_dedup.sh +++ b/PRO-seq/PRO-seq_trim_and_dedup.sh @@ -26,6 +26,9 @@ adapt=${3} fiveprime_random=${4} threeprime_random=${5} +# For the untrimmed, the adaptor was not found, +# but maybe its first 3 bases were there. +threeprime_random_with_margin=$((${threeprime_random}+3)) trimmed_and_dedup_out=${6} @@ -82,8 +85,23 @@ minsize_trimmed=$(echo "${fiveprime_random} + 16 + ${threeprime_random}" | bc) case "${trimmer}" in "cutadapt") + if [ ${MAX_ERROR_RATE} ] + then + #>&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS." + error_opt="-e ${MAX_ERROR_RATE}" + else + error_opt="" + fi + if [ ${THREADS} ] + then + #thread_opt="-j ${THREADS}" + >&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS." + thread_opt="" + else + thread_opt="" + fi # -m ${minsize_random} is to discard reads that are shorter than this after trimming - trim_cmd="${trimmer} -a ${adapt} -m ${minsize_trimmed} --untrimmed-output=${untrimmed_out}.fifo - 2> ${log}" + trim_cmd="${trimmer} -a ${adapt} -m ${minsize_trimmed} ${error_opt} ${thread_opt} --untrimmed-output=${untrimmed_out}.fifo - 2> ${log}" ;; "fastx_clipper") # -n is to keep reads with N @@ -111,12 +129,25 @@ dedup_trimmed() dedup_untrimmed() { # $1: file in which to write the number of fastq records after deduplication - cmd="cat - | dedup | trim_random_nt ${threeprime_random} ${fiveprime_random} | tee >(count_fastq_reads ${1}) | gzip" + cmd="cat - | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | tee >(count_fastq_reads ${1}) | gzip" echo ${cmd} eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed" } -zcat ${raw_in} \ +filetype="$(file -L ${raw_in} | cut -d " " -f2)" +case "${filetype}" in + "gzip") + cat_cmd="zcat" + ;; + "ASCII") + cat_cmd="cat" + ;; + *) + error_exit "Unexpected file type for ${raw_in}" + ;; +esac + +${cat_cmd} ${raw_in} \ | tee >(count_fastq_reads ${nb_raw}) \ | dedup_trimmed ${nb_adapt} ${nb_adapt_deduped} & pid_to_wait=$!