From 5390984614bddd589debfe7011b115380435c428 Mon Sep 17 00:00:00 2001 From: Blaise Li <blaise.li__git@nsup.org> Date: Fri, 2 Feb 2018 19:04:20 +0100 Subject: [PATCH] Pipeline to process iCLIP data. Currently goes from demultiplexing to mapping, via trimming and deduplicating. The mapping is performed on 3 read type: - adapt_nodedup (the adaptor was found, and the reads were trimmed but not deduplicated) - adapt_deduped (the adaptor was found, and the reads were trimmed and deduplicated) - noadapt_deduped (the adaptor was not found, and the reads were trimmed and deduplicated) The trim_and_dedup script currenly assumes that two low-diversity zones are present, and ignores them for deduplication: NNNNNGCACTANNNWWW[YYYY]NNNN 1---5 : 5' UMI 6--11: barcode (lower diversity) 12-14: UMI 15-17: AT(or GC?)-rich (low diversity) [fragment] -4 -> -1: 3' UMI It may be a problem to deduplicate taking into account the end of the reads, which tends to be of lower quality. The reads with errors will be over-represented. That is why we decided to also look at the non-deduplicated reads. --- CLIP/iCLIP.snakefile | 339 +++++++++++++++++++++++++++++++++++ CLIP/iCLIP_trim_and_dedup.sh | 203 +++++++++++++++++++++ 2 files changed, 542 insertions(+) create mode 100644 CLIP/iCLIP.snakefile create mode 100755 CLIP/iCLIP_trim_and_dedup.sh diff --git a/CLIP/iCLIP.snakefile b/CLIP/iCLIP.snakefile new file mode 100644 index 0000000..44505bd --- /dev/null +++ b/CLIP/iCLIP.snakefile @@ -0,0 +1,339 @@ +""" +Snakefile to process iCLIP data. +""" +import sys +major, minor = sys.version_info[:2] +if major < 3 or (major == 3 and minor < 6): + sys.exit("Need at least python 3.6\n") + + +import os +OPJ = os.path.join +from glob import glob + +from collections import defaultdict + +import matplotlib as mpl +# To be able to run the script without a defined $DISPLAY +mpl.use("PDF") +#mpl.rcParams["figure.figsize"] = 2, 4 +mpl.rcParams["font.sans-serif"] = [ + "Arial", "Liberation Sans", "Bitstream Vera Sans"] +mpl.rcParams["font.family"] = "sans-serif" +#mpl.rcParams["figure.figsize"] = [16, 30] +import pandas as pd +import matplotlib.pyplot as plt + +from libworkflows import get_chrom_sizes +from libworkflows import ensure_relative, SHELL_FUNCTIONS, warn_context +from smincludes import rules as irules + +# recommended k-mer length for D. melanogaster is 20 +# However, reads shorter thant the k-mer length will be ignored. +# http://crac.gforge.inria.fr/documentation/crac/#sec-2 +alignment_settings = { + "bowtie2": "--local -L 6 -i S,1,0.8 -N 0", + "crac": "-k 20 --stranded --use-x-in-cigar"} + +# Define functions to be used in shell portions +shell.prefix(SHELL_FUNCTIONS) + +aligner = config["aligner"] +genome_dict = config["genome_dict"] +genome = genome_dict["name"] +chrom_sizes = get_chrom_sizes(genome_dict["size"]) +genomelen = sum(chrom_sizes.values()) +genome_db = genome_dict["db"][aligner] +# bed file binning the genome in 10nt bins +genome_binned = genome_dict["binned"] +annot_dir = genome_dict["annot_dir"] +# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"] +convert_dir = genome_dict["convert_dir"] +gene_lists_dir = genome_dict["gene_lists_dir"] +avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) + +merged_fastq = config["merged_fastq"] +barcode_dict = config["barcode_dict"] +BARCODES = list(barcode_dict.keys()) +MAX_DIFF = config["max_diff"] +output_dir = config["output_dir"] +log_dir = OPJ(output_dir, "logs") +data_dir = OPJ(output_dir, "data") +demux_dir = OPJ(data_dir, f"demultiplexed_{MAX_DIFF}") +lib2raw = defaultdict(dict) +REPS = set() +for (barcode, lib_info) in barcode_dict.items(): + REPS.add(lib_info["rep"]) + lib2raw[lib_info["lib"]][lib_info["rep"]] = OPJ(demux_dir, f"{barcode}.fastq.gz") +LIBS = list(lib2raw.keys()) +REPS = sorted(REPS) + +TRIMMERS = ["fastx_clipper", "cutadapt"] +READ_TYPES = ["noadapt_deduped", "adapt_deduped", "adapt_nodedup"] +#TRIMMERS = ["fastx_clipper"] +# For compatibility with trim_and_dedup as used in PRO-seq pipeline +lib2adapt = defaultdict(lambda: config["adapter"]) +MAX_ADAPT_ERROR_RATE = config["max_adapt_error_rate"] + +wildcard_constraints: + lib="|".join(LIBS), + rep="\d+" + +preprocessing = [ + expand(OPJ(demux_dir, "{barcode}.fastq.gz"), barcode=BARCODES), + expand(OPJ(data_dir, "{lib}_{rep}.fastq.gz"), lib=LIBS, rep=REPS), + expand(OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}.fastq.gz"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=READ_TYPES), + expand(OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}_fastqc.html"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=READ_TYPES), + expand(OPJ(data_dir, "trimmed_{trimmer}", "read_stats", "{lib}_{rep}", "{read_type}_size_distribution.pdf"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=READ_TYPES), +] + +mapping = [ + expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=READ_TYPES), + expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_samtools_stats.txt" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=READ_TYPES), +] + +#TODO: +# - map with CRAC, detect chimera and crosslink-induced sequencing erros +# - find cross-link sites on genes: should be 5' of antisense reads +rule all: + """This top rule is used to drive the whole workflow by taking as input its final products.""" + input: + preprocessing, + mapping, + + +################# +# Preprocessing # +################# +rule demultiplex: + input: + fq_in = merged_fastq, + output: + expand(OPJ(demux_dir, "{barcode}.fastq.gz"), barcode=BARCODES), + params: + demux_dir = demux_dir, + bc_start = config["bc_start"], + barcodes = " ".join(BARCODES), + max_diff = MAX_DIFF + log: + err = OPJ(log_dir, "demultiplex.err") + benchmark: + OPJ(log_dir, "demultiplex_benchmark.txt") + shell: + """ + qaf-demux -i {input.fq_in} -o {params.demux_dir} \\ + -s {params.bc_start} -b {params.barcodes} -m {params.max_diff} \\ + 2> {log.err} || error_exit "qaf_demux failed" + """ + + +include: ensure_relative(irules["link_raw_data"], workflow.basedir) + + +rule trim_and_dedup: + """The adaptor is trimmed, then reads are treated in two groups depending + on whether the adapter was found or not. For each group the reads are + sorted, deduplicated, and the random k-mers that helped identify + PCR duplicates are removed at both ends""" + input: + rules.link_raw_data.output, + output: + noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_noadapt_deduped.fastq.gz"), + adapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_adapt_deduped.fastq.gz"), + adapt_nodedup = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_adapt_nodedup.fastq.gz"), + nb_raw = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_raw.txt"), + nb_adapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_adapt.txt"), + nb_adapt_deduped = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_adapt_deduped.txt"), + nb_noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt.txt"), + nb_noadapt_deduped = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt_deduped.txt"), + params: + adapter = lambda wildcards : lib2adapt[wildcards.lib], + max_adapt_error_rate = MAX_ADAPT_ERROR_RATE, + process_type = "iCLIP", + trim5 = 8, + trim3 = 4, + threads: 4 # Actually, to avoid too much IO + message: + "Trimming adaptor from raw data using {wildcards.trimmer}, deduplicating reads, and removing 5' and 3' random n-mers for {wildcards.lib}_{wildcards.rep}." + benchmark: + OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim_benchmark.txt") + log: + trim = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim.log"), + log = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.log"), + err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"), + run: + shell_commands = """ +MAX_ERROR_RATE="{params.max_adapt_error_rate}" THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {wildcards.trimmer} {input} \\ + {params.adapter} {params.trim5} {params.trim3} \\ + {output.adapt} {output.noadapt} {output.adapt_nodedup} {log.trim} \\ + {output.nb_raw} {output.nb_adapt} {output.nb_adapt_deduped} \\ + {output.nb_noadapt} {output.nb_noadapt_deduped} 1> {log.log} 2> {log.err} +""" + shell(shell_commands) + + +def source_fastq(wildcards): + """Determine the fastq file corresponding to a given read type.""" + read_type = wildcards.read_type + if read_type == "raw": + return rules.link_raw_data.output + elif read_type == "adapt_deduped": + return rules.trim_and_dedup.output.adapt + elif read_type == "noadapt_deduped": + return rules.trim_and_dedup.output.noadapt + elif read_type == "adapt_nodedup": + return rules.trim_and_dedup.output.adapt_nodedup + else: + raise NotImplementedError("Unknown read type: %s" % read_type) + + +rule do_fastqc: + input: + fastq = source_fastq + output: + fastqc_out = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}_fastqc.html") + shell: + """ + fastqc {input.fastq} + """ + + +rule compute_size_distribution: + input: + source_fastq + output: + OPJ(data_dir, "trimmed_{trimmer}", "read_stats", "{lib}_{rep}", "{read_type}_size_distribution.txt"), + message: + "Computing read size distribution for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}." + shell: + """ + zcat {input} | compute_size_distribution {output} + """ + + +def plot_histo(outfile, data, title=None): + fig = plt.figure(figsize=(15,7)) + ax = fig.add_subplot(111) + ax.set_xlim([data.index[0] - 0.5, data.index[-1] + 0.5]) + #ax.set_ylim([0, 100]) + bar_width = 0.8 + #letter2legend = dict(zip("ACGT", "ACGT")) + for (read_len, count) in data.iterrows(): + plt.bar( + read_len, + count, + align="center", + width=bar_width) + #color=letter2colour[letter], + #label=letter2legend[letter]) + ax.legend() + ax.set_xticks(data.index) + ax.set_xticklabels(data.index) + ax.set_xlabel("read length") + ax.set_ylabel("number of reads") + plt.setp(ax.get_xticklabels(), rotation=90) + if title is not None: + plt.title(title) + plt.savefig(outfile) + + +rule plot_size_distribution: + input: + rules.compute_size_distribution.output + output: + OPJ(data_dir, "trimmed_{trimmer}", "read_stats", "{lib}_{rep}", "{read_type}_size_distribution.{fig_format}") + message: + "Plotting size distribution for trimmed {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}." + run: + data = pd.read_table(input[0], header=None, names=("size", "count"), index_col=0) + title = f"read size distribution for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}" + plot_histo(output[0], data, title) + + +########### +# Mapping # +########### +rule map_on_genome: + input: + # fastq = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}.fastq.gz"), + fastq = source_fastq, + output: + # sam files take a lot of space + sam = temp(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s.sam" % genome)), + nomap_fastq = OPJ(output_dir, "{trimmer}", aligner, "info_mapping_%s" % genome, "{lib}_{rep}_{read_type}_unmapped_on_%s.fastq.gz" % genome), + params: + aligner = aligner, + index = genome_db, + settings = alignment_settings[aligner], + message: + "Mapping {wildcards.lib}_{wildcards.rep}_{wildcards.read_type} on C. elegans genome." + log: + log = OPJ(log_dir, "{trimmer}", aligner, "map_{read_type}_on_genome", "{lib}_{rep}.log"), + err = OPJ(log_dir, "{trimmer}", aligner, "map_{read_type}_on_genome", "{lib}_{rep}.err"), + threads: 12 + wrapper: + "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/map_on_genome" + + +rule sam2indexedbam: + input: + sam = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s.sam" % genome), + output: + sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam" % genome), + index = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam.bai" % genome), + message: + "Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}." + log: + log = OPJ(log_dir, "{trimmer}", "sam2indexedbam", "{lib}_{rep}_{read_type}.log"), + err = OPJ(log_dir, "{trimmer}", "sam2indexedbam", "{lib}_{rep}_{read_type}.err"), + threads: + 4 + wrapper: + "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/sam2indexedbam" + + +rule compute_mapping_stats: + input: + sorted_bam = rules.sam2indexedbam.output.sorted_bam, + output: + stats = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_samtools_stats.txt" % genome), + shell: + """samtools stats {input.sorted_bam} > {output.stats}""" + + +rule fuse_bams: + """This rules fuses the two sorted bam files corresponding to the mapping + of the reads containing the adaptor or not.""" + input: + noadapt_sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_noadapt_on_%s_sorted.bam" % genome), + adapt_sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_adapt_on_%s_sorted.bam" % genome), + output: + sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_on_%s_sorted.bam" % genome), + bai = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_on_C_%s_sorted.bam.bai" % genome), + message: + "Fusing sorted bam files for {wildcards.lib}_{wildcards.rep}" + log: + log = OPJ(log_dir, "{trimmer}", "fuse_bams", "{lib}_{rep}.log"), + err = OPJ(log_dir, "{trimmer}", "fuse_bams", "{lib}_{rep}.err"), + shell: + """ + samtools merge -c {output.sorted_bam} {input.noadapt_sorted_bam} {input.adapt_sorted_bam} 1> {log.log} 2> {log.err} + indexed="" + while [ ! ${{indexed}} ] + do + samtools index {output.sorted_bam} && indexed="OK" + if [ ! ${{indexed}} ] + then + rm -f {output.bai} + echo "Indexing failed. Retrying" 1>&2 + fi + done 1>> {log.log} 2>> {log.err} + """ + + +onsuccess: + print("iCLIP data pre-processing finished.") + +onerror: + print("iCLIP data pre-processing failed.") + diff --git a/CLIP/iCLIP_trim_and_dedup.sh b/CLIP/iCLIP_trim_and_dedup.sh new file mode 100755 index 0000000..9acbe20 --- /dev/null +++ b/CLIP/iCLIP_trim_and_dedup.sh @@ -0,0 +1,203 @@ +#!/usr/bin/env bash +# Usage: iCLIP_trim_and_dedup.sh <trimmer> <raw fastq> <ADAPTER> <nb 5'> <nb 3'> <trimmed fastq> <untrimmed fastq> <trimmed_nodedup> <trimmer log> <nb_raw> <nb_adapt> <nb_adapt_deduped> <nb_noadapt> <nb_noadapt_deduped> + +# http://linuxcommand.org/wss0150.php +PROGNAME=$(basename $0) + +function error_exit +{ +# ---------------------------------------------------------------- +# Function for exit due to fatal program error +# Accepts 1 argument: +# string containing descriptive error message +# ---------------------------------------------------------------- + echo "${PROGNAME}: ${1:-"Unknown Error"}" 1>&2 + exit 1 +} + +# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in +#PACKAGEDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) + +trimmer=${1} + +raw_in=${2} + +adapt=${3} + +# Sum of lengths of UMIs both left and right of barcode +fiveprime_random=${4} +# Adding lenghts of "low diversity zones" +# (to process without deduplicating) +total_fiveprime=$((${fiveprime_random}+9)) +# Length of the 3' UMI +threeprime_random=${5} +# For the untrimmed, the adaptor was not found, +# but maybe its first 3 bases were there. +threeprime_random_with_margin=$((${threeprime_random}+3)) +echo ${threeprime_random_with_margin} + +# fastq files +trimmed_and_dedup_out=${6} +untrimmed_out=${7} +trimmed_nodedup_out=${8} + +log=${9} + +# count files +nb_raw=${10} +nb_adapt=${11} +nb_adapt_deduped=${12} +nb_noadapt=${13} +nb_noadapt_deduped=${14} + +count_fastq_reads() +{ + # $1: file in which to write the number of fastq records + wc -l | { read nblines; echo ${nblines} / 4 | bc > ${1}; } +} + +# Removing parts that have lower diversity, +# and therefore more likely to have sequencing errors (6-11,15-17) +# Reads should be: +# +# NNNNNGCACTANNNWWW[YYYY]NNNN +# 1---5 : 5' UMI +# 6--11: barcode (lower diversity) +# 12-14: UMI +# 15-17: AT(or GC?)-rich (low diversity) +# [fragment] +# -4 -> -1: 3' UMI +strip_low_qual_zones() +{ + bioawk -c fastx '{print $name"\t"substr($seq, 1, 5)""substr($seq, 12, 3)""substr($seq, 18)"\t"substr($qual, 1, 5)""substr($qual, 12, 3)""substr($qual, 18)}' | mawk '{print "@"$1"\n"$2"\n+\n"$3}' +} + +# This script performs 2 sorting and deduplicating operations, depending on the +# presence or absence of the adapter in the read. + + +# The -s option of fastq-sort sorts the reads by their sequence. +sort_by_seq() +{ + TMPDIR="/var/tmp" fastq-sort -s || error_exit "fastq-sort failed" +} + +# Once the reads are sorted by sequence, +# successive reads with the same sequence are merged, +# keeping the best quality at each position. + +dedup () { + #${PACKAGEDIR}/remove_duplicates_from_sorted_fastq/remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed" + #remove_duplicates_from_sorted_fastq || error_exit "remove_duplicates_from_sorted_fastq failed" + sort_by_seq | remove-duplicates-from-sorted-fastq || error_exit "remove_duplicates_from_sorted_fastq failed" +} + +trim_random_nt() +{ + # $1: nb of bases to trim at 5' end + # $2: nb of bases to trim at 3' end + cutadapt -u ${1} -u -${2} - 2> /dev/null || error_exit "trim_random_nt failed" +} + +process_without_deduplication () { + # $1: compressed fastq file in which to write the trimmed reads + trim_random_nt ${total_fiveprime} ${threeprime_random} | gzip > ${1} +} + +# This named pipe is used to avoid writing the intermediate file to disk +# It will transmit reads that did not seem to contain the adapter to the +# second sorting and deduplicating. +mkfifo ${untrimmed_out}.fifo + +minsize_trimmed=$(echo "${fiveprime_random} + 16 + ${threeprime_random}" | bc) + +case "${trimmer}" in + "cutadapt") + if [ ${MAX_ERROR_RATE} ] + then + #>&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS." + error_opt="-e ${MAX_ERROR_RATE}" + else + error_opt="" + fi + if [ ${THREADS} ] + then + #thread_opt="-j ${THREADS}" + >&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS." + thread_opt="" + else + thread_opt="" + fi + # -m ${minsize_random} is to discard reads that are shorter than this after trimming + trim_cmd="${trimmer} -a ${adapt} -m ${minsize_trimmed} ${error_opt} ${thread_opt} --untrimmed-output=${untrimmed_out}.fifo - 2> ${log}" + ;; + "fastx_clipper") + # -n is to keep reads with N + # -l is equivalent to -m in cutadapt (not sure it has effect with -C) + # -c is to keep only the trimmed reads + # -C is to keep only the non-trimmed reads + # -v is to have verbose things to put in the log + trim_cmd="tee >(${trimmer} -a ${adapt} -l ${minsize_trimmed} -C -M 3 -n > ${untrimmed_out}.fifo) | ${trimmer} -a ${adapt} -l ${minsize_trimmed} -c -M 3 -n -v 2>> ${log}" + ;; + *) + error_exit "Trimming adapter with ${trimmer} not implemented." + ;; +esac + +# a second cutadapt step removes the random nucleotides that helped identify PCR duplicates. +#dedup_trimmed() +#{ +# # $1: file in which to write the number of fastq records after adapter trimming +# # $2: file in which to write the number of fastq records after deduplication +# cmd="${trim_cmd} | strip_low_qual_zones | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip" +# echo ${cmd} +# eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed" +#} +dedup_trimmed() +{ + # $1: file in which to write the number of fastq records after adapter trimming + # $2: file in which to write the number of fastq records after deduplication + cmd="${trim_cmd} | tee >(process_without_deduplication ${trimmed_nodedup_out}) | strip_low_qual_zones | tee >(count_fastq_reads ${1}) | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random} | tee >(count_fastq_reads ${2}) | gzip" + echo ${cmd} + eval ${cmd} > ${trimmed_and_dedup_out} || error_exit "${cmd} failed" +} + +#dedup_untrimmed() +#{ +# # $1: file in which to write the number of fastq records after deduplication +# cmd="cat - | strip_low_qual_zones | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | tee >(count_fastq_reads ${1}) | gzip" +# echo ${cmd} +# eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed" +#} +dedup_untrimmed() +{ + # $1: file in which to write the number of fastq records after deduplication + cmd="cat - | strip_low_qual_zones | dedup | trim_random_nt ${fiveprime_random} ${threeprime_random_with_margin} | tee >(count_fastq_reads ${1}) | gzip" + echo ${cmd} + eval ${cmd} > ${untrimmed_out} || error_exit "${cmd} failed" +} + +filetype="$(file -L ${raw_in} | cut -d " " -f2)" +case "${filetype}" in + "gzip") + cat_cmd="zcat" + ;; + "ASCII") + cat_cmd="cat" + ;; + *) + error_exit "Unexpected file type for ${raw_in}" + ;; +esac + +${cat_cmd} ${raw_in} \ + | tee >(count_fastq_reads ${nb_raw}) \ + | dedup_trimmed ${nb_adapt} ${nb_adapt_deduped} & +pid_to_wait=$! +cat ${untrimmed_out}.fifo \ + | tee >(count_fastq_reads ${nb_noadapt}) \ + | dedup_untrimmed ${nb_noadapt_deduped} || rm -f ${untrimmed_out}.fifo +wait ${pid_to_wait} +rm -f ${untrimmed_out}.fifo + +exit 0 -- GitLab