From e846996cadcd3ee63744dfaa628a980d360b468f Mon Sep 17 00:00:00 2001 From: Veronique Legrand <vlegrand@pasteur.fr> Date: Tue, 31 Aug 2021 16:06:53 +0200 Subject: [PATCH] added paper for publication in JOSS --- paper/paper.bib | 101 ++++++++++++++++++++++++++++++++++ paper/paper.md | 143 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 244 insertions(+) create mode 100644 paper/paper.bib create mode 100644 paper/paper.md diff --git a/paper/paper.bib b/paper/paper.bib new file mode 100644 index 0000000..32d1bb3 --- /dev/null +++ b/paper/paper.bib @@ -0,0 +1,101 @@ +@article{Brown:2012, + author = {{Brown}, C.~T. and {Howe}, A. and {Zhang}, Q. and {Pyrkosz}, A.~B. and {Brom}, Y.~H.}, + title = "{A Reference-Free Algorithm for Computational Normalization of Shotgun Sequencing Data}", + journal = {arXiv}, + pages = {1203.4802v2}, + year = 2012, + url = {https://arxiv.org/abs/1203.4802v2} +} + +@misc{Bushnell:2014, + author = {{Bushnell}, B.}, + title = {BBnorm: Kmer-based error-correction and normalization tool (from the BBTools package)}, + year = {2014}, + journal = {SourceForge repository}, + url = {https://sourceforge.net/projects/bbmap/} +} + +@article{Criscuolo:2013, + author = {{Criscuolo}, A. and {Brisse}, S.}, + title = "{AlienTrimmer: a tool to quickly and accurately trim off multiple short contaminant sequences from high-throughput sequencing reads}", + journal = {Genomics}, + volume = 102, + number = {5--6}, + pages = {500--506}, + year = 2013, + doi = {10.1016/j.ygeno.2013.07.011} +} + +@article{Crusoe:2015, + author = {{Crusoe}, M.~R. and {Alameldin}, H.~F. and {Awad}, S. and {Boucher}, E. and {Caldwell}, A. and {Cartwright}, R. and {Charbonneau}, A. and {Constantinides}, B. and {Edvenson}, G. and {Fay}, S. and {Fenton}, J. and {Fenzl}, T. and {Fish}, J. and {Garcia-Gutierrez}, L. and {Garland}, P. and {Gluck}, J. and {González}, I. and {Guermond}, S. and {Guo}, J. and {Gupta}, A. and {Herr}, J.~R. and {Howe}, A. and {Hyer}, A. and {Härpfer}, A. and {Irber}, L. and {Kidd}, R. and {Lin}, D. and {Lippi}, J. and {Mansour}, T. and {McA'Nulty}, P. and {McDonald}, E. and {Mizzi}, J. and {Murray}, K.~D. and {Nahum}, J.~R. and {Nanlohy}, K. and {Nederbragt}, A.~J. and {Ortiz-Zuazaga}, H. and {Ory}, J. and {Pell}, J. and {Pepe-Ranney}, C. and {Russ}, Z.~N. and {Schwarz}, E. and {Scott}, C. and {Seaman}, J. and {Sievert}, S. and {Simpson}, J. and {Skennerton}, D.~T. and {Spencer}, J. and {Srinivasan}, R. and {Standage}, D. and {Stapleton}, J.~A. and {Steinman}, S.~R. and {Stein}, J. and {Taylor}, B. and {Trimble}, W. and {Wiencko}, H.~L. and {Wright}, M. and {Wyss}, B. and {Zhang}, Q. and {zyme}, e. and {Brown}, C.~T.}, + title = "{The khmer software package: enabling efficient nucleotide sequence analysis [version 1; peer review: 2 approved, 1 approved with reservations]}", + journal = {F1000Research}, + year = 2015, + volume = 4, + pages = {900}, + doi = {10.12688/f1000research.6924.1} +} + +@article{Cormode:2005, + author = {{Cormode}, G. and {Muthukrishnan}, S.}, + title = "{An Improved Data Stream Summary: The Count-Min Sketch and its Applications}", + journal = {Journal of Algorithms}, + year = 2005, + volume = 55, + pages = {29--38}, + doi = {10.1016/j.jalgor.2003.12.001} +} + +@article{Durai:2019, + author = {{Durai}, D.~A. and {Schulz}, M.~H.}, + title = "{Improving in-silico normalization using read weights}", + journal = {Scientific Reports}, + year = 2019, + volume = 9, + pages = {5133}, + doi = {10.1038/s41598-019-41502-9} +} + +@article{Kim:2019, + author = {{Kim}, K. and {Jeong}, Y. and {Lee}, Y. and {Lee}, S.}, + title = "{Analysis of Counting Bloom Filters Used for Count Thresholding}", + journal = {Electronics}, + year = 2019, + volume = 8, + number = 7, + pages = {779}, + doi = {10.3390/electronics8070779} +} + +@article{Mohamadi:2017, + author = {{Mohamadi}, H. and {Khan}, H. and {Birol}, I.}, + title = "{ntCard: a streaming algorithm for cardinality estimation in genomics data}", + journal = {Bioinformatics}, + year = 2017, + volume = 33, + number = 9, + pages = {1324--1330}, + doi = {10.1093/bioinformatics/btw832} +} + +@article{Wedemeyer:2017, + author = {{Wedemeyer}, A. and {Kliemann}, L. and {Srivastav}, A. and {Schielke}, C. and {Reusch}, T.~B. and {Rosenstiel}, P.}, + title = "{An improved filtering algorithm for big read datasets and its application to single-cell assembly}", + journal = {BMC Bioinformatics}, + year = 2017, + volume = 18, + pages = {324}, + doi = {10.1186/s12859-017-1724-7} +} + +@article{Zhang:2014, + author = {{Zhang}, Q and {Pell}, J and {Canino-Koning}, R and {Howe}, A C and {Brown}, C T}, + title = "{These Are Not the K-mers You Are Looking For: Efficient Online K-mer Counting Using a Probabilistic Data Structure}", + journal = {PLoS ONE}, + year = 2014, + volume = 9, + number = 7, + pages = {e101271}, + doi = {10.1371/journal.pone.0101271} +} + diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 0000000..9bd8ce9 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,143 @@ +--- +title: 'ROCK: digital normalization of whole genome sequencing data' +tags: + - C++ + - high-throughput sequencing + - digital normalization + - _k_-mer +authors: + - affiliation: 1 + name: Véronique Legrand + - affiliation: 2 + name: Thomas Kergrohen + - affiliation: 1 + name: Nicolas Joly^[deceased] + - affiliation: "3, 4" + name: Alexis Criscuolo^[corresponding author] + orcid: 0000-0002-8212-5215 +affiliations: + - index: 1 + name: Plateforme HPC, Institut Pasteur, Paris, France + - index: 2 + name: Department of Child and Adolescent Oncology, Gustave Roussy, Villejuif, France + - index: 3 + name: Hub de Bioinformatique et Biostatistique - Département Biologie Computationnelle, Institut Pasteur, Paris, France + - index: 4 + name: Plateforme de Microbiologie Mutualisée (P2M), Institut Pasteur, Paris, France +date: "18 August 2021" +bibliography: paper.bib +--- + + + +# Summary + +Due to advances in high-throughput sequencing technologies, generating whole genome sequencing (WGS) data with high coverage depth (e.g. $\geq\!$ 500$\times$) is now becoming common, especially when dealing with non-eukaryotic genomes. +Such high coverage WGS data often fulfills the expectation that most nucleotide positions of the genome are sequenced a sufficient number of times without error. +However, performing bioinformatic analyses (e.g. sequencing error correction, whole genome _de novo_ assembly) on such highly redundant data requires substantial running times and memory footprint. + +To reduce redundancy within a WGS dataset, randomly downsampling high-throughput sequencing reads (HTSR) is trivial. +Nevertheless, this first-in-mind strategy is not efficient as it does not minimize variation in sequencing depth, thereby eroding the coverage depth of genome regions that are under-covered (if any). +To cope with this problem, a simple greedy algorithm, named digital normalization, was designed to efficiently downsample HTSRs over genome regions that are over-covered [@Brown:2012]. +Given an upper-bound threshold $\kappa\! >\!$ 1, it returns a subset $S_\kappa$ such that the coverage depth induced by the HTSRs in $S_\kappa$ is expected to be at most $\varepsilon \kappa$ across genome (where $\varepsilon\! >\!$ 1 is a constant). +By discarding highly redundant HTSRs while retaining sufficient and homogeneous coverage depth ($\approx\! \varepsilon \kappa$), this algorithm strongly decreases both running times and memory required to subsequently analyze WGS data, with often little impact on the expected results [@Crusoe:2015]. + +Interestingly, the digital normalization algorithm can be easily enhanced in several ways, so that the final subset contains fewer but more qualitative HTSRs. +Unfortunately, these different improvements are scattered in distinct program tools. +`ROCK` (_Reducing Over-Covering K-mers_) was therefore developed with the key purpose of implementing a fast, accurate and easy-to-use digital normalization procedure. +The C++ source code is available under GNU Affero General Public License v3.0 at [https://gitlab.pasteur.fr/vlegrand/ROCK](https://gitlab.pasteur.fr/vlegrand/ROCK). + +# Statement of need + +The digital normalization algorithm is based on a count-min sketch (CMS), a probabilistic data stream structure able to store the number of occurrences of the canonical _k_-mers (i.e. short oligonucleotides of fixed length $k$, invariant to reverse-complement) that derive from the selected HTSRs in $S_\kappa$ [@Cormode:2005; @Zhang:2014]. +By means of this CMS, the _k_-mer coverage $c_k(r)$ of any new HTSR $r$ (i.e. the _k_-mer coverage depth of the corresponding genome region induced by the selected HTSRs already in $S_\kappa$) can be approximated by the median number of occurences of the _k_-mers derived from $r$. +The algorithm iterates over all HTSRs $r$ to progressively fill $S_\kappa$: when $c_k(r)\!\leq\!\kappa$, $r$ is added into $S_\kappa$ and the CMS is updated with every _k_-mers of $r$. +At the end, the coverage depth induced by the selected HTRs in $S_\kappa$ is therefore expected to be upper-bounded by $\varepsilon \kappa$, where $\varepsilon\! =\! \ell\! /\! (\ell\! -\! k\! +\! 1)$ and $\ell$ is the average HTSR length. +This initial version of the algorithm can be run using either the Python program `normalize-by-median` from the package _khmer_ [@Crusoe:2015], the Java program `BBnorm` from the package _BBTools_ [@Bushnell:2014] or the C++ program `BigNorm` [@Wedemeyer:2017]. + +Of note, since the HTSRs are selected stepwise, the digital normalization algorithm may be improved by first considering the longest HTSRs, therefore yielding a subset $S_\kappa$ containing quite less HTSRs. +In a similar fashion, the input HTSRs may also be first sorted according to their decreasing quality, estimated as the sum of Phred scores. +Such a preliminary sorting results in an optimized subset $S_\kappa$ containing long and high-quality HTSRs. +This quality-based sorting step can be set when performing the particular normalization procedure implemented in the program `ORNA` [@Durai:2019]. + +Finally, given a small lower-bound threshold $\kappa'$ ($<\! \kappa$), the digital normalization approach can be easily extended to return a subset $S_{\kappa'\!,\kappa}\! \subset\! S_\kappa$ that does not contain the HTSRs with _k_-mer coverage lesser than $\kappa'$. +By means of the CMS that stores the number of occurences of all canonical _k_-mers deriving from $S_\kappa$, every HTSR $r$ such that $c_k(r)\!\leq\!\kappa'$ is simply discarded, therefore yielding a final subset $S_{\kappa'\!,\kappa}$ whose overall coverage depth lies between $\varepsilon \kappa'$ and $\varepsilon \kappa$. +This second-pass strategy thus discards HTSRs containing many infrequent _k_-mers, e.g. incorrectly sequenced, artefactual or contaminating HTSRs. +Unfortunately, this useful approach is implemented by a few program tools only, e.g. `BBnorm`. + +The main purpose of `ROCK` is to pool these complementary strategies to obtain a complete and efficient digital normalization procedure. +It was designed to be used as a preprocessing step prior to performing fast genome _de novo_ assembly. +`ROCK` is a command-line program that may consider up to 15 input FASTQ files containing HTSRs from different single-end (SE) or paired-end (PE) sequencing. +Both lower and upper thresholds $\kappa^\prime$ and $\kappa$ can be set by the user, as well as the length $k \! \leq\!$ 31. +As the CMS size is a key parameter to obtain fast running times while keeping accurate results (see Implementation), `ROCK` is also able to automatically calculate it, provided that the number of distinct canonical _k_-mers is specified. + +# Implementation + +`ROCK` first reads the input FASTQ file(s) to store the location of each HTSR into a container $\textbf{M}[x][y]$ based on `std::map`, where $\textbf{M}[x][y]$ corresponds to the $y$th SE/PE HTSR(s) with the quality score $x$ (i.e. the sum of Phred scores). +Each entry $\textbf{M}[x][y]$ contains the minimum necessary information to quickly get the associated SE/PE HTSR(s), i.e. the index of the corresponding file(s) and the index (i.e. offset from the start of the file) of the SE/PE HTSR(s). +Using such a structure enables to quickly traverse every HTSR in their decreasing quality order. +However, as each traversed HTSR is read using its offset-related values stored in $\textbf{M}[x][y]$, input FASTQ files should be uncompressed. + +The CMS is a two-dimensional $\lambda\! \times\! m$ array $\textbf{C}[i][j]$, for which each of the $\lambda$ rows is associated to a _mod prime_ hashing function $h_i$ on $[0,m]$, with $m\! =$`UINT_MAX` (i.e. $2^{32}\! -\! 1$). +Every _k_-mer is associated with one entry $\textbf{C}[i][j]$ for each row $i$, where $j$ is the hashing value returned by $h_i$ for that _k_-mer. +To add a canonical _k_-mer occurence into the CMS, each of its $\lambda$ associated entries is incremented by one. +As a consequence, when $\kappa\! \leq\!$ 255, the CMS is a two-dimensional array of `unsigned char` with a memory footprint of $4 \lambda\!$ Gb; when $\kappa\! >\!$ 255, the `unsigned short` type is used, with twice RAM usage. + +By definition, the number $\eta$ of occurences of any given canonical _k_-mer in the CMS is expected to be the minimum value $\tilde{\eta}$ among the $\lambda$ associated entries. +However, $\tilde{\eta}$ can be strictly higher than the true number $\eta$ of occurences with a probability $p_\textrm{\tiny FP}$ [@Cormode:2005], henceforth a false positive (FP) probability. +Following @Kim:2019, when $\lambda\!=\!$ 1, the probability that an arbitrary entry of $\textbf{C}$ is strictly higher than a threshold $\theta$ after inserting $n$ distinct _k_-mers is $\pi_{\theta,n}\! =\! 1\! -\! \sum_{z \leq \theta} b_{n,1/m}(z)$, where $b_{n,p}$ is the probability mass function of the binomial distribution $B(n,p)$. +Therefore, one gets $p_\textrm{\tiny FP}\! = \! \pi_{\theta,n}^\lambda$ with $\theta\! =\! \kappa$ (or $\theta\! =\! \kappa'$ if $\kappa'\! \ne\!$ 0). +By default, `ROCK` uses $\lambda\! =\!$ 4 and a FP probability cutoff $\tau\! =\!$ 5%, which can cope with up to $n =\!$ 10 billions distinct canonical _k_-mers while verifying $p_\textrm{\tiny FP}\! <\! \tau$ with any $\kappa\! >\! \kappa'\! >\!$ 1. +However, when the number $n$ of distinct canonical _k_-mers is provided by the user, `ROCK` automatically computes the smallest integer $\lambda$ such that $\pi_{\theta,n}^\lambda\! <\! \tau$, which often results in $\lambda\! =\!$ 1 with WGS data from small genomes (see Example). + +After instantiating a CMS with all entries set to 0, `ROCK` performs the digital normalization procedure by traversing $\textbf{M}[x][y]$ from the highest to the lowest quality $x$. +For every SE/PE HTSR(s) $r$, the number $\alpha$ of canonical _k_-mer occurences $\tilde{\eta}\! \leq\! \kappa$ is obtained by querying the CMS, and next compared to the total number $\beta$ of canonical _k_-mers. +If $2 \alpha\! >\! \beta$, then the median of the _k_-mer occurence values is strictly higher than $\kappa$, and $r$ is/are not selected; otherwise, $r$ is/are selected and the corresponding _k_-mers are added in the CMS. +When $\kappa'\! \ne\!$ 0, the second pass is next performed on the selected HTSRs following the same approach. +`ROCK` runs the entire procedure using only a unique thread. +At the end, all selected HTSRs are written into FASTQ-formatted output file(s). +Of note, to avoid mixing up HTSRs that derive from different WGS strategies (e.g. different insert sizes), each input file corresponds to its own output file. + + +# Example + +To illustrate the performances of `ROCK` on a real-case high coverage WGS dataset, we considered the Illumina PE sequencing (run accession [SRR2079909](https://www.ebi.ac.uk/ena/browser/view/SRR2079909)) of the _Clostridium botulinum_ ATCC 9564 genome (assembly accession [GCA_001273225](https://www.ebi.ac.uk/ena/browser/view/GCA_001273225.1)), of length $l \! =\!$ 3,813,606 base pairs (bps). +The two FASTQ files were first processed using `AlienTrimmer` [@Criscuolo:2013] to trim off 5'/3' regions containing many sequencing errors (i.e. Phred score cutoff $Q \! =\!$ 20) or stretches of identical nucleotides. +After discarding too short (i.e. $<\!$ 100 bps) and unpaired HTSRs, a total of 4,973,401 PE HTSRs remained (2,756,008,939 bps, average length $\ell\! =\!$ 277 bps), corresponding to a coverage depth of ~722$\times$. + +To reduce this high coverage depth to $c \! = \!$ 50$\times$, `ROCK` was run with $k \! =\!$ 25 and $\kappa\! =\! c / \varepsilon\! \approx\!$ 45. +To assess the optimal CMS size $\lambda$, the total number $n \! =\!$ 105,584,331 of distinct canonical _k_-mers was estimated using `ntCard` [@Mohamadi:2017], and next specified to `ROCK`, leading to $\lambda\! =\!$ 1. +In theory, using $\kappa\! =\!$ 45 is expected to yield a subset of $cl/(2\ell)\! \approx\!$ 344,000 PE HTSRs totaling $cl \! \approx\!$ 191 Mbps. +Obtained results are summarized in Table \ref{Table 1}. + +Table: Running times (min:sec), and numbers of PE HTSRs and corresponding base pairs (bps) returned by `ROCK` (on 1 thread) with $\kappa\! =\!$ 45 and varying $\kappa'$ values. \label{Table 1} + +-------------------------------------------------------------------- + $~$ **run. time** **no. PE HTSRs** **no. bps** +------------------- --------------- ------------------ ------------- + $\kappa'\! =\!$ 0 3:09 425,999 241,820,129 + + $\kappa'\! =\!$ 2 3:23 388,162 224,479,872 + + $\kappa'\! =\!$ 4 3:23 376,746 217,850,311 + + $\kappa'\! =\!$ 6 3:24 372,315 217,251,626 + + $\kappa'\! =\!$ 8 3:24 371,973 217,109,407 +-------------------------------------------------------------------- + +For comparison sake, comparable standard digital normalizations ($\kappa\! =\!$ 45 and $k \! =\!$ 25) were also carried out on the same computer (AMD Epyc 2.2 GHz processor, 128 Gb RAM) using other dedicated tools (see Statement of need). +They returned slightly larger subsets, containing from 498,899 (275 Mbps; `BBnorm`) to 693,978 (367 Mbps; `normalize-by-median`) PE HTSRs, with slower running times as compared to `ROCK`, varying from 5:17 (`BBnorm` on 12 threads) to 21:29 (`normalize-by-median`). + + +# Authors' contribution + +AC devised the project, the main conceptual ideas and algorithmic improvements. +VL coded most parts of `ROCK`. +NJ implemented the data structures with VL, and participated to the coding. +TK and VL ran benchmarks to assess and improve the overall performance of `ROCK`. +All authors provided critical feedback to optimize the program. +AC wrote the manuscript in consultation with VL and TK. + + +# References -- GitLab