diff --git a/Makefile.am b/Makefile.am index 1bfdcf48680594bc8757ad43f3144bbdca65cd65..1bfa9994703888de7c7b26712bd9ebee13225ee2 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1 +1 @@ -SUBDIRS=src +SUBDIRS=. src test doc diff --git a/NEWS b/NEWS index 54926a5205a4dce9394257b7b648bb193d014953..4b9d37e109d62efaf2adea17c8d99cab6ad9c0d9 100644 --- a/NEWS +++ b/NEWS @@ -1 +1,14 @@ -Sample NEWS file for rock project. +New in version 1.3 + * -q option for nucleotique quality score threshold. + +New in version 1.2 + * -v option for verbose mode. + +New in version 1.1 + * input file names can be provided as arguments in command-line. -i option is no more mandatory. + * output filenames are generated by ROCK if -o option is not used. -o option is no more mandatory. + * fixed compilation warning. + * bug fix: output files are now sorted in decreasing order of quality score (versus increasing order in version 1.0). + * bug fix: when computig lambda (in the case where nothing is not provided by the user). + +Version 1.0 of ROCK diff --git a/configure.ac b/configure.ac index f810bdd7b22c2081017c2bb3038b5702f6fd656e..cb0ff451e22f57c5074c502c22f75bf5bea26029 100644 --- a/configure.ac +++ b/configure.ac @@ -1,7 +1,7 @@ dnl Process this file with autoconf to produce a configure script. AC_PREREQ(2.59) -AC_INIT(rock, 1.0) +AC_INIT(rock, 1.3) AC_CANONICAL_SYSTEM @@ -10,9 +10,9 @@ AM_INIT_AUTOMAKE() # Checks for programs. AC_PROG_CXX AC_PROG_RANLIB -# AC_CHECK_PROG(POD2MAN, pod2man, pod2man, :) +AC_CHECK_PROG(POD2MAN, pod2man, pod2man, :) -AC_CONFIG_FILES(Makefile src/Makefile) +AC_CONFIG_FILES(Makefile src/Makefile test/Makefile doc/Makefile) AC_OUTPUT diff --git a/src/CountMinSketch.cpp.old b/src/CountMinSketch.cpp.old new file mode 100644 index 0000000000000000000000000000000000000000..017079ab7b207078b08e6d228cc1ed54ea3343e4 --- /dev/null +++ b/src/CountMinSketch.cpp.old @@ -0,0 +1,111 @@ +/* + * CountMinSketch.cpp + * + * Created on: Feb 10, 2016 + * Author: vlegrand + * 15/02/16 According to benchmark results, will use the hash function implementation that uses the modulo operator. It appears to be a little bit faster. + */ + +#include "CountMinSketch.h" + +const int max_pow=30; + +/* This method is used to determine if a number is a prime number or not. + * It is incomplete. TODO find an effective method to get lamdba prime numbers when we'll be sure whether we use the + * "prime number" version of the hash functions. */ +/*int CountMinSketch::isPrime(unsigned int num) { + if ((num % 2 ==0) || (num==2)) return 0; + if ((num % 3 ==0) || (num==3)) return 0; + if ((num % 5 ==0) || (num==5)) return 0; + if ((num % 7 ==0) || (num==7)) return 0; + return 1; +} + +int CountMinSketch::isMersenne(unsigned int num) { + int cur_pow=max_pow; + unsigned int mers_nbr=pow(2,cur_pow)-1; + while (num!=mers_nbr && cur_pow>=1) { + cur_pow-=1; + mers_nbr=pow(2,cur_pow)-1; + } + if (cur_pow==0) return 0; + else return 1; +} + +void CountMinSketch::findNonMersPrime() { + int i; + unsigned int num=pi_j_max; + for (i=0;i<lambda;i++) { + num-=1; + while (!isPrime(num)) num-=1; + + } +}*/ + +std::map<int,int> CountMinSketch::getIthArray(int i) { + std::map<int,int> tmp; + return tmp; +} + + +void CountMinSketch::addKMer(unsigned long val) { + int h,j; + short cnt; + j=1; + std::vector<internal_array>::iterator it_lambda_array; + for (it_lambda_array=cms_lambda_array.begin();it_lambda_array!=cms_lambda_array.end();it_lambda_array++) { + h=hash64to32(val,j); + cnt=(*it_lambda_array)[h]; + (*it_lambda_array)[h]=(cnt++ & ushortmask); + j++; + } + +} + +int CountMinSketch::getEstimatedNbOcc(unsigned long val) { + int j,h; + int min=0; + std::vector<internal_array>::iterator it; + for (it=cms_lambda_array.begin();it!=cms_lambda_array.end();it++) { + h=hash64to32(val,j); + if ((*it)[h]<min) min=(*it)[h]; + j++; + } + return min; +} + +/* + * Used to determine if a read must be kept or not. + * Depending on the case, threshold is kappa or kappa_prime + * Computes the number of k-mers for which the number of occurrences is below threshold. + * If more than half of the k-mers have a number of occurrences that is less than threshold, + * return true; else, return false. + */ +int CountMinSketch::isRCovBelowThres(const readNumericValues& read_val,int threshold) { + int a=0; + int b=0; + readNumericValues::const_iterator it; + for (it=read_val.begin();it!=read_val.end();it++) { + short nb_occ=getEstimatedNbOcc(*it); + if (nb_occ<threshold) a+=1; + ++b; + } + return (2*a>b); +} + +int CountMinSketch::addRead(const readNumericValues& read_val) { + int keep_r=isRCovBelowThres(read_val,kappa); + if (keep_r) { + readNumericValues::const_iterator it; + for (it=read_val.begin();it!=read_val.end();it++) { + addKMer(*it); + } + } + return keep_r; +} + + +int CountMinSketch::isBeneathMinKappa(const readNumericValues& read_val) { + return(isRCovBelowThres(read_val,kappa_prime)); +} + diff --git a/src/CountMinSketch.h.old b/src/CountMinSketch.h.old new file mode 100644 index 0000000000000000000000000000000000000000..e5fbcdc31c5bedc1d1e6dbd970aa20e042f84eb9 --- /dev/null +++ b/src/CountMinSketch.h.old @@ -0,0 +1,87 @@ +/* + * CountMinSketch.h + * + * Created on: Feb 10, 2016 + * Author: vlegrand + */ + +#ifndef COUNTMINSKETCH_H_ +#define COUNTMINSKETCH_H_ + +#include <vector> +#include <map> + +typedef std::vector<unsigned long> readNumericValues; // TODO move this definition to a common include file between ReadProcessor and CountMinSketch. + +class CountMinSketch { + static const unsigned int pi_j_max=2147483647; + static const unsigned long mask1=1; + static const unsigned long mask2=2095103; + static const unsigned long mask3=1023; + + static const unsigned short ushortmask=32767; + + + int lambda; + int kappa; + int kappa_prime; + + typedef std::map<int,short> internal_array; + std::vector<internal_array> cms_lambda_array; + + std::vector<int> pi_j_array; + + // void findNonMersPrime(); // fills pi_j_array with lambda non mersenne prime numbers. + // int hash64to32(unsigned long,int); + + int hash64to32(unsigned long w,int j) { // bit shift version of hash function to start. + unsigned long h_tmp; + unsigned long h=~w; + h+=w<<18; + h_tmp=h>>31; + h_tmp&=mask1; + h=h^h_tmp; + h*=j; + h_tmp=h>>11; + h_tmp&=mask2; + h ^=h_tmp; + h+=h<<6; + h_tmp=h>>22; + h_tmp&=mask3; + h^=h_tmp; + return (int) (h& 2147483647); + } + + + std::map<int,int> getIthArray(int); // utility method provided for testing only. + void addKMer(unsigned long); // inline? TODO: see later if it can help us gain time. + int isRCovBelowThres(const readNumericValues& read_val,int threshold) ; + + // for unit tests. + friend void test_CMS(int lambda,int kappa,int kappa_prime); + /*friend void test_findNonMersPrime(int lambda,int kappa,int kappa_prime); + friend void test_hash();*/ + +public: + + CountMinSketch(int glambda,int gkappa,int gkappa_prime) { + lambda=glambda; + kappa=gkappa; + kappa_prime=gkappa_prime; + cms_lambda_array.reserve(lambda); + pi_j_array.reserve(lambda); + // findNonMersPrime(); + } + + + + int getEstimatedNbOcc(unsigned long); + int addRead(const readNumericValues&); + int isBeneathMinKappa(const readNumericValues&); +}; + + + + + +#endif /* COUNTMINSKETCH_H_ */ diff --git a/src/CountMinSketch.hpp b/src/CountMinSketch.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c913ffc99a40de2e11be8ffa99e17c54bb40ab87 --- /dev/null +++ b/src/CountMinSketch.hpp @@ -0,0 +1,269 @@ +/* + * CountMinSketch.h + * + * Created on: Feb 10, 2016 + * Author: vlegrand + */ + +#ifndef COUNTMINSKETCH_HPP_ +#define COUNTMINSKETCH_HPP_ + + +#include <stdlib.h> +#include <string.h> +#include "rock_commons.h" + +#define BAD_TYPE -1 + +// Store the non mersenne prime numbers for modulo hashing in this array. + const int Pi_js[500]={2147469629, 2147469637, 2147469659, 2147469679, 2147469703, 2147469781, 2147469817, 2147469823, 2147469829, 2147469881,\ + 2147469917, 2147469943, 2147469949, 2147469983, 2147470007, 2147470019, 2147470027, 2147470043, 2147470057, 2147470067,\ + 2147470081, 2147470111, 2147470123, 2147470139, 2147470147, 2147470177, 2147470183, 2147470211, 2147470229, 2147470249,\ + 2147470313, 2147470327, 2147470333, 2147470361, 2147470427, 2147470453, 2147470511, 2147470513, 2147470529, 2147470531,\ + 2147470553, 2147470579, 2147470597, 2147470603, 2147470627, 2147470643, 2147470673, 2147470679, 2147470723, 2147470727,\ + 2147470733, 2147470751, 2147470769, 2147470771,2147483059, 2147483069, 2147483077, 2147483123, 2147483137, 2147483171,\ + 2147473897, 2147473921, 2147473963, 2147474009, 2147474027, 2147474029, 2147474071, 2147474093, 2147474113, 2147474123,\ + 2147474149, 2147474159, 2147474201, 2147474213, 2147474239, 2147474279, 2147474359, 2147474383, 2147474393, 2147474477,\ + 2147474479, 2147474491, 2147474513, 2147474519, 2147474531, 2147474551, 2147474597, 2147474627, 2147474657, 2147474711,\ + 2147474717, 2147474789, 2147474803, 2147474807, 2147474809, 2147474831, 2147474837, 2147474843, 2147474851, 2147474881,\ + 2147474887, 2147474891, 2147474921, 2147474929, 2147474947, 2147474951, 2147474963, 2147475047, 2147475061, 2147475103,\ + 2147475107, 2147475149, 2147475179, 2147475181, 2147475193, 2147475203, 2147475221, 2147475229, 2147475233, 2147475251,\ + 2147475257, 2147475269, 2147475277, 2147475331, 2147475347, 2147475349, 2147475367, 2147475373, 2147475397, 2147475401,\ + 2147475413, 2147475439, 2147475481, 2147475487, 2147475497, 2147475503, 2147475509, 2147475521, 2147475541, 2147475553,\ + 2147475559, 2147475563, 2147475587, 2147475593, 2147475601, 2147475641, 2147475653, 2147475691, 2147475713, 2147475721,\ + 2147475739, 2147475787, 2147475791, 2147475797, 2147475829, 2147475851, 2147475859, 2147475871, 2147475899, 2147475929,\ + 2147475971, 2147475973, 2147475977, 2147475997, 2147476031, 2147476073, 2147476087, 2147476109, 2147476127, 2147476139,\ + 2147476141, 2147476169, 2147476183, 2147476211, 2147476249, 2147476291, 2147476321, 2147476327, 2147476367, 2147476381,\ + 2147476399, 2147476417, 2147476517, 2147476519, 2147476543, 2147476607, 2147476619, 2147476649, 2147476663, 2147476687,\ + 2147476693, 2147476699, 2147476739, 2147476741, 2147476763, 2147476769, 2147476777, 2147476789, 2147476819, 2147476823,\ + 2147476841, 2147476871, 2147476897, 2147476927, 2147476931, 2147476937, 2147476943, 2147476951, 2147476963, 2147476979,\ + 2147477021, 2147477029, 2147477063, 2147477093, 2147477107, 2147477113, 2147477159, 2147477191, 2147477201, 2147477203,\ + 2147477207, 2147477209, 2147477237, 2147477249, 2147477273, 2147477323, 2147477393, 2147477399, 2147477419, 2147477443,\ + 2147477467, 2147477473, 2147477503, 2147477513, 2147477531, 2147477533, 2147477599, 2147477627, 2147477681, 2147477687,\ + 2147477699, 2147477701, 2147477737, 2147477807, 2147477809, 2147477833, 2147477851, 2147477861, 2147477873, 2147477879,\ + 2147477881, 2147477933, 2147477953, 2147477989, 2147478013, 2147478017, 2147478049, 2147478079, 2147478083, 2147483179,\ + 2147478089, 2147478127, 2147478133, 2147478149, 2147478253, 2147478259, 2147478293, 2147478299, 2147478331, 2147478349,\ + 2147478373, 2147478461, 2147478481, 2147478491, 2147478497, 2147478503, 2147478517, 2147478521, 2147478563, 2147478569,\ + 2147478581, 2147478601, 2147478611, 2147478647, 2147478649, 2147478653, 2147478659, 2147478661, 2147478673, 2147478701,\ + 2147478703, 2147478719, 2147478721, 2147478727, 2147478731, 2147478733, 2147478763, 2147478791, 2147478821, 2147478859,\ + 2147478863, 2147478889, 2147478899, 2147478911, 2147478919, 2147478937, 2147478959, 2147478961, 2147478967, 2147478997,\ + 2147479013, 2147479031, 2147479057, 2147479063, 2147479079, 2147479091, 2147479097, 2147479121, 2147479129, 2147479133,\ + 2147479171, 2147479189, 2147479231, 2147479259, 2147479273, 2147479307, 2147479339, 2147479349, 2147479361, 2147479381,\ + 2147479403, 2147479421, 2147479447, 2147479489, 2147479507, 2147479513, 2147479517, 2147479531, 2147479547, 2147479549,\ + 2147479573, 2147479589, 2147479601, 2147479619, 2147479637, 2147479643, 2147479657, 2147479681, 2147479751, 2147479753,\ + 2147479757, 2147479781, 2147479787, 2147479819, 2147479823, 2147479879, 2147479891, 2147479897, 2147479907, 2147479937,\ + 2147479991, 2147480009, 2147480011, 2147480039, 2147480161, 2147480197, 2147480207, 2147480219, 2147480227, 2147480297,\ + 2147480299, 2147480311, 2147480327, 2147480369, 2147480429, 2147480437, 2147480459, 2147480471, 2147480507, 2147480519,\ + 2147480527, 2147480551, 2147480591, 2147480611, 2147480623, 2147480641, 2147480651, 2147480677, 2147480683, 2147480707,\ + 2147480723, 2147480743, 2147480747, 2147480791, 2147480837, 2147480843, 2147480849, 2147480893, 2147480897, 2147480899,\ + 2147480921, 2147480927, 2147480941, 2147480957, 2147480969, 2147480971, 2147480989, 2147481019, 2147481031, 2147481053,\ + 2147481071, 2147481139, 2147481143, 2147481151, 2147481173, 2147481179, 2147481199, 2147481209, 2147481247, 2147481263,\ + 2147481269, 2147481283, 2147481311, 2147481317, 2147481337, 2147481353, 2147481359, 2147481367, 2147481373, 2147481487,\ + 2147481491, 2147481499, 2147481509, 2147481529, 2147481563, 2147481571, 2147481629, 2147481673, 2147481793, 2147481797,\ + 2147481811, 2147481827, 2147481863, 2147481883, 2147481893, 2147481899, 2147481901, 2147481907, 2147481937, 2147481949,\ + 2147481967, 2147481997, 2147482021, 2147482063, 2147482081, 2147482091, 2147482093, 2147482121, 2147482223, 2147482231,\ + 2147482237, 2147482273, 2147482291, 2147482327, 2147482343, 2147482349, 2147482361, 2147482367, 2147482409, 2147482417,\ + 2147482481, 2147482501, 2147482507, 2147482577, 2147482583, 2147482591, 2147482621, 2147482661, 2147482663, 2147482681,\ + 2147482693, 2147482697, 2147482739, 2147482763, 2147482801, 2147482811, 2147482817, 2147482819, 2147482859, 2147482867,\ + 2147482873, 2147482877, 2147482921, 2147482937, 2147482943, 2147482949, 2147482951, 2147483029, 2147483033, 2147483053}; + + +typedef struct { + int lambda; + int kappa; + int kappa_prime; + int filter_size; // max amount of RAM wanted for the CMS. + int nucl_score_threshold; // minimum quality score below which nucleotides are considered in error. 0 by default. +} CMSparams; + +template <typename T> +struct get_mask { + static const int value=BAD_TYPE; +}; + +template<> +struct get_mask<unsigned char> { + static const unsigned char value=255; +}; + +template<> +struct get_mask<unsigned short> { + static const unsigned short value=65535; +}; + + +template<typename T> class CountMinSketch { +public: + int nucl_score_threshold; + +private: + static const unsigned long mask1=1; // used only for hash64to32bs + static const unsigned long mask2=2095103; + static const unsigned long mask3=1023; + + int lambda; + int kappa; + int kappa_prime; + + T ** cms_lambda_array; + T mask; + + inline int hash64to32(const unsigned long& w ,const int& j) { + return w % Pi_js[j]; + } + + // bit shift version of hash function to start. Keep it just in case. Not used because it is very slow. + /* int hash64to32bs(unsigned long w,int j) { + unsigned long h_tmp; + unsigned long h=~w; + h+=w<<18; + h_tmp=h>>31; + h_tmp&=mask1; + h=h^h_tmp; + h*=j; + h_tmp=h>>11; + h_tmp&=mask2; + h ^=h_tmp; + h+=h<<6; + h_tmp=h>>22; + h_tmp&=mask3; + h^=h_tmp; + return (int) (h& 2147483647); + }*/ + + + + inline int isRCovBelowThres(const readNumericValues& read_val, const int& threshold); + + void init(int glambda,int gkappa,int gkappa_prime,int gnucl_score_threshold); + + // for unit tests. + friend void test_CMS(int lambda,int kappa,int kappa_prime); + + +public: + + CountMinSketch(int glambda,int gkappa,int gkappa_prime,int gnucl_score_threshold=0) { + init(glambda,gkappa,gkappa_prime,gnucl_score_threshold); + } + + CountMinSketch(CMSparams parms) { + init(parms.lambda,parms.kappa,parms.kappa_prime,parms.nucl_score_threshold); + } + + ~CountMinSketch() { + int j; + for (j=0;j<lambda;j++) { + free(cms_lambda_array[j]); + } + free(cms_lambda_array); + } + + + inline void addKMer(const unsigned long& val1) { + int h,j; + T cnt; + j=lambda; + while(--j>=0) { + h=hash64to32(val1,j); + cnt=cms_lambda_array[j] [h]; + cnt++; + cms_lambda_array[j] [h]=(cnt & mask); + } + } + + // keep that just for unit testing purpose + T getEstimatedNbOcc(const unsigned long& val); + + int addRead(const readNumericValues& read_val); + + int isBeneathMinKappa(const readNumericValues&read_val); + + unsigned long getNbDistinctKMers(); +}; + + +template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const readNumericValues& read_val, const int& threshold) { + int a=0,b=0; + int j,h; + T min; + readNumericValues::const_iterator it; + for (it=read_val.begin();it!=read_val.end();++it) { + j=lambda; + min=mask; + while (--j>=0 && min>threshold) { + h=hash64to32(*it,j); + min=cms_lambda_array[j] [h]; + } + (min<threshold)? a+=1:a; + ++b; + } + return (2*a>b); +} + +template<typename T> void CountMinSketch<T>::init(int glambda,int gkappa,int gkappa_prime,int gnucl_score_threshold) { + lambda=glambda; + kappa=gkappa; + kappa_prime=gkappa_prime; + nucl_score_threshold=gnucl_score_threshold; + int j; + mask=get_mask<T>::value; + /*if (lambda<ubytemask) mask=ubytemask; // byte implementation + else mask=ushortmask; // short implementation*/ + cms_lambda_array= (T**) malloc(lambda*sizeof(T*)); + for (j=0;j<lambda;j++) { + cms_lambda_array[j]=(T *) malloc(sizeof(T)*INT_MAX); + memset(cms_lambda_array[j],0,INT_MAX); + } +} + +// keep that just for unit testing purpose +template<typename T> T CountMinSketch<T>::getEstimatedNbOcc(const unsigned long& val) { + int j,h; + T min=mask; + // unsigned char min=ubytemask; + j=lambda; + while(--j>=0 && min>kappa) { + h=hash64to32(val,j); + min=cms_lambda_array[j] [h]; + } + return min; +} + +template<typename T> int CountMinSketch<T>::addRead(const readNumericValues& read_val) { + int keep_r=isRCovBelowThres(read_val,kappa); + if (keep_r) { + readNumericValues::const_iterator it; + for (it=read_val.begin();it!=read_val.end();it++) { + this->addKMer(*it); + } + } + return keep_r; +} + +template<typename T> int CountMinSketch<T>::isBeneathMinKappa(const readNumericValues&read_val) { + int res=isRCovBelowThres(read_val,kappa_prime); + return res; +} + +/* + * Go through the arrays of the CMS and returns the number of distinct k_mers (biggest nbr of non-zero counters amongst all arrays) + */ + +template<typename T> unsigned long CountMinSketch<T>::getNbDistinctKMers() { + int j,h; + unsigned long min=INT_MAX; + + for (j=lambda-1;j>=0;--j) { + unsigned long nb_k_mers=0; + for (h=INT_MAX-1;h>=0;--h) { + //(min<threshold)? a+=1: NULL; + (cms_lambda_array[j] [h]>0)? nb_k_mers+=1: nb_k_mers; + } + (nb_k_mers<min) ?min=nb_k_mers: min; + } + return min; +} + + +#endif /* COUNTMINSKETCH_HPP_ */ diff --git a/src/Filter.hpp b/src/Filter.hpp new file mode 100644 index 0000000000000000000000000000000000000000..05e80df5c23f61eb794380f90b5cf66994a0deb6 --- /dev/null +++ b/src/Filter.hpp @@ -0,0 +1,175 @@ +/* + * filter.h + * + * Created on: Apr 26, 2016 + * Author: vlegrand + */ +#ifndef FILTER_HPP_ +#define FILTER_HPP_ +#include <errno.h> +#include <sys/time.h> +#include <sys/resource.h> + +#include "CountMinSketch.hpp" +#include "ReadProcessor.h" +#include "read_utils.h" + + +static const unsigned short ushortmask=65535; +static const unsigned char ubytemask=255; + +typedef CountMinSketch<unsigned char> ByteCountMinSketch; +typedef CountMinSketch<unsigned short> ShortCountMinSketch; + + + +// created this class to hide implementation detail (byte versys short) from main program. TODO: refactor it later if I have time. +class Filter { + ByteCountMinSketch * pByteCMS; + ShortCountMinSketch * pShortCMS; + +#// long CMS_size_in_Bytes; + long nb_bytes_before_fill_CMS,nb_bytes_after_fill_CMS; + + void getRSS(int before_fill=0); + + + template <typename T> void underlyingfillCMS(FqBaseBackend* map_id_backend[],int nb_be, int k, srp* io_sr, CountMinSketch<T>* cms); + template <typename T> void underlyinglowFilterCMS(FqBaseBackend* map_id_backend[],int nb_be,int k,srp* io_sr, CountMinSketch<T>* pcms); + +public: + + Filter(const CMSparams& parms) { + pByteCMS=NULL; + pShortCMS=NULL; + getRSS(); + if (parms.kappa<ubytemask) pByteCMS=new ByteCountMinSketch(parms); + else pShortCMS=new ShortCountMinSketch(parms); + //CMS_size_in_Bytes=0; + + nb_bytes_after_fill_CMS=0; + } + + void fillCMS(FqBaseBackend* map_id_backend[],int nb_be,int k,srp* io_sr); + void lowFilterCMS(FqBaseBackend* map_id_backend[],int nb_be,int k, srp* io_sr); + unsigned long getApproxNbDistinctKMers(); + unsigned long getSize(); +}; + +template <typename T> void Filter::underlyingfillCMS(FqBaseBackend* map_id_backend[],int nb_be, int k, srp* io_sr, CountMinSketch<T>* cms) { + int ret; + srp::reverse_iterator rit; + i_dim::iterator it_offs; + k_dim::iterator it_struct; + // const unsigned char masque=0x0F; // to retrieve 2nd file identifier. + ReadProcessor read_p(k); + readNumericValues nbrKmerDecompo; + + // 1rst step, open files before fetching reads in them + int i; + for (i=0;i<nb_be;i++){ + map_id_backend[i]->openInputFile(); + } + + for (rit=io_sr->rbegin(); rit!=io_sr->rend(); ++rit) { //process map in reverse order (by decreasing scores). + //cout << "score="<<rit->first<<endl; + // unsigned long score=rit->first; + for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { + unsigned long j=it_offs->first; + for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { + // read dna string corresponding to fast qrecord + DnaSeqStr a_seqs[2]; + init_DnaSeqStr(&a_seqs[0]); + init_DnaSeqStr(&a_seqs[1]); + getDNASeqstr(map_id_backend,*it_struct,j,a_seqs,cms->nucl_score_threshold); + // debug stuff only. + /*if (strstr(a_seqs->fq_record_buf,"@SRR1222430.37")!=NULL) { + printf("coucou"); + }*/ + // decompose dna string into k-mers and turn k_mers into numbers. + decomposeReadInKMerNums(read_p, nbrKmerDecompo,k,a_seqs); + // insert k-mers into CMS if overall read satisfies kappa condition. + ret=cms->addRead(nbrKmerDecompo); + if (!ret) { + // read is rejected, update rpos structure accordingly. + it_struct->fileid=0; // TODO Benchmark: Wouldn't rit->second.erase(++it_struct) be more efficient (reduce size of CMS in memory then 2nd parsing for kappa_prime filtering and third parsing for writing output files would be faster? + } + nbrKmerDecompo.clear(); + } + } + } + // last step, close fq files. TODO this step may be moved somewhere else for optimization. + for (i=0;i<nb_be;i++){ + map_id_backend[i]->closeInputFile(); + } +} + +template <typename T> void Filter::underlyinglowFilterCMS(FqBaseBackend* map_id_backend[],int nb_be,int k,srp* io_sr, CountMinSketch<T>* pcms) { // TODO refactor get k from CMS. + srp::iterator it; + i_dim::iterator it_offs; + k_dim::iterator it_struct; + int ret; + ReadProcessor read_p(k); + readNumericValues nbrKmerDecompo; + + // 1rst step, open files before fetching reads in them + int i; + for (i=0;i<nb_be;i++){ + map_id_backend[i]->openInputFile(); + } + + for (it=io_sr->begin(); it!=io_sr->end(); ++it) { + for (it_offs=it->second.begin();it_offs!=it->second.end();it_offs++) { + unsigned int j=it_offs->first; + for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { + // read dna string corresponding to fastq record + if (it_struct->fileid) { + DnaSeqStr a_seqs[2]; + init_DnaSeqStr(&a_seqs[0]); + init_DnaSeqStr(&a_seqs[1]); + getDNASeqstr(map_id_backend,*it_struct,j,a_seqs); + // decompose dna string into k-mers and turn k_mers into numbers. + decomposeReadInKMerNums(read_p, nbrKmerDecompo,k,a_seqs); + ret=pcms->isBeneathMinKappa(nbrKmerDecompo); + if (ret) it_struct->fileid=0; + nbrKmerDecompo.clear(); + } + } + } + } + for (i=0;i<nb_be;i++){ + map_id_backend[i]->closeInputFile(); + } +} + +void Filter::fillCMS(FqBaseBackend* map_id_backend[],int nb_be,int k,srp* io_sr) { + if (pByteCMS!=NULL) underlyingfillCMS<unsigned char>(map_id_backend,nb_be, k, io_sr,pByteCMS); + else underlyingfillCMS<unsigned short>(map_id_backend,nb_be, k, io_sr,pShortCMS); + getRSS(1); +} + +void Filter::lowFilterCMS(FqBaseBackend* map_id_backend[],int nb_be,int k, srp* io_sr) { + if (pByteCMS!=NULL) underlyinglowFilterCMS<unsigned char>(map_id_backend,nb_be, k, io_sr,pByteCMS); + else underlyinglowFilterCMS<unsigned short>(map_id_backend,nb_be, k, io_sr,pShortCMS); +} + +unsigned long Filter::getApproxNbDistinctKMers() { + if (pByteCMS!=NULL) return pByteCMS->getNbDistinctKMers(); + else return pShortCMS->getNbDistinctKMers(); +} + +void Filter::getRSS(int before_fill) { + struct rusage usage; + int res2=getrusage(RUSAGE_SELF,&usage); + if (res2==-1) err(errno,"cannot get resource usage."); + if (before_fill==0) nb_bytes_before_fill_CMS=usage.ru_maxrss; + else nb_bytes_after_fill_CMS=usage.ru_maxrss; +} + +unsigned long Filter::getSize() { + unsigned long cms_size=nb_bytes_after_fill_CMS-nb_bytes_before_fill_CMS; + // cms_size/=8; ru_maxrss seems to be already in Bytes. + return cms_size; +} + +#endif diff --git a/src/FqAuxBackend.cpp b/src/FqAuxBackend.cpp index ef58d76807b11edfefc8444d77a6a54e64538623..c237db37941aca75d728d6bcaa282ee2e95048f7 100644 --- a/src/FqAuxBackend.cpp +++ b/src/FqAuxBackend.cpp @@ -20,7 +20,7 @@ #include <iostream> using namespace std; -//#define DEBUG +// #define DEBUG int FqAuxBackend::getNextRead(rinfo * p_nr) { rinfo nr; @@ -95,7 +95,7 @@ int FqAuxBackend::processBuffer(rinfo * p_nr) { #ifdef DEBUG store_id: { if (is_id==1) { - if (*pchar!='\/') current_id[idx_id]=*pchar; + if (*pchar!='/' and *pchar!=' ') current_id[idx_id]=*pchar; else current_id[idx_id]='\0'; idx_id+=1; } @@ -126,8 +126,9 @@ int FqAuxBackend::processBuffer(rinfo * p_nr) { void FqAuxBackend::readBuffer() { - if ((nread=read(f_desc,buf,bufsize))!=0) { - cur_offset=ftell(f_stream); + if ((nread=read(i_f_desc,buf,bufsize))!=0) { + cur_offset=lseek(i_f_desc,0,SEEK_CUR); +//ftell(f_stream); pos_in_buf=0; } } @@ -135,7 +136,7 @@ void FqAuxBackend::readBuffer() { * Opens file and performs the fist read operation. */ void FqAuxBackend::openFile(char * ficname, unsigned char id) { - FqBaseBackend::openFile(ficname,id); + FqBaseBackend::openInputFile(ficname,id); buf=(char *) malloc(bufsize*sizeof(char)); if (buf==NULL) { err(errno,"cannot allocate memory: %lu bytes.",bufsize); @@ -144,8 +145,8 @@ void FqAuxBackend::openFile(char * ficname, unsigned char id) { } void FqAuxBackend::closeFile() { - if (filename!=NULL) { - FqBaseBackend::closeFile(); + if (i_f_desc!=-1) { + FqBaseBackend::closeInputFile(); free(buf); buf=NULL; } diff --git a/src/FqAuxBackend.h b/src/FqAuxBackend.h index b52805d9e7b7ef02108106fb4b792cdb06e5b7c0..13178722c2a069e4c376dca2f4cfb44d184eb0f7 100644 --- a/src/FqAuxBackend.h +++ b/src/FqAuxBackend.h @@ -11,7 +11,7 @@ #include "srp.h" #include "FqBaseBackend.h" -const size_t bufsize=6048000; // It seems that I can't have a much bigger value than that or else my objects construction throws exception. Strange. + // const size_t bufsize=500000; //try that in valgrind class FqAuxBackend:public FqBaseBackend { diff --git a/src/FqBaseBackend.cpp b/src/FqBaseBackend.cpp index 73d4ba9c6482c474b411fb90e24a21399627f112..473b0bc2ddc4866ecc67f826ad12854a130872b7 100644 --- a/src/FqBaseBackend.cpp +++ b/src/FqBaseBackend.cpp @@ -4,72 +4,170 @@ * Created on: Jan 20, 2016 * Author: vlegrand */ -//#include <stdio.h> +#include <string> +#include <stdexcept> #include <fcntl.h> //#include <stdlib.h> #include <unistd.h> #include <errno.h> #include <err.h> + + #include "FqBaseBackend.h" +//#define DEBUG +#ifdef DEBUG +#include <cassert> +#include <iostream> +using namespace std; +#endif + -void FqBaseBackend::openFile(char * ficname, unsigned char id) { +void FqBaseBackend::openInputFile(char * ficname, unsigned char id) { int st,s; // unsigned long cur_offset; mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; - closeFile(); - filename=ficname; + closeInputFile(); + i_filename=ficname; f_id=id; - f_desc=open(filename,O_RDONLY,mode); - if (f_desc==-1) { - err(errno,"cannot open file: %s.",filename); + i_f_desc=open(i_filename,O_RDONLY,mode); + if (i_f_desc==-1) { + err(errno,"cannot open file: %s.",i_filename); } - f_stream=fdopen(f_desc,"r"); + f_stream=fdopen(i_f_desc,"r"); if (f_stream==NULL) { - err(errno,"cannot open file: %s.",filename); + err(errno,"cannot open file: %s.",i_filename); } } -void FqBaseBackend::closeFile() { - if (filename!=NULL) { - close(f_desc); - filename=NULL; - f_desc=-1; +void FqBaseBackend::closeInputFile() { + if (i_f_desc!=-1) { + close(i_f_desc); + // filename=NULL; + i_f_desc=-1; f_stream=NULL; - /*free(buf); - buf=NULL;*/ } } +void FqBaseBackend::setOutputFile(char * ofilename) { + o_filename=ofilename; +} + + +void FqBaseBackend::openInputFile() { + if (i_filename==NULL || f_id==0) throw std::runtime_error("No file currently associated to this backend"); + if (i_f_desc!=-1) { + std::string err_msg("file: "); + err_msg+=i_filename; + err_msg+=" is already open!"; + throw std::runtime_error(err_msg); + } + openInputFile(i_filename,f_id); +} + + +void FqBaseBackend::openOutputFile(){ + mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + if (o_filename==NULL) throw std::runtime_error("No output file currently associated to this backend"); + if (o_f_desc!=-1) { + std::string err_msg("file: "); + err_msg+=o_filename; + err_msg+=" is already open!"; + throw std::runtime_error(err_msg); + } + o_f_desc=open(o_filename,O_WRONLY|O_CREAT|O_TRUNC,mode); + if (o_f_desc==-1) { + err(errno,"cannot open file: %s.",o_filename); + } +} + + +void FqBaseBackend::closeOutputFile() { + if (pos_in_w_buf!=o_buf) { // check if some fq records remains in buffer before closing file, if so, flush. + if (write(o_f_desc, o_buf, pos_in_w_buf-o_buf)==-1) err(errno,"cannot write in file: %s.",o_filename); + pos_in_w_buf=o_buf; + } + if (o_f_desc!=-1) { + if (close(o_f_desc)==-1) err(errno,"cannot close file: %s.",o_filename); + o_f_desc=-1; + } +} + +void FqBaseBackend::writeToOutput(const unsigned long& offset) { + // static char * pos_in_w_buf; + if (o_buf==NULL) { + o_buf=(char *) malloc(FqBaseBackend::bufsize*sizeof(char)); + pos_in_w_buf=o_buf; + } + if (o_buf+FqBaseBackend::bufsize-pos_in_w_buf<MAX_FQ_RECORD_LENGTH) { + // may not have enough place in buffer to store another fq record. + // => flush buffer into output file. + if (write(o_f_desc, o_buf, pos_in_w_buf-o_buf)==-1) err(errno,"cannot write in file: %s.",o_filename); + pos_in_w_buf=o_buf; + } + + int fq_rec_size=getRead(offset,pos_in_w_buf); // here aim is to avoid multiple strcpy into buffer: read directly at the right place in the buffer. +#ifdef DEBUG + cout<<"read "<<fq_rec_size<<" char from offset: "<<offset<<" and stored them in o_buf at position: "<<pos_in_w_buf<<endl; + cout<<"last char read: "<<*(pos_in_w_buf+fq_rec_size-1)<<endl; + assert(*(pos_in_w_buf+fq_rec_size-1)=='\n'); + assert(*(pos_in_w_buf)=='@'); +#endif + pos_in_w_buf+=fq_rec_size; + *pos_in_w_buf='\0'; +#ifdef DEBUG + cout<<o_buf; +#endif +} + /* * use C style char* rather than std::string for a matter of performance. * use read instead of fread for the same reason. * * Note: It is up to the caller to determine where the end of the record is (5th \n character in the fq_record string). */ -void FqBaseBackend::getRead(const long& offset, char * fq_record) { +int FqBaseBackend::getRead(const unsigned long& offset, char * fq_record) { - - //char fq_record[MAX_FQ_RECORD_LENGTH]; - - int nread,i,nb_lines; + int nread,i,nb_lines,fq_rec_size; char * pchar; - if (f_desc==-1) err(0,"No open file currently associated to this backend"); - // if (f_stream==NULL) err(0,"No open file currently associated to this backend"); - - int res=lseek(f_desc,offset,SEEK_SET); - if (res==-1) err(errno,"fseek problem when trying to retrieve dna string."); - nread=read(f_desc,fq_record,MAX_FQ_RECORD_LENGTH); -/* - i=0; + if (i_f_desc==-1) throw std::runtime_error("No open file currently associated to this backend"); +#ifdef DEBUG + cout<<"going to read record at offset: "<<offset<<endl; +#endif + int res=lseek(i_f_desc,offset,SEEK_SET); + if (res==-1) { +#ifdef DEBUG + cout<<"Couldn't get read at offset: "<<offset<<endl; +#endif + throw errno; + } + nread=read(i_f_desc,fq_record,MAX_FQ_RECORD_LENGTH); +#ifdef DEBUG + assert(nread<=MAX_FQ_RECORD_LENGTH); + assert(*(fq_record)=='@'); + cout<<"read: "<<nread<<" char from input file"<<endl; +#endif + nb_lines=0; + i=1; pchar=fq_record; - while (i<=nread-1) { - - }*/ - + while(i<=nread) { + if (*pchar=='\n') { + nb_lines++; + if (nb_lines==4) { + fq_rec_size=i; + break; + } + } + pchar++; + i++; + } +#ifdef DEBUG + cout<<"found that fq record size is : "<<fq_rec_size<<endl; +#endif + return fq_rec_size; } diff --git a/src/FqBaseBackend.h b/src/FqBaseBackend.h index a5bec4a0c9fd75da9b67ea691f7f788debde969d..29f7b2ab4336508347fa1f349e07712fde7be5c9 100644 --- a/src/FqBaseBackend.h +++ b/src/FqBaseBackend.h @@ -8,6 +8,9 @@ #ifndef FQBASEBACKEND_H_ #define FQBASEBACKEND_H_ +#include <cstdio> +#include <stdlib.h> + #include "FqConstants.h" #include "srp.h" @@ -17,25 +20,53 @@ class FqBaseBackend { // TODO: this class will also be associated to writing output "filtered" fastq files. + // TODO Refactor FqBackends. Surely there is no need to keep f_stream member since I don't call ftell anymore + should even be able to get rid of lseek system call to get position in file. + // I could compute offset myself since I am reading caracters from a text file... protected: - char * filename; + static const size_t bufsize=6048000; // It seems that I can't have a much bigger value than that or else my objects construction throws exception. Strange. + char * i_filename; unsigned char f_id; - int f_desc; // for calling read + int i_f_desc; // for calling read FILE * f_stream; // for calling ftell + // for writing output (filtered reads) + char * o_filename; + int o_f_desc; + char * o_buf; + char * pos_in_w_buf; + + friend void test_processInputFiles(); + friend void test_write_PE(); + public: FqBaseBackend() { - filename=NULL; - f_desc=-1; + i_filename=NULL; + i_f_desc=-1; f_stream=NULL; f_id=0; + o_f_desc=-1; + o_filename=NULL; + o_buf=NULL; + pos_in_w_buf=NULL; + } + + ~FqBaseBackend() { + if (o_buf!=NULL) { + free(o_buf); + o_buf=NULL; + } } - void openFile(char * ficname, unsigned char id); - void closeFile(); - void getRead(const long&,char *); + void openInputFile(char * ficname, unsigned char id); + void openInputFile(); + void closeInputFile(); + int getRead(const unsigned long&,char *); + void setOutputFile(char * ofilename); + void openOutputFile(); + void writeToOutput(const unsigned long&); + void closeOutputFile(); }; diff --git a/src/FqMainBackend.cpp b/src/FqMainBackend.cpp index d79f22de1064695cf50dd021abf238f49cf0e8f6..af0faa9175bfa970e7d1b8389e577cfd350d231d 100644 --- a/src/FqMainBackend.cpp +++ b/src/FqMainBackend.cpp @@ -14,10 +14,17 @@ #include <limits.h> #include <assert.h> #include <stdlib.h> +#include <stdexcept> +#include <sstream> +#include <string> +#include <iostream> + +#define _FILE_OFFSET_BITS 64 // for portability reasons on 32bits systems. //#define DEBUG #ifdef DEBUG +#include <string.h> #include <iostream> using namespace std; #endif @@ -26,7 +33,7 @@ using namespace std; #include "srp.h" -FqMainBackend::FqMainBackend(srp * io_sr) { +FqMainBackend::FqMainBackend(srp * io_sr):FqBaseBackend() { p_auxFqProcessor=NULL; p_scoreReadStruct=io_sr; } @@ -37,10 +44,12 @@ void FqMainBackend::processFile(char * filename,unsigned char f_id) { int st,s; int cnt,nread; long cur_offset; - int nb_read_oper=0; + // int nb_read_oper=0; mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; //cout<<"INT_MAX="<<INT_MAX<<endl; // cout<<filename<<endl; + this->i_filename=filename; + this->f_id=f_id; int f_single=open(filename,O_RDONLY,mode); if (f_single==-1) { @@ -51,17 +60,20 @@ void FqMainBackend::processFile(char * filename,unsigned char f_id) { if (fp==NULL) { err(errno,"cannot open file: %s.",filename); } - cur_offset=ftell(fp); - char* buf=(char *) malloc(bufsize*sizeof(char)); + //cur_offset=ftell(fp); + //cout<<"cur_offset="<<cur_offset<<endl; + char* buf=(char *) malloc(FqBaseBackend::bufsize*sizeof(char)); if (buf==NULL) { err(errno,"cannot allocate memory: %lu bytes.",bufsize); } /* for each read, we want : offset, filenb, total quality score. */ - while ((nread=read(f_single,buf,bufsize))!=0) { - nb_read_oper++; + while ((nread=read(f_single,buf,FqBaseBackend::bufsize))!=0) { + // nb_read_oper++; //cout<<"BE1: read operation nbr: "<<nb_read_oper<<endl; //printf("going to call ftell on : %p \n",fp); - cur_offset=ftell(fp); + //cur_offset=ftell(fp); // don't know why but it seems that this doesn't work under linux; always returns 0... + cur_offset = lseek(f_single, 0, SEEK_CUR); + //cout<<"cur_offset="<<cur_offset<<endl; processBuf((char *)buf,nread,f_id,cur_offset); } close(f_single); @@ -69,16 +81,18 @@ void FqMainBackend::processFile(char * filename,unsigned char f_id) { } - -void FqMainBackend::processBuf(char * buf,int nread,unsigned char f_id,unsigned long cur_offset) { - int cnt=0; - unsigned int s; - static unsigned int st; - static int num_l_in_rec; /* counter to know on which line inside the fastq record we are */ - static int qual_score=0; - static unsigned long rstart_offset; #ifdef DEBUG - static int nb_start; + +#define init_debug 1 +#define on_record_new 2 +#define on_record_end 3 +#define on_buf_end 4 +#define on_line_end 5 +#define on_store_read_id 6 +#define on_record_end_pe 7 + +void FqMainBackend::debug_processBuf(int evt,int& cnt,int& nread,char * pchar,unsigned long& rstart_offset) { // put all the bebug stuff in a separate method so that processBuf is shorter and more "lisible". + static int nb_start; // number of fq record start and stop found in current buffer. static int nb_stop; static int tot_nb_stop=0; static int tot_nb_start=0; @@ -88,6 +102,58 @@ void FqMainBackend::processBuf(char * buf,int nread,unsigned char f_id,unsigned nb_start=0; nb_stop=0; } + + switch(evt) { + case on_record_new: + cout<<"nread="<<nread<<endl; + cout<<"cnt="<<cnt<<endl; + cout<<"rstart_offset="<<rstart_offset<<endl; + is_id=1; + idx_id=0; + nb_start++; + break; + case on_line_end: + is_id=0; + break; + case on_record_end: + nb_stop++; + cout<<"just finished processing : "<<endl; + cout<<current_id<<endl; + cout<<"j obttained: rstart_offset/UINT_MAX="<<rstart_offset/UINT_MAX<<endl; + break; + case on_store_read_id: + if (is_id==1) { + if (*pchar!='/' and *pchar !=' ') current_id[idx_id]=*pchar; + else current_id[idx_id]='\0'; + idx_id+=1; + } + break; + case on_buf_end: + std::cout<<" Main BE position in buffer "<<cnt<<endl; + assert(cnt==nread); + cout<<"Main BE Found "<<nb_start<<" and "<<nb_stop<<" start/stop of record in this buffer."<<endl; + tot_nb_start+=nb_start; + tot_nb_stop+=nb_stop; + cout<<"Main BE Total cumulated number of start: "<<tot_nb_start<<" Total number of stop: "<<tot_nb_stop<<endl; + break; + case on_record_end_pe: + cout<<p_auxFqProcessor->current_id<<endl; + assert(strcmp(current_id,p_auxFqProcessor->current_id)==0); + break; + } + } +#endif + + +void FqMainBackend::processBuf(char * buf,int nread,unsigned char f_id,unsigned long cur_offset) { + int cnt=0; + unsigned int s; + static unsigned int st; + static int num_l_in_rec; /* counter to know on which line inside the fastq record we are */ + static int qual_score=0; + static unsigned long rstart_offset; +#ifdef DEBUG + debug_processBuf(init_debug,cnt,nread,NULL,rstart_offset); #endif char * pchar=buf; while (cnt<=nread-1) { @@ -95,60 +161,46 @@ void FqMainBackend::processBuf(char * buf,int nread,unsigned char f_id,unsigned case k_read_id_start: if (qual_score) goto inc_score; else { + rstart_offset=cur_offset-nread+cnt; #ifdef DEBUG - is_id=1; - idx_id=0; - nb_start++; + debug_processBuf(on_record_new,cnt,nread,NULL,rstart_offset); #endif - rstart_offset=cur_offset-nread+cnt; - num_l_in_rec=1; } - break; + num_l_in_rec=1; } + break; case k_read_qual_start: qual_score=1; st=0; break; case '\n': #ifdef DEBUG - is_id=0; + debug_processBuf(on_line_end,cnt,nread,pchar,rstart_offset); #endif num_l_in_rec+=1; if (num_l_in_rec==5) { - // std::cout<<"BE1 position in buffer "<<cnt+1<<endl; //add 1 to compare value with BE2. qual_score=0;/* end of fastq record */ rpos rp=init_rpos(f_id,rstart_offset); if (p_auxFqProcessor!=NULL) { rinfo pe2info; int eof=p_auxFqProcessor->getNextRead(&pe2info); assert(!eof); // There should be the same number of reads in both PE files. - - // if (eof) cout<<"warning, reached eof in file PE2 after: "<<nb_read_PE1<<" reads in PE1"<<endl; st+=pe2info.score; - unsigned long j=rstart_offset/INT_MAX; + unsigned long j=rstart_offset/UINT_MAX; update_rpos(pe2info.f_id,pe2info.rstart_offset,j,&rp); #ifdef DEBUG - nb_stop++; - cout<<"Just processed : "<<endl; - if (strcmp(current_id,"NS500443:42:H3MH2AFXX:1:11208:4026:13258")==0) { - cout<<"put breakpoint here"<<endl; - } - cout<<current_id<<endl; - cout<<p_auxFqProcessor->current_id<<endl; - assert(strcmp(current_id,p_auxFqProcessor->current_id)==0); + debug_processBuf(on_record_end_pe,cnt,nread,pchar,rstart_offset); #endif } i_dim& ref_i_dim=(*p_scoreReadStruct)[st/K_SCORE_NORM_FACTOR]; - k_dim& ref_k_dim=ref_i_dim[rstart_offset/INT_MAX]; - ref_k_dim.push_back(rp); } + k_dim& ref_k_dim=ref_i_dim[rstart_offset/UINT_MAX]; +#ifdef DEBUG + debug_processBuf(on_record_end,cnt,nread,pchar,rstart_offset); +#endif + ref_k_dim.push_back(rp); + } break; default: #ifdef DEBUG - store_id: - { if (is_id==1) { - if (*pchar!='\/') current_id[idx_id]=*pchar; - else current_id[idx_id]='\0'; - idx_id+=1; - } - } + store_id: debug_processBuf(on_store_read_id,cnt,nread,pchar,rstart_offset); #endif inc_score: { if (qual_score==1) { @@ -163,13 +215,7 @@ void FqMainBackend::processBuf(char * buf,int nread,unsigned char f_id,unsigned cnt++; } #ifdef DEBUG - std::cout<<"BE1 position in buffer "<<cnt<<endl; - if (cnt==nread) { - cout<<"BE1 Found "<<nb_start<<" and "<<nb_stop<<" start/stop of record in this buffer."<<endl; - tot_nb_start+=nb_start; - tot_nb_stop+=nb_stop; - cout<<"BE1 Total cumulated number of start: "<<tot_nb_start<<" Total number of stop: "<<tot_nb_stop<<endl; - } + debug_processBuf(on_buf_end,cnt,nread,pchar,rstart_offset); #endif } diff --git a/src/FqMainBackend.h b/src/FqMainBackend.h index c8b2b1aab351014426b87c256cf3119207987693..db7414e12ff2262db36f3d37bef0152cd4cd3cbe 100644 --- a/src/FqMainBackend.h +++ b/src/FqMainBackend.h @@ -21,6 +21,9 @@ class FqMainBackend : public FqBaseBackend { srp * p_scoreReadStruct; /* Where we store information about the reads. */ char current_id[50]; + void debug_processBuf(int evt,int& cnt,int& nread,char * pchar, unsigned long &); + + friend void test_processInputFiles(); public: FqMainBackend(srp*); diff --git a/src/ROCKparams.cpp b/src/ROCKparams.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a863b2ee2481f73ee0ac5396913f9fac0e08b092 --- /dev/null +++ b/src/ROCKparams.cpp @@ -0,0 +1,287 @@ +/* + * ROCKparams.cpp + * + * Created on: Jul 25, 2016 + * Author: vlegrand + */ +#include <iostream> +#include <fstream> +#include <string> +#include <vector> +#include <cmath> +#include "ROCKparams.h" +using namespace std; + + +void ROCKparams::computeLambda() { + unsigned long tmp=parms.filter_size; + tmp*=1073741824; // this is in fact 1024*1024*1024. + parms.lambda=tmp/INT_MAX; + if (parms.kappa>get_mask<unsigned char>::value) parms.lambda=parms.lambda/sizeof(unsigned short); +} + + +void ROCKparams::processMainArgs(int optind, const int argc, char ** argv,std::vector<string>& v_input_lines) { + if (optind==argc) return; // no arguments + while (optind<argc) { // otherwise, only arguments expected are names of files to filter. + string iline=argv[optind]; + v_input_lines.push_back(iline); + optind++; + } + } + + +void ROCKparams::optArgConsistency(const string& input_file,const string& output_file, + const int& g,CMSparams& parms,const int& nb_k_mers, + const std::vector<string>& v_input_lines) { + if (input_file.empty() && v_input_lines.empty()) { + std::cout<<"You must provide filename via -i or arguments to indicate ROCK what to filter." <<std::endl; + usage(EXIT_FAILURE); + } + if (!input_file.empty() && !v_input_lines.empty()) { + std::cout<<"files to filter are provided via the -i option or as arguments. It cannot be both." <<std::endl; + usage(EXIT_FAILURE); + } + if (g!=0 and parms.lambda!=0) {// user set both lambda and cms size=> inconsistency. + cout<<"-l and -g options are mutually exclusive."<<endl; + usage(EXIT_FAILURE); + } + if (nb_k_mers!=0 and parms.lambda!=0) {// user set both lambda and number of k-mers=> inconsistency. + cout<<"-l and -n options are mutually exclusive."<<endl; + usage(EXIT_FAILURE); + } + float avail_mem=getNodePhysMemory()-defaultGRPMAXSize; + // cout<<avail_mem; + if (g>avail_mem) { + std::cout<<"This machine only has "<<getNodePhysMemory()<<" Gigabytes of RAM. This is not enough to run ROCK with a CMS of size: "<<g<<endl; + exit(EXIT_FAILURE); + } else if (g!=0) { + parms.filter_size=g; + computeLambda(); + } + if (nb_k_mers) { // user indicated number of k-mers; use it to minimize parms.lambda + parms.lambda=getBestLambdaForN(nb_k_mers,parms.lambda); + } + if (parms.kappa_prime>=parms.kappa) { + cout<<"ERROR lower filter is higher than high filter!"<<endl; + exit(EXIT_FAILURE); + } + + if (parms.lambda==0) { + // This happens when the user doesn't specify lambda nor g nor nb_k_mers. + computeLambda(); + parms.lambda=min(parms.lambda,proposedLambda); + if (parms.lambda==0) { + cout<<"Not enough RAM on the machine. TotalRAM -1GB must be bigger than INT_MAX to allow at least 1 array in the CMS."<<endl; + exit(EXIT_FAILURE); + } + } +#ifdef DEBUG + cout<<"parms.lambda="<<parms.lambda<<" INT_MAX="<<INT_MAX<<endl; +#endif + unsigned long cms_size=INT_MAX; + cms_size*=parms.lambda; + //cout<<"minimum cms size:"<<cms_size<<endl; + if (parms.kappa>get_mask<unsigned char>::value) cms_size=cms_size*sizeof(unsigned short); + //cout<<"cms_size in B="<<cms_size<<endl; + cms_size=ceil(cms_size/1024.0/1024/1024); // convert to gigabytes. + //cout<<"cms_size in GB="<<cms_size<<endl; + if (cms_size>getNodePhysMemory()-defaultGRPMAXSize) { + cout<<"Not enough RAM on the machine to run rock with a CMS of size:"<<cms_size<<" GB."<<endl; + exit(EXIT_FAILURE); + } +} + + +/* + * Loads the content of a text file (containing input fastq file names to be filtered or names of files that ROCK must generate). + */ +int ROCKparams::loadFileArgs(const std::string& afile,std::vector<string>& v_lines) { + ifstream infiles_names(afile.c_str()); + if (!infiles_names) cout<<"couldn't open file: "<<afile<<endl; + while (infiles_names) { + string iline; + if (!getline(infiles_names,iline)) break; + v_lines.push_back(iline); + } + if (!infiles_names.eof()) + { + std::cout<<"error while reading input or output file"<<std::endl; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} + +/* + * Reads the names of input and output fastq files that ROCK must process from text files whose names are passed as + * argument of the -i/-o options. Fills the appropriate structures. + * orresponding output files are supposed to be in the same order as input files. No checking here. It is ut to the user to give something correct in input. + */ +int ROCKparams::loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<string>& v_input_lines,std::vector<string>& v_output_lines) { + int reti=EXIT_SUCCESS; + int reto=EXIT_SUCCESS; + if (!input_file.empty()) { + reti=loadFileArgs(input_file,v_input_lines); + } + if (!output_file.empty()) { + reto=loadFileArgs(output_file,v_output_lines); + } + if (reti==EXIT_SUCCESS && reto==EXIT_SUCCESS) return EXIT_SUCCESS; + else return EXIT_FAILURE; +} + +void ROCKparams::removePathfromFName(string& FName) { + std::size_t i_found2 =FName.find_last_of(path_sep); // remove path from filename. + if (i_found2!=std::string::npos) { + FName=FName.substr(i_found2+1); + } +} + +void ROCKparams::changeExtension(string& FName) { // changes .fq into .rock.fq or adds .rock.fq + std::size_t o_found = FName.find_last_of(k_ext); + if (o_found!=std::string::npos) FName.replace(o_found,1,".rock."); + else FName.append(".rock.fq"); +} + +void ROCKparams::genOutFilenames(const std::vector<string>& v_input_lines,std::vector<string>& v_output_lines) { + std::vector<string>::const_iterator it_in; + string o_line; + for (it_in=v_input_lines.begin();it_in!=v_input_lines.end();++it_in) { + std::size_t i_found = (*it_in).find_first_of(k_sep_pair_end); + if (i_found!=std::string::npos) {// PE files are separared by a ',' + string i_name_PE1=(*it_in).substr(0,i_found); + removePathfromFName(i_name_PE1); + string i_name_PE2=(*it_in).substr(i_found+1); + removePathfromFName(i_name_PE2); + changeExtension(i_name_PE1); + changeExtension(i_name_PE2); + o_line=i_name_PE1; + o_line+=k_sep_pair_end; + o_line.append(i_name_PE2); + + } else { + o_line=*it_in; + removePathfromFName(o_line); + changeExtension(o_line); + } + v_output_lines.push_back(o_line); + } +} + +int ROCKparams::processInOutFileArgs(const std::vector<string>& v_input_lines,std::vector<string>& v_output_lines,std::vector<IO_fq_files>& single_files,vector<PE_files>& v_PE_files,int& f_id) { + if (v_output_lines.empty()) { + // in that case, generate output filenames from input filenames + genOutFilenames(v_input_lines,v_output_lines); + } + if (v_input_lines.size()!=v_output_lines.size()) { + cout<< "Inconsistency between input and output files lists!"<<endl; + return EXIT_FAILURE; + } else { + std::vector<string>::const_iterator it_in; + std::vector<string>::const_iterator it_out; + it_in=v_input_lines.begin(); + it_out=v_output_lines.begin(); + while (it_in!=v_input_lines.end()) { + std::size_t i_found = (*it_in).find_first_of(k_sep_pair_end); + if (i_found!=std::string::npos) { + // this is PE + f_id+=2; + string i_name_PE1=(*it_in).substr(0,i_found); + string i_name_PE2=(*it_in).substr(i_found+1); + std::size_t o_found = (*it_out).find_first_of(k_sep_pair_end); + if (o_found==std::string::npos) { + cout<< "Inconsistency between input and output files lists!"<<endl; + return EXIT_FAILURE; + } + string o_name_PE1=(*it_out).substr(0,o_found); + checkDirExists(o_name_PE1); + string o_name_PE2=(*it_out).substr(o_found+1); + //cout<<o_name_PE2<<endl; + checkDirExists(o_name_PE2); + PE_files pe; + pe.PE1.in_fq_file=i_name_PE1; + pe.PE1.out_fq_file=o_name_PE1; + pe.PE2.in_fq_file=i_name_PE2; + pe.PE2.out_fq_file=o_name_PE2; + v_PE_files.push_back(pe); + + } else { + // this is single. + f_id+=1; + IO_fq_files p; + p.in_fq_file=*it_in; + checkDirExists(*it_out); + p.out_fq_file=*it_out; + single_files.push_back(p); + } + ++it_in; + ++it_out; + } + if (f_id>k_max_input_files) { + cout<<"ROCK cannot handle more than "<<k_max_input_files<<" input files."<<endl; + return EXIT_FAILURE; + } + } + return EXIT_SUCCESS; +} + +void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { + int i; + std::vector<string> v_input_lines; + std::vector<string> v_output_lines; + + while((i = getopt(argc, argv, "i:o:l:k:c:C:g:n:vq:")) != -1) { + //printf("found option: %c\n",i); + switch(i) { + case 'i': + input_file=optarg;break; + case 'c': + parms.kappa_prime=atoi(optarg);break; + case 'h': + usage(EXIT_SUCCESS); break; + case 'o': + output_file=optarg; break; + case 'C': + parms.kappa=atoi(optarg); + if (parms.kappa<=0 || parms.kappa>get_mask<unsigned short>::value) { + cout<<"Bad value for kappa. Must choose kappa so that 0<kappa<="<<get_mask<unsigned short>::value<<endl; + usage(EXIT_FAILURE); + } + break; + case 'l': + parms.lambda = atoi(optarg); + if (parms.lambda<=0) { + cout<<"Bad value for lambda. Choose a value that is >0 or let ROCK choose for you."<<endl; + usage(EXIT_FAILURE); + } + break; + case 'k': + k=atoi(optarg); + if (k<=0 || k>32) { + cout<<"Bad value for k. Must choose k so that 0<k<=32."<<endl; + usage(EXIT_FAILURE); + } + break; + case 'n': + nb_k_mers=atoi(optarg);break; // number of distinct k-mers + case 'g': + // cout<<optarg<<endl; + g=atoi(optarg); + break; + case 'v': + verbose_mode=1; + break; + case 'q': + parms.nucl_score_threshold=atoi(optarg); + break; + default: + usage(EXIT_FAILURE); break; } + } + processMainArgs(optind, argc, argv,v_input_lines); + optArgConsistency(input_file,output_file,g,parms,nb_k_mers,v_input_lines); + if (nb_k_mers) { + expected_collision_proba=getCollisionProba(nb_k_mers,parms.lambda); + } + if ((v_input_lines.empty() || v_output_lines.empty()) && (loadInOutFileArgs(input_file,output_file,v_input_lines,v_output_lines)==EXIT_FAILURE)) throw EXIT_FAILURE; + if (processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id)!=EXIT_SUCCESS) throw EXIT_FAILURE; + } diff --git a/src/ROCKparams.h b/src/ROCKparams.h new file mode 100644 index 0000000000000000000000000000000000000000..5aea5c9ac1a6f940a814bc11a1a4684415c3d5fa --- /dev/null +++ b/src/ROCKparams.h @@ -0,0 +1,139 @@ +/* + * ROCKparams.h + * + * Created on: Jul 25, 2016 + * Author: vlegrand + */ + +#ifndef ROCKPARAMS_H_ +#define ROCKPARAMS_H_ + + +#include <cstdlib> +#include <string> +#include <vector> +#include <getopt.h> +#include "CountMinSketch.hpp" +#include "main_utils.h" + +#define defaultGRPMAXSize 1 //(in gigabytes) Default maximum size of srp structure but of course this size depends of the number of reads we are processing. + // 1 GB is enough to store approx 82 millions of reads (must take into account memory needed for stl containers). + // This constant is used only if you use rock's default behavior (use 90% of the machine's RAM). + // If you specify a lambda, it is possible to use more memory for storing more reads and less memory for the CMS. + +#define proposedLambda 4 + +using namespace std; + + +class ROCKparams{ + CMSparams parms; + int g; // max RAM to use for the CMS if specified by the user. + int nb_k_mers; // expected number of k-mers in input data if specified by the user. + int k; // size of the k-mers + int verbose_mode; + std::string input_file,output_file; + //int nucl_score_threshold; + + std::vector<IO_fq_files> single_files; + vector<PE_files> v_PE_files; + int f_id; + unsigned long cms_size; + float expected_collision_proba; + + + void computeLambda(); + void processMainArgs(int optind, int argc, char ** argv,std::vector<string>& v_input_lines); + int loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<string>& v_input_lines,std::vector<string>& v_output_lines); + int loadFileArgs(const std::string& afile,std::vector<string>& v_lines); + void removePathfromFName(string& FName); + void changeExtension(string& FName); + void genOutFilenames(const std::vector<string>& v_input_lines,std::vector<string>& v_output_lines); + int processInOutFileArgs(const std::vector<string>& v_input_lines,std::vector<string>& v_output_lines,std::vector<IO_fq_files>& single_files,vector<PE_files>& v_PE_files,int& f_id); + + /* + * Checks options and arguments consistency (ex kappa_prime not superior to kappa; output file directory exists and so on...) + * Also set some CMS parameters depending on value of other options. + */ + void optArgConsistency(const string& input_file,const string& output_file, + const int& g,CMSparams& parms,const int& nb_k_mers, + const std::vector<string>& v_input_lines); + + friend void test_processIOFileArgs(); + +public: + + ROCKparams() { + f_id=0; // to generate id of input/output fastq files. +1=total number of files. + g=0; + k=30; + nb_k_mers=0; + parms.kappa=50; + parms.kappa_prime=0; + parms.lambda=0; + verbose_mode=0; + cms_size=0; + expected_collision_proba=0.0; + //nucl_score_threshold=0; + parms.filter_size=getNodePhysMemory()/100.0*90-defaultGRPMAXSize; // Prefer not to use all the machine's memory + if (parms.filter_size==0) throw EXIT_FAILURE; + } + + + + void initFromMainOptsArgs(int argc,char ** argv); + + CMSparams getCMSparams() { + return parms; + } + + int get_f_id() { + return f_id; + } + + std::vector<IO_fq_files> get_single_files() { + return single_files; + } + + vector<PE_files> get_PE_files() { + return v_PE_files; + } + + int get_k() { + return k; + } + + void printVerboseInfo() { + std::vector<IO_fq_files>::iterator it_s; + vector<PE_files>::iterator it_pe; + cout<<"Input files: "<<endl; + cout<<"single:"<<endl; + for (it_s=single_files.begin();it_s!=single_files.end();it_s++) { + cout<<" "<<it_s->in_fq_file<<endl; + } + cout<<"pair-end:"<<endl; + for (it_pe=v_PE_files.begin();it_pe!=v_PE_files.end();it_pe++) { + cout<<" "<<it_pe->PE1.in_fq_file<<" "<<it_pe->PE2.in_fq_file<<endl; + } + cout<<endl; + cout<<"k="<<k<<endl; + cout<<"kappa="<<parms.kappa<<endl; + cout<<"kappa prime="<<parms.kappa_prime; + cout<<endl; + cout<<"CMS size="<<cms_size<<" Bytes for "<<parms.lambda<<" arrays."<<endl; + cout<<endl; + if (nb_k_mers!=0) cout<<"expected probability of collision was: "<<expected_collision_proba<<endl; + } + + void setFilterSize(unsigned long fsize) { + cms_size=fsize; + } + + int is_verbose() { + return verbose_mode; + } + +}; + + +#endif /* ROCKPARAMS_H_ */ diff --git a/src/ReadProcessor.h b/src/ReadProcessor.h index d5a26c5b7d6119ba9b4c338c01c0b62dc6a0ae42..be4d4def03ee41cbdf7d23af05bdc9284b53be6a 100644 --- a/src/ReadProcessor.h +++ b/src/ReadProcessor.h @@ -8,6 +8,11 @@ #include <vector> #include <math.h> +//#define DEBUG +#ifdef DEBUG +#include <iostream> +using namespace std; +#endif class ReadProcessor { @@ -33,7 +38,7 @@ class ReadProcessor { friend void testDNAToNumberSimple(); - inline unsigned long nucleoToNumber(char s) { + unsigned long nucleoToNumber(char s) { unsigned long nbr; switch(s) { @@ -53,13 +58,16 @@ class ReadProcessor { nbr=0; break; default: +#ifdef DEBUG +cout<<"found unexpected character in DNA string: "<<s<<endl; +#endif throw -1; // TODO Benchmark this inside try catch statement to see if try catch+exception really costs so long. // throw an integer for the moment. An exception object may not be the most optimal choice in terms of performance. } return nbr; } - inline unsigned long nucleoToNumberReverse(char s,int i) { + unsigned long nucleoToNumberReverse(char s,int i) { unsigned long nbr=0; char cpl=nucleo_rev_cpl[s]; nbr=nucleoToNumber(cpl); @@ -82,8 +90,6 @@ public: unsigned long kMerToNumberReverse(char * k_m,unsigned long * p_prev); - - unsigned long kMerToNumber(char * k_m,unsigned long * p_prev); diff --git a/src/fqwriter.cpp b/src/fqwriter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6ca2278364de6327eb68a5dc15e213a458802ced --- /dev/null +++ b/src/fqwriter.cpp @@ -0,0 +1,77 @@ +/* + * fqwriter.cpp + * + * Created on: Mar 15, 2016 + * Author: vlegrand + * Here gather functions to write filtered fastq records to output files + */ + +#include <map> +#include "srp.h" +#include "FqBaseBackend.h" + +//#define DEBUG +#ifdef DEBUG +#include <iostream> +using namespace std; +#endif + + +// TODO this looks a little bit like getDnaStr in read_utils.cpp. Some refactoring could be useful. Think about +// classes to encapsulate that without decreasing perfs. +void writeFilteredFastq(FqBaseBackend* map_id_backend[],int nb_be, const srp& io_sr){ + srp::const_reverse_iterator it; + i_dim::const_iterator it_offs; + k_dim::const_iterator it_struct; + + unsigned char f_id1; + unsigned char f_id2; + unsigned char fid_stored; + + // 1rst step, open files before fetching/writing reads from/to them + int i; + for (i=0;i<nb_be;i++){ + map_id_backend[i]->openInputFile(); + map_id_backend[i]->openOutputFile(); + } + + for (it=io_sr.rbegin(); it!=io_sr.rend(); ++it) { + for (it_offs=it->second.begin();it_offs!=it->second.end();it_offs++) { + unsigned long j=it_offs->first; + for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { + if (it_struct->fileid) { + fid_stored=it_struct->fileid; + f_id1=fid_stored >>4; + f_id2=fid_stored &mask_fid; + unsigned long offset1=j*UINT_MAX+it_struct->read_a1; +#ifdef DEBUG + cout<<"going to output record with: j="<<j<<"read_a1="<<it_struct->read_a1<<endl; + cout<<"j*UINT_MAX+it_struct->read_a1 = "<<j*UINT_MAX+it_struct->read_a1<<endl; + cout<<"offset1="<<offset1<<endl; + cout<<"f_id1-1="<<f_id1-1<<endl; +#endif + FqBaseBackend * be=map_id_backend[f_id1-1]; + be->writeToOutput(offset1); + if (f_id2!=0) { // case of PE reads. + unsigned long offset2=it_struct->read_a2+offset1; +#ifdef DEBUG + cout<<"f_id2-1="<<f_id2-1<<endl; + cout<<"offset2="<<offset2<<endl; +#endif + map_id_backend[f_id2-1]->writeToOutput(offset2); + } + } + } + } + } + + // last step, close files. + for (i=0;i<nb_be;i++){ + map_id_backend[i]->closeInputFile(); + map_id_backend[i]->closeOutputFile(); + } +} + + + + diff --git a/src/fqwriter.h b/src/fqwriter.h new file mode 100644 index 0000000000000000000000000000000000000000..97d90b44bc98e75818d57d7052b588a492de7cc4 --- /dev/null +++ b/src/fqwriter.h @@ -0,0 +1,15 @@ +/* + * fqwriter.h + * + * Created on: Mar 15, 2016 + * Author: vlegrand + */ + +#ifndef FQWRITER_H_ +#define FQWRITER_H_ + +void writeFilteredFastq(FqBaseBackend* map_id_backend[],int nb_be, const srp& io_sr); + + + +#endif /* FQWRITER_H_ */ diff --git a/src/main_utils.cpp b/src/main_utils.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dc1ba44e4f854658141f2ed8bf5dd009590cfdb7 --- /dev/null +++ b/src/main_utils.cpp @@ -0,0 +1,118 @@ +/* + * main_utils.cpp + * + * Created on: May 9, 2016 + * Author: vlegrand + */ +#include <stdint.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/sysctl.h> +#include <sys/stat.h> +#include <string.h> +#include <iostream> +#include <fstream> +#include <cmath> +#include "main_utils.h" + +using namespace std; + +/* + * Use this function to get the amount of RAM in GB on the machine (used or not). + * Aim is to make sure that ROCK can run on the current machine with the requirements that the user gave. + */ +unsigned long getNodePhysMemory() { + uint64_t mem = 0; + +#if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) + mem = sysconf(_SC_PHYS_PAGES); + mem *= sysconf(_SC_PAGESIZE); + mem /= 1 * 1024 * 1024 * 1024; +#elif __APPLE__ + size_t len = sizeof(mem); + int ret=sysctlbyname("hw.memsize", (void*) &mem, &len, NULL,0); + if (ret==-1) { + cout<<"Couldn't determine how much RAM is usable on your system."<<errno<<endl; + mem=0; + } + mem /= 1 * 1024 * 1024 * 1024; +#else + cout<<"ROCK has not been tested on this operating system, please contact author."<<endl; +#endif + + return (unsigned long)mem; +} + + +/* + * Given a filename (including full path), determine if parent directory exists. + * If is doesn't, display error message and exit. + */ +void checkDirExists(const string o_file) { + std::size_t o_found = o_file.find_last_of("/"); + if (o_found!=string::npos) { + string parent_dir=o_file.substr(0,o_found); + // cout<<parent_dir<<endl; + struct stat info; + if (stat(parent_dir.c_str(),&info)!=0) { + cout<<"parent directory for output files: "<<parent_dir<<" doesn't exist."<<endl; + exit(EXIT_FAILURE); + } + } +} + + +/* + * Minimize lambda for a given number of k-mers. + * We want p<=0.01, + * so choose smallest lambda so that [1-(1-1/m)exp n] exp lambda<=0.01 + */ +int getBestLambdaForN(const unsigned long& nb_k_mers,int lambda_max) { + int lambda=2; + int min_lambda=lambda; + if (lambda_max==0) { // no upper bound specified by the user via the -g option + lambda_max=500; + } + long double tmp=1.0/INT_MAX; + tmp=1.0-tmp; + tmp=pow(tmp,nb_k_mers); + tmp=1.0-tmp; + while (lambda<=lambda_max) { + min_lambda=lambda; + long double p=pow(tmp,lambda); + if (p<=0.01) break; + lambda+=1; + } + return min_lambda; +} + +/* + * Compute collision probability + */ +float getCollisionProba(const unsigned long& nb_k_mers,const int& lambda) { + long double tmp=1.0/INT_MAX; + tmp=1.0-tmp; + tmp=pow(tmp,nb_k_mers); + tmp=1.0-tmp; + long double p=pow(tmp,lambda); + if (p<0.0001) p=0; + // rounding + p=p*1000+0.5; + return floor(p)/1000; + //return p; +} + +void usage(int status) { + cout<<"usage: rock [options] [args]"<<endl<<endl; + cout<<" -i <file> .... file name containing the list of the FASTQ files to process."<<endl; + cout<<" -o <file> .... file name containing the list of the output FASTQ file names."<<endl; + cout<<" -h .... Print this message and exit."<<endl; + cout<<" -k <int> .... k-mer size. (default 30)."<<endl; + cout<<" -c <int> .... lower coverage threshold (default: 0)."<<endl; + cout<<" -C <int> .... upper coverage threshold (default: 50; max: 65535)."<<endl; + cout<<" -l <int> .... size of the count min sketch (default: at most 4, depending on the available RAM)"<<endl; + cout<<" -n <int> .... (expected) number of distinct k-mers within the processed reads."<<endl; + cout<<" -g <int> .... maximum RAM to use (in Gb) for the CMS."<<endl; + cout<<" -v .... verbose"<<endl; + exit(status); } + diff --git a/src/main_utils.h b/src/main_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..ed50a9cea471b62d7e0f7cc1303248a54778bc8a --- /dev/null +++ b/src/main_utils.h @@ -0,0 +1,29 @@ +/* + * main_utils.h + * + * Created on: May 9, 2016 + * Author: vlegrand + */ + +#ifndef MAIN_UTILS_H_ +#define MAIN_UTILS_H_ + +#define k_max_input_files 15 +#define k_sep_pair_end ',' +#define k_ext '.' +#define path_sep '/' + +#include <string> +#include <vector> +#include "rock_commons.h" + +unsigned long getNodePhysMemory(); +void checkDirExists(const std::string o_file); +/*int loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<std::string>& v_input_lines,std::vector<std::string>& v_output_lines); +int processInOutFileArgs(const std::vector<std::string>& v_input_lines,std::vector<std::string>& v_output_lines,std::vector<IO_fq_files>& single_files,std::vector<PE_files>& v_PE_files,int& f_id);*/ +int getBestLambdaForN(const unsigned long& nb_k_mers,int lambda_max); +float getCollisionProba(const unsigned long& nb_k_mers,const int& lambda); +void usage(int status); + + +#endif /* MAIN_UTILS_H_ */ diff --git a/src/rock_commons.h b/src/rock_commons.h new file mode 100644 index 0000000000000000000000000000000000000000..e27fa0587fc459bcde9531e2f28031f357d62ea0 --- /dev/null +++ b/src/rock_commons.h @@ -0,0 +1,46 @@ +/* + * rock_types2.h + * + * Created on: Jan 20, 2016 + * Author: vlegrand + * Gather here typedefs and structures useful for the main program and "inter-modules" data exchange. + * I divided rock into 4 "modules": + * 1- fqreader (parses fastq and fills 3D array of structures representing all reads). + * 2- read-utils (utility functions for DNA reads processing) + * 3- CMS (fill and use). + * 4- fqwriter (writes fq filtered by CMS). + */ + +#ifndef ROCK_COMMONS_H_ +#define ROCK_COMMONS_H_ + +#include <string> +#include <vector> +#include "rock_commons.h" +#include "FqBaseBackend.h" + +/* + * Gather here type and constants definitions that are common to some of the 4 different modules in ROCK + * (fqreader, read processing, CMS filling, result writing) and to the main program. + * + */ + +#define k_max_input_files 15 + +typedef struct { + std::string in_fq_file; + std::string out_fq_file; +} IO_fq_files; + +typedef struct { + IO_fq_files PE1; + IO_fq_files PE2; +}PE_files; + + +typedef std::vector<unsigned long> readNumericValues; + + + + +#endif /* ROCK_COMMONS_H_ */ diff --git a/src/rock_types.h b/src/rock_types.h deleted file mode 100644 index fe80bac75f833b822b98c723fa7dfcbb596c7c8a..0000000000000000000000000000000000000000 --- a/src/rock_types.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * rock_types2.h - * - * Created on: Jan 20, 2016 - * Author: vlegrand - * Gather here typedefs and structures useful for the main program and "inter-modules" data exchange. - * I divided rock into 4 "modules": - * 1- fqreader (parses fastq and fills 3D array of structures reprensenting all reads). - * 2- read-utils (utility functions for DNA reads processing) - * 3- CMS (fill and use). - * 4- fqwriter (writes fq filtered by CMS). - */ - -#ifndef ROCK_TYPES_H_ -#define ROCK_TYPES_H_ - -#include <map> -#include "FqBaseBackend.h" - -/*typedef struct { - FqBaseBackend& fq_be; - -}fq_io;*/ - -// typedef std::map<unsigned char,FqBaseBackend&> fq_file_map; - -#define k_max_input_files 15 - -#endif /* ROCK_TYPES_H_ */ diff --git a/src/srp_old.h b/src/srp_old.h deleted file mode 100644 index 89f06b6e5456711be0d4fe1f311d6eb5e2a47f64..0000000000000000000000000000000000000000 --- a/src/srp_old.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * srp.h - * - * Created on: Dec 8, 2015 - * Author: vlegrand - * - * SRP stands for score-read position. Indeed, this structure assoiates a score with a read's position inthe input fq file. - */ - -#ifndef SRP_H_ -#define SRP_H_ - -typedef struct { /* Here store read offset in file whose id is fileid.*/ - char fileid; - unsigned long read_a1; -}rpos; - -char ** l_files; /* array of filenames; indexed on fileid */ - -typedef rpos *** srp; - -/* - * Physically, srp is : - * An array indexed on read total score (Arr1). - * Arr1 contains pointers to other other arrays (Arr2). - * Arr2 is indexed on something that is deduced from offset (read position in a file). - * Arr2 contains pointers to array of rpos structures. - */ - -#endif /* SRP_H_ */ diff --git a/src/unit_test_cms.cpp b/src/unit_test_cms.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3ae712c4be706ee712901c417486c1f29b46e68a --- /dev/null +++ b/src/unit_test_cms.cpp @@ -0,0 +1,99 @@ +/* + * unit_test_cms.cpp + * + * Created on: Feb 9, 2016 + * Author: vlegrand + * + * Here, gather the tests for the ROCK's main component: the CMS. + */ +#include <iostream> +#include <assert.h> +#include <climits> + +using namespace std; +#include "CountMinSketch.hpp" +#include "main_utils.h" + + +void test_CMS(int lambda,int kappa,int kappa_prime) { + CountMinSketch<unsigned char> cms(lambda,kappa,kappa_prime); + int i; + cout<<"size of the CMS component: "<<sizeof(cms)<<endl; + int num=100*lambda; + int rej_expected=0; + int ret; + for (i=0;i<num;i++) { + cms.addKMer(num); + } + // Now, check that our k-mer is present in the CMS. + int min=cms.getEstimatedNbOcc(num); + std::cout<<"min="<<min<<endl; + assert((min==num) || (min==num-get_mask<unsigned char>::value-1)); // addKmer doesn't check kappa. + + unsigned long nb_distinct_k_mers=cms.getNbDistinctKMers(); + assert(nb_distinct_k_mers==1); + + // mimic a read (as given by part 2 output). + readNumericValues read_values; + for (i=1;i<=400;i++) { + read_values.push_back(i*1000-1); + } + ret=cms.addRead(read_values); // read should be accepted since none of the values that it contains is equal to those I inserted previously. + assert(ret!=rej_expected); + + nb_distinct_k_mers=cms.getNbDistinctKMers(); + assert(nb_distinct_k_mers==401); + + // mimic a vector that contains a lot of k_mers already present in the CMS and check that it is rejected. + readNumericValues read_values2; + for (i=1;i<=199;i++) { + read_values2.push_back(i*1000-1); + } + for (i=200;i<=400;i++) { + read_values2.push_back(num); + } + ret=cms.addRead(read_values2); + assert(ret==rej_expected); + + nb_distinct_k_mers=cms.getNbDistinctKMers(); + assert(nb_distinct_k_mers==401); + + // test that reads with median cov<kappa_prime are rejected. + ret=cms.isBeneathMinKappa(read_values); + assert(ret>0); +} + + + +int main(int argc, char **argv) { + int lambda=2; + int kappa=50; + int kappa_prime=20; + cout<<"INT_MAX="<<INT_MAX<<endl; + cout<<"UINT_MAX="<<UINT_MAX<<endl; + cout<<"LONG_MAX="<<LONG_MAX<<endl; + cout<<"ULONG_MAX="<<ULONG_MAX<<endl; + cout<<"ULONG_MAX/UINT_MAX="<<ULONG_MAX/UINT_MAX<<endl; // max value for j + cout<<"ULONG_MAX%UINT_MAX="<<(ULONG_MAX-1)%UINT_MAX<<endl; + cout<<"sizeof(short)="<<sizeof(short)<<endl; + cout<<"sizeof(int)="<<sizeof(int)<<endl; + cout<<"sizeof(long)="<<sizeof(long)<<endl; + + cout<<"checking that your machine has enough RAM to run test"<<endl; + unsigned long ram=getNodePhysMemory(); + cout<<"machine has: "<<ram<<" GB of RAM"<<endl; + + if (ram<5) { + cout<<"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"<<endl; + cout<<" WARNING: skipping this test, it requires at least 4 gigabytes of RAM to run."<<endl; + cout<<"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"<<endl; + return -1; + } + + if (argc==1) { + cout<<"testing CMS with lambda="<<lambda<<endl; + test_CMS(lambda,kappa,kappa_prime); // Finally using C arrays (maps implied storing hash keys : 4 Bytes per k_mer overhead) but each array is of size INT_MAX... + cout<<"done"<<endl; + } + return 0; +} diff --git a/src/unit_test_fqreader.cpp b/src/unit_test_fqreader.cpp index cca5adfa5749a2799375a29bd6193d776eea4269..786de5755f3b77043837f898515228965eccb7f9 100644 --- a/src/unit_test_fqreader.cpp +++ b/src/unit_test_fqreader.cpp @@ -7,12 +7,15 @@ * Keep using assert for the moment, don't want to add a dependency on boost (or any other test framework) just for the tests. */ #include <stdio.h> +#include <string.h> #include <iostream> #include <limits.h> #include <assert.h> #include <stdlib.h> +#include "rock_commons.h" #include "FqConstants.h" #include "srp.h" +#include "FqMainBackend.h" #include "fqreader.h" // TODO : Add test case where @ character is inside quality score. @@ -23,7 +26,7 @@ void test_processSingleFile() { //printf("MAX_UINT=%u \n",UINT_MAX); srp sr; unsigned char f_id=1; - processSingleFile((char *) "../data/test_single.fq",f_id,&sr); + processSingleFile((char *) "../test/data/unit/test_single.fq",f_id,&sr); srp::reverse_iterator rit; i_dim::iterator it_offs; k_dim::iterator it_struct; @@ -36,7 +39,8 @@ void test_processSingleFile() { assert(score==5); for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { unsigned long offset_quotient=it_offs->first; - assert(offset_quotient==0); + cout<<"offset_quotient="<<offset_quotient<<endl; + assert(offset_quotient==0); for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { unsigned char fid_stored=it_struct->fileid; assert(fid_stored >>4==f_id); @@ -55,9 +59,73 @@ void test_processSingleFile() { assert(cnt_read==6); } +/* + * Test case with other data than those I had until here. + * Quality score contains '@'caracter (usually start of fastq record), + * reads are longer, + * id and '+' line contain additional information. + */ +void test_processPEfilesWithA() { + + char * fq_3_test=(char *) "../test/data/unit/klebsiella_PE1.fq"; + char * fq_4_test=(char *) "../test/data/unit/klebsiella_PE2.fq"; + + unsigned char f_id3=3; + unsigned char f_id4=4; + + srp sr; + processPEFiles(fq_3_test, f_id3,fq_4_test, f_id4,&sr); + srp::reverse_iterator rit; + i_dim::iterator it_offs; + k_dim::iterator it_struct; + int cnt_read=0; + + unsigned char masque=0x0F; + + for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). + cout << "score="<<rit->first<<endl; + unsigned long score=rit->first; + /*if (cnt_read==0 || cnt_read==1) assert(score==10); + if (cnt_read==2) assert(score==9);*/ + for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { + unsigned long offset_quotient=it_offs->first; + assert(offset_quotient==0); + for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { + unsigned char fid_stored=it_struct->fileid; + assert(fid_stored >>4==f_id3); + assert((fid_stored &masque)==f_id4); + if (cnt_read==0) { + assert(it_struct->read_a1==0); + assert(it_struct->read_a1==0); + } + if (cnt_read==6) { + // std::cout<<it_struct->read_a1<<" "<<it_struct->read_a2; + assert(score==18); + assert(it_struct->read_a1==558); + assert(it_struct->read_a2==-2); + } + if (cnt_read==3) { + // std::cout<<it_struct->read_a1<<" "<<it_struct->read_a2; + assert(score==19); + assert(it_struct->read_a1==1114); + assert(it_struct->read_a2==0); + } + cnt_read++; + + int tmp1=fid_stored >>4; + int tmp2=fid_stored &masque; + cout<<" fileid1="<<tmp1<<" read_a1="<<it_struct->read_a1<<endl; + cout<<" fileid2="<<tmp2<<" read_a2="<<it_struct->read_a2<<endl; + } + } + } + assert(cnt_read==10); + +} + void test_processPEFiles() { - char * fq_1_test=(char *) "../data/test_PE1.fq"; - char * fq_2_test=(char *) "../data/test_PE2.fq"; + char * fq_1_test=(char *) "../test/data/unit/test_PE1.fq"; + char * fq_2_test=(char *) "../test/data/unit/test_PE2.fq"; unsigned char f_id1=1; unsigned char f_id2=2; @@ -91,12 +159,12 @@ void test_processPEFiles() { if (cnt_read==1) { std::cout<<it_struct->read_a1<<" "<<it_struct->read_a2; assert(it_struct->read_a1==349); - assert(it_struct->read_a2==349); + assert(it_struct->read_a2==0); } if (cnt_read==2) { std::cout<<it_struct->read_a1<<" "<<it_struct->read_a2; assert(it_struct->read_a1==698); - assert(it_struct->read_a2==698); + assert(it_struct->read_a2==0); } cnt_read++; @@ -110,25 +178,12 @@ void test_processPEFiles() { assert(cnt_read==3); } -void test_processAllFiles() { - char * fq_1_test=(char *) "../data/test_PE1.fq"; - char * fq_2_test=(char *) "../data/test_PE2.fq"; - char * fq_single=(char *) "../data/test_single.fq"; - - unsigned char f_id1=1; - unsigned char f_id2=2; - unsigned char f_single=3; - - srp sr; - - processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr); - processSingleFile(fq_single,f_single,&sr); +void check_processAIFilesResults(srp& sr) { srp::reverse_iterator rit; i_dim::iterator it_offs; k_dim::iterator it_struct; int cnt_read=0; - for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). //cout << "score="<<rit->first<<endl; unsigned long score=rit->first; @@ -149,6 +204,71 @@ void test_processAllFiles() { assert(cnt_read==9); } +void test_processAllFiles() { + char * fq_1_test=(char *) "../test/data/unit/test_PE1.fq"; + char * fq_2_test=(char *) "../test/data/unit/test_PE2.fq"; + char * fq_single=(char *) "../test/data/unit/test_single.fq"; + + unsigned char f_id1=1; + unsigned char f_id2=2; + unsigned char f_single=3; + + srp sr; + + processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr); + processSingleFile(fq_single,f_single,&sr); + + check_processAIFilesResults(sr); +} + + +void test_processInputFiles() { + char * fq_1_test=(char *) "../test/data/unit/test_PE1.fq"; + char * fq_2_test=(char *) "../test/data/unit/test_PE2.fq"; + char * fq_single=(char *) "../test/data/unit/test_single.fq"; + + IO_fq_files s; + s.in_fq_file=fq_single; + + vector<IO_fq_files> v_single; + v_single.push_back(s); + + PE_files pe; + pe.PE1.in_fq_file=fq_1_test; + pe.PE2.in_fq_file=fq_2_test; + vector<PE_files> v_pe; + v_pe.push_back(pe); + + srp sr; + + FqBaseBackend * array_be[k_max_input_files]; + processInputFiles(v_single,v_pe,array_be,&sr); + + // check that result is correct in sr. + check_processAIFilesResults(sr); + + // check that the 3 backends are correct + FqMainBackend * pbe=(FqMainBackend *) array_be[0]; // TODO see if one can use check_case, static_cast or one of them if they are not in boost. + FqAuxBackend * pbe2=(FqAuxBackend *) array_be[2]; + FqMainBackend * pbe3=(FqMainBackend *) array_be[1]; + + assert(strcmp(pbe->i_filename,fq_single)==0); + assert(pbe->f_id==1); + assert(pbe->p_auxFqProcessor==NULL); + + assert(pbe3->p_auxFqProcessor==pbe2); + assert(strcmp(pbe3->i_filename,fq_1_test)==0); + assert(pbe3->f_id==2); + + assert(strcmp(pbe2->i_filename,fq_2_test)==0); + assert(pbe2->f_id==3); + + + int i; + for (i=0;i<3;i++) delete array_be[i]; + //free(array_be); + +} int main(int argc, char **argv) { @@ -156,6 +276,11 @@ int main(int argc, char **argv) { test_processSingleFile(); cout<<"test for PE files"<<endl; test_processPEFiles(); + cout<<"test the case of records that contain @ character in quality score"<<endl; + test_processPEfilesWithA(); cout<<"test for both single and PE files"<<endl; test_processAllFiles(); /* mix PE together with single; nearly as in real life.*/ + cout<<"testing higher level function processInputFiles"<<endl; + test_processInputFiles(); + cout<<"done"<<endl; } diff --git a/src/unit_test_fqwriter.cpp b/src/unit_test_fqwriter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..69d107f8a67d44f5d7a6cfed6628440729436ffa --- /dev/null +++ b/src/unit_test_fqwriter.cpp @@ -0,0 +1,141 @@ +/* + * unit_test_fqwriter.cpp + * + * Created on: Mar 14, 2016 + * Author: vlegrand + */ +#include <cstdio> +#include <cassert> +#include <unistd.h> +#include <fcntl.h> +#include <string.h> +#include "rock_commons.h" +#include "FqConstants.h" +#include "srp.h" +#include "FqMainBackend.h" +#include "fqwriter.h" + +#include <iostream> +using namespace std; + +#define DEBUG + + +char expected_content_PE1 []="@SRR1222430.3 3 length=236\n\ +CAAACACCTGACGCGGTTCCAGCAGGTACTCCTGCACGCCAATTTCCGGGCGGGCAGTAAAGCGCTGTTTGCAGCCCGTCTGGTGCAGGCGCCCCAGATAGCGGCCAACCCATTCCATCTGATCAAGGTTATCCGCTTCGAACTGACGACCGCCAAGGCTTGGGAAAACGGCAAAGTAGAATCCCTGATGCTGATGAAGCGTGCTGTCATTAAATAAGAGCGGCGCAGCAACGGGC\n\ ++SRR1222430.3 3 length=236\n\ +CCCCCFFCFFFFGGGGGGGGGGHHHHHFGHHHHHHHHGGGGGHHHHHGGGGGGGGGGHHHHHHGGGGGHHHHHHHHGGGGHGHGHHGGHGGGGGGGGHHHHHGGGGGHGGGGHHHHGHHHHHHHHHHHHHHHHHGGGGGGGGGGGGGGGGGGGFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBFFFFFFFFFFFFFFFABF\n\ +@SRR1222430.2 2 length=250\n\ +AGAAATTCGCCATCAGAATAAAAACCTCATATGCACATTTTCTTGTTATTGCACAGCCTGTGCCACTTTAGCGCCAGCCTCTCCGGCAATCGTGGAGAAATTAAGGAGATAGTGTAATTTATCATGTTGCTTTTGCCGTATCGTAAAGAAACCTCGAGCTTTCCTGCCAGCAGGTAGCGAGTCTGCTTCGTCACCGCAGACCGGCGCATTATCCCTTGCCGGTGTGAAACCTCATTTCATTTAAGTCAAA\n\ ++SRR1222430.2 2 length=250\n\ +BBBBBFFFBBBBFGGGGGGGGGHHGGHHHHHHHHGFGHHHHHHHHGHHHFFFHHFHHHHHHEHGFHHHFGFGFGGGEGGFH@@HGCGGGHHGHHGGHFAFHHHEGFHHGGHHFHFHHHHHEHHHHHHHHHHHGHHHGGGGHHGHFGGFDGFGFHGEGFCDHGGHHHHHHHGHHEHHGHGGGGHGFHHEHGHGGGGDFGGGHHGADA?DBGGGGGGFFFFBBFFFFFFFFFFFFFFFFFBFFFFFFFFF/B\n\ +@SRR1222430.1 1 length=251\n\ +GCCCGCGAAGCGGAGCTGGCCGCCTGCAAAGGCCGTTCGCGCTCGCTGTCGCTGGATCTGTACGCCCAGTGGCGCTGCATGGAGGACAACCACGGCAAGTGGCGCTTCACCTCGCCGACCCATACCGTGCTGGCCTTCGCCCAGGCGCTGAAAGAGCTGGCGCAGGAGGGCGGCGTCAGCGCTCGCCATCAGCGCTACCGCAACAATCAGCGCCGTCTGGTGGCAGGGATGCGCGCGCTCGGCTTCCGGCC\n\ ++SRR1222430.1 1 length=251\n\ +CCCCCCCCCBBCGGCGGGGG@GGGGGHHHGHBHH@GGHGGGGGGGGG@GHGHGGGHHHHHHHHHGGGGGHHHFCGGGGHHHGHHGGHHHHGGGGCCGGGGHFFGGGGGHHHHHGGGGGGCGHHHHGGGGGGGGGGGGGGGGGGAEGGFFFFFFFFFFFFFFAFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEDFFFFFFFFFFF:DA;CCFFBBFDAFFFAFFFFFFFFA-9A>DFFFFFBFFB\n\ +"; + + +char expected_content_PE2[]="@SRR1222430.3 3 length=236\n\ +GCCCGTTGCTGCGCCGCTCTTATTTAATGACAGCACGCTTCATCAGCATCAGGGATTCTACTTTGCCGTTTTCCCAAGCCTTGGCGGTCGTCAGTTCGAAGCGGATAACCTTGATCAGATGGAATGGGTTGGCCGCTATCTGGGGCGCCTGCACCAGACGGGCTGCAAACAGCGCTTTACTGCCCGCCCGGAAATTGGCGTGCAGGAGTACCTGCTGGAACCGCGTCAGGTGTTTG\n\ ++SRR1222430.3 3 length=236\n\ +?BBABBFBBFFFGGGGGGGGGGHHHHHHHHHHFHHHGGGGGHHHHHHHGHHHHHGHGHHHHHHHGHFEGGHGHHHHHHGGHHHHHGGGEGGGHGHHHGHHGGCGGGDHHHHHHHHGHHGHGHHHHHHFHGGGHFGGGGGHHHHEFGGCFGHHHHGGFGGGGGGGGGGGGGGGGGFFFEFFFFFFFBFFFFF=;ABFF/FA:DB;BF.9.BFFFFFFF/AFFFFFABDFFB/9BD.;\n\ +@SRR1222430.2 2 length=251\n\ +AGAAATTCGCCAGGGTGATCAACGTCTCATCGTCGGCGAGGGTCAGCGCCTGCGCGGGACAGTTTTCCACACAGGCCGGCCCCGCCTCGCGGCCCTGGCATAGATCGCACTTGTGCGCGCTGGCTTTCACCAGGCCTGCGGCCTGCGGCGTCACCACCACCTGCATCACCCCGAACGGGCAGGCCACCATGCAGGATTTACAGCCAATGCATTTCTCCTGACGGACCTGAACGCTGTCGCCGGACTGGGCG\n\ ++SRR1222430.2 2 length=251\n\ +>1>AAFFF@11AEGGFFCGGGGGFGHFH2FF1F00EEE/@AEE0FGGFGEGGHGGCGC?EEFHEFEEHDF1EECHEFE/@@/BCCCFGAC@CC@C.CEGFHFHGHFHCEC?FH;CC?CG@@?-AF.BB0BFGF?E./EF;@?;AFF@<-@@??BFF?F-:A?BF999BBBF@@?@@@F-;@B@FF-A-9FF/BFFE/F//B/BBEFBFFFFF/BFFFFFFFEB?-@=B-/BBF--:;/A-B>--;>?EFE9\n\ +@SRR1222430.1 1 length=250\n\ +CGATGCTGCGCTGCATCAATAAGCTGGAAGAGATCACCAGCGGCGATCTGATTGTCGATGGTCTGAAGGTCAACGACCCGAAGGTGGACGAACGTCTGATTCGTCAGGAAGCCGGGATGGTTTTCCAGCAGTTTTATCTGTTCCCGCACCTCACGGCGCTGGAAAACGTGATGTTTGGCCCGCTGCGGGTACGCGGCGCCAGCAAGCAGGCGGCGAGTGCAGGCTATCGTCGAGCAGCGGCCGGAAGCCG\n\ ++SRR1222430.1 1 length=250\n\ +CCCCCCFFFCCCGGGGGGGGGGHHHHHHHHGHHHHHHHHHHGGGGGGGGHHGHHHHGHGHGHHHHHHHHHFHHHGGGGGGGGGGBFFHHGGGGGGHHGGHHHGHGGGHHHHHHGGGGGGGFGHGHHHHHHHHHGHHHFHHHHHHDFGGGGHHHHHGGAGGGGGGGGGGGGGGGFGGGFFFFFFFFFFFA=@FDF-BFFFFFFFFFFFFFEDFBDFFF-:FFFFFF/BFFFEFFFFFFFFFFF;--:DEFF\n\ +"; + + + +void test_write_PE() { + srp sr; + srp::reverse_iterator rit; + i_dim::iterator it_offs; + k_dim::iterator it_struct; + + // step 1 : fill srp with fake data + rpos rp=init_rpos(1,1114); + update_rpos(2,1114,0,&rp); + + rpos rp2=init_rpos(1,558); + update_rpos(2,556,0,&rp2); + + rpos rp3=init_rpos(1,0); + update_rpos(2,0,0,&rp3); + + i_dim& ref_i_dim=sr[600]; + k_dim& ref_k_dim=ref_i_dim[0]; + ref_k_dim.push_back(rp); + + i_dim& ref_i_dim2=sr[500]; + k_dim& ref_k_dim2=ref_i_dim2[0]; + ref_k_dim2.push_back(rp2); + + i_dim& ref_i_dim3=sr[450]; + k_dim& ref_k_dim3=ref_i_dim3[0]; + ref_k_dim3.push_back(rp3); + + // step2 : create the backends + FqBaseBackend * map_id_backend[k_max_input_files]; + FqMainBackend * be_fq1=new FqMainBackend(&sr); + unsigned char f_id1=1; + FqAuxBackend * be_fq2=new FqAuxBackend(); + map_id_backend[0]=be_fq1; + map_id_backend[1]=be_fq2; + + be_fq1->i_filename=(char *) "../test/data/unit/klebsiella_PE1.fq"; + be_fq1->f_id=1; + be_fq2->f_id=2; + be_fq2->i_filename=(char *) "../test/data/unit/klebsiella_PE2.fq"; + be_fq1->setOutputFile((char *) "../test/data/unit/klebsiella_PE1_filtered.fq"); + be_fq2->setOutputFile((char *) "../test/data/unit/klebsiella_PE2_filtered.fq"); + + writeFilteredFastq(map_id_backend,2, sr); + + // step 4 : re-read output files and check their content. + mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + int f_pe1=open((char *) "../test/data/unit/klebsiella_PE1_filtered.fq",O_RDONLY,mode); + assert(f_pe1!=-1); + char* buf=(char *) malloc(FqBaseBackend::bufsize*sizeof(char)); // test files are very small, bufsize is enough to read them entirely. + assert(buf!=NULL); + int nread=read(f_pe1,buf,FqBaseBackend::bufsize); + assert(nread<FqBaseBackend::bufsize); + // std::cout<<buf; +#ifdef DEBUG + int i; + for (i=0;i<nread;i++) { + if (buf[i]!=expected_content_PE1[i]) cout<<" character at position"<<i<<" differs: "<<buf[i]<<"!="<<expected_content_PE1[i]<<endl; + } +#endif + assert(strcmp(buf,expected_content_PE1)==0); + int f_pe2=open((char *) "../test/data/unit/klebsiella_PE2_filtered.fq",O_RDONLY,mode); + assert(f_pe2!=-1); + nread=read(f_pe2,buf,FqBaseBackend::bufsize); + assert(nread<FqBaseBackend::bufsize); +#ifdef DEBUG + for (i=0;i<nread;i++) { + if (buf[i]!=expected_content_PE2[i]) cout<<" character at position"<<i<<" differs: "<<buf[i]<<"!="<<expected_content_PE1[i]<<endl; + } +#endif + assert(strcmp(buf,expected_content_PE2)==0); + free(buf); + + + // step5 : remove output files from disk + + assert(remove((char *) "../test/data/unit/klebsiella_PE1_filtered.fq")==0); + assert(remove((char *) "../test/data/unit/klebsiella_PE2_filtered.fq")==0); + +} + +int main(int argc, char **argv) { + test_write_PE(); + +} + + diff --git a/src/unit_test_main_utils.cpp b/src/unit_test_main_utils.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5d05fa1646b6a1f2688d27bdf3917b2d6e975508 --- /dev/null +++ b/src/unit_test_main_utils.cpp @@ -0,0 +1,113 @@ +/* + * test_main_utils.cpp + * + * Created on: May 25, 2016 + * Author: vlegrand + */ + +#include <iostream> +#include <cassert> +#include <string> +#include <cmath> + + +using namespace std; + +#include "rock_commons.h" +#include "main_utils.h" +#include "ROCKparams.h" + +void test_processIOFileArgs() { + std::vector<IO_fq_files> single_files; + vector<PE_files> v_PE_files; + std::vector<string> v_input_lines; + std::vector<string> v_output_lines; + ROCKparams main_parms; + int f_id=0; + int ret=main_parms.loadInOutFileArgs("../test/data/unit/list_input1.txt","../test/data/unit/list_output1.txt",v_input_lines,v_output_lines); + assert(ret==EXIT_SUCCESS); + ret=main_parms.processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); + assert(ret==EXIT_FAILURE); + f_id=0; + v_PE_files.clear(); + single_files.clear(); + v_input_lines.clear(); + v_output_lines.clear(); + ret=main_parms.loadInOutFileArgs("../test/data/unit/list_input2.txt","../test/data/unit/list_output2.txt",v_input_lines,v_output_lines); + ret=main_parms.processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); + assert(ret==EXIT_FAILURE); + f_id=0; + v_PE_files.clear(); + single_files.clear(); + v_input_lines.clear(); + v_output_lines.clear(); + ret=main_parms.loadInOutFileArgs("../test/data/unit/list_input3.txt","../test/data/unit/list_output3.txt",v_input_lines,v_output_lines); + ret=main_parms.processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); + assert(ret==EXIT_SUCCESS); + assert(f_id==14); + assert(single_files.size()==2); + assert(v_PE_files.size()==6); + IO_fq_files s=single_files[0]; + assert(s.in_fq_file.compare("fifi")==0); + assert(s.out_fq_file.compare("ofifi")==0); + PE_files s2=v_PE_files[5]; + assert(s2.PE2.in_fq_file.compare("nono")==0); + assert(s2.PE2.out_fq_file.compare("onono")==0); + + v_output_lines.clear(); + v_input_lines.clear(); + v_PE_files.clear(); + single_files.clear(); + ret=main_parms.loadInOutFileArgs("../test/data/unit/list_input3.txt","",v_input_lines,v_output_lines); + ret=main_parms.processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); + s=single_files[0]; + assert(s.out_fq_file.compare("fifi.rock.fq")==0); + s2=v_PE_files[5]; + assert(s2.PE2.out_fq_file.compare("nono.rock.fq")==0); + +} + + +void test_getBestLambdaForN() { + int lambda_max=10; + unsigned long nb_k_mer=200000000; + int best=getBestLambdaForN(nb_k_mer,lambda_max); + assert(best==2); + nb_k_mer=600000000; + best=getBestLambdaForN(nb_k_mer,lambda_max); + assert(best==4); + nb_k_mer=2000000000; + best=getBestLambdaForN(nb_k_mer,lambda_max); + // cout<<best<<endl; + assert(best==10); + nb_k_mer=10000000000; + best=getBestLambdaForN(nb_k_mer,lambda_max); + assert(best==lambda_max); +} + +void test_getCollisionProba() { + float p =getCollisionProba(1,2); + // cout<<p<<endl; + assert(p==0.0); + p =getCollisionProba(1000000000,2); + // cout<<p<<endl; + assert(floor(p*1000+0.5)==139); + p =getCollisionProba(1000000000,4); + assert(floor(p*1000+0.5)==19); + p =getCollisionProba(20000000000,10); + assert(floor(p*1000+0.5)==999); +} + + +int main() { + cout<<"testing main_utils"<<endl; + cout<<"testing processing of IOFile arguments ( following 2 error messages are what is expected; don't worry)."<<endl; + test_processIOFileArgs(); + cout<<"test computation of the bast lambda value depending on nb distinct k-mers."<<endl; + test_getBestLambdaForN(); + cout<<"testing computation of collision probability."<<endl; + test_getCollisionProba(); + cout<<"done"<<endl; +} + +