diff --git a/configure.ac b/configure.ac index 8b259ff7492711677522ccde6f4f592d974ca097..0f719e52bfac61cd6e160d08cbec000de3437415 100644 --- a/configure.ac +++ b/configure.ac @@ -1,7 +1,7 @@ dnl Process this file with autoconf to produce a configure script. AC_PREREQ(2.59) -AC_INIT(rock, 1.5) +AC_INIT(rock, 1.5.1) AC_CANONICAL_SYSTEM diff --git a/src/CountMinSketch.hpp b/src/CountMinSketch.hpp index f10fbebc863ae31374f67276c2c4fbd4815b4141..df1c8a16655b409704daaf362b9364d5f83bad68 100644 --- a/src/CountMinSketch.hpp +++ b/src/CountMinSketch.hpp @@ -138,6 +138,7 @@ private: inline int isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold); //inline int isRCovBelowThresAux(const readNumericValues& read_val, const int& threshold); + inline int isRCovBelowThresPartial(const readNumericValues& read_val, const int& threshold, const int& start=0,const int& stop=0); void init(int glambda,int gkappa,int gkappa_prime,int filter_PE_as_single=1); @@ -191,9 +192,49 @@ public: } }; + + /* * Computes median of occurences for all the k_mers in a single read or for a part of a PE read. */ +template<typename T> inline int CountMinSketch<T>::isRCovBelowThresPartial(const readNumericValues& read_val, const int& threshold, const int& start,const int& stop) { + int a=0,b=0; + int j,h; + T min; + readNumericValues::const_iterator it; + readNumericValues::const_iterator it_aux; + if (stop) it_aux=read_val.begin()+stop; + else it_aux=read_val.end(); + for (it=read_val.begin()+start;it!=it_aux;++it) { + j=lambda; + min=mask; + while (--j>=0 && min>threshold) { + h=hash64to32(*it,j); + min=cms_lambda_array[j] [h]; + } + (min<threshold)? a+=1:a; + ++b; + } + return (2*a>b); +} + + +/* + * Determines if read (single or PE) has a median coverage below a given threshold + */ +template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold) { + int PE1_below_thres; + int PE2_below_thres=0; + PE1_below_thres=isRCovBelowThresPartial(read_val.single_or_PE_val,threshold,0,read_val.idx_start_PE2); + if (read_val.idx_start_PE2 && !filter_PE_as_single) { + PE2_below_thres=isRCovBelowThresPartial(read_val.single_or_PE_val,threshold,read_val.idx_start_PE2,0); + if (threshold==kappa) return (PE1_below_thres || PE2_below_thres); + else return(PE1_below_thres && PE2_below_thres); + } + return PE1_below_thres; +} + +/* template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold) { int PE1_below_thres; int PE2_below_thres=0; @@ -216,6 +257,8 @@ template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read } PE1_below_thres=(2*a>b); if (read_val.idx_start_PE2 && !filter_PE_as_single) { + a=0; + b=0; for (it=it_aux;it!=read_val.single_or_PE_val.end();++it) { // idx_start_PE2 is 0 if PE are treated as single or in the case of single reads. j=lambda; min=mask; @@ -230,6 +273,7 @@ template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read } return PE1_below_thres||PE2_below_thres; } +*/ template<typename T> void CountMinSketch<T>::init(int glambda,int gkappa,int gkappa_prime,int gfilter_PE_as_single) { lambda=glambda; diff --git a/src/unit_test_cms.cpp b/src/unit_test_cms.cpp index 0e0f9dd0d1f3d3f477b073d26258739ba89cac11..a93ed50029d43228afaddfec459f26766b95a79f 100644 --- a/src/unit_test_cms.cpp +++ b/src/unit_test_cms.cpp @@ -72,7 +72,7 @@ void testCMS_with_PE_NotasSingle(int lambda,int kappa,int kappa_prime) { CountMinSketch<unsigned char>* cms=new CountMinSketch<unsigned char>(lambda,kappa,kappa_prime,filter_PE_as_single); int i; cout<<"size of the CMS component: "<<sizeof(cms)<<endl; - int num=100*lambda; + int num=200; //int rej_expected=0; int ret; for (i=0;i<kappa;i++) { @@ -83,19 +83,33 @@ void testCMS_with_PE_NotasSingle(int lambda,int kappa,int kappa_prime) { // mimic a PE read (as given by part 2 output). T_read_numericValues read_values; - // mimic a PE read in which PE1 would cntain a majority of k_mers that are not already in the CMS and PE2 would contain a majority of k_mers that are already in the CMS. + // mimic a PE read in which PE1 contains k-mer that appear rarely in the CMS and PE2 contains k_mers that already appear kappa times in the CMS. for (i=1;i<=200;i++) { read_values.single_or_PE_val.push_back(i*1000-1); } - read_values.idx_start_PE2=199; + read_values.idx_start_PE2=200; for (i=1;i<=239;i++) { read_values.single_or_PE_val.push_back(num); } read_values.single_or_PE_val.push_back(50); read_values.single_or_PE_val.push_back(55); - ret=cms->addRead(read_values); // read should be accepted since none of the values that it contains is equal to those I inserted previously. + ret=cms->addRead(read_values); // read should be accepted since PE1 is nor over covered. assert(ret>0); + ret=cms->isBeneathMinKappa(read_values); // read should not be considered under covered since only PE1 is under covered. + assert(ret==0); + + // Now, change PE1 a little and insert it kappa times in the CMS + for (int j=1;j<kappa;j++) { + read_values.single_or_PE_val[2]=j; + ret=cms->addRead(read_values); + if (j<kappa-1) assert(ret>0); // read should be accepted as long as we haven't inserted PE1 kappa times. + else assert(ret>0); //otherwise it is rejected. + } + + ret=cms->isBeneathMinKappa(read_values); // read should not be considered under covered since PE1 was inserted kappa times. + assert(ret==0); + // mimic a PE read (as given by part 2 output). T_read_numericValues read_values2; @@ -111,12 +125,56 @@ void testCMS_with_PE_NotasSingle(int lambda,int kappa,int kappa_prime) { ret=cms->addRead(read_values2); assert(ret>0); } - ret=cms->isBeneathMinKappa(read_values); - assert(ret>0); ret=cms->isBeneathMinKappa(read_values2); assert(ret==0); + //mimic a read for which PE1 is under kappa_prime and PE2 too. Such a read is expected to be removed by low filter + T_read_numericValues read_values3; + + for (i=1;i<=200;i++) { + read_values3.single_or_PE_val.push_back(i); + } + read_values3.idx_start_PE2=201; + for (i=1;i<=205;i++) { + read_values3.single_or_PE_val.push_back(i); + } + ret=cms->addRead(read_values3); + assert(ret>0); + + ret=cms->isBeneathMinKappa(read_values3); + assert(ret>0); + + //mimic a read for which PE1 is over kappa_prime but not PE2 + T_read_numericValues read_values4; + for (i=201;i<=399;i++) { + read_values4.single_or_PE_val.push_back(200); + } + read_values4.idx_start_PE2=199; + for (i=201;i<=399;i++) { + read_values4.single_or_PE_val.push_back(i); + } + ret=cms->addRead(read_values4); + assert(ret>0); + + ret=cms->isBeneathMinKappa(read_values4); + assert(ret==0); + + //mimic a read for which PE2 is over kappa_prime but not PE1 + T_read_numericValues read_values5; + for (i=201;i<=399;i++) { + read_values5.single_or_PE_val.push_back(i); + } + read_values5.idx_start_PE2=199; + for (i=201;i<=399;i++) { + read_values5.single_or_PE_val.push_back(200); + } + ret=cms->addRead(read_values5); + assert(ret>0); + + ret=cms->isBeneathMinKappa(read_values5); + assert(ret==0); + delete cms; }