From f4265c6f0728e7a7ad3ac2686d6a04f786bf233f Mon Sep 17 00:00:00 2001 From: Veronique Legrand <vlegrand@pasteur.fr> Date: Tue, 19 Apr 2016 16:50:14 +0200 Subject: [PATCH] did new implementation for using only the max(k-mer value, reverse complement value) in the CMS; updates unit tests accordingly. --- src/CountMinSketch.cpp | 36 +++++++++++++++++++++++++++------- src/CountMinSketch.h | 25 +++++++++++++++++------- src/FqBaseBackend.cpp | 2 +- src/ReadProcessor.cpp | 7 +------ src/ReadProcessor.h | 2 +- src/fqwriter.cpp | 2 +- src/read_utils.cpp | 10 +++++++++- src/unit_test_cms.cpp | 5 +++-- src/unit_test_read_utils.cpp | 38 ++++++++++++++++-------------------- 9 files changed, 80 insertions(+), 47 deletions(-) diff --git a/src/CountMinSketch.cpp b/src/CountMinSketch.cpp index 4413806..daa24c3 100644 --- a/src/CountMinSketch.cpp +++ b/src/CountMinSketch.cpp @@ -60,13 +60,12 @@ const int CountMinSketch::Pi_js[500]={2147469629, 2147469637, 2147469659, 214746 2147482873, 2147482877, 2147482921, 2147482937, 2147482943, 2147482949, 2147482951, 2147483029, 2147483033, 2147483053}; - +/* void CountMinSketch::addKMer(const unsigned long& val1,const unsigned long& val2) { int h1,h2,h,j; - // short cnt; unsigned char cnt; - //unsigned short min=ushortmask; - for (j=0;j<lambda;j++) { + j=lambda; + while(--j>0) { h1=hash64to32(val1,j); h2=hash64to32(val2,j); (h1>h2)?h=h1:h=h2; @@ -75,8 +74,32 @@ void CountMinSketch::addKMer(const unsigned long& val1,const unsigned long& val2 cms_lambda_array[j] [h]=(cnt & ubytemask); } } +*/ - +/* + * Used to determine if a read must be kept or not. + * Depending on the case, threshold is kappa or kappa_prime + * Computes the number of k-mers for which the number of occurrences is below threshold. + * If more than half of the k-mers have a number of occurrences that is less than threshold, + * return true; else, return false. + */ +int CountMinSketch::isRCovBelowThres(const readNumericValues& read_val,const int& threshold) { + int a=0,b=0; + int j,h; + unsigned char min; + readNumericValues::const_iterator it; + for (it=read_val.begin();it!=read_val.end();++it) { + j=lambda; + min=ubytemask; + while (--j>=0 && min>threshold) { + h=hash64to32(*it,j); + min=cms_lambda_array[j] [h]; + } + (min<threshold)? a+=1: NULL; + ++b; + } + return (2*a>b); + } @@ -85,8 +108,7 @@ int CountMinSketch::addRead(const readNumericValues& read_val) { if (keep_r) { readNumericValues::const_iterator it; for (it=read_val.begin();it!=read_val.end();it++) { - /* Here, note that k-mer go by pair : 1rst 1 is k_mer and next one is its reverse complement*/ - addKMer(*it,*(++it)); + addKMer(*it); } } return keep_r; diff --git a/src/CountMinSketch.h b/src/CountMinSketch.h index cfba37e..2ad20c4 100644 --- a/src/CountMinSketch.h +++ b/src/CountMinSketch.h @@ -39,9 +39,9 @@ class CountMinSketch { unsigned char ** cms_lambda_array; // unsigned short * min_occ_array; - inline int hash64to32(unsigned long w ,int j) { - int pi_j=Pi_js[j]; - return w % pi_j; + inline int hash64to32(const unsigned long& w ,const int& j) { + //int pi_j=Pi_js[j]; + return w % Pi_js[j]; } int hash64to32bs(unsigned long w,int j) { // bit shift version of hash function to start. @@ -64,9 +64,20 @@ class CountMinSketch { // std::map<int,int> getIthArray(int); // utility method provided for testing only. - void addKMer(const unsigned long&,const unsigned long&); // inline? TODO: see later if it can help us gain time. + // void addKMer(const unsigned long&,const unsigned long&); // inline? TODO: see later if it can help us gain time. +inline void addKMer(const unsigned long& val1) { + int h,j; + unsigned char cnt; + j=lambda; + while(--j>=0) { + h=hash64to32(val1,j); + cnt=cms_lambda_array[j] [h]; + cnt++; + cms_lambda_array[j] [h]=(cnt & ubytemask); + } +} - // int isRCovBelowThres(const readNumericValues& read_val, const int& threshold); + int isRCovBelowThres(const readNumericValues& read_val, const int& threshold); /* * Used to determine if a read must be kept or not. * Depending on the case, threshold is kappa or kappa_prime @@ -74,7 +85,7 @@ class CountMinSketch { * If more than half of the k-mers have a number of occurrences that is less than threshold, * return true; else, return false. */ - inline int isRCovBelowThres(const readNumericValues& read_val,const int& threshold) { + /*inline int isRCovBelowThres(const readNumericValues& read_val,const int& threshold) { int a=0; int b=0; int j; @@ -92,7 +103,7 @@ class CountMinSketch { ++b; } return (2*a>b); - } + }*/ /* inline unsigned char CountMinSketch::isKmCoveBelowThres(const unsigned long& val, const int& threshold) { int h; diff --git a/src/FqBaseBackend.cpp b/src/FqBaseBackend.cpp index aab7a80..83e69c8 100644 --- a/src/FqBaseBackend.cpp +++ b/src/FqBaseBackend.cpp @@ -16,7 +16,7 @@ #include "FqBaseBackend.h" -#define DEBUG +//#define DEBUG #ifdef DEBUG #include <cassert> #include <iostream> diff --git a/src/ReadProcessor.cpp b/src/ReadProcessor.cpp index 8eda5ad..85f761b 100644 --- a/src/ReadProcessor.cpp +++ b/src/ReadProcessor.cpp @@ -112,8 +112,6 @@ unsigned long ReadProcessor::kMerToNumberReverse(char * k_m,unsigned long * p_pr } void ReadProcessor::getKMerNumbers(char * dnaStr,int l,std::vector<unsigned long>& my_vect) { // See simple_test.cpp and results. benchmark showed that std::vector is very slightly faster than C array and doesn't require more memory in our case. So, I am using it since it makes code simpler. - /*std::vector<unsigned long> my_vect; - return my_vect;*/ int i; unsigned long num; unsigned long num_rev; @@ -122,13 +120,10 @@ void ReadProcessor::getKMerNumbers(char * dnaStr,int l,std::vector<unsigned long char * p_char=dnaStr; int nb_k_m=l-k+1; for (i=0; i<nb_k_m;i++) { - /*std::cout<<"processing k_mer number: "<<i<<std::endl; - std::cout<<p_char<<std::endl;*/ num=kMerToNumber(p_char,p_prev); num_rev=kMerToNumberReverse(p_char,p_prev_rev); if (n<0) { - my_vect.push_back(num); - my_vect.push_back(num_rev); + (num<num_rev)?my_vect.push_back(num_rev):my_vect.push_back(num); // finding a k-mer is equivalent to finding its reverse complement; as numeric values for k-mers are unique, just take into account the biggest of the 2. } p_char++; p_prev=# diff --git a/src/ReadProcessor.h b/src/ReadProcessor.h index e576c56..be4d4de 100644 --- a/src/ReadProcessor.h +++ b/src/ReadProcessor.h @@ -8,7 +8,7 @@ #include <vector> #include <math.h> -#define DEBUG +//#define DEBUG #ifdef DEBUG #include <iostream> using namespace std; diff --git a/src/fqwriter.cpp b/src/fqwriter.cpp index d39609d..ca6f2b2 100644 --- a/src/fqwriter.cpp +++ b/src/fqwriter.cpp @@ -10,7 +10,7 @@ #include "srp.h" #include "FqBaseBackend.h" -#define DEBUG +//#define DEBUG #ifdef DEBUG #include <iostream> using namespace std; diff --git a/src/read_utils.cpp b/src/read_utils.cpp index f211ef7..01c91ce 100644 --- a/src/read_utils.cpp +++ b/src/read_utils.cpp @@ -16,7 +16,7 @@ #include "srp.h" #include "read_utils.h" #include "rock_commons.h" -#define DEBUG +//#define DEBUG #ifdef DEBUG #include <cassert> @@ -30,7 +30,9 @@ void getFqRecord(FqBaseBackend* fq_files_be[],const unsigned char& f_id,const un try { be->getRead(offset,fq_record); } catch (int e) { +#ifdef DEBUG cout<<"couldn't retrieve read at offset: "<<offset<<endl; +#endif throw e; } } @@ -91,10 +93,13 @@ void getDNASeqstr(FqBaseBackend* fq_files_be [], getFqRecord(fq_files_be,f_id1,offset1,dna_seqs[0].fq_record_buf); processFqRecord(p_dna_seqs); } catch (int e) { +#ifdef DEBUG cout<<"j="<<j<<" sr content: a1="<<sr.read_a1<<" a2= "<<sr.read_a2<<endl; cout<<"pb reading/seeking at offset1="<<offset1<<endl; cout<<"got: "<<endl; cout<<dna_seqs[0].fq_record_buf<<endl; +#endif + exit(e); } if (f_id2!=0) { // case of PE reads. p_dna_seqs+=1; @@ -102,10 +107,13 @@ void getDNASeqstr(FqBaseBackend* fq_files_be [], try { getFqRecord(fq_files_be,f_id2,offset2,dna_seqs[1].fq_record_buf); } catch (int e) { +#ifdef DEBUG cout<<"j="<<j<<" sr content: a1="<<sr.read_a1<<" a2= "<<sr.read_a2<<endl; cout<<"pb reading/seeking at offset2="<<offset2<<endl; cout<<"got: "<<endl; cout<<dna_seqs[1].fq_record_buf<<endl; +#endif + exit(e); } processFqRecord(p_dna_seqs); } diff --git a/src/unit_test_cms.cpp b/src/unit_test_cms.cpp index 42a136b..0162ffb 100644 --- a/src/unit_test_cms.cpp +++ b/src/unit_test_cms.cpp @@ -36,11 +36,12 @@ void test_CMS(int lambda,int kappa,int kappa_prime) { int rej_expected=0; int ret; for (i=0;i<num;i++) { - cms.addKMer(num,num-1); + cms.addKMer(num); } // Now, check that our k-mer is present in the CMS. int min=cms.getEstimatedNbOcc(num); - assert((min==num) || (min==num-CountMinSketch::ushortmask-1)); // addKmer doesn't check kappa. + std::cout<<"min="<<min<<endl; + assert((min==num) || (min==num-CountMinSketch::ubytemask-1)); // addKmer doesn't check kappa. // mimic a read (as given by part 2 output). readNumericValues read_values; diff --git a/src/unit_test_read_utils.cpp b/src/unit_test_read_utils.cpp index 19dc832..2fac279 100644 --- a/src/unit_test_read_utils.cpp +++ b/src/unit_test_read_utils.cpp @@ -146,11 +146,11 @@ void test_decomposeReadInkMerNums() { seq1->start_idx=12; seq1->length=75; decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs); - assert(nbrKmerDecompo.size()==142); + assert(nbrKmerDecompo.size()==71); readNumericValues::iterator it; for (it=nbrKmerDecompo.begin();it!=nbrKmerDecompo.end();it++) { // cout<<*it<<endl; - assert(*it==0 || *it==1023); + assert(*it==1023); } DnaSeqStr * seq2=&a_seqs[1]; strcpy(seq2->fq_record_buf,"@another_fake_stuff\nATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG\n+\n....@\n"); @@ -159,7 +159,7 @@ void test_decomposeReadInkMerNums() { nbrKmerDecompo.clear(); decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs); //cout<<nbrKmerDecompo.size()<<endl; - assert(nbrKmerDecompo.size()==294); + assert(nbrKmerDecompo.size()==147); } @@ -244,7 +244,7 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32. unsigned long expected_rev=read_p.kMerToNumber((char *) "TATGGTGAAAACAGTTGGTGTTATGGTAGC",NULL); read_p.getKMerNumbers((char *) dnaStr,strlen(dnaStr),nbrKmerDecompo); - assert(nbrKmerDecompo.size()==nb_expected_k_mers*2); // for each k_mer, we also expect to have a number for its reverse complement. + assert(nbrKmerDecompo.size()==nb_expected_k_mers); // for each k_mer, we also expect to have a number for its reverse complement. int cnt_k_mer=0; unsigned long num,num2; for (it=nbrKmerDecompo.begin();it!=nbrKmerDecompo.end();it++) { @@ -255,31 +255,32 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32. assert(*it==1151987406330741231); num=*it; } - if (cnt_k_mer==3) { + if (cnt_k_mer==2) { // 2nd k-mer is : TTTTAGGTGCTACCATAACACCAACTGTTT num2=read_p.kMerToNumber((char *) "TTTTAGGTGCTACCATAACACCAACTGTTT",NULL); assert(*it==num2); unsigned long num3=read_p.kMerToNumber((char *) "TTTTAGGTGCTACCATAACACCAACTGTTT",&num); assert(num3==num2); } - if (cnt_k_mer==5) { + if (cnt_k_mer==3) { num=read_p.kMerToNumber((char *) "TTTAGGTGCTACCATAACACCAACTGTTTT",NULL); - assert(num==*it); + unsigned long num_rev=read_p.kMerToNumber((char *) "AAAACAGTTGGTGTTATGGTAGCACCTAAA",NULL); + assert(max(num,num_rev)==*it); } - if (cnt_k_mer==7) { + if (cnt_k_mer==4) { num=read_p.kMerToNumber((char *) "TTAGGTGCTACCATAACACCAACTGTTTTC",NULL); assert(num==*it); } // k-mer number 10 is GCTACCATAACACCAACTGTTTTCACCATA // its reverse complement is : TATGGTGAAAACAGTTGGTGTTATGGTAGC - if (cnt_k_mer==19) { + if (cnt_k_mer==10) { num=read_p.kMerToNumber((char *) "GCTACCATAACACCAACTGTTTTCACCATA",NULL); - assert(*it==704021989794238796); + assert(*it==max(704021989794238796,930978566989888201)); } - if (cnt_k_mer==20) { + /*if (cnt_k_mer==20) { assert(*it==930978566989888201); - } + }*/ } } @@ -302,17 +303,12 @@ void testDNAToNumberWithN() { nbrKmerDecompo.reserve(nb_expected_k_mers); read_p.getKMerNumbers((char *) dnaStr,strlen(dnaStr),nbrKmerDecompo); - assert(nbrKmerDecompo.size()==nb_expected_k_mers*2); + assert(nbrKmerDecompo.size()==nb_expected_k_mers); it=nbrKmerDecompo.begin(); - it+=22; - // The 1rst expected k-mer is the one after the 3rd N char (starting at position 47 in dnaStr). ATCAAGCATTAGAGACGCTTCTCTAATGTA - assert(*it==234837138459816172); - it++; - assert(*it==886965076742957027); - - - + it+=11; + // The 1rst expected k-mer is the one after the 3rd N char (starting at position 47 in dnaStr). ATCAAGCATTAGAGACGCTTCTCTAATGTA or its reverse complement depending + assert(*it==max(234837138459816172,886965076742957027)); } int main(int argc, char **argv) { -- GitLab