diff --git a/src/CountMinSketch.cpp b/src/CountMinSketch.cpp index 44138065f27ea73ff98d32f653bbf8ff07a71d2a..daa24c337cbc69e00d43d16d2b722c1a60dc7a1a 100644 --- a/src/CountMinSketch.cpp +++ b/src/CountMinSketch.cpp @@ -60,13 +60,12 @@ const int CountMinSketch::Pi_js[500]={2147469629, 2147469637, 2147469659, 214746 2147482873, 2147482877, 2147482921, 2147482937, 2147482943, 2147482949, 2147482951, 2147483029, 2147483033, 2147483053}; - +/* void CountMinSketch::addKMer(const unsigned long& val1,const unsigned long& val2) { int h1,h2,h,j; - // short cnt; unsigned char cnt; - //unsigned short min=ushortmask; - for (j=0;j<lambda;j++) { + j=lambda; + while(--j>0) { h1=hash64to32(val1,j); h2=hash64to32(val2,j); (h1>h2)?h=h1:h=h2; @@ -75,8 +74,32 @@ void CountMinSketch::addKMer(const unsigned long& val1,const unsigned long& val2 cms_lambda_array[j] [h]=(cnt & ubytemask); } } +*/ - +/* + * Used to determine if a read must be kept or not. + * Depending on the case, threshold is kappa or kappa_prime + * Computes the number of k-mers for which the number of occurrences is below threshold. + * If more than half of the k-mers have a number of occurrences that is less than threshold, + * return true; else, return false. + */ +int CountMinSketch::isRCovBelowThres(const readNumericValues& read_val,const int& threshold) { + int a=0,b=0; + int j,h; + unsigned char min; + readNumericValues::const_iterator it; + for (it=read_val.begin();it!=read_val.end();++it) { + j=lambda; + min=ubytemask; + while (--j>=0 && min>threshold) { + h=hash64to32(*it,j); + min=cms_lambda_array[j] [h]; + } + (min<threshold)? a+=1: NULL; + ++b; + } + return (2*a>b); + } @@ -85,8 +108,7 @@ int CountMinSketch::addRead(const readNumericValues& read_val) { if (keep_r) { readNumericValues::const_iterator it; for (it=read_val.begin();it!=read_val.end();it++) { - /* Here, note that k-mer go by pair : 1rst 1 is k_mer and next one is its reverse complement*/ - addKMer(*it,*(++it)); + addKMer(*it); } } return keep_r; diff --git a/src/CountMinSketch.h b/src/CountMinSketch.h index cfba37e55118c02ea0c83ecd367e69cc5b6cc044..2ad20c48d4167e6c0abe29720f11299f426e1b37 100644 --- a/src/CountMinSketch.h +++ b/src/CountMinSketch.h @@ -39,9 +39,9 @@ class CountMinSketch { unsigned char ** cms_lambda_array; // unsigned short * min_occ_array; - inline int hash64to32(unsigned long w ,int j) { - int pi_j=Pi_js[j]; - return w % pi_j; + inline int hash64to32(const unsigned long& w ,const int& j) { + //int pi_j=Pi_js[j]; + return w % Pi_js[j]; } int hash64to32bs(unsigned long w,int j) { // bit shift version of hash function to start. @@ -64,9 +64,20 @@ class CountMinSketch { // std::map<int,int> getIthArray(int); // utility method provided for testing only. - void addKMer(const unsigned long&,const unsigned long&); // inline? TODO: see later if it can help us gain time. + // void addKMer(const unsigned long&,const unsigned long&); // inline? TODO: see later if it can help us gain time. +inline void addKMer(const unsigned long& val1) { + int h,j; + unsigned char cnt; + j=lambda; + while(--j>=0) { + h=hash64to32(val1,j); + cnt=cms_lambda_array[j] [h]; + cnt++; + cms_lambda_array[j] [h]=(cnt & ubytemask); + } +} - // int isRCovBelowThres(const readNumericValues& read_val, const int& threshold); + int isRCovBelowThres(const readNumericValues& read_val, const int& threshold); /* * Used to determine if a read must be kept or not. * Depending on the case, threshold is kappa or kappa_prime @@ -74,7 +85,7 @@ class CountMinSketch { * If more than half of the k-mers have a number of occurrences that is less than threshold, * return true; else, return false. */ - inline int isRCovBelowThres(const readNumericValues& read_val,const int& threshold) { + /*inline int isRCovBelowThres(const readNumericValues& read_val,const int& threshold) { int a=0; int b=0; int j; @@ -92,7 +103,7 @@ class CountMinSketch { ++b; } return (2*a>b); - } + }*/ /* inline unsigned char CountMinSketch::isKmCoveBelowThres(const unsigned long& val, const int& threshold) { int h; diff --git a/src/FqBaseBackend.cpp b/src/FqBaseBackend.cpp index aab7a80f88848b77e927334fc84a1a2401941805..83e69c83ac3c5b3d652b5196c287738088b454e7 100644 --- a/src/FqBaseBackend.cpp +++ b/src/FqBaseBackend.cpp @@ -16,7 +16,7 @@ #include "FqBaseBackend.h" -#define DEBUG +//#define DEBUG #ifdef DEBUG #include <cassert> #include <iostream> diff --git a/src/ReadProcessor.cpp b/src/ReadProcessor.cpp index 8eda5ad02b5ed1e34343fc25a0d5c5323da9f04d..85f761b2f95da1767d917f9465e3cf26470b2094 100644 --- a/src/ReadProcessor.cpp +++ b/src/ReadProcessor.cpp @@ -112,8 +112,6 @@ unsigned long ReadProcessor::kMerToNumberReverse(char * k_m,unsigned long * p_pr } void ReadProcessor::getKMerNumbers(char * dnaStr,int l,std::vector<unsigned long>& my_vect) { // See simple_test.cpp and results. benchmark showed that std::vector is very slightly faster than C array and doesn't require more memory in our case. So, I am using it since it makes code simpler. - /*std::vector<unsigned long> my_vect; - return my_vect;*/ int i; unsigned long num; unsigned long num_rev; @@ -122,13 +120,10 @@ void ReadProcessor::getKMerNumbers(char * dnaStr,int l,std::vector<unsigned long char * p_char=dnaStr; int nb_k_m=l-k+1; for (i=0; i<nb_k_m;i++) { - /*std::cout<<"processing k_mer number: "<<i<<std::endl; - std::cout<<p_char<<std::endl;*/ num=kMerToNumber(p_char,p_prev); num_rev=kMerToNumberReverse(p_char,p_prev_rev); if (n<0) { - my_vect.push_back(num); - my_vect.push_back(num_rev); + (num<num_rev)?my_vect.push_back(num_rev):my_vect.push_back(num); // finding a k-mer is equivalent to finding its reverse complement; as numeric values for k-mers are unique, just take into account the biggest of the 2. } p_char++; p_prev=# diff --git a/src/ReadProcessor.h b/src/ReadProcessor.h index e576c56e69a91cf07736783b8d385b2354745e9e..be4d4def03ee41cbdf7d23af05bdc9284b53be6a 100644 --- a/src/ReadProcessor.h +++ b/src/ReadProcessor.h @@ -8,7 +8,7 @@ #include <vector> #include <math.h> -#define DEBUG +//#define DEBUG #ifdef DEBUG #include <iostream> using namespace std; diff --git a/src/fqwriter.cpp b/src/fqwriter.cpp index d39609d9e4403bf9558b6d1b787937cc7a391475..ca6f2b2b972248ae8f6d6e45dd677206a965fa73 100644 --- a/src/fqwriter.cpp +++ b/src/fqwriter.cpp @@ -10,7 +10,7 @@ #include "srp.h" #include "FqBaseBackend.h" -#define DEBUG +//#define DEBUG #ifdef DEBUG #include <iostream> using namespace std; diff --git a/src/read_utils.cpp b/src/read_utils.cpp index f211ef795d8fc3295bf5423c61ea998b5178851d..01c91ce95049512895634117785bfcc44c5c5300 100644 --- a/src/read_utils.cpp +++ b/src/read_utils.cpp @@ -16,7 +16,7 @@ #include "srp.h" #include "read_utils.h" #include "rock_commons.h" -#define DEBUG +//#define DEBUG #ifdef DEBUG #include <cassert> @@ -30,7 +30,9 @@ void getFqRecord(FqBaseBackend* fq_files_be[],const unsigned char& f_id,const un try { be->getRead(offset,fq_record); } catch (int e) { +#ifdef DEBUG cout<<"couldn't retrieve read at offset: "<<offset<<endl; +#endif throw e; } } @@ -91,10 +93,13 @@ void getDNASeqstr(FqBaseBackend* fq_files_be [], getFqRecord(fq_files_be,f_id1,offset1,dna_seqs[0].fq_record_buf); processFqRecord(p_dna_seqs); } catch (int e) { +#ifdef DEBUG cout<<"j="<<j<<" sr content: a1="<<sr.read_a1<<" a2= "<<sr.read_a2<<endl; cout<<"pb reading/seeking at offset1="<<offset1<<endl; cout<<"got: "<<endl; cout<<dna_seqs[0].fq_record_buf<<endl; +#endif + exit(e); } if (f_id2!=0) { // case of PE reads. p_dna_seqs+=1; @@ -102,10 +107,13 @@ void getDNASeqstr(FqBaseBackend* fq_files_be [], try { getFqRecord(fq_files_be,f_id2,offset2,dna_seqs[1].fq_record_buf); } catch (int e) { +#ifdef DEBUG cout<<"j="<<j<<" sr content: a1="<<sr.read_a1<<" a2= "<<sr.read_a2<<endl; cout<<"pb reading/seeking at offset2="<<offset2<<endl; cout<<"got: "<<endl; cout<<dna_seqs[1].fq_record_buf<<endl; +#endif + exit(e); } processFqRecord(p_dna_seqs); } diff --git a/src/unit_test_cms.cpp b/src/unit_test_cms.cpp index 42a136b3246130d0fb13a097a001302bd3c9389b..0162ffb6477776bcac552e87c1fb46661bda278e 100644 --- a/src/unit_test_cms.cpp +++ b/src/unit_test_cms.cpp @@ -36,11 +36,12 @@ void test_CMS(int lambda,int kappa,int kappa_prime) { int rej_expected=0; int ret; for (i=0;i<num;i++) { - cms.addKMer(num,num-1); + cms.addKMer(num); } // Now, check that our k-mer is present in the CMS. int min=cms.getEstimatedNbOcc(num); - assert((min==num) || (min==num-CountMinSketch::ushortmask-1)); // addKmer doesn't check kappa. + std::cout<<"min="<<min<<endl; + assert((min==num) || (min==num-CountMinSketch::ubytemask-1)); // addKmer doesn't check kappa. // mimic a read (as given by part 2 output). readNumericValues read_values; diff --git a/src/unit_test_read_utils.cpp b/src/unit_test_read_utils.cpp index 19dc8328659bf391e554f8ec7f21bfc3d791316e..2fac27956d66a5a7c07f0c331fc49b69caddd884 100644 --- a/src/unit_test_read_utils.cpp +++ b/src/unit_test_read_utils.cpp @@ -146,11 +146,11 @@ void test_decomposeReadInkMerNums() { seq1->start_idx=12; seq1->length=75; decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs); - assert(nbrKmerDecompo.size()==142); + assert(nbrKmerDecompo.size()==71); readNumericValues::iterator it; for (it=nbrKmerDecompo.begin();it!=nbrKmerDecompo.end();it++) { // cout<<*it<<endl; - assert(*it==0 || *it==1023); + assert(*it==1023); } DnaSeqStr * seq2=&a_seqs[1]; strcpy(seq2->fq_record_buf,"@another_fake_stuff\nATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG\n+\n....@\n"); @@ -159,7 +159,7 @@ void test_decomposeReadInkMerNums() { nbrKmerDecompo.clear(); decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs); //cout<<nbrKmerDecompo.size()<<endl; - assert(nbrKmerDecompo.size()==294); + assert(nbrKmerDecompo.size()==147); } @@ -244,7 +244,7 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32. unsigned long expected_rev=read_p.kMerToNumber((char *) "TATGGTGAAAACAGTTGGTGTTATGGTAGC",NULL); read_p.getKMerNumbers((char *) dnaStr,strlen(dnaStr),nbrKmerDecompo); - assert(nbrKmerDecompo.size()==nb_expected_k_mers*2); // for each k_mer, we also expect to have a number for its reverse complement. + assert(nbrKmerDecompo.size()==nb_expected_k_mers); // for each k_mer, we also expect to have a number for its reverse complement. int cnt_k_mer=0; unsigned long num,num2; for (it=nbrKmerDecompo.begin();it!=nbrKmerDecompo.end();it++) { @@ -255,31 +255,32 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32. assert(*it==1151987406330741231); num=*it; } - if (cnt_k_mer==3) { + if (cnt_k_mer==2) { // 2nd k-mer is : TTTTAGGTGCTACCATAACACCAACTGTTT num2=read_p.kMerToNumber((char *) "TTTTAGGTGCTACCATAACACCAACTGTTT",NULL); assert(*it==num2); unsigned long num3=read_p.kMerToNumber((char *) "TTTTAGGTGCTACCATAACACCAACTGTTT",&num); assert(num3==num2); } - if (cnt_k_mer==5) { + if (cnt_k_mer==3) { num=read_p.kMerToNumber((char *) "TTTAGGTGCTACCATAACACCAACTGTTTT",NULL); - assert(num==*it); + unsigned long num_rev=read_p.kMerToNumber((char *) "AAAACAGTTGGTGTTATGGTAGCACCTAAA",NULL); + assert(max(num,num_rev)==*it); } - if (cnt_k_mer==7) { + if (cnt_k_mer==4) { num=read_p.kMerToNumber((char *) "TTAGGTGCTACCATAACACCAACTGTTTTC",NULL); assert(num==*it); } // k-mer number 10 is GCTACCATAACACCAACTGTTTTCACCATA // its reverse complement is : TATGGTGAAAACAGTTGGTGTTATGGTAGC - if (cnt_k_mer==19) { + if (cnt_k_mer==10) { num=read_p.kMerToNumber((char *) "GCTACCATAACACCAACTGTTTTCACCATA",NULL); - assert(*it==704021989794238796); + assert(*it==max(704021989794238796,930978566989888201)); } - if (cnt_k_mer==20) { + /*if (cnt_k_mer==20) { assert(*it==930978566989888201); - } + }*/ } } @@ -302,17 +303,12 @@ void testDNAToNumberWithN() { nbrKmerDecompo.reserve(nb_expected_k_mers); read_p.getKMerNumbers((char *) dnaStr,strlen(dnaStr),nbrKmerDecompo); - assert(nbrKmerDecompo.size()==nb_expected_k_mers*2); + assert(nbrKmerDecompo.size()==nb_expected_k_mers); it=nbrKmerDecompo.begin(); - it+=22; - // The 1rst expected k-mer is the one after the 3rd N char (starting at position 47 in dnaStr). ATCAAGCATTAGAGACGCTTCTCTAATGTA - assert(*it==234837138459816172); - it++; - assert(*it==886965076742957027); - - - + it+=11; + // The 1rst expected k-mer is the one after the 3rd N char (starting at position 47 in dnaStr). ATCAAGCATTAGAGACGCTTCTCTAATGTA or its reverse complement depending + assert(*it==max(234837138459816172,886965076742957027)); } int main(int argc, char **argv) {