diff --git a/src/CountMinSketch.hpp b/src/CountMinSketch.hpp index 6beb3f2cbbf4b052949ee2c4e3e3b81fe4f50555..bd1bf2bc51c4c92ea0761f019c4a76fe8c054287 100644 --- a/src/CountMinSketch.hpp +++ b/src/CountMinSketch.hpp @@ -18,19 +18,7 @@ #define BAD_TYPE -1 -// Store the prime numbers for modulo hashing in this array. -/* Use the 50 biggest prime numbers < 2^32. They have the propriety thatthat their binary representation 9i) - * contains at least 13 zeroes and (ii) at least 11 occurrences of '10'. - * This disparity in the binary representation of a primary number guaranties that all bits of a numerator x - * will be well affected during the x%p operation. -*/ -const long int Pi_js[k_max_array_in_cms]={4283781797,4283781461,4283738773,4283738453,4280985941,4280898901,4275393833,4275393173,\ - 4275382933,4274694821,4273296553,4273285717,4273253029,4273121621,4273034581,4272772181,\ - 4272771749,4272769621,4272761509,4272728741,4272630421,4272629077,4272608581,4272608533,\ - 4272599701,4272598181,4272597673,4272597353,4272597349,4272597157,4272596149,4272596117,\ - 4272593557,4272592549,4272592277,4272592229,4272592213,4272592021,4272589493,4272575849,\ - 4272575833,4272575819,4272575701,4272575669,4272515669,4272511657,4272511637,4272510629,\ - 4272510283,4272510101}; + diff --git a/src/ROCKparams.cpp b/src/ROCKparams.cpp index b1c1cf091e926aa155e428e304e032426c4a12e8..078e40ab89480f27e25ba61cb537e08b3be49b9d 100644 --- a/src/ROCKparams.cpp +++ b/src/ROCKparams.cpp @@ -75,7 +75,7 @@ void ROCKparams::optArgConsistency(const string& input_file,const string& output int smallest_kappa; if (parms.kappa_prime==0) smallest_kappa=parms.kappa; else smallest_kappa=parms.kappa_prime; - parms.lambda=getBestLambdaForN(nb_k_mers,smallest_kappa,k_arr_cms_size); + parms.lambda=getBestLambdaForN(nb_k_mers,smallest_kappa,Pi_js[0],k_max_collision_proba); } if (parms.kappa_prime>=parms.kappa) { cout<<"ERROR lower filter is higher than high filter!"<<endl; @@ -370,7 +370,7 @@ void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { int smallest_kappa; if (parms.kappa_prime==0) smallest_kappa=parms.kappa; else smallest_kappa=parms.kappa_prime; - expected_collision_proba=getCollisionProba(smallest_kappa,nb_k_mers,parms.lambda); + expected_collision_proba=getCollisionProba(smallest_kappa,nb_k_mers,Pi_js[0],parms.lambda); } if ((v_input_lines.empty() || v_output_lines.empty()) && (loadInOutFileArgs(input_file,output_file,v_input_lines,v_output_lines)==EXIT_FAILURE)) throw EXIT_FAILURE; if (processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id)!=EXIT_SUCCESS) throw EXIT_FAILURE; diff --git a/src/main_utils.cpp b/src/main_utils.cpp index aad8dc8f8cc266f2f600f52516f5614724abc5d4..6509ca9a1be68f6dae69aaaaf884b46d40ded1b8 100644 --- a/src/main_utils.cpp +++ b/src/main_utils.cpp @@ -69,7 +69,7 @@ std::string checkDirExists(const string o_file) { * logarithm of the gamma function; * from here: https://www.cec.uchile.cl/cinetica/pcordero/MC_libros/NumericalRecipesinC.pdf */ -#include <stdio.h> +//#include <stdio.h> double gammln(const unsigned long& xx) { double x,y,tmp,ser; @@ -79,44 +79,42 @@ double gammln(const unsigned long& xx) { int j; y=x=xx; tmp=x+5.5; - // cout<<"tmp="<<tmp<<endl; tmp -=(x+0.5)*log(tmp); ser=1.000000000190015; - for (j=0;j<=5;j++) ser+=exp(log(cof[j])-log(++y)); - double ret=-tmp+log(2.5066282746310005*ser/x); - printf("ret=%lf\n",ret); - printf("%lf\n",-tmp+log(2.5066282746310005*ser/x)); + for (j=0;j<=5;j++) ser+=cof[j]/++y; return -tmp+log(2.5066282746310005*ser/x); } /* * here x is the smallest value of the counter for a given k-mer in the CMS. - * m is the size of a CMS array. + * m is the size of a CMS array (AC: in fact, we use the biggest prime number as an approximation of the array's size). */ -float pmf(const float&x,const unsigned long& nb_k_mers,const unsigned int& m) { - //unsigned long n=nb_k_mers+1; - float lngnp1=gammln(nb_k_mers+1); - float lngxp1=gammln(x+1); - float lngnmxp1=gammln(nb_k_mers-x+1); - float lnm=log(m); - float lnmm1=log(m-1); - float nlnm=nb_k_mers*lnm; - float nmxlnmm1=(nb_k_mers-x)*lnmm1; - float tmp=lngnp1-lngxp1; - tmp -=lngnmxp1; - tmp -=nlnm; - tmp+=nmxlnmm1; - float res=exp(tmp); - return res; +double pmf(const float&x,const unsigned long& nb_k_mers,const unsigned int& m) { + double res; + double lnn=log(nb_k_mers); + double lnm=log(m); + double diff=lnn-lnm; + double p=x*diff; + diff=double(nb_k_mers)/m; + //double truc=exp(diff); + double tmp=p-diff; + double g=gammln(x+1); + tmp -=g; + res=exp(tmp); + return res; } -float ccdf(const int& smallest_kappa,const unsigned long& nb_k_mers,const unsigned int& m) { - float res; - float s=0.0; +double ccdf(const int& smallest_kappa,const unsigned long& nb_k_mers,const unsigned int& m) { + double res; + double s=0.0; for (int i=0;i<=smallest_kappa;i++) { s+=pmf(i,nb_k_mers,m); } res=1-s; + if (res<0) res=0.00000001; // It happens because pmf if an approximation of the probability distribution. + // So it happens that ccdf is negative but very close to 0... As we need a non null value for the next calculation, + // return a very small constant. + if (res==1) res=0.9999999; // Need to do that because we compule ln(ccdf) to determine lamda and we need it to be non null otherwise we'd get a division by 0. return res; } @@ -126,59 +124,20 @@ float ccdf(const int& smallest_kappa,const unsigned long& nb_k_mers,const unsign * so choose smallest lambda so that ccdf(kappa_prime,nb_k_mer,array_size) exp lambda<=0.01. * Smallest is either low filter either high filter if there is no low filter. */ -int getBestLambdaForN(const unsigned long& nb_k_mers,const int& smallest_kappa,const unsigned int& m) { - float lnp=log(k_max_collision_proba); - float ccfdr=ccdf(smallest_kappa,nb_k_mers,m); - float lnccdf=log(ccfdr); - float tmp=lnp/ccfdr; - int min_l=floor(tmp); +int getBestLambdaForN(const unsigned long& nb_k_mers,const int& smallest_kappa,const unsigned int& m,const float& max_collision_proba) { + float lnp=log(max_collision_proba); + float ccdfr=ccdf(smallest_kappa,nb_k_mers,m); + float lnccdf=log(ccdfr); + float tmp=lnp/lnccdf; + int min_l=ceil(tmp); if (min_l==0) min_l=1; return min_l; } -/* - * Minimize lambda for a given number of k-mers. - * We want p<=0.01, - * so choose smallest lambda so that [1-(1-1/m)exp n] exp lambda<=0.01 - */ -int getBestLambdaForN_old(const unsigned long& nb_k_mers,int lambda_max) { - int lambda=2; - int min_lambda=lambda; - if (lambda_max==0) { // no upper bound specified by the user via the -g option - lambda_max=500; - } - long double tmp=1.0/UINT_MAX; - tmp=1.0-tmp; - tmp=pow(tmp,nb_k_mers); - tmp=1.0-tmp; - while (lambda<=lambda_max) { - min_lambda=lambda; - long double p=pow(tmp,lambda); - if (p<=0.01) break; - lambda+=1; - } - return min_lambda; -} - -/* - * Compute collision probability - */ -float getCollisionProba_old(const unsigned long& nb_k_mers,const int& lambda) { - long double tmp=1.0/UINT_MAX; - tmp=1.0-tmp; - tmp=pow(tmp,nb_k_mers); - tmp=1.0-tmp; - long double p=pow(tmp,lambda); - if (p<0.0001) p=0; - // rounding - p=p*1000+0.5; - return floor(p)/1000; - //return p; -} -float getCollisionProba(const int& smallest_kappa,const unsigned long& nb_k_mers,const int& lambda) { - float ccfdr=ccdf(smallest_kappa,nb_k_mers,k_arr_cms_size); +float getCollisionProba(const int& smallest_kappa,const unsigned long& nb_k_mers,const unsigned int& arr_cms_size,const int& lambda) { + float ccfdr=ccdf(smallest_kappa,nb_k_mers,arr_cms_size); float fpp=pow(ccfdr,lambda); return fpp; } diff --git a/src/main_utils.h b/src/main_utils.h index 1217b5f2bee832ca12db88410b7325a5fd092d4c..63710d4b5d005ca5cd18701dbb5e6b9c65257a1d 100644 --- a/src/main_utils.h +++ b/src/main_utils.h @@ -24,14 +24,11 @@ unsigned long getNodePhysMemory(); std::string checkDirExists(const std::string o_file); -/*int loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<std::string>& v_input_lines,std::vector<std::string>& v_output_lines); -int processInOutFileArgs(const std::vector<std::string>& v_input_lines,std::vector<std::string>& v_output_lines,std::vector<IO_fq_files>& single_files,std::vector<PE_files>& v_PE_files,int& f_id);*/ -//int getBestLambdaForN(const unsigned long& nb_k_mers); -int getBestLambdaForN(const unsigned long& nb_k_mers,const int& smallest_kappa,const unsigned int& m); -//float getCollisionProba(const unsigned long& nb_k_mers,const int& lambda); +int getBestLambdaForN(const unsigned long& nb_k_mers,const int& smallest_kappa,const unsigned int& m,const float& max_collision_proba); double gammln(const unsigned long& xx); -float pmf(const float&x,const unsigned long& nb_k_mers,const unsigned int& m); -float getCollisionProba(const int& smallest_kappa,const unsigned long& nb_k_mers,const int& lambda); +double pmf(const float&x,const unsigned long& nb_k_mers,const unsigned int& m); +double ccdf(const int& smallest_kappa,const unsigned long& nb_k_mers,const unsigned int& m); +float getCollisionProba(const int& smallest_kappa,const unsigned long& nb_k_mers,const unsigned int& arr_cms_size,const int& lambda); void usage(int status); diff --git a/src/rock.cpp b/src/rock.cpp index 3f8f3b2bcf60df615d9f6b301297d82f855375b6..a09488e3ec4cfbb05f25aedb302b73444eb072d2 100644 --- a/src/rock.cpp +++ b/src/rock.cpp @@ -123,7 +123,7 @@ int main(int argc,char * argv[]) { #endif int smallest_kappa=parms.kappa; if (parms.kappa_prime>0) smallest_kappa=parms.kappa_prime; - float p =getCollisionProba(smallest_kappa,approx_nb_k_mers,parms.lambda); + float p =getCollisionProba(smallest_kappa,approx_nb_k_mers,Pi_js[0],parms.lambda); cout<<"estimated probability of collision:"<<p<<endl; cout<<"estimated number of distinct k-mers:"<<approx_nb_k_mers<<endl; diff --git a/src/rock_commons.h b/src/rock_commons.h index 81446731fe0864edf9eacc539869ccf895d0dc42..0ac2a4f150e3d10914a1221e3a4e6e6c595401ef 100644 --- a/src/rock_commons.h +++ b/src/rock_commons.h @@ -28,6 +28,20 @@ #define k_max_array_in_cms 50 // Theorical maximum of arrays in the count min sketch. #define k_arr_cms_size UINT_MAX // size of 1 CMS array +// Store the prime numbers for modulo hashing in this array. +/* Use the 50 biggest prime numbers < 2^32. They have the propriety thatthat their binary representation 9i) + * contains at least 13 zeroes and (ii) at least 11 occurrences of '10'. + * This disparity in the binary representation of a primary number guaranties that all bits of a numerator x + * will be well affected during the x%p operation. +*/ +const long int Pi_js[k_max_array_in_cms]={4283781797,4283781461,4283738773,4283738453,4280985941,4280898901,4275393833,4275393173,\ + 4275382933,4274694821,4273296553,4273285717,4273253029,4273121621,4273034581,4272772181,\ + 4272771749,4272769621,4272761509,4272728741,4272630421,4272629077,4272608581,4272608533,\ + 4272599701,4272598181,4272597673,4272597353,4272597349,4272597157,4272596149,4272596117,\ + 4272593557,4272592549,4272592277,4272592229,4272592213,4272592021,4272589493,4272575849,\ + 4272575833,4272575819,4272575701,4272575669,4272515669,4272511657,4272511637,4272510629,\ + 4272510283,4272510101}; + typedef struct { std::string in_fq_file; std::string out_fq_file; diff --git a/src/unit_test_main_utils.cpp b/src/unit_test_main_utils.cpp index 1bc054485073f9b072f9f9b3c764b5941314e2bf..86b1eb01e9bd52d9c5a5d570bd065664e890a251 100644 --- a/src/unit_test_main_utils.cpp +++ b/src/unit_test_main_utils.cpp @@ -69,44 +69,59 @@ void test_processIOFileArgs() { void test_getBestLambdaForN() { - int lambda_max=10; - unsigned long nb_k_mer=200000000; - int best=getBestLambdaForN(nb_k_mer,5,k_arr_cms_size); + unsigned long nb_k_mer=5000000000; + int best=getBestLambdaForN(nb_k_mer,2,UINT_MAX,0.05); + assert(best==2); + nb_k_mer=200000000; + best=getBestLambdaForN(nb_k_mer,5,UINT_MAX,k_max_collision_proba); assert(best==1); nb_k_mer=600000000; - best=getBestLambdaForN(nb_k_mer,70,k_arr_cms_size); + best=getBestLambdaForN(nb_k_mer,70,UINT_MAX,k_max_collision_proba); assert(best==1); nb_k_mer=2000000000; - best=getBestLambdaForN(nb_k_mer,10,k_arr_cms_size); + best=getBestLambdaForN(nb_k_mer,10,UINT_MAX,k_max_collision_proba); assert(best==1); nb_k_mer=10000000000; - best=getBestLambdaForN(nb_k_mer,5,k_arr_cms_size); - cout<<best<<endl; + best=getBestLambdaForN(nb_k_mer,5,UINT_MAX,k_max_collision_proba); + //cout<<best<<endl; assert(best==1); - nb_k_mer=100000000000000; - best=getBestLambdaForN(nb_k_mer,50,k_arr_cms_size); + best=getBestLambdaForN(nb_k_mer,50,UINT_MAX,k_max_collision_proba); + cout<<best<<endl; + //assert(best==1); + nb_k_mer=100000000000; + best=getBestLambdaForN(nb_k_mer,50,UINT_MAX,k_max_collision_proba); + cout<<best<<endl; + nb_k_mer=1000000000000; + best=getBestLambdaForN(nb_k_mer,50,UINT_MAX,k_max_collision_proba); cout<<best<<endl; + nb_k_mer=100000000000000; + best=getBestLambdaForN(nb_k_mer,50,UINT_MAX,k_max_collision_proba); + cout<<best<<endl; // VL: get 19315484 } void test_getCollisionProba() { - float p =getCollisionProba(1,2,1); + float p=getCollisionProba(2,5000000000,UINT_MAX,2); + assert(round(p*10000)==127); + p=getCollisionProba(2,5000000000,UINT_MAX,1); + assert(p=0.1128); + p =getCollisionProba(1,2,UINT_MAX,3); cout<<p<<endl; assert(p==0.0); - p =getCollisionProba(1,1000000000,1); + p =getCollisionProba(1,1000000000,UINT_MAX,1); // cout<<p<<endl; assert(floor(p*1000+0.5)==43); - p =getCollisionProba(5,1000000000,1); + p =getCollisionProba(5,1000000000,UINT_MAX,1); assert(floor(p*1000+0.5)==2); - p =getCollisionProba(5,20000000000,1); + p =getCollisionProba(5,20000000000,UINT_MAX,1); assert(floor(p*1000+0.5)==909); } void test_gammln() { float a; - /*a=gammln(1); + a=gammln(1); assert(exp(a)==1); a=gammln(2); - assert(exp(a)==1);*/ + assert(exp(a)==1); a=gammln(3); assert(exp(a)==2); a=gammln(4); @@ -114,13 +129,13 @@ void test_gammln() { a=gammln(5); assert(exp(a)==24); a=gammln(6); - cout<<exp(a)<<endl; + //cout<<exp(a)<<endl; float tmp=exp(a); float tmp2=tmp*10000; float tmp3=round(tmp2)/10000; //printf("%s \n",tmp3); assert(tmp3==120); - unsigned long n=5000; + /* unsigned long n=5000; unsigned long m=n+1; cout<<"m="<<m<<endl; //float i=m; @@ -136,34 +151,25 @@ void test_gammln() { // #include <float.h> void test_pmf() { unsigned long nb_k_mers=5000000000; - /*cout<<sizeof(unsigned int)<<endl; - cout<<sizeof(double)<<endl; - unsigned int tst=UINT_MAX; - cout<<"tst="<<tst<<endl; - double stuff=tst; - cout<<"stuff="<<stuff<<endl; - cout<<sizeof(unsigned long)<<endl; - cout<<sizeof(float)<<endl; - - unsigned long n=nb_k_mers+1; - float t=n; - printf("%f\n",t); - printf("FLT_MAX=%f\n",FLT_MAX); - printf("t=%f\n",t); - cout<<"t="<<t<<endl; - cout<<"n="<<n<<endl;*/ - float p=pmf(0,nb_k_mers,k_arr_cms_size); - cout<<p<<endl; - assert(p==0.3122); - p=pmf(1,nb_k_mers,k_arr_cms_size); - cout<<p<<endl; - assert(p==0.3634); - p=pmf(2,nb_k_mers,k_arr_cms_size); - cout<<p<<endl; - assert(p==0.2115); + float p=pmf(0,nb_k_mers,UINT_MAX); + assert(round(p*10000)==3122); + p=pmf(1,nb_k_mers,UINT_MAX); + assert(round(p*10000)==3634); + p=pmf(2,nb_k_mers,UINT_MAX); + assert(round(p*10000)==2115); +} + +void test_ccdf() { + unsigned long nb_k_mers=5000000000; + double res=ccdf(2,nb_k_mers,UINT_MAX); + assert(round(res*10000)==1128); } + + + + int main() { cout<<"testing main_utils"<<endl; cout<<"testing processing of IOFile arguments ( following 2 error messages are what is expected; don't worry)."<<endl; @@ -172,6 +178,8 @@ int main() { test_gammln(); cout<<"testing the pmf function"<<endl; test_pmf(); + cout<<"testing the ccdf function"<<endl; + test_ccdf(); cout<<"test computation of the bast lambda value depending on nb distinct k-mers."<<endl; test_getBestLambdaForN(); cout<<"testing computation of collision probability."<<endl;