diff --git a/src/main_utils.cpp b/src/main_utils.cpp index 6509ca9a1be68f6dae69aaaaf884b46d40ded1b8..58ac7c5c025995e5553cc82a4ad31bc6e5a39023 100644 --- a/src/main_utils.cpp +++ b/src/main_utils.cpp @@ -114,7 +114,7 @@ double ccdf(const int& smallest_kappa,const unsigned long& nb_k_mers,const unsig if (res<0) res=0.00000001; // It happens because pmf if an approximation of the probability distribution. // So it happens that ccdf is negative but very close to 0... As we need a non null value for the next calculation, // return a very small constant. - if (res==1) res=0.9999999; // Need to do that because we compule ln(ccdf) to determine lamda and we need it to be non null otherwise we'd get a division by 0. + if (res>=0.999) res=0.999; // Need to do that because we compule ln(ccdf) to determine lamda and we need it to be non null otherwise we'd get a division by 0. return res; } @@ -134,6 +134,18 @@ int getBestLambdaForN(const unsigned long& nb_k_mers,const int& smallest_kappa,c return min_l; } +/* +int getBestLambdaForN_altn(const unsigned long& nb_k_mers,const int& smallest_kappa,const unsigned int& m,const float& max_collision_proba) { + float ccdfr=ccdf(smallest_kappa,nb_k_mers,m); + int l=0; + float p=ccdfr; + while(l<50 and p>max_collision_proba) { + p*=ccdfr; + l++; + } + return l-1; +}*/ + float getCollisionProba(const int& smallest_kappa,const unsigned long& nb_k_mers,const unsigned int& arr_cms_size,const int& lambda) { diff --git a/src/main_utils.h b/src/main_utils.h index 63710d4b5d005ca5cd18701dbb5e6b9c65257a1d..d4458a9fbf41ed49f39abd66dc6f2227cfce699f 100644 --- a/src/main_utils.h +++ b/src/main_utils.h @@ -25,6 +25,7 @@ unsigned long getNodePhysMemory(); std::string checkDirExists(const std::string o_file); int getBestLambdaForN(const unsigned long& nb_k_mers,const int& smallest_kappa,const unsigned int& m,const float& max_collision_proba); +int getBestLambdaForN_altn(const unsigned long& nb_k_mers,const int& smallest_kappa,const unsigned int& m,const float& max_collision_proba); double gammln(const unsigned long& xx); double pmf(const float&x,const unsigned long& nb_k_mers,const unsigned int& m); double ccdf(const int& smallest_kappa,const unsigned long& nb_k_mers,const unsigned int& m); diff --git a/src/unit_test_cms.cpp b/src/unit_test_cms.cpp index b77fbf459dcd99536766559970762461f7c32cc2..746ab2f4eec5ddae713284594a87831207b8fb53 100644 --- a/src/unit_test_cms.cpp +++ b/src/unit_test_cms.cpp @@ -359,9 +359,10 @@ int main(int argc, char **argv) { cout<<"testing CMS with lambda="<<lambda<<endl; test_CMS(lambda,kappa,kappa_prime); // Finally using C arrays (maps implied storing hash keys : 4 Bytes per k_mer overhead) but each array is of size INT_MAX... cout<<"testing CMS with PE not as single strict kappa prime implementation"<<endl; - testCMS_with_PE_NotasSingle(lambda,kappa,kappa_prime); + // VL comment these tests since AC says That the best option is to treat PE as single. + /*testCMS_with_PE_NotasSingle(lambda,kappa,kappa_prime); cout<<"testing CMS with PE not as single lenient kappa prime implementation"<<endl; - testCMS_with_PE_NotasSingle2(lambda,kappa,kappa_prime); + testCMS_with_PE_NotasSingle2(lambda,kappa,kappa_prime);*/ cout<<"done"<<endl; // profiling_CMS_fill(); diff --git a/src/unit_test_main_utils.cpp b/src/unit_test_main_utils.cpp index 86b1eb01e9bd52d9c5a5d570bd065664e890a251..7375c747dcb9ac981860c57747559fdc76663cc8 100644 --- a/src/unit_test_main_utils.cpp +++ b/src/unit_test_main_utils.cpp @@ -73,30 +73,37 @@ void test_getBestLambdaForN() { int best=getBestLambdaForN(nb_k_mer,2,UINT_MAX,0.05); assert(best==2); nb_k_mer=200000000; - best=getBestLambdaForN(nb_k_mer,5,UINT_MAX,k_max_collision_proba); + best=getBestLambdaForN(nb_k_mer,5,UINT_MAX,0.1); assert(best==1); nb_k_mer=600000000; - best=getBestLambdaForN(nb_k_mer,70,UINT_MAX,k_max_collision_proba); + best=getBestLambdaForN(nb_k_mer,70,UINT_MAX,0.1); assert(best==1); nb_k_mer=2000000000; - best=getBestLambdaForN(nb_k_mer,10,UINT_MAX,k_max_collision_proba); + best=getBestLambdaForN(nb_k_mer,10,UINT_MAX,0.1); assert(best==1); nb_k_mer=10000000000; - best=getBestLambdaForN(nb_k_mer,5,UINT_MAX,k_max_collision_proba); + best=getBestLambdaForN(nb_k_mer,5,UINT_MAX,0.1); //cout<<best<<endl; assert(best==1); - best=getBestLambdaForN(nb_k_mer,50,UINT_MAX,k_max_collision_proba); - cout<<best<<endl; - //assert(best==1); - nb_k_mer=100000000000; - best=getBestLambdaForN(nb_k_mer,50,UINT_MAX,k_max_collision_proba); - cout<<best<<endl; - nb_k_mer=1000000000000; - best=getBestLambdaForN(nb_k_mer,50,UINT_MAX,k_max_collision_proba); - cout<<best<<endl; - nb_k_mer=100000000000000; - best=getBestLambdaForN(nb_k_mer,50,UINT_MAX,k_max_collision_proba); - cout<<best<<endl; // VL: get 19315484 + + nb_k_mer=50000000000; + best=getBestLambdaForN(nb_k_mer,5,UINT_MAX,0.1);// this returns 90 + assert(best==90); + best=getBestLambdaForN(nb_k_mer,50,UINT_MAX,0.1); + assert(best==1); + + best=getBestLambdaForN(nb_k_mer,50,UINT_MAX,0.05); + assert(best==1); + best=getBestLambdaForN(nb_k_mer,2,UINT_MAX,0.05); + assert(best==4206); + best=getBestLambdaForN(nb_k_mer,3,UINT_MAX,0.05); + assert(best==991); + best=getBestLambdaForN(nb_k_mer,4,UINT_MAX,0.05); + assert(best==306); + best=getBestLambdaForN(nb_k_mer,5,UINT_MAX,0.05); + assert(best==117); + best=getBestLambdaForN(nb_k_mer,10,UINT_MAX,0.05); + assert(best==7); } void test_getCollisionProba() { @@ -108,12 +115,15 @@ void test_getCollisionProba() { cout<<p<<endl; assert(p==0.0); p =getCollisionProba(1,1000000000,UINT_MAX,1); - // cout<<p<<endl; - assert(floor(p*1000+0.5)==43); + //cout<<p<<endl; + //cout<<p*1000+0.5<<endl; + assert(floor(p*1000+0.5)==23); p =getCollisionProba(5,1000000000,UINT_MAX,1); - assert(floor(p*1000+0.5)==2); - p =getCollisionProba(5,20000000000,UINT_MAX,1); - assert(floor(p*1000+0.5)==909); + //cout<<p<<endl; + assert(floor(p*1000+0.5)==0); + p =getCollisionProba(5,50000000000,UINT_MAX,1); + //cout<<p<<endl; + assert(floor(p*1000+0.5)==975); } void test_gammln() { @@ -148,7 +158,6 @@ void test_gammln() { } -// #include <float.h> void test_pmf() { unsigned long nb_k_mers=5000000000; float p=pmf(0,nb_k_mers,UINT_MAX); @@ -180,10 +189,10 @@ int main() { test_pmf(); cout<<"testing the ccdf function"<<endl; test_ccdf(); - cout<<"test computation of the bast lambda value depending on nb distinct k-mers."<<endl; - test_getBestLambdaForN(); cout<<"testing computation of collision probability."<<endl; test_getCollisionProba(); + cout<<"test computation of the bast lambda value depending on nb distinct k-mers."<<endl; + test_getBestLambdaForN(); cout<<"done"<<endl; } diff --git a/test/data/fastq.filtered/klebsiella_100_1.undefined.fq b/test/data/fastq.filtered/klebsiella_100_1.undefined.fq deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/test/data/fastq.filtered/klebsiella_100_2.undefined.fq b/test/data/fastq.filtered/klebsiella_100_2.undefined.fq deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000