diff --git a/src/CountMinSketch.hpp b/src/CountMinSketch.hpp index bd1bf2bc51c4c92ea0761f019c4a76fe8c054287..dd6a1f84e202cdcc1010f8646ab6eb2e4093315e 100644 --- a/src/CountMinSketch.hpp +++ b/src/CountMinSketch.hpp @@ -15,13 +15,9 @@ #include "rock_commons.h" - #define BAD_TYPE -1 - - - typedef struct { int lambda; int kappa; @@ -30,6 +26,7 @@ typedef struct { int filter_PE_separately; // indicates whether PE reads must be treated as single by the cms. Indeed it may appear that 1 end contains useful k-mer but that other end contains k-mer such that "global" median is below threshold. // In this case, read is rejected by the CMS (default behavior). We want to do an experimentation to see if keeping such reads wold lead to better results in assembling. // Values can be 0 (process PE as single),1 (process PE separately with strict implementation for kappa_prime) or 2 (process PE separately with lenient implementation for kappa prime). + float wanted_max_collision_proba; } CMSparams; template <typename T> diff --git a/src/ROCKparams.cpp b/src/ROCKparams.cpp index bdc1833cd279f56d68efe2f3e31db9c1e7bb7ce9..57f11c01b9ebbf8802715fc87792ee808fbb9967 100755 --- a/src/ROCKparams.cpp +++ b/src/ROCKparams.cpp @@ -65,7 +65,7 @@ void ROCKparams::optArgConsistency(const string& input_file,const string& output int smallest_kappa; if (parms.kappa_prime==0) smallest_kappa=parms.kappa; else smallest_kappa=parms.kappa_prime; - parms.lambda=getBestLambdaForN(nb_k_mers,smallest_kappa,Pi_js[0],k_max_collision_proba); + parms.lambda=getBestLambdaForN(nb_k_mers,smallest_kappa,Pi_js[0],parms.wanted_max_collision_proba); } if (parms.kappa_prime>=parms.kappa) { cout<<"ERROR lower filter is higher than high filter!"<<endl; @@ -259,7 +259,7 @@ void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { std::vector<string> v_output_lines; static int PE_separately=1; - while((i = getopt(argc, argv, "i:o:l:k:c:C:n:vq:m:")) != -1) { + while((i = getopt(argc, argv, "i:o:l:k:c:C:n:vq:m:f:")) != -1) { switch(i) { case 0: break; @@ -267,6 +267,13 @@ void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { input_file=optarg;break; case 'c': parms.kappa_prime=atoi(optarg);break; + case 'f': + float proba=atof(optarg); + if ((proba<=0.0) or (proba>=1.0)) { + cout<<"maximum for collision probability in the CMS must be > 0.0 and <1.0."<<endl; + usage(EXIT_FAILURE); + } + parms.wanted_max_collision_proba=proba;break; case 'h': usage(EXIT_SUCCESS); break; case 'o': @@ -291,7 +298,7 @@ void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { cout<<"Bad value for k. Must choose k so that 0<k<=32."<<endl; usage(EXIT_FAILURE); } - qual_thres.k=k; + qual_thres.k=k; break; case 'n': nb_k_mers=atoi(optarg);break; // number of distinct k-mers diff --git a/src/ROCKparams.h b/src/ROCKparams.h index 440453650bd82cce0346c955802114b0958a2d38..fb52950fbbbda5b5d60b02e671f5dd61609320ae 100755 --- a/src/ROCKparams.h +++ b/src/ROCKparams.h @@ -93,6 +93,7 @@ public: parms.kappa_prime=0; parms.lambda=0; parms.filter_PE_separately=0; + parms.wanted_max_collision_proba=k_max_collision_proba; qual_thres.min_correct_k_mers_in_read=1; qual_thres.nucl_score_threshold=k_phred_32; qual_thres.k=k; diff --git a/src/main_utils.cpp b/src/main_utils.cpp index 9ef58ec9822f8662d930d01ce1aa6e468eb92ce1..da28fc7bf72df156f98adfc918636ce580784acc 100644 --- a/src/main_utils.cpp +++ b/src/main_utils.cpp @@ -151,6 +151,7 @@ void usage(int status) { cout<<" -k <int> .... k-mer size. (default 25)."<<endl; cout<<" -c <int> .... lower coverage threshold (default: 0)."<<endl; cout<<" -C <int> .... upper coverage threshold (default: 70; max: 65535)."<<endl; + cout<<" -f <float> .... maximum probability of collision in the count min sketch (default: 0.05)"<<endl; cout<<" -l <int> .... size of the count min sketch (default: at most 4, depending on the available RAM)"<<endl; cout<<" -n <int> .... (expected) number of distinct k-mers within the processed reads."<<endl; cout<<" -q <int> .... minimum quality threshold for nucleotides. Default is 0."<<endl; diff --git a/src/unit_test_main_utils.cpp b/src/unit_test_main_utils.cpp index 1d51f954131ca70ce0335a921016213c34113594..3a7d66dbed6952d0d3fa3a55dfc6ead078c0d6c0 100644 --- a/src/unit_test_main_utils.cpp +++ b/src/unit_test_main_utils.cpp @@ -94,14 +94,6 @@ void test_getBestLambdaForN() { best=getBestLambdaForN(nb_k_mer,50,UINT_MAX,0.05); assert(best==1); - /*best=getBestLambdaForN(nb_k_mer,2,UINT_MAX,0.05); - assert(best==4206); - best=getBestLambdaForN(nb_k_mer,3,UINT_MAX,0.05); - assert(best==991); - best=getBestLambdaForN(nb_k_mer,4,UINT_MAX,0.05); - assert(best==306); - best=getBestLambdaForN(nb_k_mer,5,UINT_MAX,0.05); - assert(best==117);*/ best=getBestLambdaForN(nb_k_mer,10,UINT_MAX,0.05); assert(best==7); } @@ -115,14 +107,10 @@ void test_getCollisionProba() { cout<<p<<endl; assert(p==0.0); p =getCollisionProba(1,1000000000,UINT_MAX,1); - //cout<<p<<endl; - //cout<<p*1000+0.5<<endl; assert(floor(p*1000+0.5)==23); p =getCollisionProba(5,1000000000,UINT_MAX,1); - //cout<<p<<endl; assert(floor(p*1000+0.5)==0); p =getCollisionProba(5,50000000000,UINT_MAX,1); - //cout<<p<<endl; assert(floor(p*1000+0.5)==975); } @@ -139,7 +127,6 @@ void test_gammln() { a=gammln(5); assert(exp(a)==24); a=gammln(6); - //cout<<exp(a)<<endl; float tmp=exp(a); float tmp2=tmp*10000; float tmp3=round(tmp2)/10000; @@ -147,14 +134,11 @@ void test_gammln() { assert(tmp3==120); /* unsigned long n=5000; unsigned long m=n+1; - cout<<"m="<<m<<endl; - //float i=m; a=gammln(m); float b=gammln(n); double truc=exp(a-b); - printf("truc=%lf \n",truc); - //cout<<exp(a)/exp(b)<<endl;// Too big. - //assert(exp(a)/exp(b)==n);*/ + assert(truc==n); + */ }