diff --git a/src/CountMinSketch.hpp b/src/CountMinSketch.hpp index dd6a1f84e202cdcc1010f8646ab6eb2e4093315e..6e43161da8e9ec41d880488b14ac019e58792331 100644 --- a/src/CountMinSketch.hpp +++ b/src/CountMinSketch.hpp @@ -12,6 +12,7 @@ #include <stdlib.h> #include <string.h> #include <limits.h> +#include <cmath> #include "rock_commons.h" @@ -288,7 +289,7 @@ template<typename T> int CountMinSketch<T>::isBeneathMinKappa(const T_read_numer /* * Go through the arrays of the CMS and returns the number of distinct k_mers (smallest nbr of non-zero counters amongst all arrays) */ -template<typename T> unsigned long CountMinSketch<T>::getNbDistinctKMers() { +/*template<typename T> unsigned long CountMinSketch<T>::getNbDistinctKMers() { int j; unsigned int min=UINT_MAX; unsigned int h; @@ -303,7 +304,34 @@ template<typename T> unsigned long CountMinSketch<T>::getNbDistinctKMers() { (nb_k_mers<min) ?min=nb_k_mers: min; } return min; -} +}*/ +/* + * Go through the arrays of the CMS and returns an estimation of the number of distinct k_mers (new formula from Alexis) + */ +template<typename T> unsigned long CountMinSketch<T>::getNbDistinctKMers() { + int j; + unsigned int max=0; + unsigned int h; + unsigned long n; + unsigned long m=Pi_js[0]; + + for (j=lambda-1;j>=0;--j) { + unsigned long z=0; // number of zeroes in a CMS array. + for (h=Pi_js[j]-1;h>0;--h) { // Have to process the case of h=0 separately otherwise as h is now an unsigned int, it is always >=0 which causes an infinite loop. + (cms_lambda_array[j] [h]==0)? z+=1: z; + } + (cms_lambda_array[j] [0]>0)? z+=1: z; + (z>max)? max=z: max; + (max==z)? m=Pi_js[j]:m; + } + unsigned long lnz=log(max); + unsigned long lnm=log(m); + unsigned long lnm1=log(n-1); + unsigned long deno=lnm1-lnm; + unsigned long nume=lnz-lnm; + n=nume/deno; + return n; +} #endif /* COUNTMINSKETCH_HPP_ */ diff --git a/src/ROCKparams.cpp b/src/ROCKparams.cpp index a7d496dfa9745acab1436fac765ef9548d783185..5a0cac8f801c302ac0412777712fd6917a2afa1b 100755 --- a/src/ROCKparams.cpp +++ b/src/ROCKparams.cpp @@ -43,7 +43,7 @@ void ROCKparams::processMainArgs(int optind, const int argc, char ** argv,std::v void ROCKparams::optArgConsistency(const string& input_file,const string& output_file, - const int& g,CMSparams& parms,const int& nb_k_mers, + const int& g,CMSparams& parms,const unsigned long& nb_k_mers, const std::vector<string>& v_input_lines) { if (input_file.empty() && v_input_lines.empty()) { std::cout<<"You must provide filename via -i or arguments to indicate ROCK what to filter." <<std::endl; @@ -257,7 +257,7 @@ void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { int i,q,m; std::vector<string> v_input_lines; std::vector<string> v_output_lines; - static int PE_separately=1; + // static int PE_separately=1; float proba=k_max_collision_proba; while((i = getopt(argc, argv, "i:o:l:k:c:C:n:vq:m:f:")) != -1) { @@ -302,7 +302,9 @@ void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { qual_thres.k=k; break; case 'n': - nb_k_mers=atoi(optarg);break; // number of distinct k-mers + char * t; + cout<<optarg<<endl; + nb_k_mers=strtoul(optarg,&t,10);break; // number of distinct k-mers case 'v': verbose_mode=1; break; diff --git a/src/ROCKparams.h b/src/ROCKparams.h index 0c0b79486baa8e0d850c71314ab632d34c98ae6b..93cfaaf07dee1cc8865e2580d3324eb0c79955e3 100755 --- a/src/ROCKparams.h +++ b/src/ROCKparams.h @@ -47,7 +47,7 @@ class ROCKparams{ CMSparams parms; FasqQualThreshold qual_thres; int g; // max RAM to use for the CMS if specified by the user. - int nb_k_mers; // expected number of k-mers in input data if specified by the user. + unsigned long nb_k_mers; // expected number of k-mers in input data if specified by the user. int k; // size of the k-mers int verbose_mode; std::string input_file,output_file; @@ -74,7 +74,7 @@ class ROCKparams{ * Also set some CMS parameters depending on value of other options. */ void optArgConsistency(const string& input_file,const string& output_file, - const int& g,CMSparams& parms,const int& nb_k_mers, + const int& g,CMSparams& parms,const unsigned long& nb_k_mers, const std::vector<string>& v_input_lines); friend void test_processIOFileArgs(); diff --git a/src/main_utils.cpp b/src/main_utils.cpp index da28fc7bf72df156f98adfc918636ce580784acc..c4131cb26f45701b2d32ddcc7199d99151955c3f 100644 --- a/src/main_utils.cpp +++ b/src/main_utils.cpp @@ -143,6 +143,8 @@ float getCollisionProba(const int& smallest_kappa,const unsigned long& nb_k_mers return fpp; } + + void usage(int status) { cout<<"usage: rock [options] [args]"<<endl<<endl; cout<<" -i <file> .... file name containing the list of the FASTQ files to process."<<endl; diff --git a/src/unit_test_main_utils.cpp b/src/unit_test_main_utils.cpp index 3a7d66dbed6952d0d3fa3a55dfc6ead078c0d6c0..9e9a329d37281eb97b37c8a107789d8d18675ee8 100644 --- a/src/unit_test_main_utils.cpp +++ b/src/unit_test_main_utils.cpp @@ -85,6 +85,9 @@ void test_getBestLambdaForN() { best=getBestLambdaForN(nb_k_mer,5,UINT_MAX,0.1); //cout<<best<<endl; assert(best==1); + best=getBestLambdaForN(nb_k_mer,1,UINT_MAX,0.05); + //cout<<best<<endl; + assert(best==8); nb_k_mer=50000000000; best=getBestLambdaForN(nb_k_mer,5,UINT_MAX,0.1);// this returns 90 @@ -95,7 +98,6 @@ void test_getBestLambdaForN() { best=getBestLambdaForN(nb_k_mer,50,UINT_MAX,0.05); assert(best==1); best=getBestLambdaForN(nb_k_mer,10,UINT_MAX,0.05); - assert(best==7); } void test_getCollisionProba() { @@ -104,7 +106,7 @@ void test_getCollisionProba() { p=getCollisionProba(2,5000000000,UINT_MAX,1); assert(p=0.1128); p =getCollisionProba(1,2,UINT_MAX,3); - cout<<p<<endl; + //cout<<p<<endl; assert(p==0.0); p =getCollisionProba(1,1000000000,UINT_MAX,1); assert(floor(p*1000+0.5)==23);