diff --git a/src/CountMinSketch.hpp b/src/CountMinSketch.hpp index 40f671c5a5da7ea519bca7cfab7b9f279c977f08..6beb3f2cbbf4b052949ee2c4e3e3b81fe4f50555 100644 --- a/src/CountMinSketch.hpp +++ b/src/CountMinSketch.hpp @@ -24,7 +24,7 @@ * This disparity in the binary representation of a primary number guaranties that all bits of a numerator x * will be well affected during the x%p operation. */ -const long int Pi_js[50]={4283781797,4283781461,4283738773,4283738453,4280985941,4280898901,4275393833,4275393173,\ +const long int Pi_js[k_max_array_in_cms]={4283781797,4283781461,4283738773,4283738453,4280985941,4280898901,4275393833,4275393173,\ 4275382933,4274694821,4273296553,4273285717,4273253029,4273121621,4273034581,4272772181,\ 4272771749,4272769621,4272761509,4272728741,4272630421,4272629077,4272608581,4272608533,\ 4272599701,4272598181,4272597673,4272597353,4272597349,4272597157,4272596149,4272596117,\ @@ -142,7 +142,7 @@ public: /* * Determines whether median of k-mer coverage is below threshold or not. */ -#include <stdio.h> +//#include <stdio.h> // this is a few seconds faster template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold) { int PE1_below_thres; @@ -257,8 +257,8 @@ template<typename T> void CountMinSketch<T>::init(int glambda,int gkappa,int gka mask=get_mask<T>::value; cms_lambda_array= (T**) malloc(lambda*sizeof(T*)); for (j=0;j<lambda;j++) { - cms_lambda_array[j]=(T *) malloc(sizeof(T)*UINT_MAX); - memset(cms_lambda_array[j],0,UINT_MAX); + cms_lambda_array[j]=(T *) malloc(sizeof(T)*k_arr_cms_size); + memset(cms_lambda_array[j],0,k_arr_cms_size); } } diff --git a/src/FqBaseBackend.h b/src/FqBaseBackend.h index 1cd30fef43fb860e5e103611f960828017a9a43d..4dd4e6461ae713fb2546540616bfe76bb0579485 100644 --- a/src/FqBaseBackend.h +++ b/src/FqBaseBackend.h @@ -5,6 +5,8 @@ * Author: vlegrand */ + + #ifndef FQBASEBACKEND_H_ #define FQBASEBACKEND_H_ diff --git a/src/ROCKparams.cpp b/src/ROCKparams.cpp index ca524b44df9d534e0acbbaa8eaa1c281e5ff8098..b1c1cf091e926aa155e428e304e032426c4a12e8 100644 --- a/src/ROCKparams.cpp +++ b/src/ROCKparams.cpp @@ -18,12 +18,14 @@ using namespace std; const int ROCKparams::output_ext; const int ROCKparams::undef_ext; + +/* void ROCKparams::computeLambda() { unsigned long tmp=parms.filter_size; tmp*=1073741824; // this is in fact 1024*1024*1024. parms.lambda=tmp/UINT_MAX; if (parms.kappa>get_mask<unsigned char>::value) parms.lambda=parms.lambda/sizeof(unsigned short); -} +}*/ /* int ROCKparams::getfilterPEMode() { return parms.filter_PE_separately; @@ -61,15 +63,19 @@ void ROCKparams::optArgConsistency(const string& input_file,const string& output } float avail_mem=getNodePhysMemory()-defaultGRPMAXSize; // cout<<avail_mem; - if (g>avail_mem) { + /*if (g>avail_mem) { std::cout<<"This machine only has "<<getNodePhysMemory()<<" Gigabytes of RAM. This is not enough to run ROCK with a CMS of size: "<<g<<endl; exit(EXIT_FAILURE); } else if (g!=0) { parms.filter_size=g; computeLambda(); - } + }*/ if (nb_k_mers) { // user indicated number of k-mers; use it to minimize parms.lambda - parms.lambda=getBestLambdaForN(nb_k_mers,parms.lambda); + //parms.lambda=getBestLambdaForN(nb_k_mers,parms.lambda); + int smallest_kappa; + if (parms.kappa_prime==0) smallest_kappa=parms.kappa; + else smallest_kappa=parms.kappa_prime; + parms.lambda=getBestLambdaForN(nb_k_mers,smallest_kappa,k_arr_cms_size); } if (parms.kappa_prime>=parms.kappa) { cout<<"ERROR lower filter is higher than high filter!"<<endl; @@ -85,18 +91,18 @@ void ROCKparams::optArgConsistency(const string& input_file,const string& output exit(EXIT_FAILURE); } if (parms.lambda==0) { - // This happens when the user doesn't specify lambda nor g nor nb_k_mers. - computeLambda(); - parms.lambda=min(parms.lambda,proposedLambda); - if (parms.lambda==0) { + // This happens when the user doesn't specify lambda nor nb_k_mers. + //computeLambda(); + parms.lambda=k_proposed_lamda; + /* if (parms.lambda==0) { cout<<"Not enough RAM on the machine. Total RAM in GB must be bigger than UINT_MAX to allow at least 1 array in the CMS."<<endl; exit(EXIT_FAILURE); - } + }*/ } #ifdef DEBUG cout<<"parms.lambda="<<parms.lambda<<" UINT_MAX="<<UINT_MAX<<endl; #endif - unsigned long cms_size=UINT_MAX; + unsigned long cms_size=k_arr_cms_size; cms_size*=parms.lambda; //cout<<"minimum cms size:"<<cms_size<<endl; if (parms.kappa>get_mask<unsigned char>::value) cms_size=cms_size*sizeof(unsigned short); @@ -279,7 +285,7 @@ void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { //int option_index=0; // while((i = getopt_long(argc, argv, "i:o:l:k:c:C:g:n:vq:m:",long_options,&option_index)) != -1) { - while((i = getopt(argc, argv, "i:o:l:k:c:C:g:n:vq:m:")) != -1) { + while((i = getopt(argc, argv, "i:o:l:k:c:C:n:vq:m:")) != -1) { switch(i) { case 0: break; @@ -314,10 +320,10 @@ void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { break; case 'n': nb_k_mers=atoi(optarg);break; // number of distinct k-mers - case 'g': +/* case 'g': // cout<<optarg<<endl; g=atoi(optarg); - break; + break;*/ case 'v': verbose_mode=1; break; @@ -361,7 +367,10 @@ void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { processMainArgs(optind, argc, argv,v_input_lines); optArgConsistency(input_file,output_file,g,parms,nb_k_mers,v_input_lines); if (nb_k_mers) { - expected_collision_proba=getCollisionProba(nb_k_mers,parms.lambda); + int smallest_kappa; + if (parms.kappa_prime==0) smallest_kappa=parms.kappa; + else smallest_kappa=parms.kappa_prime; + expected_collision_proba=getCollisionProba(smallest_kappa,nb_k_mers,parms.lambda); } if ((v_input_lines.empty() || v_output_lines.empty()) && (loadInOutFileArgs(input_file,output_file,v_input_lines,v_output_lines)==EXIT_FAILURE)) throw EXIT_FAILURE; if (processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id)!=EXIT_SUCCESS) throw EXIT_FAILURE; diff --git a/src/ROCKparams.h b/src/ROCKparams.h index 15564013cb0542d2be672426ab758615691501d0..cfc0baa2288f3e88c2480eb3a8494971bd991e0b 100644 --- a/src/ROCKparams.h +++ b/src/ROCKparams.h @@ -23,7 +23,10 @@ // This constant is used only if you use rock's default behavior (use 90% of the machine's RAM). // If you specify a lambda, it is possible to use more memory for storing more reads and less memory for the CMS. -#define proposedLambda 4 +#define k_sep_pair_end ',' +#define k_ext '.' +#define path_sep '/' +#define k_proposed_lamda 1 @@ -56,7 +59,7 @@ class ROCKparams{ float expected_collision_proba; - void computeLambda(); + //void computeLambda(); void processMainArgs(int optind, int argc, char ** argv,std::vector<string>& v_input_lines); int loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<string>& v_input_lines,std::vector<string>& v_output_lines); int loadFileArgs(const std::string& afile,std::vector<string>& v_lines); diff --git a/src/main_utils.cpp b/src/main_utils.cpp index c6eda0150f2d4caecd3d76f4eab78745c2468f87..b41af415c48c5372e97f815c4607aff50df8ae30 100644 --- a/src/main_utils.cpp +++ b/src/main_utils.cpp @@ -65,13 +65,78 @@ std::string checkDirExists(const string o_file) { return parent_dir; } +/* + * logarithm of the gamma function; + * from here: https://www.cec.uchile.cl/cinetica/pcordero/MC_libros/NumericalRecipesinC.pdf + */ +float gammln(const float& xx) { + double x,y,tmp,ser; + + static double cof[6]={76.18009172947146,-86.50532032941677,\ + 24.01409824083091,-1.231739572450155,\ + 0.1208650973866179e-2,-0.5395239384953e-5}; + int j; + y=x=xx; + tmp=x+5.5; + tmp -=(x+0.5)*log(tmp); + ser=1.000000000190015; + for (j=0;j<=5;j++) ser+=cof[j]/++y; + return -tmp+log(2.5066282746310005*ser/x); +} + +/* + * here x is the smallest value of the counter for a given k-mer in the CMS. + * m is the size of a CMS array. + */ +float pmf(const float&x,const unsigned long& nb_k_mers,const unsigned int& m) { + float lngnp1=gammln(nb_k_mers+1); + float lngxp1=gammln(x+1); + float lngnmxp1=gammln(nb_k_mers-x+1); + float lnm=log(m); + float lnmm1=log(m-1); + float nlnm=nb_k_mers*lnm; + float nmxlnmm1=(nb_k_mers-x)*lnmm1; + float tmp=lngnp1-lngxp1; + tmp -=lngnmxp1; + tmp -=nlnm; + tmp+=nlnm; + float res=exp(tmp); + return res; +} + +float ccdf(const int& smallest_kappa,const unsigned long& nb_k_mers,const unsigned int& m) { + float res; + float s=0.0; + for (int i=1;i<=smallest_kappa;i++) { + s+=pmf(i,nb_k_mers,m); + } + res=1-s; + return res; +} + +/* + * Minimize lambda for a given number of k-mers. + * We want p<=0.01, + * so choose smallest lambda so that ccdf(kappa_prime,nb_k_mer,array_size) exp lambda<=0.01. + * Smallest is either low filter either high filter if there is no low filter. + */ +int getBestLambdaForN(const unsigned long& nb_k_mers,const int& smallest_kappa,const unsigned int& m) { + float lnp=log(k_max_collision_proba); + float ccfdr=ccdf(smallest_kappa,nb_k_mers,m); + float lnccdf=log(ccfdr); + float tmp=lnp/ccfdr; + int min_l=floor(tmp); + if (min_l==0) min_l=1; + return min_l; +} + /* * Minimize lambda for a given number of k-mers. * We want p<=0.01, * so choose smallest lambda so that [1-(1-1/m)exp n] exp lambda<=0.01 */ -int getBestLambdaForN(const unsigned long& nb_k_mers,int lambda_max) { +int getBestLambdaForN_old(const unsigned long& nb_k_mers,int lambda_max) { int lambda=2; int min_lambda=lambda; if (lambda_max==0) { // no upper bound specified by the user via the -g option @@ -93,7 +158,7 @@ int getBestLambdaForN(const unsigned long& nb_k_mers,int lambda_max) { /* * Compute collision probability */ -float getCollisionProba(const unsigned long& nb_k_mers,const int& lambda) { +float getCollisionProba_old(const unsigned long& nb_k_mers,const int& lambda) { long double tmp=1.0/UINT_MAX; tmp=1.0-tmp; tmp=pow(tmp,nb_k_mers); @@ -106,6 +171,12 @@ float getCollisionProba(const unsigned long& nb_k_mers,const int& lambda) { //return p; } +float getCollisionProba(const int& smallest_kappa,const unsigned long& nb_k_mers,const int& lambda) { + float ccfdr=ccdf(smallest_kappa,nb_k_mers,k_arr_cms_size); + float fpp=pow(ccfdr,lambda); + return fpp; +} + void usage(int status) { cout<<"usage: rock [options] [args]"<<endl<<endl; cout<<" -i <file> .... file name containing the list of the FASTQ files to process."<<endl; @@ -116,8 +187,8 @@ void usage(int status) { cout<<" -C <int> .... upper coverage threshold (default: 70; max: 65535)."<<endl; cout<<" -l <int> .... size of the count min sketch (default: at most 4, depending on the available RAM)"<<endl; cout<<" -n <int> .... (expected) number of distinct k-mers within the processed reads."<<endl; - cout<<" -g <int> .... maximum RAM to use (in Gb) for the CMS."<<endl; - cout<<" -p <int> .... treat PE as single, separately with strict -c filter, separately with lenient -c fileter. Default is 1."<<endl; + // cout<<" -g <int> .... maximum RAM to use (in Gb) for the CMS."<<endl; + // cout<<" -p <int> .... treat PE as single, separately with strict -c filter, separately with lenient -c fileter. Default is 1."<<endl; cout<<" -q <int> .... minimum quality threshold for nucleotides. Default is 10."<<endl; cout<<" -m <int> .... minimum of valid k-mer threshold for reads. Default is 1."<<endl; cout<<" -v .... verbose"<<endl; diff --git a/src/main_utils.h b/src/main_utils.h index 080e584c9a2089639c762a9480f4376682c31563..4fb56969e1dc0a847d77b0e1a981e96da2230f0f 100644 --- a/src/main_utils.h +++ b/src/main_utils.h @@ -8,10 +8,12 @@ #ifndef MAIN_UTILS_H_ #define MAIN_UTILS_H_ -#define k_max_input_files 15 -#define k_sep_pair_end ',' +//#define k_max_input_files 15 +/*#define k_sep_pair_end ',' #define k_ext '.' -#define path_sep '/' +#define path_sep '/'*/ + +#define k_max_collision_proba 0.1 #include <string> #include <vector> @@ -24,8 +26,10 @@ unsigned long getNodePhysMemory(); std::string checkDirExists(const std::string o_file); /*int loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<std::string>& v_input_lines,std::vector<std::string>& v_output_lines); int processInOutFileArgs(const std::vector<std::string>& v_input_lines,std::vector<std::string>& v_output_lines,std::vector<IO_fq_files>& single_files,std::vector<PE_files>& v_PE_files,int& f_id);*/ -int getBestLambdaForN(const unsigned long& nb_k_mers,int lambda_max); -float getCollisionProba(const unsigned long& nb_k_mers,const int& lambda); +//int getBestLambdaForN(const unsigned long& nb_k_mers); +int getBestLambdaForN(const unsigned long& nb_k_mers,const int& smallest_kappa,const unsigned int& m); +//float getCollisionProba(const unsigned long& nb_k_mers,const int& lambda); +float getCollisionProba(const int& smallest_kappa,const unsigned long& nb_k_mers,const int& lambda); void usage(int status); diff --git a/src/rock.cpp b/src/rock.cpp index a52eecfeffc5e08fa3adbda6f5d3c6d4774b5088..3f8f3b2bcf60df615d9f6b301297d82f855375b6 100644 --- a/src/rock.cpp +++ b/src/rock.cpp @@ -121,8 +121,9 @@ int main(int argc,char * argv[]) { cout<<"going to compute collision probability"<<endl; printRUsage(); #endif - - float p =getCollisionProba(approx_nb_k_mers,parms.lambda); + int smallest_kappa=parms.kappa; + if (parms.kappa_prime>0) smallest_kappa=parms.kappa_prime; + float p =getCollisionProba(smallest_kappa,approx_nb_k_mers,parms.lambda); cout<<"estimated probability of collision:"<<p<<endl; cout<<"estimated number of distinct k-mers:"<<approx_nb_k_mers<<endl; diff --git a/src/rock_commons.h b/src/rock_commons.h index 72ecb5aad6d5d63b006a3ab73dc3e7f6a7e1de2e..81446731fe0864edf9eacc539869ccf895d0dc42 100644 --- a/src/rock_commons.h +++ b/src/rock_commons.h @@ -25,6 +25,8 @@ */ #define k_max_input_files 15 +#define k_max_array_in_cms 50 // Theorical maximum of arrays in the count min sketch. +#define k_arr_cms_size UINT_MAX // size of 1 CMS array typedef struct { std::string in_fq_file; diff --git a/src/unit_test_main_utils.cpp b/src/unit_test_main_utils.cpp index cd1a1e4075f4b4fe31aeabf9ac0e2e598a284a09..3ba375bbb96e2a5ea4f20e34739c92cc6483dee6 100644 --- a/src/unit_test_main_utils.cpp +++ b/src/unit_test_main_utils.cpp @@ -71,30 +71,33 @@ void test_processIOFileArgs() { void test_getBestLambdaForN() { int lambda_max=10; unsigned long nb_k_mer=200000000; - int best=getBestLambdaForN(nb_k_mer,lambda_max); - assert(best==2); + int best=getBestLambdaForN(nb_k_mer,5,k_arr_cms_size); + assert(best==1); nb_k_mer=600000000; - best=getBestLambdaForN(nb_k_mer,lambda_max); - assert(best==3); + best=getBestLambdaForN(nb_k_mer,70,k_arr_cms_size); + assert(best==1); nb_k_mer=2000000000; - best=getBestLambdaForN(nb_k_mer,lambda_max); - // cout<<best<<endl; - assert(best==5); + best=getBestLambdaForN(nb_k_mer,10,k_arr_cms_size); + assert(best==1); nb_k_mer=10000000000; - best=getBestLambdaForN(nb_k_mer,lambda_max); - assert(best==lambda_max); + best=getBestLambdaForN(nb_k_mer,5,k_arr_cms_size); + cout<<best<<endl; + assert(best==1); + nb_k_mer=100000000000000; + best=getBestLambdaForN(nb_k_mer,50,k_arr_cms_size); + cout<<best<<endl; } void test_getCollisionProba() { - float p =getCollisionProba(1,2); - // cout<<p<<endl; + float p =getCollisionProba(1,2,k_arr_cms_size); + cout<<p<<endl; assert(p==0.0); - p =getCollisionProba(1000000000,2); + p =getCollisionProba(1,1000000000,k_arr_cms_size); // cout<<p<<endl; assert(floor(p*1000+0.5)==43); - p =getCollisionProba(1000000000,4); + p =getCollisionProba(5,1000000000,k_arr_cms_size); assert(floor(p*1000+0.5)==2); - p =getCollisionProba(20000000000,10); + p =getCollisionProba(5,20000000000,k_arr_cms_size); assert(floor(p*1000+0.5)==909); }