diff --git a/src/CountMinSketch.cpp b/src/CountMinSketch.cpp index 2029e2ba015d3c36b22477f4f9e587a209d5ecd4..0d6073c1d242eb332b911b85407385ad0cc94a50 100644 --- a/src/CountMinSketch.cpp +++ b/src/CountMinSketch.cpp @@ -10,38 +10,65 @@ const int max_pow=30; -/* This method is used to determine if a number is a prime number or not. - * It is incomplete. TODO find an effective method to get lamdba prime numbers when we'll be sure whether we use the - * "prime number" version of the hash functions. */ -/*int CountMinSketch::isPrime(unsigned int num) { - if ((num % 2 ==0) || (num==2)) return 0; - if ((num % 3 ==0) || (num==3)) return 0; - if ((num % 5 ==0) || (num==5)) return 0; - if ((num % 7 ==0) || (num==7)) return 0; - return 1; -} -int CountMinSketch::isMersenne(unsigned int num) { - int cur_pow=max_pow; - unsigned int mers_nbr=pow(2,cur_pow)-1; - while (num!=mers_nbr && cur_pow>=1) { - cur_pow-=1; - mers_nbr=pow(2,cur_pow)-1; - } - if (cur_pow==0) return 0; - else return 1; +// Store the non mersenne prime numbers for modulo hashing in this array. +int Pi_js[500]={ + 2147469629, 2147469637, 2147469659, 2147469679, 2147469703, 2147469781, 2147469817, 2147469823, 2147469829, 2147469881,\ + 2147469917, 2147469943, 2147469949, 2147469983, 2147470007, 2147470019, 2147470027, 2147470043, 2147470057, 2147470067,\ + 2147470081, 2147470111, 2147470123, 2147470139, 2147470147, 2147470177, 2147470183, 2147470211, 2147470229, 2147470249,\ + 2147470313, 2147470327, 2147470333, 2147470361, 2147470427, 2147470453, 2147470511, 2147470513, 2147470529, 2147470531,\ + 2147470553, 2147470579, 2147470597, 2147470603, 2147470627, 2147470643, 2147470673, 2147470679, 2147470723, 2147470727,\ + 2147470733, 2147470751, 2147470769, 2147470771,2147483059, 2147483069, 2147483077, 2147483123, 2147483137, 2147483171,\ + 2147473897, 2147473921, 2147473963, 2147474009, 2147474027, 2147474029, 2147474071, 2147474093, 2147474113, 2147474123,\ + 2147474149, 2147474159, 2147474201, 2147474213, 2147474239, 2147474279, 2147474359, 2147474383, 2147474393, 2147474477,\ + 2147474479, 2147474491, 2147474513, 2147474519, 2147474531, 2147474551, 2147474597, 2147474627, 2147474657, 2147474711,\ + 2147474717, 2147474789, 2147474803, 2147474807, 2147474809, 2147474831, 2147474837, 2147474843, 2147474851, 2147474881,\ + 2147474887, 2147474891, 2147474921, 2147474929, 2147474947, 2147474951, 2147474963, 2147475047, 2147475061, 2147475103,\ + 2147475107, 2147475149, 2147475179, 2147475181, 2147475193, 2147475203, 2147475221, 2147475229, 2147475233, 2147475251,\ + 2147475257, 2147475269, 2147475277, 2147475331, 2147475347, 2147475349, 2147475367, 2147475373, 2147475397, 2147475401,\ + 2147475413, 2147475439, 2147475481, 2147475487, 2147475497, 2147475503, 2147475509, 2147475521, 2147475541, 2147475553,\ + 2147475559, 2147475563, 2147475587, 2147475593, 2147475601, 2147475641, 2147475653, 2147475691, 2147475713, 2147475721,\ + 2147475739, 2147475787, 2147475791, 2147475797, 2147475829, 2147475851, 2147475859, 2147475871, 2147475899, 2147475929,\ + 2147475971, 2147475973, 2147475977, 2147475997, 2147476031, 2147476073, 2147476087, 2147476109, 2147476127, 2147476139,\ + 2147476141, 2147476169, 2147476183, 2147476211, 2147476249, 2147476291, 2147476321, 2147476327, 2147476367, 2147476381,\ + 2147476399, 2147476417, 2147476517, 2147476519, 2147476543, 2147476607, 2147476619, 2147476649, 2147476663, 2147476687,\ + 2147476693, 2147476699, 2147476739, 2147476741, 2147476763, 2147476769, 2147476777, 2147476789, 2147476819, 2147476823,\ + 2147476841, 2147476871, 2147476897, 2147476927, 2147476931, 2147476937, 2147476943, 2147476951, 2147476963, 2147476979,\ + 2147477021, 2147477029, 2147477063, 2147477093, 2147477107, 2147477113, 2147477159, 2147477191, 2147477201, 2147477203,\ + 2147477207, 2147477209, 2147477237, 2147477249, 2147477273, 2147477323, 2147477393, 2147477399, 2147477419, 2147477443,\ + 2147477467, 2147477473, 2147477503, 2147477513, 2147477531, 2147477533, 2147477599, 2147477627, 2147477681, 2147477687,\ + 2147477699, 2147477701, 2147477737, 2147477807, 2147477809, 2147477833, 2147477851, 2147477861, 2147477873, 2147477879,\ + 2147477881, 2147477933, 2147477953, 2147477989, 2147478013, 2147478017, 2147478049, 2147478079, 2147478083, 2147483179,\ + 2147478089, 2147478127, 2147478133, 2147478149, 2147478253, 2147478259, 2147478293, 2147478299, 2147478331, 2147478349,\ + 2147478373, 2147478461, 2147478481, 2147478491, 2147478497, 2147478503, 2147478517, 2147478521, 2147478563, 2147478569,\ + 2147478581, 2147478601, 2147478611, 2147478647, 2147478649, 2147478653, 2147478659, 2147478661, 2147478673, 2147478701,\ + 2147478703, 2147478719, 2147478721, 2147478727, 2147478731, 2147478733, 2147478763, 2147478791, 2147478821, 2147478859,\ + 2147478863, 2147478889, 2147478899, 2147478911, 2147478919, 2147478937, 2147478959, 2147478961, 2147478967, 2147478997,\ + 2147479013, 2147479031, 2147479057, 2147479063, 2147479079, 2147479091, 2147479097, 2147479121, 2147479129, 2147479133,\ + 2147479171, 2147479189, 2147479231, 2147479259, 2147479273, 2147479307, 2147479339, 2147479349, 2147479361, 2147479381,\ + 2147479403, 2147479421, 2147479447, 2147479489, 2147479507, 2147479513, 2147479517, 2147479531, 2147479547, 2147479549,\ + 2147479573, 2147479589, 2147479601, 2147479619, 2147479637, 2147479643, 2147479657, 2147479681, 2147479751, 2147479753,\ + 2147479757, 2147479781, 2147479787, 2147479819, 2147479823, 2147479879, 2147479891, 2147479897, 2147479907, 2147479937,\ + 2147479991, 2147480009, 2147480011, 2147480039, 2147480161, 2147480197, 2147480207, 2147480219, 2147480227, 2147480297,\ + 2147480299, 2147480311, 2147480327, 2147480369, 2147480429, 2147480437, 2147480459, 2147480471, 2147480507, 2147480519,\ + 2147480527, 2147480551, 2147480591, 2147480611, 2147480623, 2147480641, 2147480651, 2147480677, 2147480683, 2147480707,\ + 2147480723, 2147480743, 2147480747, 2147480791, 2147480837, 2147480843, 2147480849, 2147480893, 2147480897, 2147480899,\ + 2147480921, 2147480927, 2147480941, 2147480957, 2147480969, 2147480971, 2147480989, 2147481019, 2147481031, 2147481053,\ + 2147481071, 2147481139, 2147481143, 2147481151, 2147481173, 2147481179, 2147481199, 2147481209, 2147481247, 2147481263,\ + 2147481269, 2147481283, 2147481311, 2147481317, 2147481337, 2147481353, 2147481359, 2147481367, 2147481373, 2147481487,\ + 2147481491, 2147481499, 2147481509, 2147481529, 2147481563, 2147481571, 2147481629, 2147481673, 2147481793, 2147481797,\ + 2147481811, 2147481827, 2147481863, 2147481883, 2147481893, 2147481899, 2147481901, 2147481907, 2147481937, 2147481949,\ + 2147481967, 2147481997, 2147482021, 2147482063, 2147482081, 2147482091, 2147482093, 2147482121, 2147482223, 2147482231,\ + 2147482237, 2147482273, 2147482291, 2147482327, 2147482343, 2147482349, 2147482361, 2147482367, 2147482409, 2147482417,\ + 2147482481, 2147482501, 2147482507, 2147482577, 2147482583, 2147482591, 2147482621, 2147482661, 2147482663, 2147482681,\ + 2147482693, 2147482697, 2147482739, 2147482763, 2147482801, 2147482811, 2147482817, 2147482819, 2147482859, 2147482867,\ + 2147482873, 2147482877, 2147482921, 2147482937, 2147482943, 2147482949, 2147482951, 2147483029, 2147483033, 2147483053}; + +int CountMinSketch::hash64to32(unsigned long w,int j) { + int pi_j=Pi_js[j-1]; + return w % pi_j; } -void CountMinSketch::findNonMersPrime() { - int i; - unsigned int num=pi_j_max; - for (i=0;i<lambda;i++) { - num-=1; - while (!isPrime(num)) num-=1; - - } -}*/ - std::map<int,int> CountMinSketch::getIthArray(int i) { std::map<int,int> tmp; return tmp; diff --git a/src/CountMinSketch.h b/src/CountMinSketch.h index cb5faadcd26082506612f14351712c42282ace51..7f47d07e44a3d9efe6feb9e68bb65214e8224887 100644 --- a/src/CountMinSketch.h +++ b/src/CountMinSketch.h @@ -11,11 +11,16 @@ #include <vector> #include <map> -typedef std::vector<unsigned long> readNumericValues; // TODO move this definition to a common include file between ReadProcessor and CountMinSketch. +#include "rock_commons.h" + +typedef struct { + int lambda; + int kappa; + int kappa_prime; +} CMSparams; class CountMinSketch { - static const unsigned int pi_j_max=2147483647; - static const unsigned long mask1=1; + static const unsigned long mask1=1; // used only for hash64to32bs static const unsigned long mask2=2095103; static const unsigned long mask3=1023; @@ -29,12 +34,12 @@ class CountMinSketch { typedef std::map<int,short> internal_array; std::vector<internal_array> cms_lambda_array; - std::vector<int> pi_j_array; + // std::vector<int> pi_j_array; + - // void findNonMersPrime(); // fills pi_j_array with lambda non mersenne prime numbers. - // int hash64to32(unsigned long,int); + int hash64to32(unsigned long,int); - int hash64to32(unsigned long w,int j) { // bit shift version of hash function to start. + int hash64to32bs(unsigned long w,int j) { // bit shift version of hash function to start. unsigned long h_tmp; unsigned long h=~w; h+=w<<18; @@ -57,14 +62,7 @@ class CountMinSketch { void addKMer(unsigned long); // inline? TODO: see later if it can help us gain time. int isRCovBelowThres(const readNumericValues& read_val,int threshold) ; - // for unit tests. - friend void test_CMS(int lambda,int kappa,int kappa_prime); - /*friend void test_findNonMersPrime(int lambda,int kappa,int kappa_prime); - friend void test_hash();*/ - -public: - - CountMinSketch(int glambda,int gkappa,int gkappa_prime) { + void init(int glambda,int gkappa,int gkappa_prime) { lambda=glambda; kappa=gkappa; kappa_prime=gkappa_prime; @@ -74,8 +72,21 @@ public: for (j=0; j<lambda;j++) { cms_lambda_array.push_back(cpt_array); } - pi_j_array.reserve(lambda); - // findNonMersPrime(); + } + + // for unit tests. + friend void test_CMS(int lambda,int kappa,int kappa_prime); + /* + friend void test_hash();*/ + +public: + + CountMinSketch(int glambda,int gkappa,int gkappa_prime) { + init(glambda,gkappa,gkappa_prime); + } + + CountMinSketch(CMSparams parms) { + init(parms.lambda,parms.kappa,parms.kappa_prime); } diff --git a/src/unit_test_cms.cpp b/src/unit_test_cms.cpp index c64fd05aaf1c34a14abb705e72fd37e5582c47b5..294181ff14da39fee06f14e85d3e61514677ad91 100644 --- a/src/unit_test_cms.cpp +++ b/src/unit_test_cms.cpp @@ -26,15 +26,7 @@ void test_hash(int lambda,int kappa,int kappa_prime) { } } -void test_findNonMersPrime(int lambda,int kappa,int kappa_prime) { - CountMinSketch cms=CountMinSketch(lambda,kappa,kappa_prime); - assert(cms.pi_j_array.size()==lambda); - std::vector<int>::iterator it; - for (it=cms.pi_j_array.begin();it!=cms.pi_j_array.end();it++) { - assert(*it<CountMinSketch::pi_j_max); - } - assert(int (cms.pi_j_array[lambda-1])==2747483641); -} + */ void test_CMS(int lambda,int kappa,int kappa_prime) { CountMinSketch cms=CountMinSketch(lambda,kappa,kappa_prime);