From fb37d10fbd892c33678e59870d7c4ec11264edef Mon Sep 17 00:00:00 2001 From: Veronique Legrand <vlegrand@pasteur.fr> Date: Fri, 11 Mar 2016 16:38:51 +0100 Subject: [PATCH] bugfix+ finally chose C array as underlying implementation for CMS --- src/CountMinSketch.cpp | 25 +++++++----------------- src/CountMinSketch.h | 44 ++++++++++++++++++++++++++++-------------- src/unit_test_cms.cpp | 20 +++++++------------ 3 files changed, 44 insertions(+), 45 deletions(-) diff --git a/src/CountMinSketch.cpp b/src/CountMinSketch.cpp index 0d6073c..d63a83d 100644 --- a/src/CountMinSketch.cpp +++ b/src/CountMinSketch.cpp @@ -65,40 +65,29 @@ int Pi_js[500]={ 2147482873, 2147482877, 2147482921, 2147482937, 2147482943, 2147482949, 2147482951, 2147483029, 2147483033, 2147483053}; int CountMinSketch::hash64to32(unsigned long w,int j) { - int pi_j=Pi_js[j-1]; + int pi_j=Pi_js[j]; return w % pi_j; } -std::map<int,int> CountMinSketch::getIthArray(int i) { - std::map<int,int> tmp; - return tmp; -} void CountMinSketch::addKMer(unsigned long val) { int h,j; short cnt; - j=1; - std::vector<internal_array>::iterator it_j_array; - for (it_j_array=cms_lambda_array.begin();it_j_array!=cms_lambda_array.end();it_j_array++) { + for (j=0;j<lambda;j++) { h=hash64to32(val,j); - cnt=(*it_j_array)[h]; + cnt=cms_lambda_array[j] [h]; cnt++; - (*it_j_array)[h]=(cnt & ushortmask); - j++; + cms_lambda_array[j] [h]=(cnt & ushortmask); } - } int CountMinSketch::getEstimatedNbOcc(unsigned long val) { int j,h; - std::vector<internal_array>::iterator it; - short min=ushortmask; - j=1; - for (it=cms_lambda_array.begin();it!=cms_lambda_array.end();it++) { + unsigned short min=ushortmask; + for (j=0;j<lambda;j++) { h=hash64to32(val,j); - if ((*it)[h]<min) min=(*it)[h]; - j++; + if (cms_lambda_array[j] [h] <min) min=cms_lambda_array[j] [h]; } return min; } diff --git a/src/CountMinSketch.h b/src/CountMinSketch.h index 7f47d07..f57ed7e 100644 --- a/src/CountMinSketch.h +++ b/src/CountMinSketch.h @@ -8,9 +8,15 @@ #ifndef COUNTMINSKETCH_H_ #define COUNTMINSKETCH_H_ -#include <vector> -#include <map> - +/* +#ifdef __linux__ // TODO refactor and find appropriate ifdef; the pb is not linux it is GCC/C++11 +#include <hash_map> // only in gcc and MS visual studio. +#else +#include <unordered_map> // only in the C++11 standard +#endif*/ + +#include <stdlib.h> +#include <string.h> #include "rock_commons.h" typedef struct { @@ -24,20 +30,24 @@ class CountMinSketch { static const unsigned long mask2=2095103; static const unsigned long mask3=1023; - static const unsigned short ushortmask=32767; + static const unsigned short ushortmask=65535; + static const unsigned char ubytemask=255; int lambda; int kappa; int kappa_prime; +/* +#ifdef __linux__ +typedef __gnu_cxx::hash_map<int,short> internal_array; +#else +typedef std::unordered_map<int,short> internal_array; +#endif*/ - typedef std::map<int,short> internal_array; - std::vector<internal_array> cms_lambda_array; - - // std::vector<int> pi_j_array; + unsigned short ** cms_lambda_array; - int hash64to32(unsigned long,int); + int hash64to32(unsigned long,int); int hash64to32bs(unsigned long w,int j) { // bit shift version of hash function to start. unsigned long h_tmp; @@ -66,11 +76,11 @@ class CountMinSketch { lambda=glambda; kappa=gkappa; kappa_prime=gkappa_prime; - cms_lambda_array.reserve(lambda); int j; - internal_array cpt_array; - for (j=0; j<lambda;j++) { - cms_lambda_array.push_back(cpt_array); + cms_lambda_array=(unsigned short **) malloc(lambda*sizeof(unsigned short*)); + for (j=0;j<lambda;j++) { + cms_lambda_array[j]=(unsigned short *) malloc(sizeof(unsigned short)*INT_MAX); + memset(cms_lambda_array[j],0,INT_MAX); } } @@ -89,7 +99,13 @@ public: init(parms.lambda,parms.kappa,parms.kappa_prime); } - + ~CountMinSketch() { + int j; + for (j=0;j<lambda;j++) { + free(cms_lambda_array[j]); + } + free(cms_lambda_array); + } int getEstimatedNbOcc(unsigned long); int addRead(const readNumericValues&); diff --git a/src/unit_test_cms.cpp b/src/unit_test_cms.cpp index 8175905..de667c0 100644 --- a/src/unit_test_cms.cpp +++ b/src/unit_test_cms.cpp @@ -31,14 +31,7 @@ void test_hash(int lambda,int kappa,int kappa_prime) { void test_CMS(int lambda,int kappa,int kappa_prime) { CountMinSketch cms=CountMinSketch(lambda,kappa,kappa_prime); int i; - /*std::map<int,int> ithMap; - std::map<int, int>::iterator it; - for (i=0;i<lambda;i++) { - ithMap=cms.getIthArray(i); - for (it=ithMap.begin();it!=ithMap.end();it++) { - assert(it->second==0); - } - }*/ // doesn't make much sense since all the maps are empty. + cout<<"size of the CMS component: "<<sizeof(CountMinSketch)<<endl; int num=100*lambda; int rej_expected=0; int ret; @@ -75,19 +68,20 @@ void test_CMS(int lambda,int kappa,int kappa_prime) { int main(int argc, char **argv) { - int lambda=10; + int lambda=2; int kappa=50; int kappa_prime=20; cout<<"INT_MAX="<<INT_MAX<<endl; cout<<"sizeof(short)="<<sizeof(short)<<endl; cout<<"testing CMS with lambda="<<lambda<<endl; - test_CMS(lambda,kappa,kappa_prime); - lambda=100; + test_CMS(lambda,kappa,kappa_prime); // Finally using C arrays (maps implied storing hash keys : 4 Bytes per k_mer overhead) but each array is of size INT_MAX... + + /*lambda=6; cout<<"testing CMS with lambda="<<lambda<<endl; test_CMS(lambda,kappa,kappa_prime); - lambda=500; + lambda=8; cout<<"testing CMS with lambda="<<lambda<<endl; - test_CMS(lambda,kappa,kappa_prime); + test_CMS(lambda,kappa,kappa_prime);*/ cout<<"done"<<endl; } -- GitLab