From fb37d10fbd892c33678e59870d7c4ec11264edef Mon Sep 17 00:00:00 2001
From: Veronique Legrand <vlegrand@pasteur.fr>
Date: Fri, 11 Mar 2016 16:38:51 +0100
Subject: [PATCH] bugfix+ finally chose C array as underlying implementation
 for CMS

---
 src/CountMinSketch.cpp | 25 +++++++-----------------
 src/CountMinSketch.h   | 44 ++++++++++++++++++++++++++++--------------
 src/unit_test_cms.cpp  | 20 +++++++------------
 3 files changed, 44 insertions(+), 45 deletions(-)

diff --git a/src/CountMinSketch.cpp b/src/CountMinSketch.cpp
index 0d6073c..d63a83d 100644
--- a/src/CountMinSketch.cpp
+++ b/src/CountMinSketch.cpp
@@ -65,40 +65,29 @@ int Pi_js[500]={
         2147482873, 2147482877, 2147482921, 2147482937, 2147482943, 2147482949, 2147482951, 2147483029, 2147483033, 2147483053};
 
 int CountMinSketch::hash64to32(unsigned long w,int j) {
-    int pi_j=Pi_js[j-1];
+    int pi_j=Pi_js[j];
     return w % pi_j;
 }
 
-std::map<int,int> CountMinSketch::getIthArray(int i) {
-    std::map<int,int> tmp;
-    return tmp;
-}
 
 
 void CountMinSketch::addKMer(unsigned long val) {
     int h,j;
     short cnt;
-    j=1;
-    std::vector<internal_array>::iterator it_j_array;
-    for (it_j_array=cms_lambda_array.begin();it_j_array!=cms_lambda_array.end();it_j_array++) {
+    for (j=0;j<lambda;j++) {
         h=hash64to32(val,j);
-        cnt=(*it_j_array)[h];
+        cnt=cms_lambda_array[j] [h];
         cnt++;
-        (*it_j_array)[h]=(cnt & ushortmask);
-        j++;
+        cms_lambda_array[j] [h]=(cnt & ushortmask);
     }
-
 }
 
 int CountMinSketch::getEstimatedNbOcc(unsigned long val) {
     int j,h;
-    std::vector<internal_array>::iterator it;
-    short min=ushortmask;
-    j=1;
-    for (it=cms_lambda_array.begin();it!=cms_lambda_array.end();it++) {
+    unsigned short min=ushortmask;
+    for (j=0;j<lambda;j++) {
         h=hash64to32(val,j);
-        if ((*it)[h]<min) min=(*it)[h];
-        j++;
+        if (cms_lambda_array[j] [h] <min) min=cms_lambda_array[j] [h];
     }
     return min;
 }
diff --git a/src/CountMinSketch.h b/src/CountMinSketch.h
index 7f47d07..f57ed7e 100644
--- a/src/CountMinSketch.h
+++ b/src/CountMinSketch.h
@@ -8,9 +8,15 @@
 #ifndef COUNTMINSKETCH_H_
 #define COUNTMINSKETCH_H_
 
-#include <vector>
-#include <map>
-
+/*
+#ifdef __linux__ // TODO refactor and find appropriate ifdef; the pb is not linux it is GCC/C++11
+#include <hash_map> // only in gcc and MS visual studio.
+#else
+#include <unordered_map> // only in the C++11 standard
+#endif*/
+
+#include <stdlib.h>
+#include <string.h>
 #include "rock_commons.h"
 
 typedef struct {
@@ -24,20 +30,24 @@ class CountMinSketch {
     static const unsigned long mask2=2095103;
     static const unsigned long mask3=1023;
 
-    static const unsigned short ushortmask=32767;
+    static const unsigned short ushortmask=65535;
+    static const unsigned char ubytemask=255;
 
 
     int lambda;
     int kappa;
     int kappa_prime;
+/*
+#ifdef __linux__
+typedef __gnu_cxx::hash_map<int,short> internal_array;
+#else
+typedef std::unordered_map<int,short> internal_array;
+#endif*/
 
-    typedef std::map<int,short> internal_array;
-    std::vector<internal_array> cms_lambda_array;
-
-    // std::vector<int> pi_j_array;
 
+   unsigned short ** cms_lambda_array;
 
-    int hash64to32(unsigned long,int);
+   int hash64to32(unsigned long,int);
 
     int hash64to32bs(unsigned long w,int j) { // bit shift version of hash function to start.
         unsigned long h_tmp;
@@ -66,11 +76,11 @@ class CountMinSketch {
         lambda=glambda;
         kappa=gkappa;
         kappa_prime=gkappa_prime;
-        cms_lambda_array.reserve(lambda);
         int j;
-        internal_array cpt_array;
-        for (j=0; j<lambda;j++) {
-            cms_lambda_array.push_back(cpt_array);
+        cms_lambda_array=(unsigned short **) malloc(lambda*sizeof(unsigned short*));
+        for (j=0;j<lambda;j++) {
+            cms_lambda_array[j]=(unsigned short *) malloc(sizeof(unsigned short)*INT_MAX);
+            memset(cms_lambda_array[j],0,INT_MAX);
         }
     }
 
@@ -89,7 +99,13 @@ public:
         init(parms.lambda,parms.kappa,parms.kappa_prime);
     }
 
-
+    ~CountMinSketch() {
+    int j;
+    for (j=0;j<lambda;j++) {
+        free(cms_lambda_array[j]);
+    }
+    free(cms_lambda_array);
+    }
 
     int getEstimatedNbOcc(unsigned long);
     int addRead(const readNumericValues&);
diff --git a/src/unit_test_cms.cpp b/src/unit_test_cms.cpp
index 8175905..de667c0 100644
--- a/src/unit_test_cms.cpp
+++ b/src/unit_test_cms.cpp
@@ -31,14 +31,7 @@ void test_hash(int lambda,int kappa,int kappa_prime) {
 void test_CMS(int lambda,int kappa,int kappa_prime) {
     CountMinSketch cms=CountMinSketch(lambda,kappa,kappa_prime);
     int i;
-    /*std::map<int,int> ithMap;
-    std::map<int, int>::iterator it;
-    for (i=0;i<lambda;i++) {
-        ithMap=cms.getIthArray(i);
-        for (it=ithMap.begin();it!=ithMap.end();it++) {
-            assert(it->second==0);
-        }
-    }*/ // doesn't make much sense since all the maps are empty.
+    cout<<"size of the CMS component: "<<sizeof(CountMinSketch)<<endl;
     int num=100*lambda;
     int rej_expected=0;
     int ret;
@@ -75,19 +68,20 @@ void test_CMS(int lambda,int kappa,int kappa_prime) {
 
 
 int main(int argc, char **argv) {
-    int lambda=10;
+    int lambda=2;
     int kappa=50;
     int kappa_prime=20;
     cout<<"INT_MAX="<<INT_MAX<<endl;
     cout<<"sizeof(short)="<<sizeof(short)<<endl;
 
     cout<<"testing CMS with lambda="<<lambda<<endl;
-    test_CMS(lambda,kappa,kappa_prime);
-    lambda=100;
+    test_CMS(lambda,kappa,kappa_prime); // Finally using C arrays (maps implied storing hash keys : 4 Bytes per k_mer overhead) but each array is of size INT_MAX...
+    
+    /*lambda=6;
     cout<<"testing CMS with lambda="<<lambda<<endl;
     test_CMS(lambda,kappa,kappa_prime);
-    lambda=500;
+    lambda=8;
     cout<<"testing CMS with lambda="<<lambda<<endl;
-    test_CMS(lambda,kappa,kappa_prime);
+    test_CMS(lambda,kappa,kappa_prime);*/
     cout<<"done"<<endl;
 }
-- 
GitLab