From f4265c6f0728e7a7ad3ac2686d6a04f786bf233f Mon Sep 17 00:00:00 2001
From: Veronique Legrand <vlegrand@pasteur.fr>
Date: Tue, 19 Apr 2016 16:50:14 +0200
Subject: [PATCH] did new implementation for using only the max(k-mer value,
 reverse complement value) in the CMS; updates unit tests accordingly.

---
 src/CountMinSketch.cpp       | 36 +++++++++++++++++++++++++++-------
 src/CountMinSketch.h         | 25 +++++++++++++++++-------
 src/FqBaseBackend.cpp        |  2 +-
 src/ReadProcessor.cpp        |  7 +------
 src/ReadProcessor.h          |  2 +-
 src/fqwriter.cpp             |  2 +-
 src/read_utils.cpp           | 10 +++++++++-
 src/unit_test_cms.cpp        |  5 +++--
 src/unit_test_read_utils.cpp | 38 ++++++++++++++++--------------------
 9 files changed, 80 insertions(+), 47 deletions(-)

diff --git a/src/CountMinSketch.cpp b/src/CountMinSketch.cpp
index 4413806..daa24c3 100644
--- a/src/CountMinSketch.cpp
+++ b/src/CountMinSketch.cpp
@@ -60,13 +60,12 @@ const int CountMinSketch::Pi_js[500]={2147469629, 2147469637, 2147469659, 214746
             2147482873, 2147482877, 2147482921, 2147482937, 2147482943, 2147482949, 2147482951, 2147483029, 2147483033, 2147483053};
 
 
-
+/*
 void CountMinSketch::addKMer(const unsigned long& val1,const unsigned long& val2) {
     int h1,h2,h,j;
-    // short cnt;
     unsigned char cnt;
-    //unsigned short min=ushortmask;
-    for (j=0;j<lambda;j++) {
+    j=lambda;
+    while(--j>0) {
         h1=hash64to32(val1,j);
         h2=hash64to32(val2,j);
         (h1>h2)?h=h1:h=h2;
@@ -75,8 +74,32 @@ void CountMinSketch::addKMer(const unsigned long& val1,const unsigned long& val2
         cms_lambda_array[j] [h]=(cnt & ubytemask);
     }
 }
+*/
 
-
+/*
+     * Used to determine if a read must be kept or not.
+     * Depending on the case, threshold is kappa or kappa_prime
+     * Computes the number of k-mers for which the number of occurrences is below threshold.
+     * If more than half of the k-mers have a number of occurrences that is less than threshold,
+     * return true; else, return false.
+     */
+int CountMinSketch::isRCovBelowThres(const readNumericValues& read_val,const int& threshold) {
+        int a=0,b=0;
+        int j,h;
+        unsigned char min;
+        readNumericValues::const_iterator it;
+        for (it=read_val.begin();it!=read_val.end();++it) {
+            j=lambda;
+            min=ubytemask;
+            while (--j>=0 && min>threshold) {
+                h=hash64to32(*it,j);
+                min=cms_lambda_array[j] [h];
+            }
+            (min<threshold)? a+=1: NULL;
+            ++b;
+        }
+        return (2*a>b);
+    }
 
 
 
@@ -85,8 +108,7 @@ int CountMinSketch::addRead(const readNumericValues& read_val) {
     if (keep_r) {
         readNumericValues::const_iterator it;
         for (it=read_val.begin();it!=read_val.end();it++) {
-            /* Here, note that k-mer go by pair : 1rst 1 is k_mer and next one is its reverse complement*/
-            addKMer(*it,*(++it));
+            addKMer(*it);
         }
     }
     return keep_r;
diff --git a/src/CountMinSketch.h b/src/CountMinSketch.h
index cfba37e..2ad20c4 100644
--- a/src/CountMinSketch.h
+++ b/src/CountMinSketch.h
@@ -39,9 +39,9 @@ class CountMinSketch {
     unsigned char ** cms_lambda_array;
    // unsigned short * min_occ_array;
 
-   inline int hash64to32(unsigned long w ,int j) {
-       int pi_j=Pi_js[j];
-       return w % pi_j;
+   inline int hash64to32(const unsigned long& w ,const int& j) {
+       //int pi_j=Pi_js[j];
+       return w % Pi_js[j];
    }
 
     int hash64to32bs(unsigned long w,int j) { // bit shift version of hash function to start.
@@ -64,9 +64,20 @@ class CountMinSketch {
 
 
     // std::map<int,int> getIthArray(int); // utility method provided for testing only.
-    void addKMer(const unsigned long&,const unsigned long&); // inline? TODO: see later if it can help us gain time.
+    // void addKMer(const unsigned long&,const unsigned long&); // inline? TODO: see later if it can help us gain time.
+inline void addKMer(const unsigned long& val1) {
+    int h,j;
+    unsigned char cnt;
+    j=lambda;
+    while(--j>=0) {
+        h=hash64to32(val1,j);
+        cnt=cms_lambda_array[j] [h];
+        cnt++;
+        cms_lambda_array[j] [h]=(cnt & ubytemask);
+    }
+}
 
-    // int isRCovBelowThres(const readNumericValues& read_val, const int& threshold);
+    int isRCovBelowThres(const readNumericValues& read_val, const int& threshold);
     /*
      * Used to determine if a read must be kept or not.
      * Depending on the case, threshold is kappa or kappa_prime
@@ -74,7 +85,7 @@ class CountMinSketch {
      * If more than half of the k-mers have a number of occurrences that is less than threshold,
      * return true; else, return false.
      */
-    inline int isRCovBelowThres(const readNumericValues& read_val,const int& threshold) {
+    /*inline int isRCovBelowThres(const readNumericValues& read_val,const int& threshold) {
         int a=0;
         int b=0;
         int j;
@@ -92,7 +103,7 @@ class CountMinSketch {
             ++b;
         }
         return (2*a>b);
-    }
+    }*/
 
  /*   inline unsigned char CountMinSketch::isKmCoveBelowThres(const unsigned long& val, const int& threshold) {
         int h;
diff --git a/src/FqBaseBackend.cpp b/src/FqBaseBackend.cpp
index aab7a80..83e69c8 100644
--- a/src/FqBaseBackend.cpp
+++ b/src/FqBaseBackend.cpp
@@ -16,7 +16,7 @@
 
 #include "FqBaseBackend.h"
 
-#define DEBUG
+//#define DEBUG
 #ifdef DEBUG
 #include <cassert>
 #include <iostream>
diff --git a/src/ReadProcessor.cpp b/src/ReadProcessor.cpp
index 8eda5ad..85f761b 100644
--- a/src/ReadProcessor.cpp
+++ b/src/ReadProcessor.cpp
@@ -112,8 +112,6 @@ unsigned long ReadProcessor::kMerToNumberReverse(char * k_m,unsigned long * p_pr
 }
 
 void ReadProcessor::getKMerNumbers(char * dnaStr,int l,std::vector<unsigned long>& my_vect) { // See simple_test.cpp and results. benchmark showed that std::vector is very slightly faster than C array and doesn't require more memory in our case. So, I am using it since it makes code simpler.
-    /*std::vector<unsigned long> my_vect;
-    return my_vect;*/
     int i;
     unsigned long num;
     unsigned long num_rev;
@@ -122,13 +120,10 @@ void ReadProcessor::getKMerNumbers(char * dnaStr,int l,std::vector<unsigned long
     char * p_char=dnaStr;
     int nb_k_m=l-k+1;
     for (i=0; i<nb_k_m;i++) {
-        /*std::cout<<"processing k_mer number: "<<i<<std::endl;
-        std::cout<<p_char<<std::endl;*/
         num=kMerToNumber(p_char,p_prev);
         num_rev=kMerToNumberReverse(p_char,p_prev_rev);
         if (n<0) {
-            my_vect.push_back(num);
-            my_vect.push_back(num_rev);
+            (num<num_rev)?my_vect.push_back(num_rev):my_vect.push_back(num); // finding a k-mer is equivalent to finding its reverse complement; as numeric values for k-mers are unique, just take into account the biggest of the 2.
         }
         p_char++;
         p_prev=&num;
diff --git a/src/ReadProcessor.h b/src/ReadProcessor.h
index e576c56..be4d4de 100644
--- a/src/ReadProcessor.h
+++ b/src/ReadProcessor.h
@@ -8,7 +8,7 @@
 #include <vector>
 #include <math.h>
 
-#define DEBUG
+//#define DEBUG
 #ifdef DEBUG
 #include <iostream>
 using namespace std;
diff --git a/src/fqwriter.cpp b/src/fqwriter.cpp
index d39609d..ca6f2b2 100644
--- a/src/fqwriter.cpp
+++ b/src/fqwriter.cpp
@@ -10,7 +10,7 @@
 #include "srp.h"
 #include "FqBaseBackend.h"
 
-#define DEBUG
+//#define DEBUG
 #ifdef DEBUG
 #include <iostream>
 using namespace std;
diff --git a/src/read_utils.cpp b/src/read_utils.cpp
index f211ef7..01c91ce 100644
--- a/src/read_utils.cpp
+++ b/src/read_utils.cpp
@@ -16,7 +16,7 @@
 #include "srp.h"
 #include "read_utils.h"
 #include "rock_commons.h"
-#define DEBUG
+//#define DEBUG
 
 #ifdef DEBUG
 #include <cassert>
@@ -30,7 +30,9 @@ void getFqRecord(FqBaseBackend* fq_files_be[],const unsigned char& f_id,const un
     try {
         be->getRead(offset,fq_record);
     } catch (int e) {
+#ifdef DEBUG
         cout<<"couldn't retrieve read at offset: "<<offset<<endl;
+#endif
         throw e;
     }
 }
@@ -91,10 +93,13 @@ void getDNASeqstr(FqBaseBackend* fq_files_be [],
         getFqRecord(fq_files_be,f_id1,offset1,dna_seqs[0].fq_record_buf);
         processFqRecord(p_dna_seqs);
     } catch (int e) {
+#ifdef DEBUG
         cout<<"j="<<j<<" sr content: a1="<<sr.read_a1<<" a2= "<<sr.read_a2<<endl;
         cout<<"pb reading/seeking at offset1="<<offset1<<endl;
         cout<<"got: "<<endl;
         cout<<dna_seqs[0].fq_record_buf<<endl;
+#endif
+        exit(e); 
     }
     if (f_id2!=0) { // case of PE reads.
         p_dna_seqs+=1;
@@ -102,10 +107,13 @@ void getDNASeqstr(FqBaseBackend* fq_files_be [],
         try {
             getFqRecord(fq_files_be,f_id2,offset2,dna_seqs[1].fq_record_buf);
         } catch (int e) {
+#ifdef DEBUG
             cout<<"j="<<j<<" sr content: a1="<<sr.read_a1<<" a2= "<<sr.read_a2<<endl;
             cout<<"pb reading/seeking at offset2="<<offset2<<endl;
             cout<<"got: "<<endl;
             cout<<dna_seqs[1].fq_record_buf<<endl;
+#endif
+            exit(e);
         }
         processFqRecord(p_dna_seqs);
     }
diff --git a/src/unit_test_cms.cpp b/src/unit_test_cms.cpp
index 42a136b..0162ffb 100644
--- a/src/unit_test_cms.cpp
+++ b/src/unit_test_cms.cpp
@@ -36,11 +36,12 @@ void test_CMS(int lambda,int kappa,int kappa_prime) {
     int rej_expected=0;
     int ret;
     for (i=0;i<num;i++) {
-        cms.addKMer(num,num-1);
+        cms.addKMer(num);
     }
     // Now, check that our k-mer is present in the CMS.
     int min=cms.getEstimatedNbOcc(num);
-    assert((min==num) || (min==num-CountMinSketch::ushortmask-1)); // addKmer doesn't check kappa.
+    std::cout<<"min="<<min<<endl;
+    assert((min==num) || (min==num-CountMinSketch::ubytemask-1)); // addKmer doesn't check kappa.
     
     // mimic a read (as given by part 2 output).
     readNumericValues read_values;
diff --git a/src/unit_test_read_utils.cpp b/src/unit_test_read_utils.cpp
index 19dc832..2fac279 100644
--- a/src/unit_test_read_utils.cpp
+++ b/src/unit_test_read_utils.cpp
@@ -146,11 +146,11 @@ void test_decomposeReadInkMerNums() {
     seq1->start_idx=12;
     seq1->length=75;
     decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs);
-    assert(nbrKmerDecompo.size()==142);
+    assert(nbrKmerDecompo.size()==71);
     readNumericValues::iterator it;
     for (it=nbrKmerDecompo.begin();it!=nbrKmerDecompo.end();it++) {
         // cout<<*it<<endl;
-        assert(*it==0 || *it==1023);
+        assert(*it==1023);
     }
     DnaSeqStr * seq2=&a_seqs[1];
     strcpy(seq2->fq_record_buf,"@another_fake_stuff\nATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG\n+\n....@\n");
@@ -159,7 +159,7 @@ void test_decomposeReadInkMerNums() {
     nbrKmerDecompo.clear();
     decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs);
     //cout<<nbrKmerDecompo.size()<<endl;
-    assert(nbrKmerDecompo.size()==294);
+    assert(nbrKmerDecompo.size()==147);
 }
 
 
@@ -244,7 +244,7 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32.
     unsigned long expected_rev=read_p.kMerToNumber((char *) "TATGGTGAAAACAGTTGGTGTTATGGTAGC",NULL);
 
     read_p.getKMerNumbers((char *) dnaStr,strlen(dnaStr),nbrKmerDecompo);
-    assert(nbrKmerDecompo.size()==nb_expected_k_mers*2); // for each k_mer, we also expect to have a number for its reverse complement.
+    assert(nbrKmerDecompo.size()==nb_expected_k_mers); // for each k_mer, we also expect to have a number for its reverse complement.
     int cnt_k_mer=0;
     unsigned long num,num2;
     for (it=nbrKmerDecompo.begin();it!=nbrKmerDecompo.end();it++) {
@@ -255,31 +255,32 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32.
             assert(*it==1151987406330741231);
             num=*it;
         }
-        if (cnt_k_mer==3) {
+        if (cnt_k_mer==2) {
             // 2nd k-mer is : TTTTAGGTGCTACCATAACACCAACTGTTT
             num2=read_p.kMerToNumber((char *) "TTTTAGGTGCTACCATAACACCAACTGTTT",NULL);
             assert(*it==num2);
             unsigned long num3=read_p.kMerToNumber((char *) "TTTTAGGTGCTACCATAACACCAACTGTTT",&num);
             assert(num3==num2);
         }
-        if (cnt_k_mer==5) {
+        if (cnt_k_mer==3) {
             num=read_p.kMerToNumber((char *) "TTTAGGTGCTACCATAACACCAACTGTTTT",NULL);
-            assert(num==*it);
+            unsigned long num_rev=read_p.kMerToNumber((char *) "AAAACAGTTGGTGTTATGGTAGCACCTAAA",NULL);
+            assert(max(num,num_rev)==*it);
         }
-        if (cnt_k_mer==7) {
+        if (cnt_k_mer==4) {
             num=read_p.kMerToNumber((char *) "TTAGGTGCTACCATAACACCAACTGTTTTC",NULL);
             assert(num==*it);
         }
 
         // k-mer number 10 is GCTACCATAACACCAACTGTTTTCACCATA
         // its reverse complement is : TATGGTGAAAACAGTTGGTGTTATGGTAGC
-        if (cnt_k_mer==19) {
+        if (cnt_k_mer==10) {
             num=read_p.kMerToNumber((char *) "GCTACCATAACACCAACTGTTTTCACCATA",NULL);
-            assert(*it==704021989794238796);
+            assert(*it==max(704021989794238796,930978566989888201));
         }
-        if (cnt_k_mer==20) {
+        /*if (cnt_k_mer==20) {
             assert(*it==930978566989888201);
-        }
+        }*/
     }
 }
 
@@ -302,17 +303,12 @@ void testDNAToNumberWithN() {
 
     nbrKmerDecompo.reserve(nb_expected_k_mers);
     read_p.getKMerNumbers((char *) dnaStr,strlen(dnaStr),nbrKmerDecompo);
-    assert(nbrKmerDecompo.size()==nb_expected_k_mers*2);
+    assert(nbrKmerDecompo.size()==nb_expected_k_mers);
 
     it=nbrKmerDecompo.begin();
-    it+=22;
-    // The 1rst expected k-mer is the one after the 3rd N char (starting at position 47 in dnaStr). ATCAAGCATTAGAGACGCTTCTCTAATGTA
-    assert(*it==234837138459816172);
-    it++;
-    assert(*it==886965076742957027);
-
-
-
+    it+=11;
+    // The 1rst expected k-mer is the one after the 3rd N char (starting at position 47 in dnaStr). ATCAAGCATTAGAGACGCTTCTCTAATGTA or its reverse complement depending
+    assert(*it==max(234837138459816172,886965076742957027));
 }
 
 int main(int argc, char **argv) {
-- 
GitLab