From 91c02af3acbae520e4371b9a2819226fec36b6da Mon Sep 17 00:00:00 2001
From: Veronique Legrand <vlegrand@pasteur.fr>
Date: Fri, 15 Apr 2022 11:28:09 +0200
Subject: [PATCH] version 2.0_beta where PE reads are processed separately and
 selected or removed from the cms if at least one of the PE has a median
 coverage below threshold.

---
 configure.ac                 |   2 +-
 src/CountMinSketch.hpp       |  51 +++++++++++++----
 src/Filter.hpp               |   8 +--
 src/FqMainBackend.cpp        |  15 ++---
 src/FqMainBackend.h          |   6 +-
 src/ReadProcessor.cpp        |  10 ++--
 src/ReadProcessor.h          |   3 +-
 src/fqreader.cpp             |   4 +-
 src/read_utils.cpp           |  11 ++--
 src/read_utils.h             |   2 +-
 src/rock_commons.h           |   8 ++-
 src/unit_test_cms.cpp        |  30 ++++++++--
 src/unit_test_fqreader.cpp   | 105 +++++++++--------------------------
 src/unit_test_read_utils.cpp |  41 +++++---------
 test/rock_mem.sh             |  14 +++--
 15 files changed, 150 insertions(+), 160 deletions(-)

diff --git a/configure.ac b/configure.ac
index e2143e2..37c685a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,7 +1,7 @@
 dnl Process this file with autoconf to produce a configure script.
 
 AC_PREREQ(2.59)
-AC_INIT(rock, 1.9.6)
+AC_INIT(rock, 2.0_beta)
 
 
 AC_CANONICAL_SYSTEM
diff --git a/src/CountMinSketch.hpp b/src/CountMinSketch.hpp
index a8f0702..24d5d0d 100644
--- a/src/CountMinSketch.hpp
+++ b/src/CountMinSketch.hpp
@@ -80,7 +80,9 @@ private:
     }
 
 
-    inline int isRCovBelowThres(const readNumericValues& read_val, const int& threshold);
+
+
+    inline int isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold);
 
     inline int isRCovBelowThresPartial(const readNumericValues& read_val, const int& threshold, const int& start=0,const int& stop=0);
 
@@ -94,10 +96,12 @@ public:
 
     CountMinSketch(int glambda,int gkappa,int gkappa_prime) {
         init(glambda,gkappa,gkappa_prime);
+        unsigned int test=hash64to32(657922856560023,0);
     }
 
     CountMinSketch(CMSparams parms) {
         init(parms.lambda,parms.kappa,parms.kappa_prime);
+        unsigned int test=hash64to32(657922856560023,0);
     }
 
     ~CountMinSketch() {
@@ -130,9 +134,9 @@ public:
     // keep that just for unit testing purpose
     T getEstimatedNbOcc(const unsigned long& val);
 
-    int addRead(const readNumericValues& read_val);
+    int addRead(const T_read_numericValues& read_val);
 
-    int isBeneathMinKappa(const readNumericValues&read_val);
+    int isBeneathMinKappa(const T_read_numericValues&read_val);
 
     unsigned long getNbDistinctKMers();
 
@@ -145,31 +149,54 @@ public:
  * Determines whether median of k-mer coverage is below threshold or not.
  */
 // this is a few seconds faster
-template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const readNumericValues& read_val, const int& threshold) {
+template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold) {
     int PE1_below_thres;
+    int PE2_below_thres=0;
     int a1=0,a2=0;
-    int b1;
+    int b1,b2=0;
     int j;
     unsigned int h;
     T min;
 
-    int num=read_val.size();
-    b1=num;
+    int num=read_val.single_or_PE_val.size();
+
+    read_val.idx_start_PE2?b1=read_val.idx_start_PE2:b1=num;
+    read_val.idx_start_PE2?b2=num-b1:b2;
 
     // this proves faster than using iterators on these data (short)
-    long const * p_start=(long const * )&(read_val[0]); // TODO: when we'll use C++11: use the data() method of the vector class to access the underlying C array.
+    long const * p_start=(long const * )&(read_val.single_or_PE_val[0]); // TODO: when we'll use C++11: use the data() method of the vector class to access the underlying C array.
     int nb=0;
     while(nb<b1 && 2*a1<=b1) {
         j=lambda;
         min=mask;
         while (--j>=0 && min>threshold) {
+            //std::cout<<*(p_start+nb)<<std::endl;
+            //std::cout<<j<<std::endl;
             h=hash64to32(*(p_start+nb),j);
+            //std::cout<<h<<std::endl;
+            //unsigned int test=hash64to32(657922856560023,0);
+            
             min=cms_lambda_array[j] [h];
         }
         (min<threshold)? a1+=1:a1;
         nb++;
     }
     PE1_below_thres=2*a1>b1;
+    if (b2) {
+        nb=b1;
+        while(nb<num && 2*a2<=b2) {
+            j=lambda;
+            min=mask;
+            while (--j>=0 && min>threshold) {
+                h=hash64to32(*(p_start+nb),j);
+                min=cms_lambda_array[j] [h];
+            }
+            (min<threshold)? a2+=1:a2;
+            nb++;
+        }
+        PE2_below_thres=2*a2>b2;
+        return (PE1_below_thres || PE2_below_thres);
+    }
     return PE1_below_thres;
 }
 
@@ -252,14 +279,14 @@ template<typename T> T CountMinSketch<T>::getEstimatedNbOcc(const unsigned long&
     return min;
 }
 
-template<typename T> int CountMinSketch<T>::addRead(const readNumericValues& read_val) {
+template<typename T> int CountMinSketch<T>::addRead(const T_read_numericValues& read_val) {
     int keep_r=isRCovBelowThres(read_val,kappa);
     if (keep_r) {
         readNumericValues::const_iterator it;
         // according to the intel profiler, we spend a lot of time in this loop. Try to use something slightly faster.
-        long const * p=(long const *) &read_val[0];
+        long const * p=(long const *) &read_val.single_or_PE_val[0];
         int cnt;
-        int stop=read_val.size();
+        int stop=read_val.single_or_PE_val.size();
         for (cnt=0;cnt<stop;cnt++) {
             this->addKMer(*p);
             p=p+1;
@@ -268,7 +295,7 @@ template<typename T> int CountMinSketch<T>::addRead(const readNumericValues& rea
     return keep_r;
 }
 
-template<typename T> int CountMinSketch<T>::isBeneathMinKappa(const readNumericValues& read_val) {
+template<typename T> int CountMinSketch<T>::isBeneathMinKappa(const T_read_numericValues& read_val) {
     int res=isRCovBelowThres(read_val,kappa_prime);
     return res;
 }
diff --git a/src/Filter.hpp b/src/Filter.hpp
index 8537fa2..e439979 100644
--- a/src/Filter.hpp
+++ b/src/Filter.hpp
@@ -85,7 +85,7 @@ template <typename T> void Filter::underlyingfillCMS(FqBaseBackend* map_id_backe
     i_dim::iterator it_offs;
     k_dim::iterator it_struct;
     ReadProcessor read_p(k);
-    readNumericValues nbrKmerDecompo;
+    T_read_numericValues nbrKmerDecompo;
 
     // 1rst step, open files before fetching reads in them
     int i;
@@ -110,7 +110,7 @@ template <typename T> void Filter::underlyingfillCMS(FqBaseBackend* map_id_backe
                     // read is rejected, update rpos structure accordingly.
                     it_struct->fileid=0;
                 }
-                nbrKmerDecompo.clear();
+                nbrKmerDecompo.single_or_PE_val.clear();
             }
         }
     }
@@ -126,7 +126,7 @@ template <typename T> void Filter::underlyinglowFilterCMS(FqBaseBackend* map_id_
     k_dim::iterator it_struct;
     int ret;
     ReadProcessor read_p(k);
-    readNumericValues nbrKmerDecompo;
+    T_read_numericValues nbrKmerDecompo;
 
     // 1rst step, open files before fetching reads in them
     int i;
@@ -148,7 +148,7 @@ template <typename T> void Filter::underlyinglowFilterCMS(FqBaseBackend* map_id_
                 decomposeReadInKMerNums(read_p, nbrKmerDecompo,k,a_seqs);
                 ret=cms->isBeneathMinKappa(nbrKmerDecompo);
                 if (ret) it_struct->fileid=0;
-                nbrKmerDecompo.clear();
+                nbrKmerDecompo.single_or_PE_val.clear();
                 }
             }
         }
diff --git a/src/FqMainBackend.cpp b/src/FqMainBackend.cpp
index a259053..2e73578 100644
--- a/src/FqMainBackend.cpp
+++ b/src/FqMainBackend.cpp
@@ -71,9 +71,6 @@ void FqMainBackend::processFile(char * filename,unsigned char f_id) {
     size_t buffer_size;
     (test_mode)?buffer_size=test_bufsize:buffer_size=FqBaseBackend::bufsize;
 
-    //this->openUndefFile();
-    //if (p_auxFqProcessor!=NULL) p_auxFqProcessor->openUndefFile();
-
     int f_single=open(filename,O_RDONLY,mode);
     if (f_single==-1) {
         err(errno,"cannot open file: %s.",filename);
@@ -90,9 +87,6 @@ void FqMainBackend::processFile(char * filename,unsigned char f_id) {
         processBuf(buf_info,f_id,cur_offset);
     }
     close(f_single);
-    /*this->closeUndefFile();
-    if (p_auxFqProcessor!=NULL) p_auxFqProcessor->closeUndefFile();*/
-
     free(buf);
 }
 
@@ -117,14 +111,17 @@ void FqMainBackend::onEndFastqRecord(T_fq_rec_info& rec_info,const T_buf_info& b
      //debug_processBuf(on_record_end,bufinfo,rec_info.rstart_offset);
      int nb_k_mer=rec_info.nb_nucleotides_in_read+1-qual_thres.k;
      rec_info.nb_nucleotides_in_read_PE2>0?nb_k_mer_PE2=rec_info.nb_nucleotides_in_read_PE2+1-qual_thres.k:nb_k_mer_PE2=0;
-     if (!treat_PE_separately || (p_auxFqProcessor==NULL)) {
+     if (p_auxFqProcessor==NULL) { //case of single reads
          nb_k_mer+=nb_k_mer_PE2;
          //(nb_k_mer-rec_info.nb_k_mers_in_error-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read)?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo);
          if (nb_k_mer-rec_info.nb_k_mers_in_error-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read) ref_k_dim.push_back(rp);
-     } else {
+     } else { // PE reads are processed separately by default
          // ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2>=1) && (nb_k_mer-rec_info.nb_k_mers_in_error>=1))?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo);
          // ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read) && (nb_k_mer-rec_info.nb_k_mers_in_error>=qual_thres.min_correct_k_mers_in_read))?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo);
-    	 if ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read) && (nb_k_mer-rec_info.nb_k_mers_in_error>=qual_thres.min_correct_k_mers_in_read)) ref_k_dim.push_back(rp);
+    	 //if ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read) && (nb_k_mer-rec_info.nb_k_mers_in_error>=qual_thres.min_correct_k_mers_in_read)) ref_k_dim.push_back(rp);
+    	 cout<<"nb_k_mer_PE2="<<nb_k_mer_PE2<<" nb_k_mer="<<nb_k_mer<<endl;
+    	 cout<<" nb_k_mers_in_error_in_PE2="<<rec_info.nb_k_mers_in_error_in_PE2<<" nb_k_mers_in_error="<<rec_info.nb_k_mers_in_error<<endl;
+    	 if ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2+nb_k_mer-rec_info.nb_k_mers_in_error>=qual_thres.min_correct_k_mers_in_read)) ref_k_dim.push_back(rp);
      }
      // empty buffer keeping current record
      cur_fq_record[0]='\0';
diff --git a/src/FqMainBackend.h b/src/FqMainBackend.h
index aad846b..050f1c9 100644
--- a/src/FqMainBackend.h
+++ b/src/FqMainBackend.h
@@ -32,7 +32,7 @@
 
 class FqMainBackend : public FqBaseBackend {
     // static int treat_PE_as_single;
-    static int treat_PE_separately;
+    //static int treat_PE_separately;
     unsigned long cnt_k_mers; //! counter for the total number of k-mers found in a single file or in a pair of PE files.
                               //! This includes k-mers in error.
     FqAuxBackend * p_auxFqProcessor; /* Another fastq processor component is necessary for handling the case of PE reads.*/
@@ -56,9 +56,9 @@ public:
 
     void processBuf(T_buf_info&,unsigned char f_id,unsigned long cur_offset);
 
-    static void setTreatPEMode(const int& treat_PE){
+    /*static void setTreatPEMode(const int& treat_PE){
         FqMainBackend::treat_PE_separately=treat_PE;
-    }
+    }*/
 
     unsigned long getCntKMers() {
     	return cnt_k_mers;
diff --git a/src/ReadProcessor.cpp b/src/ReadProcessor.cpp
index dde6617..66bec29 100644
--- a/src/ReadProcessor.cpp
+++ b/src/ReadProcessor.cpp
@@ -59,17 +59,17 @@ void ReadProcessor::kMerAndRevToNumber(char * k_m,KmRevNumbers& numbers) {
 
 
 
-void ReadProcessor::getKMerNumbers(char * dnaStr,int l,std::vector<unsigned long>& my_vect) { // See simple_test.cpp and results. benchmark showed that std::vector is very slightly faster than C array and doesn't require more memory in our case. So, I am using it since it makes code simpler.
+void ReadProcessor::getKMerNumbers(char * dnaStr,int l,readNumericValues& all_values) { // See simple_test.cpp and results. benchmark showed that std::vector is very slightly faster than C array and doesn't require more memory in our case. So, I am using it since it makes code simpler.
     int i;
     KmRevNumbers numbers;
     initNumbers(numbers);
     char * p_char=dnaStr;
     int nb_k_m=l-k+1;
     for (i=0; i<nb_k_m;i++) {
-	kMerAndRevToNumber(p_char,numbers);
-        if (n<0) {
-            (numbers.nbr<numbers.nbr_rev)?my_vect.push_back(numbers.nbr_rev):my_vect.push_back(numbers.nbr); // finding a k-mer is equivalent to finding its reverse complement; as numeric values for k-mers are unique, just take into account the biggest of the 2.
-        }
+    	kMerAndRevToNumber(p_char,numbers);
+    	if (n<0) {
+    		(numbers.nbr<numbers.nbr_rev)?all_values.push_back(numbers.nbr_rev):all_values.push_back(numbers.nbr); // finding a k-mer is equivalent to finding its reverse complement; as numeric values for k-mers are unique, just take into account the biggest of the 2.
+    	}
         p_char++;
     }
 }
diff --git a/src/ReadProcessor.h b/src/ReadProcessor.h
index 8caa552..f3396cd 100644
--- a/src/ReadProcessor.h
+++ b/src/ReadProcessor.h
@@ -33,6 +33,7 @@
 
 
 #include <iostream>
+#include "rock_commons.h"
 using namespace std;
 
 
@@ -132,7 +133,7 @@ public:
     void kMerAndRevToNumber(char * k_m,KmRevNumbers& numbers);
     unsigned long nucleoToNumberWrap(char s);
 
-    void getKMerNumbers(char * dnaStr,int l,std::vector<unsigned long>& my_vect);
+    void getKMerNumbers(char * dnaStr,int l,readNumericValues&);
 
 
 };
diff --git a/src/fqreader.cpp b/src/fqreader.cpp
index 80f7fde..0a0a3a4 100644
--- a/src/fqreader.cpp
+++ b/src/fqreader.cpp
@@ -38,7 +38,7 @@
 
 
 FasqQualThreshold FqBaseBackend::qual_thres;
-int FqMainBackend::treat_PE_separately;
+//int FqMainBackend::treat_PE_separately;
 
 /*
  * Processes 1 file containing single reads.
@@ -80,7 +80,7 @@ unsigned long processInputFiles(const std::vector<IO_fq_files>& single_files,\
     unsigned long tot_nb_kmers=0;
 	unsigned char f_id=1;
     FqBaseBackend::setQualThreshold(a_qual_thres);
-    FqMainBackend::setTreatPEMode(PE_process_mode);
+    //FqMainBackend::setTreatPEMode(PE_process_mode);
     std::vector<IO_fq_files>::const_iterator it_single;
     for (it_single=single_files.begin();it_single!=single_files.end();it_single++) {
         FqMainBackend * be_fq=new FqMainBackend(io_sr);
diff --git a/src/read_utils.cpp b/src/read_utils.cpp
index d223fca..7f71a4f 100644
--- a/src/read_utils.cpp
+++ b/src/read_utils.cpp
@@ -149,19 +149,22 @@ void getDNASeqstr(FqBaseBackend* fq_files_be [],
 
 }
 
-void decomposeReadInKMerNums(ReadProcessor& read_p, readNumericValues& nbrKmerDecompo,int k,DnaSeqStr a_seqs[2]) {
+void decomposeReadInKMerNums(ReadProcessor& read_p, T_read_numericValues& nbrKmerDecompo,int k,DnaSeqStr a_seqs[2]) {
     int nb_expected_k_mers,nb_expected_k_mers_PE1,nb_expected_k_mers_PE2=0;
     nb_expected_k_mers_PE1=a_seqs[0].length+1-k;
+    nbrKmerDecompo.idx_start_PE2=0;
     if (a_seqs[1].length) nb_expected_k_mers_PE2=a_seqs[1].length+1-k;
     nb_expected_k_mers=nb_expected_k_mers_PE1+nb_expected_k_mers_PE2;
-    nbrKmerDecompo.reserve(nb_expected_k_mers);
+    nbrKmerDecompo.single_or_PE_val.reserve(nb_expected_k_mers);
+    //nbrKmerDecompo.nb_kmer_PE1=nb_expected_k_mers_PE1;
 
     char * start_dna_str=a_seqs[0].fq_record_buf+a_seqs[0].start_idx;
-    read_p.getKMerNumbers(start_dna_str,a_seqs[0].length,nbrKmerDecompo);
+    read_p.getKMerNumbers(start_dna_str,a_seqs[0].length,nbrKmerDecompo.single_or_PE_val);
 
     if (a_seqs[1].length) { // case of PE reads
+    	nbrKmerDecompo.idx_start_PE2=nb_expected_k_mers_PE1;
         start_dna_str=a_seqs[1].fq_record_buf+a_seqs[1].start_idx;
-        read_p.getKMerNumbers(start_dna_str,a_seqs[1].length,nbrKmerDecompo);
+        read_p.getKMerNumbers(start_dna_str,a_seqs[1].length,nbrKmerDecompo.single_or_PE_val);
     }
 }
 
diff --git a/src/read_utils.h b/src/read_utils.h
index 5bb504c..aaf064f 100644
--- a/src/read_utils.h
+++ b/src/read_utils.h
@@ -36,6 +36,6 @@ void getDNASeqstr(FqBaseBackend* [],
 
 void init_DnaSeqStr(DnaSeqStr * dna_seq);
 
-void decomposeReadInKMerNums(ReadProcessor& read_p, readNumericValues& nbrKmerDecompo,int k,DnaSeqStr a_seqs[2]);
+void decomposeReadInKMerNums(ReadProcessor& read_p, T_read_numericValues& nbrKmerDecompo,int k,DnaSeqStr a_seqs[2]);
 
 #endif /* READ_UTILS_H_ */
diff --git a/src/rock_commons.h b/src/rock_commons.h
index f4647d3..cfaf09c 100644
--- a/src/rock_commons.h
+++ b/src/rock_commons.h
@@ -70,11 +70,13 @@ typedef struct {
 }PE_files;
 
 
-typedef std::vector<unsigned long> readNumericValues;
+typedef struct std::vector<unsigned long> readNumericValues;
 
-/*typedef struct {
+typedef struct {
     readNumericValues single_or_PE_val;
-}T_read_numericValues;*/
+    int idx_start_PE2; // In he case where PE reads are not treated as single;
+                       // indicate at which index the single_or_PE_val array starts containing values for PE2 k-mers.
+}T_read_numericValues;
 
 typedef struct {
 	unsigned int nb_input_reads;
diff --git a/src/unit_test_cms.cpp b/src/unit_test_cms.cpp
index 21bee4a..8a78c55 100644
--- a/src/unit_test_cms.cpp
+++ b/src/unit_test_cms.cpp
@@ -141,9 +141,10 @@ void test_CMS(int lambda,int kappa,int kappa_prime) {
     assert(nb_distinct_k_mers==1);
 
     // mimic a read (as given by part 2 output).
-    readNumericValues read_values;
+    T_read_numericValues read_values;
+    read_values.idx_start_PE2=0;
     for (i=1;i<=400;i++) {
-        read_values.push_back(i*1000-1);
+        read_values.single_or_PE_val.push_back(i*1000-1);
     }
     ret=cms.addRead(read_values); // read should be accepted since none of the values that it contains is equal to those I inserted previously.
     assert(ret!=rej_expected);
@@ -151,15 +152,32 @@ void test_CMS(int lambda,int kappa,int kappa_prime) {
     nb_distinct_k_mers=cms.getNbDistinctKMers();
     assert(nb_distinct_k_mers==401);
 
-    // mimic a vector that contains a lot of k_mers already present in the CMS and check that it is rejected.
-    readNumericValues read_values2;
+    // mimic a vector that contains a lot of k_mers already present in the CMS in its PE2 but not in its PE1 and check that it is kept.
+    T_read_numericValues read_values2;
+    read_values2.idx_start_PE2=200;
     for (i=1;i<=199;i++) {
-        read_values2.push_back(i*1000-1);
+        read_values2.single_or_PE_val.push_back(i*1000-1);
     }
     for (i=200;i<=400;i++) {
-        read_values2.push_back(num);
+        read_values2.single_or_PE_val.push_back(num);
     }
     ret=cms.addRead(read_values2);
+    assert(ret!=rej_expected);
+
+    nb_distinct_k_mers=cms.getNbDistinctKMers();
+    assert(nb_distinct_k_mers==401);
+    
+    //mimic a vector that contains in both its PE1 and PE2 parts a lot of kmers already in the CMS and check that it is rejected.
+    T_read_numericValues read_values3;
+    read_values3.idx_start_PE2=200;
+    for (i=1;i<=160;i++) {
+        read_values3.single_or_PE_val.push_back(num);
+    }
+    for (i=61;i<=99;i++) read_values3.single_or_PE_val.push_back(num-1);
+    for (i=200;i<=400;i++) {
+        read_values3.single_or_PE_val.push_back(num);
+    }
+    ret=cms.addRead(read_values3);
     assert(ret==rej_expected);
 
     nb_distinct_k_mers=cms.getNbDistinctKMers();
diff --git a/src/unit_test_fqreader.cpp b/src/unit_test_fqreader.cpp
index 46c3df9..9756c32 100644
--- a/src/unit_test_fqreader.cpp
+++ b/src/unit_test_fqreader.cpp
@@ -47,7 +47,7 @@ const int treat_PE_as_single=0;
 void test_processSingleFile() {
     srp sr;
     unsigned char f_id=1;
-    FqMainBackend::setTreatPEMode(0);
+    //FqMainBackend::setTreatPEMode(0);
     processSingleFile((char *) "../test/data/unit/test_single.fq",f_id,&sr);
     srp::reverse_iterator rit;
     i_dim::iterator it_offs;
@@ -92,7 +92,7 @@ void test_processSingleFileWithMQOption() {
     qual_thres.nucl_score_threshold=14+k_phred_32;
     qual_thres.min_correct_k_mers_in_read=100;
     FqBaseBackend::setQualThreshold(qual_thres);
-    FqMainBackend::setTreatPEMode(0);
+    //FqMainBackend::setTreatPEMode(0);
     FqMainBackend be_fq=FqMainBackend(&sr);
     
     //be_fq.setUndefFile((char *) "../test/data/unit/test_single.undef.fq");
@@ -170,7 +170,7 @@ void test_processPEfilesWithA() {
     unsigned char f_id4=4;
 
     srp sr;
-    FqMainBackend::setTreatPEMode(0);
+    //FqMainBackend::setTreatPEMode(0);
     processPEFiles(fq_3_test, f_id3,fq_4_test, f_id4,&sr);
     srp::reverse_iterator rit;
     i_dim::iterator it_offs;
@@ -227,7 +227,7 @@ void test_processPEFiles() {
     unsigned char f_id2=2;
 
     srp sr;
-    FqMainBackend::setTreatPEMode(0);
+    //FqMainBackend::setTreatPEMode(0);
     processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr);
     srp::reverse_iterator rit;
     i_dim::iterator it_offs;
@@ -286,7 +286,7 @@ void aux_testPEFilesMQ(FasqQualThreshold qual_thres,int nb_expected_reads) {
 
     unsigned char f_id1=1;
     unsigned char f_id2=2;
-    FqMainBackend::setTreatPEMode(0);
+    //FqMainBackend::setTreatPEMode(0);
     FqBaseBackend::setQualThreshold(qual_thres);
     processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr);
 
@@ -315,7 +315,7 @@ void test_processPEFilesWithMQOptions() {
     qual_thres.k=20;
     qual_thres.nucl_score_threshold=14+k_phred_32;
     qual_thres.min_correct_k_mers_in_read=76;
-    FqMainBackend::setTreatPEMode(0);
+    //FqMainBackend::setTreatPEMode(0);
 
     aux_testPEFilesMQ(qual_thres,4); // last fq records contains only 77 correct k-mers.
 
@@ -368,23 +368,17 @@ void test_processAllFiles() {
     unsigned char f_single=3;
 
     srp sr;
-    FqMainBackend::setTreatPEMode(0);
+    //FqMainBackend::setTreatPEMode(0);
     processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr);
     processSingleFile(fq_single,f_single,&sr);
 
     check_processAllFilesResults(sr);
 }
 
-void test_processPE_not_as_single() {
+void test_processPE_nuclthreshold() {
     char * fq_1_test=(char *) "../test/data/unit/test_PE1_PE_not_as_single.fq";
     char * fq_2_test=(char *) "../test/data/unit/test_PE2_PE_not_as_single.fq";
 
-    char * fq_1_test_undef=(char *) "../test/data/unit/test_PE1_PE_not_as_single.undef.fastq";
-    char * fq_2_test_undef=(char *) "../test/data/unit/test_PE2_PE_not_as_single.undef.fastq";
-
-    char * fq_1_expected_undef=(char *) "../test/data/unit/expected_PE1_PE_not_as_single.undef.fastq";
-    char * fq_2_expected_undef=(char *) "../test/data/unit/expected_PE2_PE_not_as_single.undef.fastq";
-
     unsigned char f_id1=1;
     unsigned char f_id2=2;
     srp sr;
@@ -395,7 +389,6 @@ void test_processPE_not_as_single() {
     qual_thres.nucl_score_threshold=2+k_phred_32;
 
     FqBaseBackend::setQualThreshold(qual_thres);
-    FqMainBackend::setTreatPEMode(1);
     processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,1000);
 
     srp::reverse_iterator rit;
@@ -411,18 +404,13 @@ void test_processPE_not_as_single() {
             }
         }
     }
-    assert(cnt_read==1);
-
-    /*assert(compareFilesLileByLine(fq_1_test_undef,fq_1_expected_undef)==0);
-    assert(compareFilesLileByLine(fq_2_test_undef,fq_2_expected_undef)==0);
+    assert(cnt_read==3);
 
-    assert(remove((char *) "../test/data/unit/test_PE1_PE_not_as_single.undef.fastq")==0);
-    assert(remove((char *) "../test/data/unit/test_PE2_PE_not_as_single.undef.fastq")==0);*/
 
 }
 
 
-void test_processPE_not_as_single2() {
+void test_processPE_nuclthreshold2() {
     char * fq_1_test=(char *) "../test/data/unit/test_PE1_PE_not_as_single_pathological.fq";
     char * fq_2_test=(char *) "../test/data/unit/test_PE2_PE_not_as_single_pathological.fq";
 
@@ -435,11 +423,10 @@ void test_processPE_not_as_single2() {
 
     FasqQualThreshold qual_thres;
     qual_thres.k=30;
-    qual_thres.min_correct_k_mers_in_read=1;
+    qual_thres.min_correct_k_mers_in_read=221;
     qual_thres.nucl_score_threshold=10+k_phred_32;
 
     FqBaseBackend::setQualThreshold(qual_thres);
-    FqMainBackend::setTreatPEMode(1);
     processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,1000);
 
     srp::reverse_iterator rit;
@@ -448,20 +435,14 @@ void test_processPE_not_as_single2() {
     int cnt_read=0;
 
     for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores).
-        //unsigned long score=rit->first;
         for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) {
             for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) {
                 cnt_read++;
             }
         }
     }
-    assert(cnt_read==0);
-
-    /*assert(compareFilesLileByLine(fq_1_test_undef,fq_1_test)==0);
-    assert(compareFilesLileByLine(fq_2_test_undef,fq_2_test)==0);
+    assert(cnt_read==2);
 
-    assert(remove((char *) "../test/data/unit/test_PE1_PE_not_as_single_pathological.undef.fastq")==0);
-    assert(remove((char *) "../test/data/unit/test_PE2_PE_not_as_single_pathological.undef.fastq")==0);*/
 
 }
 
@@ -469,9 +450,6 @@ void test_processPE_not_as_singleWithMQ() {
     char * fq_1_test=(char *) "../test/data/unit/fake_PE1.fq";
     char * fq_2_test=(char *) "../test/data/unit/fake_PE2.fq";
 
-    char * fq_1_test_undef=(char *) "../test/data/unit/fake_PE1.undef.fastq";
-    char * fq_2_test_undef=(char *) "../test/data/unit/fake_PE2.undef.fastq";
-
     unsigned char f_id1=1;
     unsigned char f_id2=2;
     srp sr;
@@ -482,7 +460,6 @@ void test_processPE_not_as_singleWithMQ() {
     qual_thres.nucl_score_threshold=10+k_phred_32;
 
     FqBaseBackend::setQualThreshold(qual_thres);
-    FqMainBackend::setTreatPEMode(1);
     processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,1000);
 
     srp::reverse_iterator rit;
@@ -491,7 +468,6 @@ void test_processPE_not_as_singleWithMQ() {
     int cnt_read=0;
 
     for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores).
-        //unsigned long score=rit->first;
         for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) {
             for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) {
                 cnt_read++;
@@ -499,27 +475,19 @@ void test_processPE_not_as_singleWithMQ() {
         }
     }
     assert(cnt_read==1);
-    /*struct stat fileStat;
-    stat(fq_1_test_undef, &fileStat);
-    assert(fileStat.st_size==0);
-    stat(fq_2_test_undef, &fileStat);
-    assert(fileStat.st_size==0);*/
     sr.clear();
 
+    cout<<"try with a different min_correct_k_mers_in_read=87"<<endl;
     qual_thres.k=30;
-    qual_thres.min_correct_k_mers_in_read=45;
+    qual_thres.min_correct_k_mers_in_read=87;
     qual_thres.nucl_score_threshold=15+k_phred_32;
 
     FqBaseBackend::setQualThreshold(qual_thres);
-    FqMainBackend::setTreatPEMode(1);
     processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,1000);
 
-
-
     cnt_read=0;
 
     for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores).
-        //unsigned long score=rit->first;
         for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) {
             for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) {
                 cnt_read++;
@@ -527,25 +495,21 @@ void test_processPE_not_as_singleWithMQ() {
         }
     }
     assert(cnt_read==0);
-
     sr.clear();
 
-    /*assert(compareFilesLileByLine(fq_1_test_undef,fq_1_test)==0);
-    assert(compareFilesLileByLine(fq_2_test_undef,fq_2_test)==0);*/
+    cout<<"try with a different min_correct_k_mers_in_read=13"<<endl;
 
     qual_thres.k=30;
     qual_thres.min_correct_k_mers_in_read=13;
     qual_thres.nucl_score_threshold=18+k_phred_32;
 
     FqBaseBackend::setQualThreshold(qual_thres);
-    FqMainBackend::setTreatPEMode(1);
     processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,1000);
 
 
     cnt_read=0;
 
     for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores).
-        //unsigned long score=rit->first;
         for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) {
             for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) {
                 cnt_read++;
@@ -555,16 +519,6 @@ void test_processPE_not_as_singleWithMQ() {
     assert(cnt_read==1);
     sr.clear();
 
-
-    /*stat(fq_1_test_undef, &fileStat);
-    assert(fileStat.st_size==0);
-    stat(fq_2_test_undef, &fileStat);
-    assert(fileStat.st_size==0);
-
-
-    assert(remove((char *) fq_1_test_undef)==0);
-    assert(remove((char *) fq_2_test_undef)==0);*/
-
 }
 
 
@@ -591,7 +545,7 @@ void test_processInputFiles() {
     default_qual_thres.min_correct_k_mers_in_read=0; // aim of that test is not to check undef file creation. Disable it by putting 0 here
     default_qual_thres.nucl_score_threshold=0; // leave default values for that test
     
-    FqMainBackend::setTreatPEMode(treat_PE_as_single);
+    //FqMainBackend::setTreatPEMode(treat_PE_as_single);
 
     FqBaseBackend * array_be[k_max_input_files];
     processInputFiles(v_single,v_pe,array_be,default_qual_thres,&sr,1);
@@ -638,7 +592,7 @@ AAAAAEEAEEEEEEEEE6EE/EEEEEEEEAEEAEEEEEEEEEEEEEEAEEEA/A/EEEAEEEEEE/EE</EAEEEEEE/E
     qual_thres.min_correct_k_mers_in_read=78;
     T_buf_info buf_info;
     FqBaseBackend::setQualThreshold(qual_thres);
-    FqMainBackend::setTreatPEMode(treat_PE_as_single);
+    //FqMainBackend::setTreatPEMode(treat_PE_as_single);
     FqMainBackend be(&sr);
     buf_info.buf=buf;
     buf_info.pchar=buf;
@@ -680,7 +634,7 @@ AAAAAEEEEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEE<EEEAEEEEA<EAEEEEEEEAAAAE<EEEE/EEEEAAEEE
 	qual_thres.nucl_score_threshold=k_phred_32;
 	qual_thres.k=25;
 	FqBaseBackend::setQualThreshold(qual_thres);
-	FqMainBackend::setTreatPEMode(treat_PE_as_single);
+	//FqMainBackend::setTreatPEMode(treat_PE_as_single);
 	FqMainBackend be(&sr);
 	buf_info.buf=buf;
 	buf_info.pchar=buf;
@@ -751,7 +705,7 @@ AAAAAEEEEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEE<EEEAEEEEA<EAEEEEEEEAAAAE<EEEE/EEEEAAEEE
     qual_thres.nucl_score_threshold=k_phred_32;
     qual_thres.k=25;
     FqBaseBackend::setQualThreshold(qual_thres);
-    FqMainBackend::setTreatPEMode(treat_PE_as_single);
+    //FqMainBackend::setTreatPEMode(treat_PE_as_single);
     FqMainBackend be(&sr);
     FqAuxBackend be2;
 
@@ -820,7 +774,7 @@ AAAAAEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEAEEEEEEEEEEEE
     T_buf_info buf_info;
     T_buf_info PE2_buf_info;
     FqBaseBackend::setQualThreshold(qual_thres);
-    FqMainBackend::setTreatPEMode(0);
+    //FqMainBackend::setTreatPEMode(0);
 
     FqMainBackend be(&sr);
 
@@ -861,18 +815,11 @@ void Aux_MimicBigPEFilesWithMQOptions(const FasqQualThreshold& qual_thres,const
     char * fq_1_test_undef=(char *) "../test/data/unit/09-4607_S43_R1.undef.fastq";
     char * fq_2_test_undef=(char *) "../test/data/unit/09-4607_S43_R2.undef.fastq";
 
-    char * fq_1_expected_undef_100=(char *) "../test/data/unit/thres_100_09-4607_S43_R1.undef.fastq";
-    char * fq_2_expected_undef_100=(char *) "../test/data/unit/thres_100_09-4607_S43_R2.undef.fastq";
-
-    char * fq_1_expected_undef_180=(char *) "../test/data/unit/thres_180_09-4607_S43_R1.undef.fastq";
-    char * fq_2_expected_undef_180=(char *) "../test/data/unit/thres_180_09-4607_S43_R2.undef.fastq";
-
     unsigned char f_id1=1;
     unsigned char f_id2=2;
 
-    // FqBaseBackend::bufsize=bufsize; // choose a very small buf size to mimic behavior on big files and reun the test in a short time
+
     FqBaseBackend::setQualThreshold(qual_thres);
-    FqMainBackend::setTreatPEMode(0);
 
     processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,bufsize);
 
@@ -937,11 +884,11 @@ int main(int argc, char **argv) {
     test_processPEFilesWithMQOptions();
     cout<<"test processing files with thresholds (nucleotide quality score and minimum number of valid k_mers in read) mimic big files that need several read operations. "<<endl;
     test_MimicBigPEFilesWithMQOptions();
-    cout<<"test processing PE files with nucleotide quality threshold and PE not treated as single option."<<endl;
-    test_processPE_not_as_single();
-    cout<<"test processing PE files with lots of errors in PE2; nucleotide quality threshold and PE not treated as single option."<<endl;
-    test_processPE_not_as_single2();
-    cout<<"test processing PE files with PE not treated as single; quality score threshold and minimum number of valid k-mer."<<endl;
+    cout<<"test processing PE files with nucleotide quality threshold."<<endl;
+    test_processPE_nuclthreshold();
+    cout<<"test processing PE files with lots of errors in PE2; nucleotide quality threshold"<<endl;
+    test_processPE_nuclthreshold2();
+    cout<<"test processing PE files; quality score threshold and minimum number of valid k-mer."<<endl;
     test_processPE_not_as_singleWithMQ();
     cout<<"done"<<endl;
 }
diff --git a/src/unit_test_read_utils.cpp b/src/unit_test_read_utils.cpp
index 44560b9..af81c08 100644
--- a/src/unit_test_read_utils.cpp
+++ b/src/unit_test_read_utils.cpp
@@ -42,8 +42,6 @@ void test_getReadSingle() {
     my_struct1=init_rpos(4,639);
     my_struct2=init_rpos(4,1228);
 
-    //std::cout<<my_struct1.read_a1<<endl;
-
     assert(my_struct1.read_a1==639);
     assert(my_struct2.read_a1==1228);
     srp io_sr;
@@ -147,7 +145,6 @@ void test_getReadPE() {
 
     be_fq1.closeInputFile();
     be_fq2.closeFile();
-    //cout<<"done"<<endl;
 }
 
 void test_getReadPEWithNQST() {
@@ -191,19 +188,17 @@ void test_getReadPEWithNQST() {
     std::cout<<dna_read<<endl;
     assert(strcmp(dna_read,"GGNTCTGTTGGCTCAAACGCCNTCACNTCNTTGNTCAAAAGCTTATAATGCGNGCCAAANTCCGCCATCGNGACNNCTACGCCTTCCCCNGCTNTCCCGNCAAANNCNNGTCTTGCCGGANCTTCNCNNNCTCCNNTCGAAAGCGGCGAAATCTTAGNGGAAGGNGGANATAATGCNNTCNCNTCGNACNNTGAANNTNTANNCGGCANNCNGCAGNNTCNAGGNCTTNCNGNGNAACGNTTAANNGCAGATGGCTACGGNTTTGCNGGGGNANGAGACNGGANANCNNNGNCGNTCGNCCN")==0);
 
-    /*getDnaStr(fic_map,my_struct2,a_seqs,dna_read);
-    assert(strcmp(dna_read,"ATTGTGGGGTTCCTTTTTGTAGCATTGGAATGGAAATTAAAAATGGGGCTTCAGGATGCCCGCTCCATTATTTAATTCCAGAATGTAACGATGCTGTTTACCGGGGGGACTGGAAAGATGCACTTGAGCTTTTGATTAAAACAAATAACATGCCCAGAACCAATCACTGCAATTTTTTTATCCCACCGCACTATCGGTGGAGTCGGCATGAACCAACCTAAACCAAACCCACGGTCAATAATAGCCCGTTCAATCGAATTAATACCCACAGCAGGATCAGAAATTGCAACCGTACAACTTC")==0);
-*/
+
     be_fq1.closeInputFile();
     be_fq2.closeFile();
-    //cout<<"done"<<endl;
 }
 
 void test_decomposeReadInkMerNums() {
     int k=5;
     ReadProcessor read_p(k);
-    readNumericValues nbrKmerDecompo;
     
+    T_read_numericValues nbrKmerDecompo;
+
     DnaSeqStr a_seqs[2];
     init_DnaSeqStr(&a_seqs[0]);
     init_DnaSeqStr(&a_seqs[1]);
@@ -212,26 +207,23 @@ void test_decomposeReadInkMerNums() {
     strcpy(seq1->fq_record_buf,"@fake_stuff\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n+\n....@\n");
     seq1->start_idx=12;
     seq1->length=75;
-    decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs);
-    assert(nbrKmerDecompo.size()==71);
-    readNumericValues::iterator it;
-    for (it=nbrKmerDecompo.begin();it!=nbrKmerDecompo.end();it++) {
-        assert(*it==1023);
-    }
+
     DnaSeqStr * seq2=&a_seqs[1];
     strcpy(seq2->fq_record_buf,"@another_fake_stuff\nATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG\n+\n....@\n");
     seq2->start_idx=20;
     seq2->length=80;
-    nbrKmerDecompo.clear();
     decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs);
-    assert(nbrKmerDecompo.size()==147);
+    assert(nbrKmerDecompo.single_or_PE_val.size()==71+76);
+    assert(nbrKmerDecompo.idx_start_PE2==71);
+    readNumericValues::iterator it;
+    readNumericValues::iterator it_end_PE1=nbrKmerDecompo.single_or_PE_val.begin()+71;
+    for (it=nbrKmerDecompo.single_or_PE_val.begin();it!=it_end_PE1;it++) {
+        assert(*it==1023);
+    }
+
+    nbrKmerDecompo.single_or_PE_val.clear();
 }
-/*
-void initNumbers(KmRevNumbers& numbers) {
-	numbers.first=1;
-	numbers.nbr=0;
-	numbers.nbr_rev=0;
-}*/
+
 
 void savNumbers(ReadProcessor::KmRevNumbers& numbers,ReadProcessor::KmRevNumbers& numbers_sav) {
 	numbers_sav.first=numbers.first;
@@ -384,12 +376,9 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32.
         // k-mer number 10 is GCTACCATAACACCAACTGTTTTCACCATA
         // its reverse complement is : TATGGTGAAAACAGTTGGTGTTATGGTAGC
         if (cnt_k_mer==10) {
-            //num=read_p.kMerToNumber((char *) "GCTACCATAACACCAACTGTTTTCACCATA",NULL);
             assert(*it==max(704021989794238796,930978566989888201));
         }
-        /*if (cnt_k_mer==20) {
-            assert(*it==930978566989888201);
-        }*/
+
     }
 }
 
diff --git a/test/rock_mem.sh b/test/rock_mem.sh
index 7f3fcb0..1d9b0c0 100755
--- a/test/rock_mem.sh
+++ b/test/rock_mem.sh
@@ -139,17 +139,23 @@ test $ret2 -eq 0 || exit 143
 
 ../src/rock -C 100 -k 30 -c 1 -l 2 -q 13 -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_13.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq >/dev/null ||exit 145
 
-ret1=`diff tmp/klebsiella_100_1_13_qual_thres.fq tmp/klebsiella_100_1_12_qual_thres.fq|grep -c "@SRR1222430.37"`
+ret1=`diff tmp/klebsiella_100_1_13_qual_thres.fq tmp/klebsiella_100_1_12_qual_thres.fq|grep -c "@SRR1222430.14"`
 test $ret1 -eq  1 || exit 146
 
+ret1=`diff tmp/klebsiella_100_1_13_qual_thres.fq tmp/klebsiella_100_1_12_qual_thres.fq|grep -c "@SRR1222430.24"`
+test $ret1 -eq  1 || exit 1461
+
 ret2=`diff tmp/klebsiella_100_1_13_qual_thres.fq tmp/klebsiella_100_1_12_qual_thres.fq|grep -c "length"`
-test $ret2 -eq  2 || exit 147
+test $ret2 -eq  4 || exit 147
 
-ret1=`diff tmp/klebsiella_100_2_13_qual_thres.fq tmp/klebsiella_100_2_12_qual_thres.fq|grep -c "@SRR1222430.37"`
+ret1=`diff tmp/klebsiella_100_2_13_qual_thres.fq tmp/klebsiella_100_2_12_qual_thres.fq|grep -c "@SRR1222430.14"`
 test $ret1 -eq  1 || exit 148
 
+ret1=`diff tmp/klebsiella_100_2_13_qual_thres.fq tmp/klebsiella_100_2_12_qual_thres.fq|grep -c "@SRR1222430.24"`
+test $ret1 -eq  1 || exit 1481
+
 ret2=`diff tmp/klebsiella_100_2_13_qual_thres.fq tmp/klebsiella_100_2_12_qual_thres.fq|grep -c "length"`
-test $ret2 -eq  2 || exit 149
+test $ret2 -eq  4 || exit 149
 
 rm -fr tmp
 
-- 
GitLab