From 6ccf1b03248a425b542e9df577be6b2f9f78bd66 Mon Sep 17 00:00:00 2001
From: Veronique Legrand <vlegrand@pasteur.fr>
Date: Fri, 16 Jun 2017 14:33:45 +0200
Subject: [PATCH] version 1.7: users now have the possibility to specify a
 minimum number of k-mer when processing PE separately.

---
 src/FqMainBackend.cpp      |   3 +-
 src/ROCKparams.cpp         |   4 +-
 src/unit_test_fqreader.cpp | 104 +++++++++++++++++++++++++++++++++++++
 test/Makefile.am           |   7 +--
 test/data/unit/fake_PE1.fq |   4 ++
 test/data/unit/fake_PE2.fq |   4 ++
 test/rock.sh               |   2 +-
 test/rock_mem.sh           |  77 ++++++++++++++++++++++++++-
 8 files changed, 197 insertions(+), 8 deletions(-)
 create mode 100644 test/data/unit/fake_PE1.fq
 create mode 100644 test/data/unit/fake_PE2.fq

diff --git a/src/FqMainBackend.cpp b/src/FqMainBackend.cpp
index a76d6a8..025e505 100644
--- a/src/FqMainBackend.cpp
+++ b/src/FqMainBackend.cpp
@@ -108,7 +108,8 @@ void FqMainBackend::onEndFastqRecord(T_fq_rec_info& rec_info,const T_buf_info& b
          nb_k_mer+=nb_k_mer_PE2;
          (nb_k_mer-rec_info.nb_k_mers_in_error-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read)?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo);
      } else {
-         ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2>=1) && (nb_k_mer-rec_info.nb_k_mers_in_error>=1))?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo);
+         // ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2>=1) && (nb_k_mer-rec_info.nb_k_mers_in_error>=1))?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo);
+         ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read) && (nb_k_mer-rec_info.nb_k_mers_in_error>=qual_thres.min_correct_k_mers_in_read))?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo);
      }
      // empty buffer keeping current record
      cur_fq_record[0]='\0';
diff --git a/src/ROCKparams.cpp b/src/ROCKparams.cpp
index db1fb94..7b9b086 100644
--- a/src/ROCKparams.cpp
+++ b/src/ROCKparams.cpp
@@ -339,10 +339,10 @@ void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) {
             default:
                 usage(EXIT_FAILURE); break; }
     }
-    if (!PE_as_single && qual_thres.min_correct_k_mers_in_read!=1) {
+    /*if (!PE_as_single && qual_thres.min_correct_k_mers_in_read!=1) {
         cout<<"Incompatible options: when PE are processed independently, minimum number of correct k-mers in each end is 1."<<endl;
         usage(EXIT_FAILURE);
-    }
+    }*/
     parms.filter_PE_as_single=PE_as_single;
     //cout<<PE_as_single<<endl;
     processMainArgs(optind, argc, argv,v_input_lines);
diff --git a/src/unit_test_fqreader.cpp b/src/unit_test_fqreader.cpp
index 536ad49..d9771e4 100644
--- a/src/unit_test_fqreader.cpp
+++ b/src/unit_test_fqreader.cpp
@@ -448,6 +448,108 @@ void test_processPE_not_as_single2() {
 
 }
 
+void test_processPE_not_as_singleWithMQ() {
+    char * fq_1_test=(char *) "../test/data/unit/fake_PE1.fq";
+    char * fq_2_test=(char *) "../test/data/unit/fake_PE2.fq";
+
+    char * fq_1_test_undef=(char *) "../test/data/unit/fake_PE1.undef.fastq";
+    char * fq_2_test_undef=(char *) "../test/data/unit/fake_PE2.undef.fastq";
+
+    unsigned char f_id1=1;
+    unsigned char f_id2=2;
+    srp sr;
+
+    FasqQualThreshold qual_thres;
+    qual_thres.k=30;
+    qual_thres.min_correct_k_mers_in_read=64;
+    qual_thres.nucl_score_threshold=10+k_phred_32;
+
+    FqBaseBackend::setQualThreshold(qual_thres);
+    FqMainBackend::setTreatPEAsSingle(0);
+    processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,fq_1_test_undef,fq_2_test_undef,1000);
+
+    srp::reverse_iterator rit;
+    i_dim::iterator it_offs;
+    k_dim::iterator it_struct;
+    int cnt_read=0;
+
+    for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores).
+        //unsigned long score=rit->first;
+        for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) {
+            for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) {
+                cnt_read++;
+            }
+        }
+    }
+    assert(cnt_read==1);
+    struct stat fileStat;
+    stat(fq_1_test_undef, &fileStat);
+    assert(fileStat.st_size==0);
+    stat(fq_2_test_undef, &fileStat);
+    assert(fileStat.st_size==0);
+    sr.clear();
+
+    qual_thres.k=30;
+    qual_thres.min_correct_k_mers_in_read=45;
+    qual_thres.nucl_score_threshold=15+k_phred_32;
+
+    FqBaseBackend::setQualThreshold(qual_thres);
+    FqMainBackend::setTreatPEAsSingle(0);
+    processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,fq_1_test_undef,fq_2_test_undef,1000);
+
+
+
+    cnt_read=0;
+
+    for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores).
+        //unsigned long score=rit->first;
+        for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) {
+            for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) {
+                cnt_read++;
+            }
+        }
+    }
+    assert(cnt_read==0);
+
+    sr.clear();
+
+    assert(compareFilesLileByLine(fq_1_test_undef,fq_1_test)==0);
+    assert(compareFilesLileByLine(fq_2_test_undef,fq_2_test)==0);
+
+    qual_thres.k=30;
+    qual_thres.min_correct_k_mers_in_read=13;
+    qual_thres.nucl_score_threshold=18+k_phred_32;
+
+    FqBaseBackend::setQualThreshold(qual_thres);
+    FqMainBackend::setTreatPEAsSingle(0);
+    processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,fq_1_test_undef,fq_2_test_undef,1000);
+
+
+    cnt_read=0;
+
+    for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores).
+        //unsigned long score=rit->first;
+        for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) {
+            for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) {
+                cnt_read++;
+            }
+        }
+    }
+    assert(cnt_read==1);
+    sr.clear();
+
+
+    stat(fq_1_test_undef, &fileStat);
+    assert(fileStat.st_size==0);
+    stat(fq_2_test_undef, &fileStat);
+    assert(fileStat.st_size==0);
+
+
+    assert(remove((char *) fq_1_test_undef)==0);
+    assert(remove((char *) fq_2_test_undef)==0);
+
+}
+
 
 void test_processInputFiles() {
     char * fq_1_test=(char *) "../test/data/unit/test_PE1.fq";
@@ -722,5 +824,7 @@ int main(int argc, char **argv) {
     test_processPE_not_as_single();
     cout<<"test processing PE files with lots of errors in PE2; nucleotide quality threshold and PE not treated as single option."<<endl;
     test_processPE_not_as_single2();
+    cout<<"test processing PE files with PE not treated as single; quality score threshold and minimum number of valid k-mer."<<endl;
+    test_processPE_not_as_singleWithMQ();
     cout<<"done"<<endl;
 }
diff --git a/test/Makefile.am b/test/Makefile.am
index 029c5f5..cbaa008 100755
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -17,8 +17,9 @@ data/unit/09-4607_S43_R2_big.fastq data/unit/thres_100_09-4607_S43_R1.undef.fast
 data/unit/thres_100_09-4607_S43_R2.undef.fastq data/unit/thres_180_09-4607_S43_R1.undef.fastq \
 data/unit/thres_180_09-4607_S43_R2.undef.fastq data/fastq.raw/klebsiella_5_1_bad_scores.fq \
 data/fastq.raw/klebsiella_5_2_bad_scores.fq data/iofiles.args/output_files_2_PE_separated.txt \
-data/unit/test_PE1_PE_not_as_single.fq data/unit/test_PE2_PE_not_as_single.fq data/unit/expected_PE1_PE_not_as_single.undef.fastq\
-data/unit/expected_PE2_PE_not_as_single.undef.fastq \
-data/unit/test_PE1_PE_not_as_single_pathological.fq data/unit/test_PE2_PE_not_as_single_pathological.fq
+data/unit/test_PE1_PE_not_as_single.fq data/unit/test_PE2_PE_not_as_single.fq \
+data/unit/expected_PE1_PE_not_as_single.undef.fastq data/unit/expected_PE2_PE_not_as_single.undef.fastq \
+data/unit/test_PE1_PE_not_as_single_pathological.fq data/unit/test_PE2_PE_not_as_single_pathological.fq \
+data/unit/fake_PE1.fq data/unit/fake_PE2.fq
 
 
diff --git a/test/data/unit/fake_PE1.fq b/test/data/unit/fake_PE1.fq
new file mode 100644
index 0000000..3e855a2
--- /dev/null
+++ b/test/data/unit/fake_PE1.fq
@@ -0,0 +1,4 @@
+@SRR1222430.1378154 1378154 length=94
+CAGGAACCGCCTTCTGGTGATTTGCAAGAACGCAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTT
++SRR1222430.1378154 1378154 length=94
+CCCCDFFDCDDCGGGGGGGGGGHHHHHHHHHGEGGCHFGEBFEHHHGGEGGHHHHHHGHHHGHGGEGFGGGE/FGHGGHHHHGGGHGHGGHHH
diff --git a/test/data/unit/fake_PE2.fq b/test/data/unit/fake_PE2.fq
new file mode 100644
index 0000000..3e855a2
--- /dev/null
+++ b/test/data/unit/fake_PE2.fq
@@ -0,0 +1,4 @@
+@SRR1222430.1378154 1378154 length=94
+CAGGAACCGCCTTCTGGTGATTTGCAAGAACGCAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTT
++SRR1222430.1378154 1378154 length=94
+CCCCDFFDCDDCGGGGGGGGGGHHHHHHHHHGEGGCHFGEBFEHHHGGEGGHHHHHHGHHHGHGGEGFGGGE/FGHGGHHHHGGGHGHGGHHH
diff --git a/test/rock.sh b/test/rock.sh
index 59a3ba5..7c30d3e 100755
--- a/test/rock.sh
+++ b/test/rock.sh
@@ -65,7 +65,7 @@ echo "doing more checks on options and error messages"
 ../src/rock -C 500 -c 400 -k 10 -l 12 -n 85000000 -p -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "options are mutually exclusive">/dev/null || exit 14
 ../src/rock -C 500 -c 400 -k 10 -q 3 -m 0 -p -i ${srcdir}/test/data/iofiles.args/extract_klebsiella_long_reads_100.txt|grep "minimum number of valid k-mer for keeping a read must be an integer">/dev/null || exit 15
 ../src/rock -C 500 -c 400 -k 10 -q -1 -m 2 -p -i ${srcdir}/test/data/iofiles.args/extract_klebsiella_long_reads_100.txt|grep "q must be">/dev/null || exit 16
-../src/rock -C 500 -c 400 -k 10 -q 2 -m 2 -i ${srcdir}/test/data/iofiles.args/extract_klebsiella_long_reads_100.txt|grep "Incompatible options">/dev/null || exit 161
+#../src/rock -C 500 -c 400 -k 10 -q 2 -m 2 -i ${srcdir}/test/data/iofiles.args/extract_klebsiella_long_reads_100.txt|grep "Incompatible options">/dev/null || exit 161
 
 # here check that we have enough memory for running the tests.
 ../src/unit_test_cms CHECK_MEM
diff --git a/test/rock_mem.sh b/test/rock_mem.sh
index 02455ec..e21876f 100755
--- a/test/rock_mem.sh
+++ b/test/rock_mem.sh
@@ -255,11 +255,86 @@ rm -f tmp/klebsiella_5_1_2_qual_thres.fq
 rm -f tmp/klebsiella_5_2_2_qual_thres.fq
 
 
+echo " "
+echo "##################################################################################"
+echo "testing ROCK with a quality score threshold for nucleotides, PE processed separately and -m option"
+../src/rock -C 100 -c 1 -l 2 -q 2 -m 2 -o ${srcdir}/data/iofiles.args/output_files_2_PE_separated.txt ${srcdir}/data/fastq.raw/klebsiella_5_1_bad_scores.fq,${srcdir}/data/fastq.raw/klebsiella_5_2_bad_scores.fq || exit 202
+
+[ -f "tmp/klebsiella_5_1_bad_scores.undefined.fq" ] || exit 20200
+[ -f "tmp/klebsiella_5_2_bad_scores.undefined.fq" ] || exit 20201
+
+ret1=`cat tmp/klebsiella_5_1_bad_scores.undefined.fq|wc -l`
+ret2=`cat tmp/klebsiella_5_2_bad_scores.undefined.fq|wc -l`
+
+test $ret1=8 || exit 202100
+test $ret1=8 || exit 202101
+
+ret1=`grep -c SRR1222430.1 tmp/klebsiella_5_1_bad_scores.undefined.fq`
+ret2=`grep -c SRR1222430.1 tmp/klebsiella_5_2_bad_scores.undefined.fq`
+
+test $ret1=2 || exit 202200
+test $ret1=2 || exit 202201
+
+ret1=`grep -c SRR1222430.5 tmp/klebsiella_5_1_bad_scores.undefined.fq`
+ret2=`grep -c SRR1222430.5 tmp/klebsiella_5_2_bad_scores.undefined.fq`
+
+test $ret1=2 || exit 202300
+test $ret1=2 || exit 202301
+
+ret1=`grep -c SRR1222430.2 tmp/klebsiella_5_1_bad_scores.undefined.fq`
+ret2=`grep -c SRR1222430.2 tmp/klebsiella_5_2_bad_scores.undefined.fq`
+
+test $ret1=0 || exit 202400
+test $ret1=0 || exit 202401
+
+ret1=`grep -c SRR1222430.3 tmp/klebsiella_5_1_bad_scores.undefined.fq`
+ret2=`grep -c SRR1222430.3 tmp/klebsiella_5_2_bad_scores.undefined.fq`
+
+test $ret1=0 || exit 202500
+test $ret1=0 || exit 202501
+
+ret1=`grep -c SRR1222430.4 tmp/klebsiella_5_1_bad_scores.undefined.fq`
+ret2=`grep -c SRR1222430.4 tmp/klebsiella_5_2_bad_scores.undefined.fq`
+
+test $ret1=0 || exit 202600
+test $ret1=0 || exit 202601
+
+ret1=`cat tmp/klebsiella_5_1_2_qual_thres.fq|wc -l`
+ret2=`cat tmp/klebsiella_5_1_2_qual_thres.fq|wc -l`
+
+test $ret1=12 || exit 202700
+test $ret1=12 || exit 202701
+
+ret1=`grep -c SRR1222430.2 tmp/klebsiella_5_1_2_qual_thres.fq`
+ret2=`grep -c SRR1222430.2 tmp/klebsiella_5_2_2_qual_thres.fq`
+
+test $ret1=2 || exit 202800
+test $ret1=2 || exit 202801
+
+ret1=`grep -c SRR1222430.3 tmp/klebsiella_5_1_2_qual_thres.fq`
+ret2=`grep -c SRR1222430.3 tmp/klebsiella_5_2_2_qual_thres.fq`
+
+test $ret1=2 || exit 202802
+test $ret1=2 || exit 202803
+
+ret1=`grep -c SRR1222430.4 tmp/klebsiella_5_1_2_qual_thres.fq`
+ret2=`grep -c SRR1222430.4 tmp/klebsiella_5_2_2_qual_thres.fq`
+
+test $ret1=2 || exit 202804
+test $ret1=2 || exit 202805
+
+rm -f tmp/klebsiella_5_1_bad_scores.undefined.fq
+rm -f tmp/klebsiella_5_2_bad_scores.undefined.fq
+rm -f tmp/klebsiella_5_1_2_qual_thres.fq
+rm -f tmp/klebsiella_5_2_2_qual_thres.fq
+
+
+
 echo " "
 echo "##################################################################################"
 echo "testing ROCK with no quality score threshold for nucleotides and PE processed separately"
 
-../src/rock -C 100 -c 1 -l 2 -o ${srcdir}/data/iofiles.args/output_files_2_PE_separated.txt ${srcdir}/data/fastq.raw/klebsiella_5_1_bad_scores.fq,${srcdir}/data/fastq.raw/klebsiella_5_2_bad_scores.fq || exit 201
+../src/rock -C 100 -c 1 -l 2 -o ${srcdir}/data/iofiles.args/output_files_2_PE_separated.txt ${srcdir}/data/fastq.raw/klebsiella_5_1_bad_scores.fq,${srcdir}/data/fastq.raw/klebsiella_5_2_bad_scores.fq || exit 203
 
 [ -f "tmp/klebsiella_5_1_bad_scores.undefined.fq" ] || exit 30100
 [ -f "tmp/klebsiella_5_2_bad_scores.undefined.fq" ] || exit 30101
-- 
GitLab