Now handle the case of k-mers that contain unknown nucleotides (labelled 'N')...

Now handle the case of k-mers that contain unknown nucleotides (labelled 'N') and who must be processed but not taken into account for CMS. Former-commit-id: 4df2b1be

Now handle the case of k-mers that contain unknown nucleotides (labelled 'N')...
de35e3b4 · Veronique Legrand · c5a10623 · de35e3b4 · de35e3b4 · de35e3b4
Commit de35e3b4 authored 9 years ago by Veronique Legrand
--- a/src/ReadProcessor.cpp
+++ b/src/ReadProcessor.cpp
@@ -25,6 +25,12 @@ inline void ReadProcessor::init_mask(int k) {
    mask_kMer=mask_kMer-1;
 }*/

+inline unsigned long ReadProcessor::nucleoToNumberWrap(char s) {
+    if (s=='N') n=k;
+    n--;
+    return nucleoToNumber(s);
+}
+
 inline unsigned long ReadProcessor::nucleoToNumber(char s) {
    unsigned long nbr;
    switch(s)
@@ -43,11 +49,13 @@ inline unsigned long ReadProcessor::nucleoToNumber(char s) {
            break;
        case 'N':
            nbr=0;
+            // n=2*k; // do that because n will also be decremented for reverse complement
            break;
        default:
            throw -1; // TODO Benchmark this inside try catch statement to see if try catch+exception really costs so long.
                      // throw an integer for the moment. An exception object may not be the most optimal choice in terms of performance.
    }
+    // n--;
    return nbr;
 }

@@ -57,13 +65,13 @@ inline unsigned long ReadProcessor::kMerToNumber(char * k_m,unsigned long * p_pr
    if (p_prev==NULL) { // first k_mer conversion
        int i;
        for (i=0;i<k;i++) {
-            c=nucleoToNumber(k_m[i]); // do not catch exception for the moment (not until I have checked it doesn't slow down execution). If nucleoToNumber returns -1, program will simply crash
+            c=nucleoToNumberWrap(k_m[i]); // do not catch exception for the moment (not until I have checked it doesn't slow down execution). If nucleoToNumber returns -1, program will simply crash
            nbr=nbr<<2;
            nbr=nbr|c;
        }
    } else {
        nbr=*p_prev;
-        c=nucleoToNumber(k_m[k-1]);
+        c=nucleoToNumberWrap(k_m[k-1]);
        nbr=nbr<<2;
        nbr=nbr&mask_kMer;
        nbr=nbr|c;
@@ -110,9 +118,11 @@ void ReadProcessor::getKMerNumbers(char * dnaStr,int l,std::vector<unsigned long
    int nb_k_m=l-k+1;
    for (i=0; i<nb_k_m;i++) {
        num=kMerToNumber(p_char,p_prev);
-        my_vect.push_back(num);
        num_rev=kMerToNumberReverse(p_char,p_prev_rev);
+        if (n<0) {
+            my_vect.push_back(num);
            my_vect.push_back(num_rev);
+        }
        p_char++;
        p_prev=&num;
        p_prev_rev=&num_rev;

--- a/src/ReadProcessor.h
+++ b/src/ReadProcessor.h
@@ -12,6 +12,7 @@
 class ReadProcessor {

    int k;
+    int n;

    std::map<char, char> nucleo_rev_cpl;
    unsigned long mask_kMer;
@@ -33,7 +34,7 @@ class ReadProcessor {

 public:
    ReadProcessor(int k=3) {
-        // init_mask(k);
+        n=0;
        set_k(k);
        init_nucleo_rev_cpl();
    }
@@ -49,7 +50,7 @@ public:
    unsigned long kMerToNumber(char * k_m,unsigned long * p_prev);

    unsigned long nucleoToNumber(char s);
-
+    unsigned long nucleoToNumberWrap(char s);

    void getKMerNumbers(char * dnaStr,int l,std::vector<unsigned long>& my_vect);


--- a/src/unit_test_read_utils.cpp
+++ b/src/unit_test_read_utils.cpp
@@ -203,7 +203,6 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32.
    ReadProcessor read_p(k);
    std::vector<unsigned long> nbrKmerDecompo;

-
    std::vector<unsigned long>::iterator it;
    int nb_expected_k_mers=strlen(dnaStr)+1;
    nb_expected_k_mers-=k;
@@ -251,6 +250,37 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32.
    }
 }

+// NTTTTNGGTGCTACCATAACACCAACTGTTTTCACCATAATTTTAANATCAAGCATTAGAGACGCTTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTCATTTCTTCCACTAACCTGCCATAATCCAGTACAACCTGGTATAACGGTCAAACGCTTTTTATCATAGGAACTGTATTCTCCTACCTCACGTGGCAAAGGAGGTCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGTGGTA
+
+/*
+ * Tests the handling of N nucleotides.
+ * k-mers containing such nucleotides must not be converted into number and they are not in the result returned to the caller.
+ */
+void testDNAToNumberWithN() {
+    char dnaStr[]="NTTTTNGGTGCTACCATAACACCAACTGTTTTCACCATAATTTTAANATCAAGCATTAGAGACGCTTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTCATTTCTTCCACTAACCTGCCATAATCCAGTACAACCTGGTATAACGGTCAAACGCTTTTTATCATAGGAACTGTATTCTCCTACCTCACGTGGCAAAGGAGGTCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGTGGTA";
+    int k=30;
+    ReadProcessor read_p(k);
+    std::vector<unsigned long> nbrKmerDecompo;
+    std::vector<unsigned long>::iterator it;
+    int nb_expected_k_mers=strlen(dnaStr)+1;
+    nb_expected_k_mers-=k;
+    nb_expected_k_mers-=47; // expect the 1rts 76 k-mers to be skipped due to the presence of the N (unknown) nucleotide.
+    nb_expected_k_mers+=11; // between the 2nd and 3rd N character, there are 11 k-mers of length 30 that are correct (don't contain any N). no
+
+    nbrKmerDecompo.reserve(nb_expected_k_mers);
+    read_p.getKMerNumbers((char *) dnaStr,strlen(dnaStr),nbrKmerDecompo);
+    assert(nbrKmerDecompo.size()==nb_expected_k_mers*2);
+
+    it=nbrKmerDecompo.begin();
+    it+=22;
+    // The 1rst expected k-mer is the one after the 3rd N char (starting at position 47 in dnaStr). ATCAAGCATTAGAGACGCTTCTCTAATGTA
+    assert(*it==234837138459816172);
+    it++;
+    assert(*it==886965076742957027);
+
+
+
+}

 int main(int argc, char **argv) {
    cout<<"test getting single reads."<<endl;
@@ -261,6 +291,8 @@ int main(int argc, char **argv) {
    testDNAToNumberSimple();
    cout<<"test converting an entire read to a series of numbers"<<endl;
    testDNAToNumberMoreComplex();
+    cout<<"testing the case of N nucleotides"<<endl;
+    testDNAToNumberWithN();
    cout<<"done"<<endl;
 }