Skip to content
Snippets Groups Projects
Select Git revision
  • 37533eafdf88060b186f8272cd2be9d6e9edd6d6
  • main default protected
  • interactive-widget
  • v0.2.5
  • v0.2.4
  • v0.2.3
  • v0.2.2
  • v0.2.1
  • v0.2.0
  • v0.1.4
  • v0.1.3
  • v0.1.2
  • v0.1.1
  • v0.1
14 results

_widget.py

Blame
  • unit_test_read_utils.cpp 13.38 KiB
    /*
     * unit_test_read_utils.cpp
     *
     *  Created on: Jan 26, 2016
     *      Author: vlegrand
     *      unit and non regression testing for the read_utils component.
     *      Keep using assert for the moment, don't want to add a dependency on boost (or any other test framework) just for the tests.
     */
    #include <iostream>
    
    #include <cassert>
    #include <cstring>
    
    #include "read_utils.h"
    #include "FqMainBackend.h"
    #include "rock_commons.h"
    #include "srp.h"
    #include "ReadProcessor.h"
    
    using namespace std;
    
    void test_getReadSingle() {
        rpos my_struct1,my_struct2;
        my_struct1=init_rpos(4,639);
        my_struct2=init_rpos(4,1228);
    
        //std::cout<<my_struct1.read_a1<<endl;
    
        assert(my_struct1.read_a1==639);
        assert(my_struct2.read_a1==1228);
        srp io_sr;
        unsigned int j=0;
        DnaSeqStr a_seqs;
        char dna_read[MAX_READ_LENGTH];
    
        char * fq_single2=(char *) "../data/test_single2.fq";
        FqMainBackend be_fq=FqMainBackend(&io_sr); // TODO, remove argument from constructor
        be_fq.openInputFile(fq_single2, 4);
    
    
        FqBaseBackend * fic_map[k_max_input_files];
        fic_map[3]=&be_fq;
    
        init_DnaSeqStr(&a_seqs);
        getDNASeqstr(fic_map, my_struct1, 0, &a_seqs);
        char * tmp=(char *) a_seqs.fq_record_buf;
        tmp+=a_seqs.start_idx;
        memcpy(dna_read,tmp,a_seqs.length);
        // std::cout<<dna_read<<endl;
        assert(strcmp(dna_read,"TTTTTAGGTGCTACCATAACACCAACTGTTTTCACCATAATTTTAAAATCAAGCATTAGAGACGCTTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTCATTTCTTCCACTAACCTGCCATAATCCAGTACAACCTGGTATAACGGTCAAACGCTTTTTATCATAGGAACTGTATTCTCCTACCTCACGTGGCAAAGGAGGTCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGTGGTA")==0);
    
    
        init_DnaSeqStr(&a_seqs);
        assert(strcmp(a_seqs.fq_record_buf,"")==0);
        assert(a_seqs.length==0);
        assert(a_seqs.start_idx==0);
    
        getDNASeqstr(fic_map, my_struct2, 0, &a_seqs);
        tmp=(char *) a_seqs.fq_record_buf;
        tmp+=a_seqs.start_idx;
        memcpy(dna_read,tmp,a_seqs.length);
        if (a_seqs.length<MAX_READ_LENGTH) dna_read[a_seqs.length]='\0';
        std::cout<<dna_read<<endl;
        assert(strcmp(dna_read,"ACCCAAACTTGCCAGACTTGTGTAGAACGTCCAATATGTATCGGCATCGCTTCCACATGAATGAATCCTTGTTCCACACTTTTTATATGATTCGCATTAATTTCTTGTCCGAAAATCAACTGATTTTTTGCAACATTTTCTCCCGCTCCAAGACTGGCTGCATGTTCTGCAAGCGCAACAGAAACACCACCATGCAAGTAGCCAAAGGGTTGTTTGACCTGATCTGTTATTTCAAGCGCCAGTTCCACTC")==0);
        be_fq.closeInputFile();
    }
    
    void getDnaStr(FqBaseBackend * fic_map[],rpos my_struct,DnaSeqStr* a_seqs,char * dna_read) { // Auxilliary method for test_getReadPE().
        getDNASeqstr(fic_map, my_struct, 0, a_seqs);
    
        char * tmp=(char *) a_seqs[0].fq_record_buf;
        tmp+=a_seqs[0].start_idx;
    
        memcpy(dna_read,tmp,a_seqs[0].length);
    
        tmp=(char *) a_seqs[1].fq_record_buf;
        tmp+=a_seqs[1].start_idx;
    
        char * b=(char *) dna_read+a_seqs[0].length;
        memcpy(b,tmp,a_seqs[1].length);
    
        if (a_seqs[0].length+a_seqs[1].length<MAX_READ_LENGTH) dna_read[a_seqs[0].length+a_seqs[1].length]='\0';
    }
    
    
    void test_getReadPE() {
        rpos my_struct1,my_struct2;
        char * fq_PE1=(char *) "../data/test_PE1_2.fq";
        char * fq_PE2=(char *) "../data/test_PE2_2.fq";
        srp io_sr;
        unsigned int j=0;
        char dna_read[MAX_READ_LENGTH];
    
        FqMainBackend be_fq1=FqMainBackend(&io_sr);
        FqAuxBackend be_fq2;
        be_fq1.setAuxProcessor(&be_fq2);
    
        be_fq1.openInputFile(fq_PE1, 4);
        be_fq2.openFile(fq_PE2, 5);
    
        FqBaseBackend * fic_map[k_max_input_files];
        fic_map[3]=&be_fq1;
        fic_map[4]=&be_fq2;
    
        DnaSeqStr a_seqs[2];
    
        init_DnaSeqStr(&a_seqs[0]);
        init_DnaSeqStr(&a_seqs[1]);
    
        my_struct1=init_rpos(4,0);
        update_rpos(5,0,j,&my_struct1);
    
        my_struct2=init_rpos(4,13647);
        update_rpos(5,13653,j,&my_struct2);
    
        assert(my_struct1.read_a1==0);
        assert(my_struct1.read_a2==0);
        assert(my_struct2.read_a1==13647);
        assert(my_struct2.read_a2==6);
    
        getDnaStr(fic_map,my_struct1,a_seqs,dna_read);
        std::cout<<dna_read<<endl;
    
        assert(strcmp(dna_read,"GGTTCTGTTGGCTCAAACGCCGTCACTTCATTGATCAAAAGCTTATAATGCGTGCCAAAGTCCGCCATCGAGACGACTACGCCTTCCCCTGCTTTCCCGTCAAAAACGAGTCTTGCCGGATCTTCACGGTCTCCCCTCGAAAGCGGCGAAATCTTAGAGGAAGGTGGATATAATGCCGTCACATCGAACTTTGAAGATCTATACGGCATGCAGCAGCTTCCAGGTCTTGCGGTGCAACGTTTAATGGCAGATGGCTACGGTTTTGCGGGGGAAGGAGACTGGAAAACGGCGGCGATCGACCG")==0);
    
        getDnaStr(fic_map,my_struct2,a_seqs,dna_read);
        assert(strcmp(dna_read,"ATTGTGGGGTTCCTTTTTGTAGCATTGGAATGGAAATTAAAAATGGGGCTTCAGGATGCCCGCTCCATTATTTAATTCCAGAATGTAACGATGCTGTTTACCGGGGGGACTGGAAAGATGCACTTGAGCTTTTGATTAAAACAAATAACATGCCCAGAACCAATCACTGCAATTTTTTTATCCCACCGCACTATCGGTGGAGTCGGCATGAACCAACCTAAACCAAACCCACGGTCAATAATAGCCCGTTCAATCGAATTAATACCCACAGCAGGATCAGAAATTGCAACCGTACAACTTC")==0);
    
        be_fq1.closeInputFile();
        be_fq2.closeFile();
        //cout<<"done"<<endl;
    
    }
    
    void test_decomposeReadInkMerNums() {
        int k=5;
        ReadProcessor read_p(k);
        readNumericValues nbrKmerDecompo;
        
        DnaSeqStr a_seqs[2];
        init_DnaSeqStr(&a_seqs[0]);
        init_DnaSeqStr(&a_seqs[1]);
        DnaSeqStr * seq1=a_seqs;
        //cout<<strlen("AAAAAAAAAA")<<endl;
        strcpy(seq1->fq_record_buf,"@fake_stuff\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n+\n....@\n");
        seq1->start_idx=12;
        seq1->length=75;
        decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs);
        assert(nbrKmerDecompo.size()==71);
        readNumericValues::iterator it;
        for (it=nbrKmerDecompo.begin();it!=nbrKmerDecompo.end();it++) {
            // cout<<*it<<endl;
            assert(*it==1023);
        }
        DnaSeqStr * seq2=&a_seqs[1];
        strcpy(seq2->fq_record_buf,"@another_fake_stuff\nATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG\n+\n....@\n");
        seq2->start_idx=20;
        seq2->length=80;
        nbrKmerDecompo.clear();
        decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs);
        //cout<<nbrKmerDecompo.size()<<endl;
        assert(nbrKmerDecompo.size()==147);
    }
    
    
    /*
     * Test the basic methods of the ReadProcessor object.
     */
    void testDNAToNumberSimple() {
        unsigned long nbr;
        assert(sizeof(nbr)==8); // This may not be true on some platform; so better know it now.
    
        ReadProcessor read_p;
    
        assert(read_p.nucleoToNumber('A')==0);
        assert(read_p.nucleoToNumber('C')==1);
        assert(read_p.nucleoToNumber('G')==2);
        assert(read_p.nucleoToNumber('T')==3);
        assert(read_p.nucleoToNumber('N')==0);
    
        unsigned long * p_prev=NULL;
        read_p.set_k(4);
        assert(read_p.kMerToNumber((char *) "AAAA",p_prev)==0); // TODO : several of these methods should be private in ReadProcessor. Design a friend class to test them or declare this testing function friend in ReadProcessor.
        assert(read_p.kMerToNumberReverse((char *) "AAAA",p_prev)==255);
        read_p.set_k(5);
        assert(read_p.kMerToNumber((char *) "AANAA",p_prev)==0);
        assert(read_p.kMerToNumberReverse((char *)"AANAA",p_prev)==975);
        read_p.set_k(14);
        assert(read_p.kMerToNumber((char *) "AANAATCCGATTGG",p_prev)==219386);
        assert(read_p.kMerToNumberReverse((char *)"AANAATCCGATTGG",p_prev)==84779983);
    
        unsigned long prev=219386;
        p_prev=&prev;
    
        read_p.set_k(15);
        assert(read_p.kMerToNumber((char *) "AANAATCCGATTGGA",p_prev)==877544);
        assert(read_p.kMerToNumber((char *) "AANAATCCGATTGGN",p_prev)==877544);
        assert(read_p.kMerToNumber((char *) "AANAATCCGATTGGC",p_prev)==877545);
        assert(read_p.kMerToNumber((char *) "AANAATCCGATTGGG",p_prev)==877546);
        assert(read_p.kMerToNumber((char *) "AANAATCCGATTGGT",p_prev)==877547);
    
        read_p.set_k(1);
        assert(read_p.nucleoToNumberReverse('A',1)==3);
        assert(read_p.nucleoToNumberReverse('C',1)==2);
        assert(read_p.nucleoToNumberReverse('G',1)==1);
        assert(read_p.nucleoToNumberReverse('T',1)==0);
        assert(read_p.nucleoToNumberReverse('N',1)==0);
    
        read_p.set_k(20);
        assert(read_p.nucleoToNumberReverse('A',20)==824633720832);
        assert(read_p.nucleoToNumberReverse('C',20)==549755813888);
        assert(read_p.nucleoToNumberReverse('G',20)==274877906944);
        assert(read_p.nucleoToNumberReverse('T',20)==0);
        assert(read_p.nucleoToNumberReverse('N',20)==0);
    
        prev=84779983;
        p_prev=&prev;
        read_p.set_k(14);
        assert(read_p.kMerToNumberReverse((char *)"ANAATCCGATTGGA",p_prev)==222521587);
        assert(read_p.kMerToNumberReverse((char *)"ANAATCCGATTGGN",p_prev)==21194995);
        assert(read_p.kMerToNumberReverse((char *)"ANAATCCGATTGGT",p_prev)==21194995);
        assert(read_p.kMerToNumberReverse((char *)"ANAATCCGATTGGC",p_prev)==155412723);
        assert(read_p.kMerToNumberReverse((char *)"ANAATCCGATTGGG",p_prev)==88303859);
    
    }
    
    /*
     * Test the ReadProcessor method that will be called by the main program : the one who takes as input a dna string and returns
     * a serie of numbers all corresponding to its k-mer composition + reverse complement.
     */
    
    void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32.
        // from a dna string a found in fq files, return its k-mer decomposition; k-mers are numbers.
        char dnaStr[]="TTTTTAGGTGCTACCATAACACCAACTGTTTTCACCATAATTTTAAAATCAAGCATTAGAGACGCTTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTCATTTCTTCCACTAACCTGCCATAATCCAGTACAACCTGGTATAACGGTCAAACGCTTTTTATCATAGGAACTGTATTCTCCTACCTCACGTGGCAAAGGAGGTCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGTGGTA";
        int k=30;
        ReadProcessor read_p(k);
        readNumericValues nbrKmerDecompo;
    
        readNumericValues::iterator it;
        int nb_expected_k_mers=strlen(dnaStr)+1;
        nb_expected_k_mers-=k;
        nbrKmerDecompo.reserve(nb_expected_k_mers);
        unsigned long expected=read_p.kMerToNumber((char *) "GCTACCATAACACCAACTGTTTTCACCATA",NULL);
        unsigned long expected_rev=read_p.kMerToNumber((char *) "TATGGTGAAAACAGTTGGTGTTATGGTAGC",NULL);
    
        read_p.getKMerNumbers((char *) dnaStr,strlen(dnaStr),nbrKmerDecompo);
        assert(nbrKmerDecompo.size()==nb_expected_k_mers); // for each k_mer, we also expect to have a number for its reverse complement.
        int cnt_k_mer=0;
        unsigned long num,num2;
        for (it=nbrKmerDecompo.begin();it!=nbrKmerDecompo.end();it++) {
            cnt_k_mer++;
            // 1rst k-mer is : TTTTTAGGTGCTACCATAACACCAACTGTT.
            // its reverse complement is : AACAGTTGGTGTTATGGTAGCACCTAAAAA
            if (cnt_k_mer==1) {
                assert(*it==1151987406330741231);
                num=*it;
            }
            if (cnt_k_mer==2) {
                // 2nd k-mer is : TTTTAGGTGCTACCATAACACCAACTGTTT
                num2=read_p.kMerToNumber((char *) "TTTTAGGTGCTACCATAACACCAACTGTTT",NULL);
                assert(*it==num2);
                unsigned long num3=read_p.kMerToNumber((char *) "TTTTAGGTGCTACCATAACACCAACTGTTT",&num);
                assert(num3==num2);
            }
            if (cnt_k_mer==3) {
                num=read_p.kMerToNumber((char *) "TTTAGGTGCTACCATAACACCAACTGTTTT",NULL);
                unsigned long num_rev=read_p.kMerToNumber((char *) "AAAACAGTTGGTGTTATGGTAGCACCTAAA",NULL);
                assert(max(num,num_rev)==*it);
            }
            if (cnt_k_mer==4) {
                num=read_p.kMerToNumber((char *) "TTAGGTGCTACCATAACACCAACTGTTTTC",NULL);
                assert(num==*it);
            }
    
            // k-mer number 10 is GCTACCATAACACCAACTGTTTTCACCATA
            // its reverse complement is : TATGGTGAAAACAGTTGGTGTTATGGTAGC
            if (cnt_k_mer==10) {
                num=read_p.kMerToNumber((char *) "GCTACCATAACACCAACTGTTTTCACCATA",NULL);
                assert(*it==max(704021989794238796,930978566989888201));
            }
            /*if (cnt_k_mer==20) {
                assert(*it==930978566989888201);
            }*/
        }
    }
    
    // NTTTTNGGTGCTACCATAACACCAACTGTTTTCACCATAATTTTAANATCAAGCATTAGAGACGCTTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTCATTTCTTCCACTAACCTGCCATAATCCAGTACAACCTGGTATAACGGTCAAACGCTTTTTATCATAGGAACTGTATTCTCCTACCTCACGTGGCAAAGGAGGTCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGTGGTA
    
    /*
     * Tests the handling of N nucleotides.
     * k-mers containing such nucleotides must not be converted into number and they are not in the result returned to the caller.
     */
    void testDNAToNumberWithN() {
        char dnaStr[]="NTTTTNGGTGCTACCATAACACCAACTGTTTTCACCATAATTTTAANATCAAGCATTAGAGACGCTTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTCATTTCTTCCACTAACCTGCCATAATCCAGTACAACCTGGTATAACGGTCAAACGCTTTTTATCATAGGAACTGTATTCTCCTACCTCACGTGGCAAAGGAGGTCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGTGGTA";
        int k=30;
        ReadProcessor read_p(k);
        std::vector<unsigned long> nbrKmerDecompo;
        std::vector<unsigned long>::iterator it;
        int nb_expected_k_mers=strlen(dnaStr)+1;
        nb_expected_k_mers-=k;
        nb_expected_k_mers-=47; // expect the 1rts 76 k-mers to be skipped due to the presence of the N (unknown) nucleotide.
        nb_expected_k_mers+=11; // between the 2nd and 3rd N character, there are 11 k-mers of length 30 that are correct (don't contain any N). no
    
        nbrKmerDecompo.reserve(nb_expected_k_mers);
        read_p.getKMerNumbers((char *) dnaStr,strlen(dnaStr),nbrKmerDecompo);
        assert(nbrKmerDecompo.size()==nb_expected_k_mers);
    
        it=nbrKmerDecompo.begin();
        it+=11;
        // The 1rst expected k-mer is the one after the 3rd N char (starting at position 47 in dnaStr). ATCAAGCATTAGAGACGCTTCTCTAATGTA or its reverse complement depending
        assert(*it==max(234837138459816172,886965076742957027));
    }
    
    int main(int argc, char **argv) {
        cout<<"test getting single reads."<<endl;
        test_getReadSingle();
        cout<<"test getting PE reads."<<endl;
        test_getReadPE();
        cout<<"test converting k_mers of different size into number"<<endl;
        testDNAToNumberSimple();
        cout<<"test converting an entire read to a series of numbers"<<endl;
        testDNAToNumberMoreComplex();
        cout<<"testing the case of N nucleotides"<<endl;
        testDNAToNumberWithN();
        cout<<"testing higher level function: decomposeReadInKMerNums"<<endl;
        test_decomposeReadInkMerNums();
        cout<<"done"<<endl;
    }