Select Git revision
unit_test_read_utils.cpp 13.38 KiB
/*
* unit_test_read_utils.cpp
*
* Created on: Jan 26, 2016
* Author: vlegrand
* unit and non regression testing for the read_utils component.
* Keep using assert for the moment, don't want to add a dependency on boost (or any other test framework) just for the tests.
*/
#include <iostream>
#include <cassert>
#include <cstring>
#include "read_utils.h"
#include "FqMainBackend.h"
#include "rock_commons.h"
#include "srp.h"
#include "ReadProcessor.h"
using namespace std;
void test_getReadSingle() {
rpos my_struct1,my_struct2;
my_struct1=init_rpos(4,639);
my_struct2=init_rpos(4,1228);
//std::cout<<my_struct1.read_a1<<endl;
assert(my_struct1.read_a1==639);
assert(my_struct2.read_a1==1228);
srp io_sr;
unsigned int j=0;
DnaSeqStr a_seqs;
char dna_read[MAX_READ_LENGTH];
char * fq_single2=(char *) "../data/test_single2.fq";
FqMainBackend be_fq=FqMainBackend(&io_sr); // TODO, remove argument from constructor
be_fq.openInputFile(fq_single2, 4);
FqBaseBackend * fic_map[k_max_input_files];
fic_map[3]=&be_fq;
init_DnaSeqStr(&a_seqs);
getDNASeqstr(fic_map, my_struct1, 0, &a_seqs);
char * tmp=(char *) a_seqs.fq_record_buf;
tmp+=a_seqs.start_idx;
memcpy(dna_read,tmp,a_seqs.length);
// std::cout<<dna_read<<endl;
assert(strcmp(dna_read,"TTTTTAGGTGCTACCATAACACCAACTGTTTTCACCATAATTTTAAAATCAAGCATTAGAGACGCTTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTCATTTCTTCCACTAACCTGCCATAATCCAGTACAACCTGGTATAACGGTCAAACGCTTTTTATCATAGGAACTGTATTCTCCTACCTCACGTGGCAAAGGAGGTCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGTGGTA")==0);
init_DnaSeqStr(&a_seqs);
assert(strcmp(a_seqs.fq_record_buf,"")==0);
assert(a_seqs.length==0);
assert(a_seqs.start_idx==0);
getDNASeqstr(fic_map, my_struct2, 0, &a_seqs);
tmp=(char *) a_seqs.fq_record_buf;
tmp+=a_seqs.start_idx;
memcpy(dna_read,tmp,a_seqs.length);
if (a_seqs.length<MAX_READ_LENGTH) dna_read[a_seqs.length]='\0';
std::cout<<dna_read<<endl;
assert(strcmp(dna_read,"ACCCAAACTTGCCAGACTTGTGTAGAACGTCCAATATGTATCGGCATCGCTTCCACATGAATGAATCCTTGTTCCACACTTTTTATATGATTCGCATTAATTTCTTGTCCGAAAATCAACTGATTTTTTGCAACATTTTCTCCCGCTCCAAGACTGGCTGCATGTTCTGCAAGCGCAACAGAAACACCACCATGCAAGTAGCCAAAGGGTTGTTTGACCTGATCTGTTATTTCAAGCGCCAGTTCCACTC")==0);
be_fq.closeInputFile();
}
void getDnaStr(FqBaseBackend * fic_map[],rpos my_struct,DnaSeqStr* a_seqs,char * dna_read) { // Auxilliary method for test_getReadPE().
getDNASeqstr(fic_map, my_struct, 0, a_seqs);
char * tmp=(char *) a_seqs[0].fq_record_buf;
tmp+=a_seqs[0].start_idx;
memcpy(dna_read,tmp,a_seqs[0].length);
tmp=(char *) a_seqs[1].fq_record_buf;
tmp+=a_seqs[1].start_idx;
char * b=(char *) dna_read+a_seqs[0].length;
memcpy(b,tmp,a_seqs[1].length);
if (a_seqs[0].length+a_seqs[1].length<MAX_READ_LENGTH) dna_read[a_seqs[0].length+a_seqs[1].length]='\0';
}
void test_getReadPE() {
rpos my_struct1,my_struct2;
char * fq_PE1=(char *) "../data/test_PE1_2.fq";
char * fq_PE2=(char *) "../data/test_PE2_2.fq";
srp io_sr;
unsigned int j=0;
char dna_read[MAX_READ_LENGTH];
FqMainBackend be_fq1=FqMainBackend(&io_sr);
FqAuxBackend be_fq2;
be_fq1.setAuxProcessor(&be_fq2);
be_fq1.openInputFile(fq_PE1, 4);
be_fq2.openFile(fq_PE2, 5);
FqBaseBackend * fic_map[k_max_input_files];
fic_map[3]=&be_fq1;
fic_map[4]=&be_fq2;
DnaSeqStr a_seqs[2];
init_DnaSeqStr(&a_seqs[0]);
init_DnaSeqStr(&a_seqs[1]);
my_struct1=init_rpos(4,0);
update_rpos(5,0,j,&my_struct1);
my_struct2=init_rpos(4,13647);
update_rpos(5,13653,j,&my_struct2);
assert(my_struct1.read_a1==0);
assert(my_struct1.read_a2==0);
assert(my_struct2.read_a1==13647);
assert(my_struct2.read_a2==6);
getDnaStr(fic_map,my_struct1,a_seqs,dna_read);
std::cout<<dna_read<<endl;
assert(strcmp(dna_read,"GGTTCTGTTGGCTCAAACGCCGTCACTTCATTGATCAAAAGCTTATAATGCGTGCCAAAGTCCGCCATCGAGACGACTACGCCTTCCCCTGCTTTCCCGTCAAAAACGAGTCTTGCCGGATCTTCACGGTCTCCCCTCGAAAGCGGCGAAATCTTAGAGGAAGGTGGATATAATGCCGTCACATCGAACTTTGAAGATCTATACGGCATGCAGCAGCTTCCAGGTCTTGCGGTGCAACGTTTAATGGCAGATGGCTACGGTTTTGCGGGGGAAGGAGACTGGAAAACGGCGGCGATCGACCG")==0);
getDnaStr(fic_map,my_struct2,a_seqs,dna_read);
assert(strcmp(dna_read,"ATTGTGGGGTTCCTTTTTGTAGCATTGGAATGGAAATTAAAAATGGGGCTTCAGGATGCCCGCTCCATTATTTAATTCCAGAATGTAACGATGCTGTTTACCGGGGGGACTGGAAAGATGCACTTGAGCTTTTGATTAAAACAAATAACATGCCCAGAACCAATCACTGCAATTTTTTTATCCCACCGCACTATCGGTGGAGTCGGCATGAACCAACCTAAACCAAACCCACGGTCAATAATAGCCCGTTCAATCGAATTAATACCCACAGCAGGATCAGAAATTGCAACCGTACAACTTC")==0);
be_fq1.closeInputFile();
be_fq2.closeFile();
//cout<<"done"<<endl;
}
void test_decomposeReadInkMerNums() {
int k=5;
ReadProcessor read_p(k);
readNumericValues nbrKmerDecompo;
DnaSeqStr a_seqs[2];
init_DnaSeqStr(&a_seqs[0]);
init_DnaSeqStr(&a_seqs[1]);
DnaSeqStr * seq1=a_seqs;
//cout<<strlen("AAAAAAAAAA")<<endl;
strcpy(seq1->fq_record_buf,"@fake_stuff\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n+\n....@\n");
seq1->start_idx=12;
seq1->length=75;
decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs);
assert(nbrKmerDecompo.size()==71);
readNumericValues::iterator it;
for (it=nbrKmerDecompo.begin();it!=nbrKmerDecompo.end();it++) {
// cout<<*it<<endl;
assert(*it==1023);
}
DnaSeqStr * seq2=&a_seqs[1];
strcpy(seq2->fq_record_buf,"@another_fake_stuff\nATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG\n+\n....@\n");
seq2->start_idx=20;
seq2->length=80;
nbrKmerDecompo.clear();
decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs);
//cout<<nbrKmerDecompo.size()<<endl;
assert(nbrKmerDecompo.size()==147);
}
/*
* Test the basic methods of the ReadProcessor object.
*/
void testDNAToNumberSimple() {
unsigned long nbr;
assert(sizeof(nbr)==8); // This may not be true on some platform; so better know it now.
ReadProcessor read_p;
assert(read_p.nucleoToNumber('A')==0);
assert(read_p.nucleoToNumber('C')==1);
assert(read_p.nucleoToNumber('G')==2);
assert(read_p.nucleoToNumber('T')==3);
assert(read_p.nucleoToNumber('N')==0);
unsigned long * p_prev=NULL;
read_p.set_k(4);
assert(read_p.kMerToNumber((char *) "AAAA",p_prev)==0); // TODO : several of these methods should be private in ReadProcessor. Design a friend class to test them or declare this testing function friend in ReadProcessor.
assert(read_p.kMerToNumberReverse((char *) "AAAA",p_prev)==255);
read_p.set_k(5);
assert(read_p.kMerToNumber((char *) "AANAA",p_prev)==0);
assert(read_p.kMerToNumberReverse((char *)"AANAA",p_prev)==975);
read_p.set_k(14);
assert(read_p.kMerToNumber((char *) "AANAATCCGATTGG",p_prev)==219386);
assert(read_p.kMerToNumberReverse((char *)"AANAATCCGATTGG",p_prev)==84779983);
unsigned long prev=219386;
p_prev=&prev;
read_p.set_k(15);
assert(read_p.kMerToNumber((char *) "AANAATCCGATTGGA",p_prev)==877544);
assert(read_p.kMerToNumber((char *) "AANAATCCGATTGGN",p_prev)==877544);
assert(read_p.kMerToNumber((char *) "AANAATCCGATTGGC",p_prev)==877545);
assert(read_p.kMerToNumber((char *) "AANAATCCGATTGGG",p_prev)==877546);
assert(read_p.kMerToNumber((char *) "AANAATCCGATTGGT",p_prev)==877547);
read_p.set_k(1);
assert(read_p.nucleoToNumberReverse('A',1)==3);
assert(read_p.nucleoToNumberReverse('C',1)==2);
assert(read_p.nucleoToNumberReverse('G',1)==1);
assert(read_p.nucleoToNumberReverse('T',1)==0);
assert(read_p.nucleoToNumberReverse('N',1)==0);
read_p.set_k(20);
assert(read_p.nucleoToNumberReverse('A',20)==824633720832);
assert(read_p.nucleoToNumberReverse('C',20)==549755813888);
assert(read_p.nucleoToNumberReverse('G',20)==274877906944);
assert(read_p.nucleoToNumberReverse('T',20)==0);
assert(read_p.nucleoToNumberReverse('N',20)==0);
prev=84779983;
p_prev=&prev;
read_p.set_k(14);
assert(read_p.kMerToNumberReverse((char *)"ANAATCCGATTGGA",p_prev)==222521587);
assert(read_p.kMerToNumberReverse((char *)"ANAATCCGATTGGN",p_prev)==21194995);
assert(read_p.kMerToNumberReverse((char *)"ANAATCCGATTGGT",p_prev)==21194995);
assert(read_p.kMerToNumberReverse((char *)"ANAATCCGATTGGC",p_prev)==155412723);
assert(read_p.kMerToNumberReverse((char *)"ANAATCCGATTGGG",p_prev)==88303859);
}
/*
* Test the ReadProcessor method that will be called by the main program : the one who takes as input a dna string and returns
* a serie of numbers all corresponding to its k-mer composition + reverse complement.
*/
void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32.
// from a dna string a found in fq files, return its k-mer decomposition; k-mers are numbers.
char dnaStr[]="TTTTTAGGTGCTACCATAACACCAACTGTTTTCACCATAATTTTAAAATCAAGCATTAGAGACGCTTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTCATTTCTTCCACTAACCTGCCATAATCCAGTACAACCTGGTATAACGGTCAAACGCTTTTTATCATAGGAACTGTATTCTCCTACCTCACGTGGCAAAGGAGGTCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGTGGTA";
int k=30;
ReadProcessor read_p(k);
readNumericValues nbrKmerDecompo;
readNumericValues::iterator it;
int nb_expected_k_mers=strlen(dnaStr)+1;
nb_expected_k_mers-=k;
nbrKmerDecompo.reserve(nb_expected_k_mers);
unsigned long expected=read_p.kMerToNumber((char *) "GCTACCATAACACCAACTGTTTTCACCATA",NULL);
unsigned long expected_rev=read_p.kMerToNumber((char *) "TATGGTGAAAACAGTTGGTGTTATGGTAGC",NULL);
read_p.getKMerNumbers((char *) dnaStr,strlen(dnaStr),nbrKmerDecompo);
assert(nbrKmerDecompo.size()==nb_expected_k_mers); // for each k_mer, we also expect to have a number for its reverse complement.
int cnt_k_mer=0;
unsigned long num,num2;
for (it=nbrKmerDecompo.begin();it!=nbrKmerDecompo.end();it++) {
cnt_k_mer++;
// 1rst k-mer is : TTTTTAGGTGCTACCATAACACCAACTGTT.
// its reverse complement is : AACAGTTGGTGTTATGGTAGCACCTAAAAA
if (cnt_k_mer==1) {
assert(*it==1151987406330741231);
num=*it;
}
if (cnt_k_mer==2) {
// 2nd k-mer is : TTTTAGGTGCTACCATAACACCAACTGTTT
num2=read_p.kMerToNumber((char *) "TTTTAGGTGCTACCATAACACCAACTGTTT",NULL);
assert(*it==num2);
unsigned long num3=read_p.kMerToNumber((char *) "TTTTAGGTGCTACCATAACACCAACTGTTT",&num);
assert(num3==num2);
}
if (cnt_k_mer==3) {
num=read_p.kMerToNumber((char *) "TTTAGGTGCTACCATAACACCAACTGTTTT",NULL);
unsigned long num_rev=read_p.kMerToNumber((char *) "AAAACAGTTGGTGTTATGGTAGCACCTAAA",NULL);
assert(max(num,num_rev)==*it);
}
if (cnt_k_mer==4) {
num=read_p.kMerToNumber((char *) "TTAGGTGCTACCATAACACCAACTGTTTTC",NULL);
assert(num==*it);
}
// k-mer number 10 is GCTACCATAACACCAACTGTTTTCACCATA
// its reverse complement is : TATGGTGAAAACAGTTGGTGTTATGGTAGC
if (cnt_k_mer==10) {
num=read_p.kMerToNumber((char *) "GCTACCATAACACCAACTGTTTTCACCATA",NULL);
assert(*it==max(704021989794238796,930978566989888201));
}
/*if (cnt_k_mer==20) {
assert(*it==930978566989888201);
}*/
}
}
// NTTTTNGGTGCTACCATAACACCAACTGTTTTCACCATAATTTTAANATCAAGCATTAGAGACGCTTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTCATTTCTTCCACTAACCTGCCATAATCCAGTACAACCTGGTATAACGGTCAAACGCTTTTTATCATAGGAACTGTATTCTCCTACCTCACGTGGCAAAGGAGGTCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGTGGTA
/*
* Tests the handling of N nucleotides.
* k-mers containing such nucleotides must not be converted into number and they are not in the result returned to the caller.
*/
void testDNAToNumberWithN() {
char dnaStr[]="NTTTTNGGTGCTACCATAACACCAACTGTTTTCACCATAATTTTAANATCAAGCATTAGAGACGCTTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTCATTTCTTCCACTAACCTGCCATAATCCAGTACAACCTGGTATAACGGTCAAACGCTTTTTATCATAGGAACTGTATTCTCCTACCTCACGTGGCAAAGGAGGTCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGTGGTA";
int k=30;
ReadProcessor read_p(k);
std::vector<unsigned long> nbrKmerDecompo;
std::vector<unsigned long>::iterator it;
int nb_expected_k_mers=strlen(dnaStr)+1;
nb_expected_k_mers-=k;
nb_expected_k_mers-=47; // expect the 1rts 76 k-mers to be skipped due to the presence of the N (unknown) nucleotide.
nb_expected_k_mers+=11; // between the 2nd and 3rd N character, there are 11 k-mers of length 30 that are correct (don't contain any N). no
nbrKmerDecompo.reserve(nb_expected_k_mers);
read_p.getKMerNumbers((char *) dnaStr,strlen(dnaStr),nbrKmerDecompo);
assert(nbrKmerDecompo.size()==nb_expected_k_mers);
it=nbrKmerDecompo.begin();
it+=11;
// The 1rst expected k-mer is the one after the 3rd N char (starting at position 47 in dnaStr). ATCAAGCATTAGAGACGCTTCTCTAATGTA or its reverse complement depending
assert(*it==max(234837138459816172,886965076742957027));
}
int main(int argc, char **argv) {
cout<<"test getting single reads."<<endl;
test_getReadSingle();
cout<<"test getting PE reads."<<endl;
test_getReadPE();
cout<<"test converting k_mers of different size into number"<<endl;
testDNAToNumberSimple();
cout<<"test converting an entire read to a series of numbers"<<endl;
testDNAToNumberMoreComplex();
cout<<"testing the case of N nucleotides"<<endl;
testDNAToNumberWithN();
cout<<"testing higher level function: decomposeReadInKMerNums"<<endl;
test_decomposeReadInkMerNums();
cout<<"done"<<endl;
}