diff --git a/src/FqAuxBackend.cpp b/src/FqAuxBackend.cpp index f97f7857e0c1c248a93ca501f47d5a36bad4c942..26912250b7e89aa6cc47f66329b976f637bd2013 100644 --- a/src/FqAuxBackend.cpp +++ b/src/FqAuxBackend.cpp @@ -12,17 +12,24 @@ #include <unistd.h> #include <errno.h> #include <err.h> +#include <stdlib.h> #include "FqAuxBackend.h" +#include <iostream> +using namespace std; + +//#define DEBUG int FqAuxBackend::getNextRead(rinfo * p_nr) { rinfo nr; int rfound=0; int eof=0; - while (!rfound) { - if (pos_in_buf>=nread-1) readBuffer(); + if (nread==0 || (pos_in_buf>=nread)) { + // if (pos_in_buf==nread-1) cout<<"BE2 : last char of buf before reading new buffer: "<<buf[pos_in_buf]<<endl; + readBuffer(); + } if (nread==0) { eof=1; break;} rfound=processBuffer(p_nr); } @@ -37,30 +44,63 @@ int FqAuxBackend::processBuffer(rinfo * p_nr) { static unsigned long rstart_offset; char * pchar=buf+pos_in_buf; int rfound=0; +#ifdef DEBUG + static int tot_nb_start=0; + static int tot_nb_stop=0; + static int nb_stop=0; + static int nb_start=0; + + static int idx_id; + static int is_id=0; + if (pos_in_buf==0) { + nb_start=0; + nb_stop=0; + } +#endif while (pos_in_buf<=nread-1 && !rfound) { switch (*pchar){ - case k_read_id_start: { + case k_read_id_start: + if (qual_score) goto inc_score; + else { +#ifdef DEBUG + is_id=1; + idx_id=0; + nb_start++; +#endif + //cout<<k_read_id_start<<" found in PE2."<<endl; rstart_offset=cur_offset-nread+pos_in_buf; num_l_in_rec=1; } break; - case k_read_qual_start: { + case k_read_qual_start: qual_score=1; - st=0;} + st=0; break; - case '\n': { + case '\n': num_l_in_rec+=1; - if (num_l_in_rec==5) {qual_score=0;/* end of fastq record */ - // debug stuff - /*printf("\nquality score is : %d \n",st); - printf("read start_offset is %lu \n",rstart_offset);*/ +#ifdef DEBUG + is_id=0; +#endif + if (num_l_in_rec==5) { qual_score=0;/* end of fastq record */ +#ifdef DEBUG + nb_stop++; +#endif p_nr->f_id=f_id; p_nr->score=st; p_nr->rstart_offset=rstart_offset; rfound=1; } - } break; default: +#ifdef DEBUG + store_id: + { if (is_id==1) { + if (*pchar!='\/') current_id[idx_id]=*pchar; + else current_id[idx_id]='\0'; + idx_id+=1; + } + } +#endif + inc_score: { if (qual_score==1) { s=(int)*pchar; s-=k_phred_32; @@ -71,6 +111,15 @@ int FqAuxBackend::processBuffer(rinfo * p_nr) { pchar++; pos_in_buf++; } +#ifdef DEBUG + std::cout<<"BE2, position in buffer: "<<pos_in_buf<<endl; + if (pos_in_buf==nread) { + cout<<"BE2 Found "<<nb_start<<" and "<<nb_stop<<" start/stop of record in this buffer."<<endl; + tot_nb_start+=nb_start; + tot_nb_stop+=nb_stop; + cout<<"BE2 Total cumulated number of start char found: "<<tot_nb_start<< " Total number of stop: "<<tot_nb_stop<<endl; + } +#endif return rfound; } @@ -102,13 +151,19 @@ void FqAuxBackend::openFile(char * ficname, unsigned char id) { if (fp2==NULL) { err(errno,"cannot open file: %s.",filename); } - + buf=(char *) malloc(bufsize*sizeof(char)); + if (buf==NULL) { + err(errno,"cannot allocate memory: %lu bytes.",bufsize); + } + pos_in_buf=bufsize; // do that to force first reading in buffer. } void FqAuxBackend::closeFile() { if (filename!=NULL) { close(f_pe2); filename=NULL; + free(buf); + buf=NULL; } } diff --git a/src/FqAuxBackend.h b/src/FqAuxBackend.h index bc767e6c1c98960879a264c8f1c325feea2d9909..badb2ee563ba93f36bef92477f88ba4cf08f6875 100644 --- a/src/FqAuxBackend.h +++ b/src/FqAuxBackend.h @@ -9,22 +9,25 @@ #define FQAUXBACKEND_H_ #include "srp.h" -const size_t bufsize=10240; +const size_t bufsize=6048000; // It seems that I can't have a much bigger value than that or else my objects construction throws exception. Strange. +// const size_t bufsize=500000; //try that in valgrind class FqAuxBackend { char * filename; unsigned char f_id; - char buf[bufsize]; - int nread; + size_t nread; int f_pe2; // for calling read FILE * fp2; // for calling ftell unsigned long cur_offset; - int pos_in_buf; + char * buf; void readBuffer(); int processBuffer(rinfo * p_nr); public: + + size_t pos_in_buf; + char current_id[50]; FqAuxBackend() { filename=NULL; @@ -33,7 +36,7 @@ public: f_id=0; cur_offset=0; nread=0; - pos_in_buf=bufsize; // do that to force first reading in buffer. + buf=NULL; } void openFile(char * ficname, unsigned char id); diff --git a/src/FqMainBackend.cpp b/src/FqMainBackend.cpp index c8d4214adf2d8d9f180d5bfcfad195b1fbd29aa3..30ae15071c4d0747bfa0ff1e1520fd9d3b25b83b 100644 --- a/src/FqMainBackend.cpp +++ b/src/FqMainBackend.cpp @@ -13,10 +13,17 @@ #include <err.h> #include <limits.h> #include <assert.h> +#include <stdlib.h> + +// for debug only. TODO : remove that. +#include <iostream> +using namespace std; #include "FqMainBackend.h" #include "srp.h" +//#define DEBUG + rpos init_rpos(unsigned char f_id, unsigned long rstart_offset) { rpos rp; rp.fileid=f_id <<4; @@ -33,13 +40,23 @@ void update_rpos(unsigned char f_id,unsigned long rstart_offset,unsigned long j, } + +FqMainBackend::FqMainBackend(srp * io_sr) { + p_auxFqProcessor=NULL; + p_scoreReadStruct=io_sr; +} + + void FqMainBackend::processFile(char * filename,unsigned char f_id) { FILE * fp; int st,s; int cnt,nread; - unsigned long cur_offset; + long cur_offset; + int nb_read_oper=0; mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; - char buf[bufsize]; + //cout<<"INT_MAX="<<INT_MAX<<endl; + // cout<<filename<<endl; + int f_single=open(filename,O_RDONLY,mode); if (f_single==-1) { err(errno,"cannot open file: %s.",filename); @@ -49,66 +66,128 @@ void FqMainBackend::processFile(char * filename,unsigned char f_id) { if (fp==NULL) { err(errno,"cannot open file: %s.",filename); } - + cur_offset=ftell(fp); + char* buf=(char *) malloc(bufsize*sizeof(char)); + if (buf==NULL) { + err(errno,"cannot allocate memory: %lu bytes.",bufsize); + } /* for each read, we want : offset, filenb, total quality score. */ while ((nread=read(f_single,buf,bufsize))!=0) { - // printf("%s",buf); + nb_read_oper++; + //cout<<"BE1: read operation nbr: "<<nb_read_oper<<endl; + //printf("going to call ftell on : %p \n",fp); cur_offset=ftell(fp); - processBuf((char *)&buf,nread,f_id,cur_offset); + processBuf((char *)buf,nread,f_id,cur_offset); } close(f_single); - + free(buf); } + void FqMainBackend::processBuf(char * buf,int nread,unsigned char f_id,unsigned long cur_offset) { int cnt=0; + // debug stuff, TODO: remove it + // static int nb_read_PE1=0; unsigned int s; static unsigned int st; static int num_l_in_rec; /* counter to know on which line inside the fastq record we are */ static int qual_score=0; static unsigned long rstart_offset; +#ifdef DEBUG + static int nb_start; + static int nb_stop; + static int tot_nb_stop=0; + static int tot_nb_start=0; + static int idx_id; + static int is_id=0; + if (cnt==0) { + nb_start=0; + nb_stop=0; + } +#endif char * pchar=buf; while (cnt<=nread-1) { switch (*pchar){ - case k_read_id_start: { + case k_read_id_start: + if (qual_score) goto inc_score; + else { +#ifdef DEBUG + is_id=1; + idx_id=0; + nb_start++; +#endif rstart_offset=cur_offset-nread+cnt; num_l_in_rec=1; } break; - case k_read_qual_start: { + case k_read_qual_start: qual_score=1; - st=0;} + st=0; break; - case '\n': { + case '\n': +#ifdef DEBUG + is_id=0; +#endif num_l_in_rec+=1; - if (num_l_in_rec==5) {qual_score=0;/* end of fastq record */ - // debug stuff - /*printf("\nquality score is : %d \n",st); - printf("read start_offset is %lu \n",rstart_offset);*/ + if (num_l_in_rec==5) { + // std::cout<<"BE1 position in buffer "<<cnt+1<<endl; //add 1 to compare value with BE2. + qual_score=0;/* end of fastq record */ rpos rp=init_rpos(f_id,rstart_offset); + // nb_read_PE1++; // TODO: remove, this is debug stuff if (p_auxFqProcessor!=NULL) { rinfo pe2info; int eof=p_auxFqProcessor->getNextRead(&pe2info); assert(!eof); // There should be the same number of reads in both PE files. + + // if (eof) cout<<"warning, reached eof in file PE2 after: "<<nb_read_PE1<<" reads in PE1"<<endl; st+=pe2info.score; unsigned long j=rstart_offset/INT_MAX; update_rpos(pe2info.f_id,pe2info.rstart_offset,j,&rp); +#ifdef DEBUG + nb_stop++; + cout<<"Just processed : "<<endl; + if (strcmp(current_id,"NS500443:42:H3MH2AFXX:1:11208:4026:13258")==0) { + cout<<"put breakpoint here"<<endl; + } + cout<<current_id<<endl; + cout<<p_auxFqProcessor->current_id<<endl; + assert(strcmp(current_id,p_auxFqProcessor->current_id)==0); +#endif } i_dim& ref_i_dim=(*p_scoreReadStruct)[st/K_SCORE_NORM_FACTOR]; k_dim& ref_k_dim=ref_i_dim[rstart_offset/INT_MAX]; ref_k_dim.push_back(rp); } - } break; default: +#ifdef DEBUG + store_id: + { if (is_id==1) { + if (*pchar!='\/') current_id[idx_id]=*pchar; + else current_id[idx_id]='\0'; + idx_id+=1; + } + } +#endif + inc_score: { if (qual_score==1) { s=(int)*pchar; s-=k_phred_32; st+=s; } } + break; } pchar++; cnt++; } +#ifdef DEBUG + std::cout<<"BE1 position in buffer "<<cnt<<endl; + if (cnt==nread) { + cout<<"BE1 Found "<<nb_start<<" and "<<nb_stop<<" start/stop of record in this buffer."<<endl; + tot_nb_start+=nb_start; + tot_nb_stop+=nb_stop; + cout<<"BE1 Total cumulated number of start: "<<tot_nb_start<<" Total number of stop: "<<tot_nb_stop<<endl; + } +#endif } diff --git a/src/FqMainBackend.h b/src/FqMainBackend.h index 610a6a3b7be85d1de7fe1f7ff8c9a3659cb13dad..38a9bf30f70da4f1b823c018c89f8a535917dadf 100644 --- a/src/FqMainBackend.h +++ b/src/FqMainBackend.h @@ -13,17 +13,15 @@ #include "FqAuxBackend.h" #include "FqConstants.h" + class FqMainBackend { FqAuxBackend * p_auxFqProcessor; /* Another fastq processor component is necessary for handling the case of PE reads.*/ srp * p_scoreReadStruct; /* Where we store information about the reads. */ + char current_id[50]; public: - FqMainBackend(srp * io_sr) { - p_scoreReadStruct=io_sr; - p_auxFqProcessor=NULL; - } - - + + FqMainBackend(srp*); void setAuxProcessor(FqAuxBackend* fq2ndProc) { p_auxFqProcessor=fq2ndProc; diff --git a/src/fqreader.cpp b/src/fqreader.cpp index 182a1d738c43be73808bb3e43ccb6816f2162023..7b8169f1536e35aba574015fc8e20199c18b3dc9 100644 --- a/src/fqreader.cpp +++ b/src/fqreader.cpp @@ -17,6 +17,8 @@ #include "FqMainBackend.h" + + /* typedef struct { char buf[bufsize]; @@ -40,12 +42,19 @@ void processSingleFile(char * fq_s,unsigned char f_id, srp* io_sr) { /* Processes 1 pair of files containing PE reads.*/ void processPEFiles(char * fq_1, unsigned char f_id1,char * fq_2, unsigned char f_id2,srp * io_sr ) { - FqMainBackend be_fq1=FqMainBackend(io_sr); - FqAuxBackend be_fq2=FqAuxBackend(); + FqAuxBackend be_fq2; + FqMainBackend be_fq1(io_sr); + be_fq2.openFile(fq_2,f_id2); be_fq1.setAuxProcessor(&be_fq2); be_fq1.processFile(fq_1,f_id1); be_fq2.closeFile(); } +/* + * This function processes the content of an rpos structure and returns the character string "ATCG" corresponding to the read record. + * PE reads are concatenated. + */ +void getRead(char ** fname_id_array, const rpos& sr, unsigned int j, char ** dna_read){ +} diff --git a/src/fqreader.h b/src/fqreader.h index cc664c7274125cabe3b764fa456a17a220720ec8..bd823d944facebd88ffdf2ebe43d2b4fe4516925 100644 --- a/src/fqreader.h +++ b/src/fqreader.h @@ -8,10 +8,11 @@ #define FQREADER_H #include "srp.h" - +const int read_length=1000; // this will become a parameter later. /*void processBuf(char * buf,int nread,int cur_offset);*/ void processSingleFile(char *, unsigned char, srp*); void processPEFiles(char * fq_1, unsigned char f_id1,char * gq_2, unsigned char f_id2,srp *io_sr ); +void getRead(char ** fname_id_array, const rpos& sr, unsigned int j, char ** read); #endif diff --git a/src/perf_test_fqreader.cpp b/src/perf_test_fqreader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d6a8951486b12759b03b40ee8fda9b173f393f6c --- /dev/null +++ b/src/perf_test_fqreader.cpp @@ -0,0 +1,143 @@ +/* + * unit_test_fqreader.cpp + * + * Created on: Dec 8, 2015 + * Author: vlegrand + * unit testing for the fqreader component. + * Keep using assert for the moment, don't want to add a dependency on boost (or any other test framework) just for the tests. + */ +#include <stdio.h> +#include <iostream> +#include <limits.h> +#include <assert.h> +#include <stdlib.h> +#include "srp.h" +#include "fqreader.h" + +using namespace std; + +void test_processSingleFile() { + //printf("MAX_UINT=%u \n",UINT_MAX); + srp sr; + unsigned char f_id=1; + processSingleFile((char *) "../data_perf_tests/LM_single10x.fq",f_id,&sr); + srp::reverse_iterator rit; + i_dim::iterator it_offs; + k_dim::iterator it_struct; + + int cnt_read=0; + + for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). + unsigned long score=rit->first; + cout << "score="<<rit->first<<endl; + for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { + unsigned long offset_quotient=it_offs->first; + for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { + unsigned char fid_stored=it_struct->fileid; + cnt_read++; + int tmp=fid_stored >>4; + cout<<" fileid="<<tmp<<" read_a1="<<it_struct->read_a1<<endl; + } + } + } + cout<<"number of reads: "<<cnt_read; +} + +void test_processPEFiles() { + char * fq_1_test=(char *) "data/test_PE1.fq"; + char * fq_2_test=(char *) "data/test_PE2.fq"; + + unsigned char f_id1=1; + unsigned char f_id2=2; + + srp sr; + + processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr); + srp::reverse_iterator rit; + i_dim::iterator it_offs; + k_dim::iterator it_struct; + int cnt_read=0; + + unsigned char masque=0x0F; + + for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). + // cout << "score="<<rit->first<<endl; + unsigned long score=rit->first; + if (cnt_read==0 || cnt_read==1) assert(score==10); + if (cnt_read==2) assert(score==9); + for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { + unsigned long offset_quotient=it_offs->first; + assert(offset_quotient==0); + for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { + unsigned char fid_stored=it_struct->fileid; + assert(fid_stored >>4==f_id1); + assert((fid_stored &masque)==f_id2); + if (cnt_read==0) { + assert(it_struct->read_a1==0); + assert(it_struct->read_a1==0); + } + if (cnt_read==1) { + //std::cout<<it_struct->read_a1<<" "<<it_struct->read_a2; + assert(it_struct->read_a1==349); + assert(it_struct->read_a2==349); + } + if (cnt_read==2) { + // std::cout<<it_struct->read_a1<<" "<<it_struct->read_a2; + assert(it_struct->read_a1==698); + assert(it_struct->read_a2==698); + } + cnt_read++; + /* + int tmp1=fid_stored >>4; + int tmp2=fid_stored &masque; + cout<<" fileid1="<<tmp1<<" read_a1="<<it_struct->read_a1<<endl; + cout<<" fileid2="<<tmp2<<" read_a2="<<it_struct->read_a2<<endl; */ + } + } + } + assert(cnt_read==3); +} + +void test_processAllFiles() { + char * fq_1_test=(char *) "../data_perf_tests/LM_R1_10x.fq"; + char * fq_2_test=(char *) "../data_perf_tests/LM_R2_10x.fq"; + char * fq_single=(char *) "../data_perf_tests/LM_single10x.fq"; + + unsigned char f_id1=1; + unsigned char f_id2=2; + unsigned char f_single=3; + + srp sr; + + processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr); + processSingleFile(fq_single,f_single,&sr); + + srp::reverse_iterator rit; + i_dim::iterator it_offs; + k_dim::iterator it_struct; + int cnt_read=0; + unsigned char masque=0x0F; + + for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). + unsigned long score=rit->first; + cout << "score="<<rit->first<<endl; + for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { + unsigned long offset_quotient=it_offs->first; + assert(offset_quotient==0); + for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { + cnt_read++; + unsigned char fid_stored=it_struct->fileid; + int tmp1=fid_stored >>4; + int tmp2=fid_stored &masque; + cout<<" fileid1="<<tmp1<<" read_a1="<<it_struct->read_a1<<" fileid2="<<tmp2<<" read_a2="<<it_struct->read_a2<<endl; + } + } + } + cout<<"number of reads: "<<cnt_read; +} + +int main(int argc, char **argv) { + // test_processSingleFile(); + // test_processPEFiles(); + test_processAllFiles(); /* mix PE together with single; nearly as in real life.*/ +} diff --git a/src/unit_test_fqreader.cpp b/src/unit_test_fqreader.cpp index 61db57c805ce0d701827f256565fede68b4287ee..64160bd84da0ceb0180be6714267a6ed1ffb1145 100644 --- a/src/unit_test_fqreader.cpp +++ b/src/unit_test_fqreader.cpp @@ -20,7 +20,7 @@ void test_processSingleFile() { //printf("MAX_UINT=%u \n",UINT_MAX); srp sr; unsigned char f_id=1; - processSingleFile((char *) "data/test_single.fq",f_id,&sr); + processSingleFile((char *) "../data/test_single.fq",f_id,&sr); srp::reverse_iterator rit; i_dim::iterator it_offs; k_dim::iterator it_struct; @@ -53,8 +53,8 @@ void test_processSingleFile() { } void test_processPEFiles() { - char * fq_1_test=(char *) "data/test_PE1.fq"; - char * fq_2_test=(char *) "data/test_PE2.fq"; + char * fq_1_test=(char *) "../data/test_PE1.fq"; + char * fq_2_test=(char *) "../data/test_PE2.fq"; unsigned char f_id1=1; unsigned char f_id2=2; @@ -86,21 +86,21 @@ void test_processPEFiles() { assert(it_struct->read_a1==0); } if (cnt_read==1) { - //std::cout<<it_struct->read_a1<<" "<<it_struct->read_a2; + std::cout<<it_struct->read_a1<<" "<<it_struct->read_a2; assert(it_struct->read_a1==349); assert(it_struct->read_a2==349); } if (cnt_read==2) { - // std::cout<<it_struct->read_a1<<" "<<it_struct->read_a2; + std::cout<<it_struct->read_a1<<" "<<it_struct->read_a2; assert(it_struct->read_a1==698); assert(it_struct->read_a2==698); } cnt_read++; - /* + int tmp1=fid_stored >>4; int tmp2=fid_stored &masque; cout<<" fileid1="<<tmp1<<" read_a1="<<it_struct->read_a1<<endl; - cout<<" fileid2="<<tmp2<<" read_a2="<<it_struct->read_a2<<endl; */ + cout<<" fileid2="<<tmp2<<" read_a2="<<it_struct->read_a2<<endl; } } } @@ -108,9 +108,9 @@ void test_processPEFiles() { } void test_processAllFiles() { - char * fq_1_test=(char *) "data/test_PE1.fq"; - char * fq_2_test=(char *) "data/test_PE2.fq"; - char * fq_single=(char *) "data/test_single.fq"; + char * fq_1_test=(char *) "../data/test_PE1.fq"; + char * fq_2_test=(char *) "../data/test_PE2.fq"; + char * fq_single=(char *) "../data/test_single.fq"; unsigned char f_id1=1; unsigned char f_id2=2; @@ -127,7 +127,7 @@ void test_processAllFiles() { int cnt_read=0; for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). - // cout << "score="<<rit->first<<endl; + //cout << "score="<<rit->first<<endl; unsigned long score=rit->first; if (cnt_read==0 || cnt_read==1) assert(score==10); else if (cnt_read==2) assert(score==9); @@ -147,8 +147,12 @@ void test_processAllFiles() { } int main(int argc, char **argv) { + cout<<"sizeof(char)"<<sizeof(char); /*char* fq_s_test="data/test_single.fq";*/ + cout<<"test for single file"<<endl; test_processSingleFile(); + cout<<"test for PE files"<<endl; test_processPEFiles(); + cout<<"test for both single and PE files"; test_processAllFiles(); /* mix PE together with single; nearly as in real life.*/ }