diff --git a/configure.ac b/configure.ac index cb0ff451e22f57c5074c502c22f75bf5bea26029..b3950b388c19e5e013c0273b679eb11fd3542a88 100644 --- a/configure.ac +++ b/configure.ac @@ -1,7 +1,7 @@ dnl Process this file with autoconf to produce a configure script. AC_PREREQ(2.59) -AC_INIT(rock, 1.3) +AC_INIT(rock, 1.4) AC_CANONICAL_SYSTEM diff --git a/doc/rock.pod b/doc/rock.pod index fd5ca12022ad7b57fed8747bec1ae9fa679732e5..a48a3dea1b42959815db7ea9770e3926c6d434e2 100644 --- a/doc/rock.pod +++ b/doc/rock.pod @@ -39,7 +39,15 @@ Specify wanted k-mer size. Default is 30; maximum is 32. =item -q -Specify minimum threshold for nucleotides quality score. Nucleotides that have a score below thta threshold will be considered as errors (just like N nucleotides). +Specify minimum threshold for nucleotides quality score. Nucleotides that have a score below that threshold will be considered as errors (just like N nucleotides). +Default is 0. + +=item -m + +To be used together with -q. +Specify minimum number of correct k-mers required to keep a read for CMS filter. +Indicate only a integer in version 1.4. +Default is 1. =item -C diff --git a/src/Filter.hpp b/src/Filter.hpp index 05e80df5c23f61eb794380f90b5cf66994a0deb6..176d284a7fd9350176eb0cc99d94754819be40d2 100644 --- a/src/Filter.hpp +++ b/src/Filter.hpp @@ -28,7 +28,7 @@ class Filter { ByteCountMinSketch * pByteCMS; ShortCountMinSketch * pShortCMS; -#// long CMS_size_in_Bytes; + // long CMS_size_in_Bytes; long nb_bytes_before_fill_CMS,nb_bytes_after_fill_CMS; void getRSS(int before_fill=0); diff --git a/src/FqAuxBackend.cpp b/src/FqAuxBackend.cpp index c237db37941aca75d728d6bcaa282ee2e95048f7..8cc77dad13933c101c14ed941e188b1cdc2d567f 100644 --- a/src/FqAuxBackend.cpp +++ b/src/FqAuxBackend.cpp @@ -1,4 +1,4 @@ -/* + /* * FqAuxProcessor.cpp * * Created on: Jan 7, 2016 @@ -23,95 +23,77 @@ using namespace std; // #define DEBUG int FqAuxBackend::getNextRead(rinfo * p_nr) { - rinfo nr; int rfound=0; int eof=0; while (!rfound) { - if (nread==0 || (pos_in_buf>=nread)) { + if (buf_info.real_bufsize==0 || (buf_info.cnt>=buf_info.real_bufsize)) { // if (pos_in_buf==nread-1) cout<<"BE2 : last char of buf before reading new buffer: "<<buf[pos_in_buf]<<endl; readBuffer(); } - if (nread==0) { eof=1; break;} + if (buf_info.real_bufsize==0) { eof=1; break;} rfound=processBuffer(p_nr); } return eof; } + int FqAuxBackend::processBuffer(rinfo * p_nr) { - unsigned int s; - static unsigned int st; + static T_fq_rec_info fq_rec_info; static int num_l_in_rec; /* counter to know on which line inside the fastq record we are */ static int qual_score=0; + static int n; + static int start_rec_in_buf; + static unsigned long rstart_offset; - char * pchar=buf+pos_in_buf; + buf_info.pchar=buf_info.buf+buf_info.cnt; int rfound=0; -#ifdef DEBUG - static int tot_nb_start=0; - static int tot_nb_stop=0; - static int nb_stop=0; - static int nb_start=0; - - static int idx_id; - static int is_id=0; - if (pos_in_buf==0) { - nb_start=0; - nb_stop=0; - } -#endif - while (pos_in_buf<=nread-1 && !rfound) { - switch (*pchar){ + + while (buf_info.cnt<=buf_info.real_bufsize-1 && !rfound) { + switch (*buf_info.pchar){ case k_read_id_start: if (qual_score) goto inc_score; else { -#ifdef DEBUG - is_id=1; - idx_id=0; - nb_start++; -#endif - //cout<<k_read_id_start<<" found in PE2."<<endl; - rstart_offset=cur_offset-nread+pos_in_buf; - num_l_in_rec=1; } + debug_processBuf(on_record_new,buf_info,fq_rec_info.rstart_offset); + rstart_offset=cur_offset-buf_info.real_bufsize+buf_info.cnt; + start_rec_in_buf=buf_info.cnt; + buf_info.p_start_cur_rec=buf_info.pchar; + num_l_in_rec=1; + fq_rec_info.nb_nucleotides_in_read=0;} break; case k_read_qual_start: qual_score=1; - st=0; + fq_rec_info.nb_k_mers_in_error=0; + fq_rec_info.st=0; + fq_rec_info.idx_nucl_in_read=0; + n=0; break; case '\n': + debug_processBuf(on_line_end,buf_info,fq_rec_info.rstart_offset); + (qual_score==1)?qual_score++:qual_score; num_l_in_rec+=1; -#ifdef DEBUG - is_id=0; -#endif if (num_l_in_rec==5) { qual_score=0;/* end of fastq record */ #ifdef DEBUG nb_stop++; #endif p_nr->f_id=f_id; - p_nr->score=st; + p_nr->score=fq_rec_info.st; p_nr->rstart_offset=rstart_offset; + p_nr->nb_nucleotides_in_read=fq_rec_info.nb_nucleotides_in_read; + p_nr->nb_k_mers_in_error=fq_rec_info.nb_k_mers_in_error; rfound=1; } break; default: -#ifdef DEBUG - store_id: - { if (is_id==1) { - if (*pchar!='/' and *pchar!=' ') current_id[idx_id]=*pchar; - else current_id[idx_id]='\0'; - idx_id+=1; - } - } -#endif + debug_processBuf(on_store_read_id,buf_info,fq_rec_info.rstart_offset); + (num_l_in_rec==2)?fq_rec_info.nb_nucleotides_in_read++:fq_rec_info.nb_nucleotides_in_read; inc_score: - { if (qual_score==1) { - s=(int)*pchar; - s-=k_phred_32; - st+=s; - } + { if (qual_score==2) onIncScore(fq_rec_info,buf_info,n); } } - pchar++; - pos_in_buf++; + buf_info.pchar++; + buf_info.cnt++; } + if (!rfound) keepCurFastqRecord(buf_info.buf,start_rec_in_buf,buf_info.real_bufsize); #ifdef DEBUG std::cout<<"BE2, position in buffer: "<<pos_in_buf<<endl; if (pos_in_buf==nread) { @@ -126,30 +108,54 @@ int FqAuxBackend::processBuffer(rinfo * p_nr) { void FqAuxBackend::readBuffer() { - if ((nread=read(i_f_desc,buf,bufsize))!=0) { + size_t buffer_size; + (test_mode)?buffer_size=test_bufsize:buffer_size=bufsize; + if ((buf_info.real_bufsize=read(i_f_desc,buf_info.buf,buffer_size))!=0) { cur_offset=lseek(i_f_desc,0,SEEK_CUR); -//ftell(f_stream); - pos_in_buf=0; + buf_info.cnt=0; + buf_info.p_start_cur_rec=buf_info.buf; + buf_info.pchar=buf_info.buf; } } /* * Opens file and performs the fist read operation. */ void FqAuxBackend::openFile(char * ficname, unsigned char id) { + size_t buffer_size; + (test_mode)?buffer_size=test_bufsize:buffer_size=bufsize; FqBaseBackend::openInputFile(ficname,id); - buf=(char *) malloc(bufsize*sizeof(char)); - if (buf==NULL) { - err(errno,"cannot allocate memory: %lu bytes.",bufsize); + + buf_info.buf=(char *) malloc(buffer_size*sizeof(char)); + if (buf_info.buf==NULL) { + err(errno,"cannot allocate memory: %lu bytes.",buffer_size); } - pos_in_buf=bufsize; // do that to force first reading in buffer. + buf_info.real_bufsize=buffer_size; // do that to force first reading in buffer. } void FqAuxBackend::closeFile() { if (i_f_desc!=-1) { FqBaseBackend::closeInputFile(); - free(buf); - buf=NULL; + free(buf_info.buf); + buf_info.buf=NULL; } } +void FqAuxBackend::writeToUndefFile() { // here duplicate code for the moment since in the case of Auxbackend, we have processed 1 more char from buffer than in the case of MainBackend. TODO refactor this. + // FqBaseBackend::writeToUndefFile(buf_info); + int l=strlen(cur_fq_record); + if (l==0) { + writeStrToUndefFile(buf_info.p_start_cur_rec,buf_info.pchar-buf_info.p_start_cur_rec); + } else { + memcpy(cur_fq_record+l,buf_info.p_start_cur_rec,buf_info.cnt); + char tmp=buf_info.buf[buf_info.cnt]; + l+=buf_info.cnt; + cur_fq_record[l]='\0'; + writeStrToUndefFile(cur_fq_record,strlen(cur_fq_record)); // Try that, if there is performance problem due to resolving inheritance, will find a different way + } + +} + +void FqAuxBackend::resetCurFqRecord() { + cur_fq_record[0]='\0'; +} diff --git a/src/FqAuxBackend.h b/src/FqAuxBackend.h index 13178722c2a069e4c376dca2f4cfb44d184eb0f7..6e093ea162f614a6b9464e7c2fc80403b15ffe01 100644 --- a/src/FqAuxBackend.h +++ b/src/FqAuxBackend.h @@ -15,27 +15,32 @@ // const size_t bufsize=500000; //try that in valgrind class FqAuxBackend:public FqBaseBackend { - size_t nread; + //size_t nread; unsigned long cur_offset; - char * buf; + // char * buf; + T_buf_info buf_info; void readBuffer(); int processBuffer(rinfo * p_nr); + friend void test_processBufPE(); + public: - size_t pos_in_buf; + //size_t pos_in_buf; char current_id[50]; FqAuxBackend() { cur_offset=0; - nread=0; - buf=NULL; - pos_in_buf=bufsize; + buf_info.real_bufsize=0; + buf_info.buf=NULL; + buf_info.cnt=bufsize; } void openFile(char * ficname, unsigned char id); void closeFile(); int getNextRead(rinfo *); + void writeToUndefFile(); + void resetCurFqRecord(); }; diff --git a/src/FqBaseBackend.cpp b/src/FqBaseBackend.cpp index 473b0bc2ddc4866ecc67f826ad12854a130872b7..90a52ac6851d39cd7d41589560980a5cbc2411f4 100644 --- a/src/FqBaseBackend.cpp +++ b/src/FqBaseBackend.cpp @@ -24,8 +24,18 @@ using namespace std; #endif +T_buf_info init_buf_info(int& nread,char * buf) { + T_buf_info buf_info; + buf_info.real_bufsize=nread; + buf_info.cnt=0; + buf_info.pchar=buf; + buf_info.buf=buf; + buf_info.p_start_cur_rec=buf; + return buf_info; +} + void FqBaseBackend::openInputFile(char * ficname, unsigned char id) { - int st,s; + // int st,s; // unsigned long cur_offset; mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; @@ -57,6 +67,10 @@ void FqBaseBackend::setOutputFile(char * ofilename) { o_filename=ofilename; } +void FqBaseBackend::setUndefFile(char * ficname) { + undef_filename=ficname; +} + void FqBaseBackend::openInputFile() { if (i_filename==NULL || f_id==0) throw std::runtime_error("No file currently associated to this backend"); @@ -71,21 +85,34 @@ void FqBaseBackend::openInputFile() { void FqBaseBackend::openOutputFile(){ - mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; if (o_filename==NULL) throw std::runtime_error("No output file currently associated to this backend"); - if (o_f_desc!=-1) { + openFile4Output(o_filename,&o_f_desc); +} + +void FqBaseBackend::openUndefFile(){ + //std::cout<<qual_thres.min_correct_k_mers_in_read<<endl; + if (qual_thres.min_correct_k_mers_in_read>1) { + if (undef_filename==NULL) throw std::runtime_error("No file currently associated to this backend for undefined reads."); + openFile4Output(undef_filename,&undef_f_desc); + } +} + +void FqBaseBackend::openFile4Output(char * filename, int * f_desc) { + mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + if (*f_desc!=-1) { std::string err_msg("file: "); - err_msg+=o_filename; + err_msg+=filename; err_msg+=" is already open!"; throw std::runtime_error(err_msg); } - o_f_desc=open(o_filename,O_WRONLY|O_CREAT|O_TRUNC,mode); - if (o_f_desc==-1) { - err(errno,"cannot open file: %s.",o_filename); + *f_desc=open(filename,O_WRONLY|O_CREAT|O_TRUNC,mode); + if (*f_desc==-1) { + err(errno,"cannot open file: %s.",filename); } } + void FqBaseBackend::closeOutputFile() { if (pos_in_w_buf!=o_buf) { // check if some fq records remains in buffer before closing file, if so, flush. if (write(o_f_desc, o_buf, pos_in_w_buf-o_buf)==-1) err(errno,"cannot write in file: %s.",o_filename); @@ -97,6 +124,35 @@ void FqBaseBackend::closeOutputFile() { } } +void FqBaseBackend::closeUndefFile() { + if (undef_f_desc!=-1) { + if (close(undef_f_desc)==-1) err(errno,"cannot close file: %s.",undef_filename); + undef_f_desc=-1; + } +} + +void FqBaseBackend::writeStrToUndefFile(char * start_in_buf, int len) { + if (write(undef_f_desc,start_in_buf,len)==-1) err(errno,"cannot write in file: %s.",undef_filename); +} + +void FqBaseBackend::writeToUndefFile(const T_buf_info& buf_info) { + int l=strlen(cur_fq_record); + if (l==0) { + writeStrToUndefFile(buf_info.p_start_cur_rec,buf_info.pchar-buf_info.p_start_cur_rec+1); + } else { + /*cout<<"cur_fq_record actually contains: "<<l<<"chars"<<endl; + cout<<cur_fq_record<<endl; + cout<<"going to add to it :"<<buf_info.cnt<<" char from buf"<<endl;*/ + memcpy(cur_fq_record+l,buf_info.p_start_cur_rec,buf_info.cnt+1); + char tmp=buf_info.buf[buf_info.cnt]; + l+=buf_info.cnt; + cur_fq_record[l+1]='\0'; + writeStrToUndefFile(cur_fq_record,strlen(cur_fq_record)); // Try that, if there is performance problem due to resolving inheritance, will find a different way + } +} + + + void FqBaseBackend::writeToOutput(const unsigned long& offset) { // static char * pos_in_w_buf; if (o_buf==NULL) { @@ -128,12 +184,13 @@ void FqBaseBackend::writeToOutput(const unsigned long& offset) { * use C style char* rather than std::string for a matter of performance. * use read instead of fread for the same reason. * - * Note: It is up to the caller to determine where the end of the record is (5th \n character in the fq_record string). + * Note: length of the string containing the read is returned. */ int FqBaseBackend::getRead(const unsigned long& offset, char * fq_record) { int nread,i,nb_lines,fq_rec_size; char * pchar; + fq_rec_size=-1; if (i_f_desc==-1) throw std::runtime_error("No open file currently associated to this backend"); #ifdef DEBUG @@ -169,5 +226,96 @@ int FqBaseBackend::getRead(const unsigned long& offset, char * fq_record) { #ifdef DEBUG cout<<"found that fq record size is : "<<fq_rec_size<<endl; #endif + if (fq_rec_size==-1) errx(1,"Cannot get read!"); // here use errx rather than C++ exception because I don't want to catch an exception. If this happens, program must exit with error message and that's it. return fq_rec_size; } + +/* keep the end of current fastq record in memory. Used for records that are incomplete (when reading big files an fq record'start may be read in buffer 1 + whereas its end may be in buffer 2...*/ +void FqBaseBackend::keepCurFastqRecord(char * buf,const int& start_rec_in_buf,const int &nread) { + int len=nread-start_rec_in_buf; + if (len+1>=MAX_FQ_RECORD_LENGTH) errx(1,"Buffer for current record is too small given record length."); + memcpy(cur_fq_record,&buf[start_rec_in_buf],len); + cur_fq_record[len]='\0'; + /*cout<<"keeping in memory "<<len+1<<" char from previous read"<<endl; + cout<<cur_fq_record<<endl;*/ +} + +void FqBaseBackend::onIncScore(T_fq_rec_info& rec_info,T_buf_info& buf_info,int& n) { + unsigned int s=(int)*buf_info.pchar; + unsigned int remaining_nucl=rec_info.nb_nucleotides_in_read-rec_info.idx_nucl_in_read; + if (s<=qual_thres.nucl_score_threshold) { // TODO rewrite this with chained ternary operators once it is clear to see if it improves performances. + if (rec_info.idx_nucl_in_read<=qual_thres.k-1) { + rec_info.nb_k_mers_in_error=rec_info.idx_nucl_in_read+1; + } + else if (n>0 && remaining_nucl>n) { + (qual_thres.k-n<remaining_nucl-n)?rec_info.nb_k_mers_in_error+=qual_thres.k-n:rec_info.nb_k_mers_in_error+=remaining_nucl-n; + } else if (n<=0) { // there should be a way to make it simplier by making sure n is never negative. + (qual_thres.k<remaining_nucl)?rec_info.nb_k_mers_in_error+=qual_thres.k:rec_info.nb_k_mers_in_error+=remaining_nucl; + } + n=qual_thres.k; + } + n--; + rec_info.st+=s; + rec_info.idx_nucl_in_read++; +} + +void FqBaseBackend::debug_processBuf(int evt,const T_buf_info& buf_info,const unsigned long& rstart_offset) { // put all the bebug stuff in a separate method so that processBuf is shorter and more "lisible". +#ifdef DEBUG + static int nb_start; // number of fq record start and stop found in current buffer. + static int nb_stop; + static int tot_nb_stop=0; + static int tot_nb_start=0; + static int idx_id; + static int is_id=0; + if (buf_info.cnt==0) { + nb_start=0; + nb_stop=0; + } + + switch(evt) { + case on_record_new: + cout<<"nread="<<buf_info.real_bufsize<<endl; + cout<<"cnt="<<buf_info.cnt<<endl; + cout<<"rstart_offset="<<rstart_offset<<endl; + is_id=1; + idx_id=0; + nb_start++; + break; + case on_line_end: + is_id=0; + break; + case on_record_end: + nb_stop++; + cout<<"just finished processing : "<<endl; + cout<<current_id<<endl; + cout<<"j obttained: rstart_offset/UINT_MAX="<<rstart_offset/UINT_MAX<<endl; + break; + case on_store_read_id: + if (is_id==1) { + if (*buf_info.pchar!='/' and *buf_info.pchar !=' ') current_id[idx_id]=*buf_info.pchar; + else current_id[idx_id]='\0'; + idx_id+=1; + } + break; + case on_buf_end: + std::cout<<" Main BE position in buffer "<<cnt<<endl; + assert(buf_info.cnt==buf_info.real_bufsize); + cout<<"Main BE Found "<<nb_start<<" and "<<nb_stop<<" start/stop of record in this buffer."<<endl; + tot_nb_start+=nb_start; + tot_nb_stop+=nb_stop; + cout<<"Main BE Total cumulated number of start: "<<tot_nb_start<<" Total number of stop: "<<tot_nb_stop<<endl; + break; + case on_record_end_pe: + cout<<p_auxFqProcessor->current_id<<endl; + assert(strcmp(current_id,p_auxFqProcessor->current_id)==0); + break; + } +#endif +} + +/*static void FqBaseBackend::setQualThreshold(const FasqQualThreshold& a_qual_thres) { + qual_thres.min_correct_k_mers_in_read=a_qual_thres.min_correct_k_mers_in_read; + qual_thres.nucl_score_threshold=a_qual_thres.nucl_score_threshold; + qual_thres.k=a_qual_thres.k; +}*/ diff --git a/src/FqBaseBackend.h b/src/FqBaseBackend.h index 29f7b2ab4336508347fa1f349e07712fde7be5c9..bd9d36a90b9c13269304f1e73197488bae783398 100644 --- a/src/FqBaseBackend.h +++ b/src/FqBaseBackend.h @@ -13,9 +13,40 @@ #include "FqConstants.h" #include "srp.h" +#include "ROCKparams.h" +//#define init_debug 1 +#define on_record_new 2 +#define on_record_end 3 +#define on_buf_end 4 +#define on_line_end 5 +#define on_store_read_id 6 +#define on_record_end_pe 7 +/* + * Auxilliary structure for buffer processing. + */ +typedef struct { + int cnt; // number of char already processed in buffer + char * pchar; // point on current char in buffer + int real_bufsize; // total number of char in buffer + char * buf; // pointer to start of buffer + char * p_start_cur_rec; // pointer to the start of the current record. +}T_buf_info; + +/* + * Auxilliary structure for fastq parsing; gather here information on the fastq record before we can put it inthe srp data structure. + */ +typedef struct { + unsigned long rstart_offset; // fq record start offset in file. + int nb_k_mers_in_error; // number of k-mers that contain nucleoties whose quality score is below given threshold. + unsigned int nb_nucleotides_in_read; // number of nucleotides in read (single or PE1) + unsigned int nb_nucleotides_in_read_PE2; // number of nucleotides in PE2. + unsigned int st; // totalread score (sum of the nucleotides quality score). + unsigned int idx_nucl_in_read; +}T_fq_rec_info; +T_buf_info init_buf_info(int& nread,char * buf); class FqBaseBackend { @@ -25,6 +56,8 @@ class FqBaseBackend { protected: static const size_t bufsize=6048000; // It seems that I can't have a much bigger value than that or else my objects construction throws exception. Strange. + + char * i_filename; unsigned char f_id; int i_f_desc; // for calling read @@ -36,8 +69,32 @@ protected: char * o_buf; char * pos_in_w_buf; + // for writing undefined (reads that don't contain a sufficient number of correct k-mers) + // correct k-mers are k-mers that contain only nucleotides with a quality score greater than a given threshold (default is 0). + // expected minimum number of correct k-mers is provided by the user. + + char * undef_filename; + int undef_f_desc; + + static FasqQualThreshold qual_thres; + + char cur_fq_record[MAX_FQ_RECORD_LENGTH]; + + void onIncScore(T_fq_rec_info& rec_info,T_buf_info& buf_info,int& n); + void debug_processBuf(int evt,const T_buf_info& buf_info,const unsigned long& rstart_offset); + friend void test_processInputFiles(); friend void test_write_PE(); + friend void processPEFiles(char *, unsigned char,char * , unsigned char,srp *,char *,char *, size_t); + + /* for testing only */ + int test_mode; + size_t test_bufsize; + + void setTestMode(size_t new_buf_size) { + test_mode=1; + test_bufsize=new_buf_size; + } public: @@ -48,8 +105,14 @@ public: f_id=0; o_f_desc=-1; o_filename=NULL; + undef_f_desc=-1; + undef_filename=NULL; + o_buf=NULL; pos_in_w_buf=NULL; + strcpy(cur_fq_record,""); + test_mode=0; + test_bufsize=0; } ~FqBaseBackend() { @@ -67,6 +130,20 @@ public: void openOutputFile(); void writeToOutput(const unsigned long&); void closeOutputFile(); + void setUndefFile(char * ficname); + void openUndefFile(); + void writeStrToUndefFile(char * start_in_buf, int len); + void writeToUndefFile(const T_buf_info& buf_info); + void closeUndefFile(); + + static void setQualThreshold(const FasqQualThreshold& a_qual_thres){ + FqBaseBackend::qual_thres.min_correct_k_mers_in_read=a_qual_thres.min_correct_k_mers_in_read; + FqBaseBackend::qual_thres.nucl_score_threshold=a_qual_thres.nucl_score_threshold; + FqBaseBackend::qual_thres.k=a_qual_thres.k; + } + + void openFile4Output(char * filename, int * f_desc); + void keepCurFastqRecord(char * buf,const int& start_rec_in_buf,const int &nread); }; diff --git a/src/FqMainBackend.cpp b/src/FqMainBackend.cpp index af0faa9175bfa970e7d1b8389e577cfd350d231d..784012342b59ccfb9b89ee98330bbc115fc3b36d 100644 --- a/src/FqMainBackend.cpp +++ b/src/FqMainBackend.cpp @@ -39,18 +39,22 @@ FqMainBackend::FqMainBackend(srp * io_sr):FqBaseBackend() { } + void FqMainBackend::processFile(char * filename,unsigned char f_id) { FILE * fp; - int st,s; - int cnt,nread; + int nread; long cur_offset; - // int nb_read_oper=0; mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; - //cout<<"INT_MAX="<<INT_MAX<<endl; - // cout<<filename<<endl; this->i_filename=filename; this->f_id=f_id; + char* buf; + size_t buffer_size; + (test_mode)?buffer_size=test_bufsize:buffer_size=FqBaseBackend::bufsize; + + this->openUndefFile(); + if (p_auxFqProcessor!=NULL) p_auxFqProcessor->openUndefFile(); + int f_single=open(filename,O_RDONLY,mode); if (f_single==-1) { err(errno,"cannot open file: %s.",filename); @@ -60,162 +64,116 @@ void FqMainBackend::processFile(char * filename,unsigned char f_id) { if (fp==NULL) { err(errno,"cannot open file: %s.",filename); } - //cur_offset=ftell(fp); - //cout<<"cur_offset="<<cur_offset<<endl; - char* buf=(char *) malloc(FqBaseBackend::bufsize*sizeof(char)); + buf=(char *) malloc(buffer_size*sizeof(char)); if (buf==NULL) { - err(errno,"cannot allocate memory: %lu bytes.",bufsize); + err(errno,"cannot allocate memory: %lu bytes.",buffer_size); } /* for each read, we want : offset, filenb, total quality score. */ - while ((nread=read(f_single,buf,FqBaseBackend::bufsize))!=0) { - // nb_read_oper++; - //cout<<"BE1: read operation nbr: "<<nb_read_oper<<endl; - //printf("going to call ftell on : %p \n",fp); - //cur_offset=ftell(fp); // don't know why but it seems that this doesn't work under linux; always returns 0... + while ((nread=read(f_single,buf,buffer_size))!=0) { cur_offset = lseek(f_single, 0, SEEK_CUR); - //cout<<"cur_offset="<<cur_offset<<endl; - processBuf((char *)buf,nread,f_id,cur_offset); + T_buf_info buf_info=init_buf_info(nread,buf); + processBuf(buf_info,f_id,cur_offset); + //cout<<"going to read new buffer." <<endl; } close(f_single); + this->closeUndefFile(); + if (p_auxFqProcessor!=NULL) p_auxFqProcessor->closeUndefFile(); + free(buf); } -#ifdef DEBUG -#define init_debug 1 -#define on_record_new 2 -#define on_record_end 3 -#define on_buf_end 4 -#define on_line_end 5 -#define on_store_read_id 6 -#define on_record_end_pe 7 - -void FqMainBackend::debug_processBuf(int evt,int& cnt,int& nread,char * pchar,unsigned long& rstart_offset) { // put all the bebug stuff in a separate method so that processBuf is shorter and more "lisible". - static int nb_start; // number of fq record start and stop found in current buffer. - static int nb_stop; - static int tot_nb_stop=0; - static int tot_nb_start=0; - static int idx_id; - static int is_id=0; - if (cnt==0) { - nb_start=0; - nb_stop=0; - } +void FqMainBackend::onEndFastqRecord(T_fq_rec_info& rec_info,const T_buf_info& bufinfo) { + rpos rp=init_rpos(f_id,rec_info.rstart_offset); + if (p_auxFqProcessor!=NULL) { + rinfo pe2info; + int eof=p_auxFqProcessor->getNextRead(&pe2info); + assert(!eof); // There should be the same number of reads in both PE files. + rec_info.nb_k_mers_in_error+=pe2info.nb_k_mers_in_error; + rec_info.nb_nucleotides_in_read_PE2=pe2info.nb_nucleotides_in_read; + rec_info.st+=pe2info.score; + unsigned long j=rec_info.rstart_offset/UINT_MAX; + update_rpos(pe2info.f_id,pe2info.rstart_offset,j,&rp); + debug_processBuf(on_record_end_pe,bufinfo,rec_info.rstart_offset); + } + i_dim& ref_i_dim=(*p_scoreReadStruct)[rec_info.st/K_SCORE_NORM_FACTOR]; + k_dim& ref_k_dim=ref_i_dim[rec_info.rstart_offset/UINT_MAX]; + debug_processBuf(on_record_end,bufinfo,rec_info.rstart_offset); + int nb_k_mer=rec_info.nb_nucleotides_in_read+1-qual_thres.k; + (rec_info.nb_nucleotides_in_read_PE2>0)?nb_k_mer+=rec_info.nb_nucleotides_in_read_PE2+1-qual_thres.k:nb_k_mer; + (nb_k_mer-rec_info.nb_k_mers_in_error>=qual_thres.min_correct_k_mers_in_read)?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo); + // empty buffer keeping current record + cur_fq_record[0]='\0'; + if (p_auxFqProcessor!=NULL) p_auxFqProcessor->resetCurFqRecord(); +} + - switch(evt) { - case on_record_new: - cout<<"nread="<<nread<<endl; - cout<<"cnt="<<cnt<<endl; - cout<<"rstart_offset="<<rstart_offset<<endl; - is_id=1; - idx_id=0; - nb_start++; - break; - case on_line_end: - is_id=0; - break; - case on_record_end: - nb_stop++; - cout<<"just finished processing : "<<endl; - cout<<current_id<<endl; - cout<<"j obttained: rstart_offset/UINT_MAX="<<rstart_offset/UINT_MAX<<endl; - break; - case on_store_read_id: - if (is_id==1) { - if (*pchar!='/' and *pchar !=' ') current_id[idx_id]=*pchar; - else current_id[idx_id]='\0'; - idx_id+=1; - } - break; - case on_buf_end: - std::cout<<" Main BE position in buffer "<<cnt<<endl; - assert(cnt==nread); - cout<<"Main BE Found "<<nb_start<<" and "<<nb_stop<<" start/stop of record in this buffer."<<endl; - tot_nb_start+=nb_start; - tot_nb_stop+=nb_stop; - cout<<"Main BE Total cumulated number of start: "<<tot_nb_start<<" Total number of stop: "<<tot_nb_stop<<endl; - break; - case on_record_end_pe: - cout<<p_auxFqProcessor->current_id<<endl; - assert(strcmp(current_id,p_auxFqProcessor->current_id)==0); - break; - } - } -#endif -void FqMainBackend::processBuf(char * buf,int nread,unsigned char f_id,unsigned long cur_offset) { - int cnt=0; - unsigned int s; - static unsigned int st; +void FqMainBackend::processBuf(T_buf_info& buf_info,unsigned char f_id,unsigned long cur_offset) { + static T_fq_rec_info fq_rec_info; static int num_l_in_rec; /* counter to know on which line inside the fastq record we are */ static int qual_score=0; - static unsigned long rstart_offset; -#ifdef DEBUG - debug_processBuf(init_debug,cnt,nread,NULL,rstart_offset); -#endif - char * pchar=buf; - while (cnt<=nread-1) { - switch (*pchar){ + static int n; + int start_rec_in_buf=0; + + // debug_processBuf(init_debug,buf_info,fq_rec_info.rstart_offset); + while (buf_info.cnt<=buf_info.real_bufsize-1) { + switch (*buf_info.pchar){ case k_read_id_start: if (qual_score) goto inc_score; else { - rstart_offset=cur_offset-nread+cnt; -#ifdef DEBUG - debug_processBuf(on_record_new,cnt,nread,NULL,rstart_offset); -#endif - num_l_in_rec=1; } - break; + buf_info.p_start_cur_rec=buf_info.pchar; + fq_rec_info.rstart_offset=cur_offset-buf_info.real_bufsize+buf_info.cnt; + start_rec_in_buf=buf_info.cnt; + debug_processBuf(on_record_new,buf_info,fq_rec_info.rstart_offset); + num_l_in_rec=1; + fq_rec_info.nb_nucleotides_in_read=0; + fq_rec_info.nb_nucleotides_in_read_PE2=0; + } + break; case k_read_qual_start: qual_score=1; - st=0; + fq_rec_info.nb_k_mers_in_error=0; + fq_rec_info.st=0; + fq_rec_info.idx_nucl_in_read=0; + n=0; // counter for k-mers in error break; case '\n': -#ifdef DEBUG - debug_processBuf(on_line_end,cnt,nread,pchar,rstart_offset); -#endif + debug_processBuf(on_line_end,buf_info,fq_rec_info.rstart_offset); + (qual_score==1)?qual_score++:qual_score; num_l_in_rec+=1; if (num_l_in_rec==5) { qual_score=0;/* end of fastq record */ - rpos rp=init_rpos(f_id,rstart_offset); - if (p_auxFqProcessor!=NULL) { - rinfo pe2info; - int eof=p_auxFqProcessor->getNextRead(&pe2info); - assert(!eof); // There should be the same number of reads in both PE files. - st+=pe2info.score; - unsigned long j=rstart_offset/UINT_MAX; - update_rpos(pe2info.f_id,pe2info.rstart_offset,j,&rp); -#ifdef DEBUG - debug_processBuf(on_record_end_pe,cnt,nread,pchar,rstart_offset); -#endif - } - i_dim& ref_i_dim=(*p_scoreReadStruct)[st/K_SCORE_NORM_FACTOR]; - k_dim& ref_k_dim=ref_i_dim[rstart_offset/UINT_MAX]; -#ifdef DEBUG - debug_processBuf(on_record_end,cnt,nread,pchar,rstart_offset); -#endif - ref_k_dim.push_back(rp); + onEndFastqRecord(fq_rec_info,buf_info); } break; default: -#ifdef DEBUG - store_id: debug_processBuf(on_store_read_id,cnt,nread,pchar,rstart_offset); -#endif + debug_processBuf(on_store_read_id,buf_info,fq_rec_info.rstart_offset); + (num_l_in_rec==2)?fq_rec_info.nb_nucleotides_in_read++:fq_rec_info.nb_nucleotides_in_read; inc_score: - { if (qual_score==1) { - s=(int)*pchar; - s-=k_phred_32; - st+=s; - } + { if (qual_score==2) onIncScore(fq_rec_info,buf_info,n); } break; } - pchar++; - cnt++; + buf_info.pchar++; + buf_info.cnt++; + } + keepCurFastqRecord(buf_info.buf,start_rec_in_buf,buf_info.real_bufsize); + debug_processBuf(on_buf_end,buf_info,fq_rec_info.rstart_offset); +} + + +/* + * 2 cases when writin records to undef file. + * 1- record was all inside current buffer => write it as is. + * 2- beginning of record was at the end of one buffer and the end of the record was processed in a 2nd buffer; so, record was stored in memory. In that case, write cur_fq_record to undef file. + */ +void FqMainBackend::writeToUndefFile(const T_buf_info& buf_info) { + FqBaseBackend::writeToUndefFile(buf_info); // TODO : if this is slow due to inheritance resolution, use template method or other solution. + if (p_auxFqProcessor!=NULL) { + p_auxFqProcessor->writeToUndefFile(); } -#ifdef DEBUG - debug_processBuf(on_buf_end,cnt,nread,pchar,rstart_offset); -#endif } diff --git a/src/FqMainBackend.h b/src/FqMainBackend.h index db7414e12ff2262db36f3d37bef0152cd4cd3cbe..6f42599fbe47aa776e1d968b3c796fd13d52b60a 100644 --- a/src/FqMainBackend.h +++ b/src/FqMainBackend.h @@ -21,7 +21,10 @@ class FqMainBackend : public FqBaseBackend { srp * p_scoreReadStruct; /* Where we store information about the reads. */ char current_id[50]; - void debug_processBuf(int evt,int& cnt,int& nread,char * pchar, unsigned long &); + // void debug_processBuf(int evt,const T_buf_info&, const unsigned long &); + void writeToUndefFile(const T_buf_info&); + void onEndFastqRecord(T_fq_rec_info& rec_info,const T_buf_info& bufinfo); + friend void test_processInputFiles(); public: @@ -34,7 +37,7 @@ public: void processFile(char * filename,unsigned char f_id); - void processBuf(char * buf,int nread,unsigned char f_id,unsigned long cur_offset); + void processBuf(T_buf_info&,unsigned char f_id,unsigned long cur_offset); }; #endif /* FQMAINBACKEND_H_ */ diff --git a/src/ROCKparams.cpp b/src/ROCKparams.cpp index a863b2ee2481f73ee0ac5396913f9fac0e08b092..ddad4f103e7e8648f96de203a90996948f9e4d2d 100644 --- a/src/ROCKparams.cpp +++ b/src/ROCKparams.cpp @@ -9,9 +9,12 @@ #include <string> #include <vector> #include <cmath> +#include "FqConstants.h" #include "ROCKparams.h" using namespace std; +const int ROCKparams::output_ext; +const int ROCKparams::undef_ext; void ROCKparams::computeLambda() { unsigned long tmp=parms.filter_size; @@ -66,7 +69,15 @@ void ROCKparams::optArgConsistency(const string& input_file,const string& output cout<<"ERROR lower filter is higher than high filter!"<<endl; exit(EXIT_FAILURE); } + if (parms.nucl_score_threshold<0) { + cout<<"ERROR nucleotide score threshold must be positive."<<endl; + exit(EXIT_FAILURE); + } + if (qual_thres.min_correct_k_mers_in_read<1) { + cout<<"minimum number of correct k-mers in read must be a positive integer."<<endl; + exit(EXIT_FAILURE); + } if (parms.lambda==0) { // This happens when the user doesn't specify lambda nor g nor nb_k_mers. computeLambda(); @@ -137,10 +148,29 @@ void ROCKparams::removePathfromFName(string& FName) { } } -void ROCKparams::changeExtension(string& FName) { // changes .fq into .rock.fq or adds .rock.fq +void ROCKparams::changeExtension(string& FName,const int& extension_type) { // changes .fq into .rock.fq or adds .rock.fq std::size_t o_found = FName.find_last_of(k_ext); - if (o_found!=std::string::npos) FName.replace(o_found,1,".rock."); - else FName.append(".rock.fq"); + if (extension_type==ROCKparams::output_ext) { + if (o_found!=std::string::npos) FName.replace(o_found,1,".rock."); + else FName.append(".rock.fq"); + } + else { + if (o_found!=std::string::npos) FName.replace(o_found,1,".undefined."); + else FName.append(".undefined.fq"); + } +} + +string ROCKparams::genUndefFilename(const string& fname,const string& dname) { + string undef_filename=""; + string new_name=fname; + removePathfromFName(new_name); + if (dname.compare(".")!=0) { + undef_filename=dname; + undef_filename.append("/"); + } + changeExtension(new_name,ROCKparams::undef_ext); + undef_filename.append(new_name); + return undef_filename; } void ROCKparams::genOutFilenames(const std::vector<string>& v_input_lines,std::vector<string>& v_output_lines) { @@ -153,8 +183,8 @@ void ROCKparams::genOutFilenames(const std::vector<string>& v_input_lines,std::v removePathfromFName(i_name_PE1); string i_name_PE2=(*it_in).substr(i_found+1); removePathfromFName(i_name_PE2); - changeExtension(i_name_PE1); - changeExtension(i_name_PE2); + changeExtension(i_name_PE1,ROCKparams::output_ext); + changeExtension(i_name_PE2,ROCKparams::output_ext); o_line=i_name_PE1; o_line+=k_sep_pair_end; o_line.append(i_name_PE2); @@ -162,7 +192,7 @@ void ROCKparams::genOutFilenames(const std::vector<string>& v_input_lines,std::v } else { o_line=*it_in; removePathfromFName(o_line); - changeExtension(o_line); + changeExtension(o_line,ROCKparams::output_ext); } v_output_lines.push_back(o_line); } @@ -194,15 +224,17 @@ int ROCKparams::processInOutFileArgs(const std::vector<string>& v_input_lines,st return EXIT_FAILURE; } string o_name_PE1=(*it_out).substr(0,o_found); - checkDirExists(o_name_PE1); + string o_dir_PE1=checkDirExists(o_name_PE1); string o_name_PE2=(*it_out).substr(o_found+1); + string o_dir_PE2=checkDirExists(o_name_PE2); //cout<<o_name_PE2<<endl; - checkDirExists(o_name_PE2); PE_files pe; pe.PE1.in_fq_file=i_name_PE1; pe.PE1.out_fq_file=o_name_PE1; + pe.PE1.undef_fq_file=genUndefFilename(i_name_PE1,o_dir_PE1); pe.PE2.in_fq_file=i_name_PE2; pe.PE2.out_fq_file=o_name_PE2; + pe.PE2.undef_fq_file=genUndefFilename(i_name_PE2,o_dir_PE2); v_PE_files.push_back(pe); } else { @@ -210,7 +242,8 @@ int ROCKparams::processInOutFileArgs(const std::vector<string>& v_input_lines,st f_id+=1; IO_fq_files p; p.in_fq_file=*it_in; - checkDirExists(*it_out); + string o_dir_single=checkDirExists(*it_out); + p.undef_fq_file=genUndefFilename(*it_in,o_dir_single); p.out_fq_file=*it_out; single_files.push_back(p); } @@ -226,62 +259,81 @@ int ROCKparams::processInOutFileArgs(const std::vector<string>& v_input_lines,st } void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { - int i; - std::vector<string> v_input_lines; - std::vector<string> v_output_lines; + int i,q,m; + std::vector<string> v_input_lines; + std::vector<string> v_output_lines; - while((i = getopt(argc, argv, "i:o:l:k:c:C:g:n:vq:")) != -1) { - //printf("found option: %c\n",i); - switch(i) { - case 'i': - input_file=optarg;break; - case 'c': - parms.kappa_prime=atoi(optarg);break; - case 'h': - usage(EXIT_SUCCESS); break; - case 'o': - output_file=optarg; break; - case 'C': - parms.kappa=atoi(optarg); - if (parms.kappa<=0 || parms.kappa>get_mask<unsigned short>::value) { - cout<<"Bad value for kappa. Must choose kappa so that 0<kappa<="<<get_mask<unsigned short>::value<<endl; - usage(EXIT_FAILURE); - } - break; - case 'l': - parms.lambda = atoi(optarg); - if (parms.lambda<=0) { - cout<<"Bad value for lambda. Choose a value that is >0 or let ROCK choose for you."<<endl; - usage(EXIT_FAILURE); - } - break; - case 'k': - k=atoi(optarg); - if (k<=0 || k>32) { - cout<<"Bad value for k. Must choose k so that 0<k<=32."<<endl; - usage(EXIT_FAILURE); - } - break; - case 'n': - nb_k_mers=atoi(optarg);break; // number of distinct k-mers - case 'g': - // cout<<optarg<<endl; - g=atoi(optarg); - break; - case 'v': - verbose_mode=1; - break; - case 'q': - parms.nucl_score_threshold=atoi(optarg); - break; - default: - usage(EXIT_FAILURE); break; } - } - processMainArgs(optind, argc, argv,v_input_lines); - optArgConsistency(input_file,output_file,g,parms,nb_k_mers,v_input_lines); - if (nb_k_mers) { - expected_collision_proba=getCollisionProba(nb_k_mers,parms.lambda); - } - if ((v_input_lines.empty() || v_output_lines.empty()) && (loadInOutFileArgs(input_file,output_file,v_input_lines,v_output_lines)==EXIT_FAILURE)) throw EXIT_FAILURE; - if (processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id)!=EXIT_SUCCESS) throw EXIT_FAILURE; - } + while((i = getopt(argc, argv, "i:o:l:k:c:C:g:n:vq:m:")) != -1) { + //printf("found option: %c\n",i); + switch(i) { + case 'i': + input_file=optarg;break; + case 'c': + parms.kappa_prime=atoi(optarg);break; + case 'h': + usage(EXIT_SUCCESS); break; + case 'o': + output_file=optarg; break; + case 'C': + parms.kappa=atoi(optarg); + if (parms.kappa<=0 || parms.kappa>get_mask<unsigned short>::value) { + cout<<"Bad value for kappa. Must choose kappa so that 0<kappa<="<<get_mask<unsigned short>::value<<endl; + usage(EXIT_FAILURE); + } + break; + case 'l': + parms.lambda = atoi(optarg); + if (parms.lambda<=0) { + cout<<"Bad value for lambda. Choose a value that is >0 or let ROCK choose for you."<<endl; + usage(EXIT_FAILURE); + } + break; + case 'k': + k=atoi(optarg); + if (k<=0 || k>32) { + cout<<"Bad value for k. Must choose k so that 0<k<=32."<<endl; + usage(EXIT_FAILURE); + } + break; + case 'n': + nb_k_mers=atoi(optarg);break; // number of distinct k-mers + case 'g': + // cout<<optarg<<endl; + g=atoi(optarg); + break; + case 'v': + verbose_mode=1; + break; + case 'q': + q=atoi(optarg); + if (q<=0) { + cout<<"q must be >=0"<<endl; + usage(EXIT_FAILURE); + } + parms.nucl_score_threshold=atoi(optarg)+k_phred_32; + qual_thres.nucl_score_threshold=parms.nucl_score_threshold; + break; + case 'm': + m=atoi(optarg); + if (m<1) { + cout<<"minimum number of valid k-mer for keeping a read must be an integer >=1"<<endl; + usage(EXIT_FAILURE); + } + qual_thres.min_correct_k_mers_in_read=atoi(optarg); + break; + default: + usage(EXIT_FAILURE); break; } + } + processMainArgs(optind, argc, argv,v_input_lines); + optArgConsistency(input_file,output_file,g,parms,nb_k_mers,v_input_lines); + if (nb_k_mers) { + expected_collision_proba=getCollisionProba(nb_k_mers,parms.lambda); + } + if ((v_input_lines.empty() || v_output_lines.empty()) && (loadInOutFileArgs(input_file,output_file,v_input_lines,v_output_lines)==EXIT_FAILURE)) throw EXIT_FAILURE; + if (processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id)!=EXIT_SUCCESS) throw EXIT_FAILURE; +} + +FasqQualThreshold ROCKparams::getQualThreshold() { + qual_thres.k=k; + return qual_thres; +} diff --git a/src/ROCKparams.h b/src/ROCKparams.h index 511d9c290de9793da898300caf0abf591bb338bc..7d899465989c9546f7657790a9b085f65800e026 100644 --- a/src/ROCKparams.h +++ b/src/ROCKparams.h @@ -13,6 +13,7 @@ #include <string> #include <vector> #include <getopt.h> +#include <iostream> #include "CountMinSketch.hpp" #include "main_utils.h" @@ -23,11 +24,24 @@ #define proposedLambda 4 + + +typedef struct { + int nucl_score_threshold; + int min_correct_k_mers_in_read; + int k; +}FasqQualThreshold; + + using namespace std; class ROCKparams{ + static const int output_ext=1; + static const int undef_ext=2; + CMSparams parms; + FasqQualThreshold qual_thres; int g; // max RAM to use for the CMS if specified by the user. int nb_k_mers; // expected number of k-mers in input data if specified by the user. int k; // size of the k-mers @@ -39,6 +53,7 @@ class ROCKparams{ int f_id; unsigned long cms_size; float expected_collision_proba; + int min_correct_k_mers_in_read; void computeLambda(); @@ -46,7 +61,8 @@ class ROCKparams{ int loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<string>& v_input_lines,std::vector<string>& v_output_lines); int loadFileArgs(const std::string& afile,std::vector<string>& v_lines); void removePathfromFName(string& FName); - void changeExtension(string& FName); + void changeExtension(string& FName,const int &); + string genUndefFilename(const string& ,const string&); void genOutFilenames(const std::vector<string>& v_input_lines,std::vector<string>& v_output_lines); int processInOutFileArgs(const std::vector<string>& v_input_lines,std::vector<string>& v_output_lines,std::vector<IO_fq_files>& single_files,vector<PE_files>& v_PE_files,int& f_id); @@ -62,6 +78,8 @@ class ROCKparams{ public: + FasqQualThreshold getQualThreshold(); + ROCKparams() { f_id=0; // to generate id of input/output fastq files. +1=total number of files. g=0; @@ -71,9 +89,12 @@ public: parms.kappa_prime=0; parms.lambda=0; parms.nucl_score_threshold=0; + qual_thres.min_correct_k_mers_in_read=1; + qual_thres.nucl_score_threshold=0; verbose_mode=0; cms_size=0; expected_collision_proba=0.0; + min_correct_k_mers_in_read=0; parms.filter_size=getNodePhysMemory()/100.0*90-defaultGRPMAXSize; // Prefer not to use all the machine's memory if (parms.filter_size==0) throw EXIT_FAILURE; } diff --git a/src/fqreader.cpp b/src/fqreader.cpp index 0cd758e782b6b1615244bf39b85d11ef238957ab..5dcccf6bb504c75d106ce566b4b64a782ac14fde 100644 --- a/src/fqreader.cpp +++ b/src/fqreader.cpp @@ -20,7 +20,7 @@ - +FasqQualThreshold FqBaseBackend::qual_thres; /* * Processes 1 file containing single reads. @@ -33,9 +33,15 @@ void processSingleFile(char * fq_s,unsigned char f_id, srp* io_sr) { /* Processes 1 pair of files containing PE reads. * Used for unit testing only.*/ -void processPEFiles(char * fq_1, unsigned char f_id1,char * fq_2, unsigned char f_id2,srp * io_sr ) { +void processPEFiles(char * fq_1, unsigned char f_id1,char * fq_2, unsigned char f_id2,srp * io_sr ,char * fq_1_test_undef,char * fq_2_test_undef, size_t test_bufsize) { FqAuxBackend be_fq2; FqMainBackend be_fq1(io_sr); + if (test_bufsize) { + be_fq2.setTestMode(test_bufsize); + be_fq1.setTestMode(test_bufsize); + } + if (fq_1_test_undef!=NULL) be_fq1.setUndefFile(fq_1_test_undef); + if (fq_2_test_undef!=NULL) be_fq2.setUndefFile(fq_2_test_undef); be_fq2.openFile(fq_2,f_id2); be_fq1.setAuxProcessor(&be_fq2); @@ -43,16 +49,19 @@ void processPEFiles(char * fq_1, unsigned char f_id1,char * fq_2, unsigned char be_fq2.closeFile(); } + /* * Processes a list of single files and a list of PE files and returns an plain old C array containing * pointers to Fqbackend objects. * This function ALLOCATES MEMORY with new. */ -void processInputFiles(const std::vector<IO_fq_files>& single_files,const vector<PE_files>& v_PE_files,FqBaseBackend * array_be[],srp* io_sr) { +void processInputFiles(const std::vector<IO_fq_files>& single_files,const vector<PE_files>& v_PE_files,FqBaseBackend * array_be[],const FasqQualThreshold& a_qual_thres, srp* io_sr) { unsigned char f_id=1; + FqBaseBackend::setQualThreshold(a_qual_thres); std::vector<IO_fq_files>::const_iterator it_single; for (it_single=single_files.begin();it_single!=single_files.end();it_single++) { FqMainBackend * be_fq=new FqMainBackend(io_sr); + be_fq->setUndefFile((char *) it_single->undef_fq_file.c_str()); be_fq->processFile((char *) it_single->in_fq_file.c_str(),f_id); be_fq->setOutputFile((char *) it_single->out_fq_file.c_str()); array_be[f_id-1]=be_fq; @@ -64,9 +73,12 @@ void processInputFiles(const std::vector<IO_fq_files>& single_files,const vector unsigned char f_id1=f_id; f_id++; FqAuxBackend * be_fq2=new FqAuxBackend(); + be_fq2->setUndefFile((char *) it_pe->PE2.undef_fq_file.c_str()); be_fq2->openFile((char *) (it_pe->PE2.in_fq_file).c_str(),(unsigned char) f_id); be_fq2->setOutputFile((char *) (it_pe->PE2.out_fq_file).c_str()); be_fq1->setAuxProcessor(be_fq2); + + be_fq1->setUndefFile((char *) it_pe->PE1.undef_fq_file.c_str()); be_fq1->processFile((char *) (it_pe->PE1.in_fq_file).c_str(),f_id-1); be_fq1->setOutputFile((char *) (it_pe->PE1.out_fq_file).c_str()); be_fq2->closeFile(); diff --git a/src/fqreader.h b/src/fqreader.h index 1247137b3cce71996213fd197422da7d0b08f900..a8c95ca4432d7ef40a08d96b50ab0e06f203bc15 100644 --- a/src/fqreader.h +++ b/src/fqreader.h @@ -18,6 +18,6 @@ using namespace std; void processSingleFile(char *, unsigned char, srp*); -void processPEFiles(char * fq_1, unsigned char f_id1,char * gq_2, unsigned char f_id2,srp *io_sr ); -void processInputFiles(const std::vector<IO_fq_files>& ,const vector<PE_files>& ,FqBaseBackend * [], srp* ); +void processPEFiles(char * fq_1, unsigned char f_id1,char * gq_2, unsigned char f_id2,srp *io_sr,char * fq_1_test_undef=NULL,char * fq_2_test_undef=NULL,size_t test_bufsize=0); +void processInputFiles(const std::vector<IO_fq_files>& ,const vector<PE_files>& ,FqBaseBackend * [], const FasqQualThreshold&,srp* ); #endif diff --git a/src/main_utils.cpp b/src/main_utils.cpp index dc1ba44e4f854658141f2ed8bf5dd009590cfdb7..d7dfd51e3a49373841291e456459056f342b0891 100644 --- a/src/main_utils.cpp +++ b/src/main_utils.cpp @@ -48,10 +48,11 @@ unsigned long getNodePhysMemory() { * Given a filename (including full path), determine if parent directory exists. * If is doesn't, display error message and exit. */ -void checkDirExists(const string o_file) { +std::string checkDirExists(const string o_file) { + string parent_dir="."; std::size_t o_found = o_file.find_last_of("/"); if (o_found!=string::npos) { - string parent_dir=o_file.substr(0,o_found); + parent_dir=o_file.substr(0,o_found); // cout<<parent_dir<<endl; struct stat info; if (stat(parent_dir.c_str(),&info)!=0) { @@ -59,6 +60,7 @@ void checkDirExists(const string o_file) { exit(EXIT_FAILURE); } } + return parent_dir; } @@ -113,6 +115,8 @@ void usage(int status) { cout<<" -l <int> .... size of the count min sketch (default: at most 4, depending on the available RAM)"<<endl; cout<<" -n <int> .... (expected) number of distinct k-mers within the processed reads."<<endl; cout<<" -g <int> .... maximum RAM to use (in Gb) for the CMS."<<endl; + cout<<" -q <int> .... minimum quality threshold for nucleotides. Default is 0."<<endl; + cout<<" -m <int> .... minimum of valid k-mer threshold for reads. Default is 1."<<endl; cout<<" -v .... verbose"<<endl; exit(status); } diff --git a/src/main_utils.h b/src/main_utils.h index ed50a9cea471b62d7e0f7cc1303248a54778bc8a..080e584c9a2089639c762a9480f4376682c31563 100644 --- a/src/main_utils.h +++ b/src/main_utils.h @@ -17,8 +17,11 @@ #include <vector> #include "rock_commons.h" + + + unsigned long getNodePhysMemory(); -void checkDirExists(const std::string o_file); +std::string checkDirExists(const std::string o_file); /*int loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<std::string>& v_input_lines,std::vector<std::string>& v_output_lines); int processInOutFileArgs(const std::vector<std::string>& v_input_lines,std::vector<std::string>& v_output_lines,std::vector<IO_fq_files>& single_files,std::vector<PE_files>& v_PE_files,int& f_id);*/ int getBestLambdaForN(const unsigned long& nb_k_mers,int lambda_max); diff --git a/src/read_utils.cpp b/src/read_utils.cpp index 23830ce5cecec30e3fc5d7989290f32c190eeb1b..2a73709b4b9e2446b52d3ef834ce77f34218979b 100644 --- a/src/read_utils.cpp +++ b/src/read_utils.cpp @@ -27,7 +27,7 @@ void UpdateReadForScoreThreshold(DnaSeqStr * dna_seq,const int & nucl_qual_score char * qual_score_start=dna_seq->fq_record_buf+idx_start_qual_score; int idx=0; while (idx<dna_seq->length) { - if (*(qual_score_start+idx)-k_phred_32<=nucl_qual_score_thres) *(read_data_start+idx)=k_nucl_in_error; + if (*(qual_score_start+idx)<=nucl_qual_score_thres) *(read_data_start+idx)=k_nucl_in_error; idx++; } } @@ -37,7 +37,6 @@ void getFqRecord(FqBaseBackend* fq_files_be[],const unsigned char& f_id,const un char * fq_record) { FqBaseBackend * be=fq_files_be[f_id-1]; try { - int fq_rec_size=be->getRead(offset,fq_record); be->getRead(offset,fq_record); } catch (int e) { #ifdef DEBUG diff --git a/src/read_utils.h b/src/read_utils.h index 55119f16ecf85b729eec5359a8000fdefec6260c..b6a643cc2af3f84d41960f26d0ab8305bf43ebda 100644 --- a/src/read_utils.h +++ b/src/read_utils.h @@ -11,6 +11,7 @@ #include "FqConstants.h" #include "rock_commons.h" #include "ReadProcessor.h" +#include "FqBaseBackend.h" #define k_nucl_in_error 'N' diff --git a/src/rock.cpp b/src/rock.cpp index 7bd1657d0eb7149d4117244f83a169cb865334b1..b74bba1a70e67e5d31605478cf2f03e4cc7119a5 100644 --- a/src/rock.cpp +++ b/src/rock.cpp @@ -33,7 +33,7 @@ using namespace std; -//#define BENCHMARK +// #define BENCHMARK #ifdef BENCHMARK #include <sys/time.h> #include <sys/resource.h> @@ -48,7 +48,6 @@ void printRUsage() { - int main(int argc,char * argv[]) { #ifdef BENCHMARK cout<<"program startup"<<endl; @@ -69,7 +68,7 @@ int main(int argc,char * argv[]) { cout<<"processed input args; going to start reading fastq files"<<endl; printRUsage(); #endif - processInputFiles(single_files,v_PE_files,map_id_backend,&sr); + processInputFiles(single_files,v_PE_files,map_id_backend,main_parms.getQualThreshold(),&sr); #ifdef BENCHMARK cout<<"finished loading fastq file into sr structure"<<endl; cout<<"size of srp structure="<<sizeof(sr)<<endl; diff --git a/src/rock_commons.h b/src/rock_commons.h index e27fa0587fc459bcde9531e2f28031f357d62ea0..dbd0a7f62c68c8ff8fd7684b7305af470b3d36d1 100644 --- a/src/rock_commons.h +++ b/src/rock_commons.h @@ -16,8 +16,7 @@ #include <string> #include <vector> -#include "rock_commons.h" -#include "FqBaseBackend.h" + /* * Gather here type and constants definitions that are common to some of the 4 different modules in ROCK @@ -30,6 +29,7 @@ typedef struct { std::string in_fq_file; std::string out_fq_file; + std::string undef_fq_file; } IO_fq_files; typedef struct { diff --git a/src/srp.h b/src/srp.h index f77ff1e7a54b70e24abb00c2f9999369c2cc44d4..a72b48ff03a72de595754dfe6abbed6b10d1695b 100644 --- a/src/srp.h +++ b/src/srp.h @@ -28,6 +28,8 @@ typedef struct { unsigned int score; // total quality score for read. unsigned long rstart_offset; // read offset in file unsigned char f_id; // identifier of the file. + int nb_k_mers_in_error; + int nb_nucleotides_in_read; }rinfo; typedef std::vector<rpos> k_dim; diff --git a/src/unit_test_fqreader.cpp b/src/unit_test_fqreader.cpp index 786de5755f3b77043837f898515228965eccb7f9..49fa8df565cd5ffce60e29356b1317c0fd2be696 100644 --- a/src/unit_test_fqreader.cpp +++ b/src/unit_test_fqreader.cpp @@ -6,9 +6,12 @@ * unit and non regression testing for the fqreader component. * Keep using assert for the moment, don't want to add a dependency on boost (or any other test framework) just for the tests. */ +#include <sys/stat.h> #include <stdio.h> #include <string.h> #include <iostream> +#include <fstream> +#include <streambuf> #include <limits.h> #include <assert.h> #include <stdlib.h> @@ -18,12 +21,10 @@ #include "FqMainBackend.h" #include "fqreader.h" -// TODO : Add test case where @ character is inside quality score. using namespace std; void test_processSingleFile() { - //printf("MAX_UINT=%u \n",UINT_MAX); srp sr; unsigned char f_id=1; processSingleFile((char *) "../test/data/unit/test_single.fq",f_id,&sr); @@ -36,7 +37,7 @@ void test_processSingleFile() { for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). //cout << "score="<<rit->first<<endl; unsigned long score=rit->first; - assert(score==5); + assert(score==10); // reads are all 152 character long in this test. for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { unsigned long offset_quotient=it_offs->first; cout<<"offset_quotient="<<offset_quotient<<endl; @@ -59,6 +60,79 @@ void test_processSingleFile() { assert(cnt_read==6); } +/* + * Expects a given minimum of k_mers to be present in the read or else it is dumped as undefined + */ +void test_processSingleFileWithMQOption() { + srp sr; + unsigned char f_id=1; + FasqQualThreshold qual_thres; + qual_thres.k=10; + qual_thres.nucl_score_threshold=14+k_phred_32; + qual_thres.min_correct_k_mers_in_read=100; + FqBaseBackend::setQualThreshold(qual_thres); + FqMainBackend be_fq=FqMainBackend(&sr); + + be_fq.setUndefFile((char *) "../test/data/unit/test_single.undef.fq"); + be_fq.processFile((char *) "../test/data/unit/test_single.fq",f_id); + + srp::reverse_iterator rit; + i_dim::iterator it_offs; + k_dim::iterator it_struct; + + int cnt_read=0; + for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). + for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { + for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { + cnt_read++; + } + } + } + assert(cnt_read==6); + assert(remove((char *) "../test/data/unit/test_single.undef.fq")==0); + sr.clear(); + + cnt_read=0; + qual_thres.min_correct_k_mers_in_read=130; + FqBaseBackend::setQualThreshold(qual_thres); + FqMainBackend be_fq2=FqMainBackend(&sr); + be_fq2.setUndefFile((char *) "../test/data/unit/test_single2.undef.fq"); + be_fq2.processFile((char *) "../test/data/unit/test_single.fq",f_id); + + for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). + for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { + for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { + cnt_read++; + if (cnt_read==1) assert(it_struct->read_a1==0); + if (cnt_read==2) assert(it_struct->read_a1==1400); + if (cnt_read==3) assert(it_struct->read_a1==1751); + } + } + } + assert(cnt_read==3); + assert(remove((char *) "../test/data/unit/test_single2.undef.fq")==0); + sr.clear(); + + cnt_read=0; + qual_thres.nucl_score_threshold=36+k_phred_32; + qual_thres.k=30; + FqBaseBackend::setQualThreshold(qual_thres); + + FqMainBackend be_fq3=FqMainBackend(&sr); + be_fq3.setUndefFile((char *) "../test/data/unit/test_single3.undef.fq"); + be_fq3.processFile((char *) "../test/data/unit/test_single.fq",f_id); + + for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). + for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { + for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { + cnt_read++; + } + } + } + assert(cnt_read==0); + assert(remove((char *) "../test/data/unit/test_single3.undef.fq")==0); +} + /* * Test case with other data than those I had until here. * Quality score contains '@'caracter (usually start of fastq record), @@ -96,17 +170,17 @@ void test_processPEfilesWithA() { assert((fid_stored &masque)==f_id4); if (cnt_read==0) { assert(it_struct->read_a1==0); - assert(it_struct->read_a1==0); + assert(it_struct->read_a2==0); } - if (cnt_read==6) { + if (cnt_read==4) { // std::cout<<it_struct->read_a1<<" "<<it_struct->read_a2; - assert(score==18); + assert(score==33); assert(it_struct->read_a1==558); assert(it_struct->read_a2==-2); } - if (cnt_read==3) { + if (cnt_read==5) { // std::cout<<it_struct->read_a1<<" "<<it_struct->read_a2; - assert(score==19); + assert(score==33); assert(it_struct->read_a1==1114); assert(it_struct->read_a2==0); } @@ -120,7 +194,6 @@ void test_processPEfilesWithA() { } } assert(cnt_read==10); - } void test_processPEFiles() { @@ -143,8 +216,8 @@ void test_processPEFiles() { for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). // cout << "score="<<rit->first<<endl; unsigned long score=rit->first; - if (cnt_read==0 || cnt_read==1) assert(score==10); - if (cnt_read==2) assert(score==9); + if (cnt_read==0 || cnt_read==1) assert(score==20); + if (cnt_read==2) assert(score==19); for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { unsigned long offset_quotient=it_offs->first; assert(offset_quotient==0); @@ -178,6 +251,62 @@ void test_processPEFiles() { assert(cnt_read==3); } +/* + * Auxilliary function for testing the processing of PE files while using m and q options. + */ +void aux_testPEFilesMQ(FasqQualThreshold qual_thres,int nb_expected_reads) { + srp sr; + char * fq_1_test=(char *) "../test/data/unit/09-4607_S43_R1.fastq"; + char * fq_2_test=(char *) "../test/data/unit/09-4607_S43_R2.fastq"; + char * fq_1_test_undef=(char *) "../test/data/unit/09-4607_S43_R1.undef.fastq"; + char * fq_2_test_undef=(char *) "../test/data/unit/09-4607_S43_R2.undef.fastq"; + + unsigned char f_id1=1; + unsigned char f_id2=2; + FqBaseBackend::setQualThreshold(qual_thres); + processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,fq_1_test_undef,fq_2_test_undef); + + srp::reverse_iterator rit; + i_dim::iterator it_offs; + k_dim::iterator it_struct; + int cnt_read=0; + + for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). + //unsigned long score=rit->first; + for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { + for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { + cnt_read++; + } + } + } + assert(cnt_read==nb_expected_reads); + assert(remove((char *) "../test/data/unit/09-4607_S43_R1.undef.fastq")==0); + assert(remove((char *) "../test/data/unit/09-4607_S43_R2.undef.fastq")==0); +} + +void test_processPEFilesWithMQOptions() { + + + FasqQualThreshold qual_thres; + qual_thres.k=20; + qual_thres.nucl_score_threshold=14+k_phred_32; + qual_thres.min_correct_k_mers_in_read=76; + + aux_testPEFilesMQ(qual_thres,4); // last fq records contains only 77 correct k-mers. + + qual_thres.min_correct_k_mers_in_read=100; + aux_testPEFilesMQ(qual_thres,3); + + qual_thres.min_correct_k_mers_in_read=150; + aux_testPEFilesMQ(qual_thres,2); + + qual_thres.min_correct_k_mers_in_read=180; + aux_testPEFilesMQ(qual_thres,2); + + qual_thres.min_correct_k_mers_in_read=230; + aux_testPEFilesMQ(qual_thres,0); +} + void check_processAIFilesResults(srp& sr) { srp::reverse_iterator rit; @@ -187,11 +316,11 @@ void check_processAIFilesResults(srp& sr) { for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). //cout << "score="<<rit->first<<endl; unsigned long score=rit->first; - if (cnt_read==0 || cnt_read==1) assert(score==10); - else if (cnt_read==2) assert(score==9); + if (cnt_read==0 || cnt_read==1) assert(score==20); + else if (cnt_read==2) assert(score==19); else { //cout << "score="<<rit->first<<endl; - assert(score==5); + assert(score==10); } for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { unsigned long offset_quotient=it_offs->first; @@ -222,6 +351,7 @@ void test_processAllFiles() { } + void test_processInputFiles() { char * fq_1_test=(char *) "../test/data/unit/test_PE1.fq"; char * fq_2_test=(char *) "../test/data/unit/test_PE2.fq"; @@ -240,9 +370,13 @@ void test_processInputFiles() { v_pe.push_back(pe); srp sr; + FasqQualThreshold default_qual_thres; + default_qual_thres.k=30; + default_qual_thres.min_correct_k_mers_in_read=1; + default_qual_thres.nucl_score_threshold=0; // leave default values for that test FqBaseBackend * array_be[k_max_input_files]; - processInputFiles(v_single,v_pe,array_be,&sr); + processInputFiles(v_single,v_pe,array_be,default_qual_thres,&sr); // check that result is correct in sr. check_processAIFilesResults(sr); @@ -270,6 +404,252 @@ void test_processInputFiles() { } +void test_processBufSingle() { + srp::reverse_iterator rit; + i_dim::iterator it_offs; + k_dim::iterator it_struct; + int cnt_read=0; + unsigned char f_id1=1; + char * buf="@NS500443:65:H573HAFXX:1:11101:20964:1048/1\n\ +ACATTTCACTGGTCATGAGCTTGGTGAGAGAAGCGATACGAATGACCGAATCAAGCTGTGGACGAACATTATTGCCCGGTCTGGTGTCACCGAAACTACGAAACACGCGTTGATTGCCGTCGATCACCACCAGCGCCATGCCCGTGGCGC\n\ ++\n\ +AAAAAEEAEEEEEEEEE6EE/EEEEEEEEAEEAEEEEEEEEEEEEEEAEEEA/A/EEEAEEEEEE/EE</EAEEEEEE/EEE/<//AEEE<E/E/EEEEEEEEE/AAA6EAE<<EE<EEEEEA6E<EAEEAA<EEEAEAEEA<<<<EEAA\n"; + srp sr; + FasqQualThreshold qual_thres; + qual_thres.k=20; + qual_thres.nucl_score_threshold=14+k_phred_32; + qual_thres.min_correct_k_mers_in_read=78; + T_buf_info buf_info; + FqBaseBackend::setQualThreshold(qual_thres); + FqMainBackend be(&sr); + be.setUndefFile((char *) "../test/data/unit/test_processBuf.undef.fq"); + buf_info.buf=buf; + buf_info.pchar=buf; + buf_info.cnt=0; + buf_info.real_bufsize=strlen(buf); + be.openUndefFile(); + be.processBuf(buf_info,f_id1,348); + be.closeUndefFile(); + // check that read is rejected + for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). + for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { + for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { + cnt_read++; + } + } + } + assert(cnt_read==0); + std::ifstream t("../test/data/unit/test_processBuf.undef.fq"); + std::string rej_str; + + t.seekg(0, std::ios::end); + rej_str.reserve(t.tellg()); + t.seekg(0, std::ios::beg); + + rej_str.assign((std::istreambuf_iterator<char>(t)), + std::istreambuf_iterator<char>()); + assert(strcmp(rej_str.c_str(),buf)==0); + + assert(remove((char *) "../test/data/unit/test_processBuf.undef.fq")==0); +} + +void test_processBufPE() { + srp::reverse_iterator rit; + i_dim::iterator it_offs; + k_dim::iterator it_struct; + int cnt_read=0; + unsigned char f_id1=1; + char * buf_PE1="@NS500443:65:H573HAFXX:1:11101:20964:1048/1\n\ +ACATTTCACTGGTCATGAGCTTGGTGAGAGAAGCGATACGAATGACCGAATCAAGCTGTGGACGAACATTATTGCCCGGTCTGGTGTCACCGAAACTACGAAACACGCGTTGATTGCCGTCGATCACCACCAGCGCCATGCCCGTGGCGC\n\ ++\n\ +AAAAAEEAEEEEEEEEE6EE/EEEEEEEEAEEAEEEEEEEEEEEEEEAEEEA/A/EEEAEEEEEE/EE</EAEEEEEE/EEE/<//AEEE<E/E/EEEEEEEEE/AAA6EAE<<EE<EEEEEA6E<EAEEAA<EEEAEAEEA<<<<EEAA\n"; + + char * buf_PE2="@NS500443:65:H573HAFXX:1:11101:20964:1048/2\n\ +CTGATATCGTTGATCGTTATGCCGAGCATATCTTTTATGGTAGCGGCGCCACGGGCATGGCGCTGGTGGTGATCGACGGCAATCAACGCGTGTTTCGTAGTTTCGGTGACACCAGACCGGGCAATAATGTTCGTCCACAGCTTGATTCGGT\n\ ++\n\ +AAAAAEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEAEEEEEEEEEEEEEAEEE/EEEEAE<EEEEEEEEEEAEEA<<E<EAEAAEE<EEEEEEEEEEEEEAE</A<AEEEE<EAAA<AA\n"; + + + srp sr; + FasqQualThreshold qual_thres; + qual_thres.k=20; + qual_thres.nucl_score_threshold=14+k_phred_32; + qual_thres.min_correct_k_mers_in_read=78; + T_buf_info buf_info; + T_buf_info PE2_buf_info; + FqBaseBackend::setQualThreshold(qual_thres); + + FqMainBackend be(&sr); + be.setUndefFile((char *) "../test/data/unit/test_processBuf_PE1.undef.fq"); + + FqAuxBackend be2; + be2.setUndefFile((char *) "../test/data/unit/test_processBuf_PE2.undef.fq"); + + + buf_info.buf=buf_PE1; + PE2_buf_info.buf=buf_PE2; + buf_info.pchar=buf_PE1; + PE2_buf_info.pchar=buf_PE2; + buf_info.cnt=0; + PE2_buf_info.cnt=0; + buf_info.real_bufsize=strlen(buf_PE1); + PE2_buf_info.real_bufsize=strlen(buf_PE2); + + be2.buf_info=PE2_buf_info; + + be.setAuxProcessor(&be2); + + be.openUndefFile(); + be2.openUndefFile(); + be.processBuf(buf_info,f_id1,348); + be.closeUndefFile(); + be2.closeUndefFile(); + + // check that read is selected + for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). + for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { + for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { + cnt_read++; + } + } + } + assert(cnt_read==1); + + struct stat fileStat; + stat("../test/data/unit/test_processBuf_PE1.undef.fq", &fileStat); + assert(fileStat.st_size==0); + + stat("../test/data/unit/test_processBuf_PE2.undef.fq", &fileStat); + assert(fileStat.st_size==0); + + assert(remove((char *) "../test/data/unit/test_processBuf_PE1.undef.fq")==0); + assert(remove((char *) "../test/data/unit/test_processBuf_PE2.undef.fq")==0); +} + + +/* + * Auxilliary method to compare 2 files and make ure that they are identical. + */ +int compareFilesLileByLine(char * filename1,char* filename2) { + ifstream file1,file2; + file1.open(filename1,ios::binary); + file2.open(filename2,ios::binary); +//---------- compare number of lines in both files ------------------ + int c1,c2; + c1 = 0; c2 = 0; + string str; + while(!file1.eof()) + { + getline(file1,str); + c1++; + } + file1.clear(); // sets a new value for the error control state + file1.seekg(0,ios::beg); + while(!file2.eof()) + { + getline(file2,str); + c2++; + } + file2.clear(); + file2.seekg(0,ios::beg); + + if(c1 != c2) + { + cout << "Different number of lines in files!" << "\n"; + cout << "file1 has " << c1 << " lines and file2 has " + << c2 << " lines" << "\n"; + return -1; + } +//---------- compare two files line by line ------------------ + char string1[1000], string2[1000]; // assume a read in afastq record is never longer than that. + int j = 0; + while(!file1.eof()) + { + file1.getline(string1,256); + file2.getline(string2,256); + j++; + if(strcmp(string1,string2) != 0) + { + cout << j << "-th strings are not equal" << "\n"; + cout << " " << string1 << "\n"; + cout << " " << string2 << "\n"; + return -1; + } + } + + return 0; +} + + +void Aux_MimicBigPEFilesWithMQOptions(const FasqQualThreshold& qual_thres,const int bufsize,const int nb_expected_reads) { + srp sr; + char * fq_1_test=(char *) "../test/data/unit/09-4607_S43_R1_big.fastq"; + char * fq_2_test=(char *) "../test/data/unit/09-4607_S43_R2_big.fastq"; + char * fq_1_test_undef=(char *) "../test/data/unit/09-4607_S43_R1.undef.fastq"; + char * fq_2_test_undef=(char *) "../test/data/unit/09-4607_S43_R2.undef.fastq"; + + char * fq_1_expected_undef_100=(char *) "../test/data/unit/thres_100_09-4607_S43_R1.undef.fastq"; + char * fq_2_expected_undef_100=(char *) "../test/data/unit/thres_100_09-4607_S43_R2.undef.fastq"; + + char * fq_1_expected_undef_180=(char *) "../test/data/unit/thres_180_09-4607_S43_R1.undef.fastq"; + char * fq_2_expected_undef_180=(char *) "../test/data/unit/thres_180_09-4607_S43_R2.undef.fastq"; + + unsigned char f_id1=1; + unsigned char f_id2=2; + + // FqBaseBackend::bufsize=bufsize; // choose a very small buf size to mimic behavior on big files and reun the test in a short time + FqBaseBackend::setQualThreshold(qual_thres); + + processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,fq_1_test_undef,fq_2_test_undef,bufsize); + + srp::reverse_iterator rit; + i_dim::iterator it_offs; + k_dim::iterator it_struct; + int cnt_read=0; + + for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). + //unsigned long score=rit->first; + for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { + for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { + cnt_read++; + } + } + } + assert(cnt_read==nb_expected_reads); + if (nb_expected_reads==6) { + assert(compareFilesLileByLine(fq_1_test_undef,fq_1_expected_undef_100)==0); + assert(compareFilesLileByLine(fq_2_test_undef,fq_2_expected_undef_100)==0); + } else { + assert(compareFilesLileByLine(fq_1_test_undef,fq_1_expected_undef_180)==0); + assert(compareFilesLileByLine(fq_2_test_undef,fq_2_expected_undef_180)==0); + } + assert(remove((char *) "../test/data/unit/09-4607_S43_R1.undef.fastq")==0); + assert(remove((char *) "../test/data/unit/09-4607_S43_R2.undef.fastq")==0); +} + + +void test_MimicBigPEFilesWithMQOptions() { + int bufsize; + FasqQualThreshold qual_thres; + qual_thres.k=20; + qual_thres.nucl_score_threshold=14+k_phred_32; + qual_thres.min_correct_k_mers_in_read=100; + + bufsize=500; + Aux_MimicBigPEFilesWithMQOptions(qual_thres,bufsize,6); + bufsize=400; + Aux_MimicBigPEFilesWithMQOptions(qual_thres,bufsize,6); + bufsize=800; + Aux_MimicBigPEFilesWithMQOptions(qual_thres,bufsize,6); + + qual_thres.min_correct_k_mers_in_read=180; + bufsize=500; + Aux_MimicBigPEFilesWithMQOptions(qual_thres,bufsize,3); + bufsize=400; + Aux_MimicBigPEFilesWithMQOptions(qual_thres,bufsize,3); + bufsize=800; + Aux_MimicBigPEFilesWithMQOptions(qual_thres,bufsize,3); +} int main(int argc, char **argv) { cout<<"test for single file"<<endl; @@ -279,8 +659,16 @@ int main(int argc, char **argv) { cout<<"test the case of records that contain @ character in quality score"<<endl; test_processPEfilesWithA(); cout<<"test for both single and PE files"<<endl; - test_processAllFiles(); /* mix PE together with single; nearly as in real life.*/ + test_processAllFiles(); // mix PE together with single; nearly as in real life. cout<<"testing higher level function processInputFiles"<<endl; test_processInputFiles(); + cout<<"test processing single files with thresholds (nucleotide quality score and minimum number of valid k_mers in read). "<<endl; + test_processBufSingle(); + test_processSingleFileWithMQOption(); + cout<<"test processing PE files with thresholds (nucleotide quality score and minimum number of valid k_mers in read). "<<endl; + test_processBufPE(); + test_processPEFilesWithMQOptions(); + cout<<"test processing files with thresholds (nucleotide quality score and minimum number of valid k_mers in read) mimic big files that need several read operations. "<<endl; + test_MimicBigPEFilesWithMQOptions(); cout<<"done"<<endl; } diff --git a/src/unit_test_fqwriter.cpp b/src/unit_test_fqwriter.cpp index 69d107f8a67d44f5d7a6cfed6628440729436ffa..62d88d1ef963c0cac3b0a0d6472158f5b136b678 100644 --- a/src/unit_test_fqwriter.cpp +++ b/src/unit_test_fqwriter.cpp @@ -83,7 +83,7 @@ void test_write_PE() { // step2 : create the backends FqBaseBackend * map_id_backend[k_max_input_files]; FqMainBackend * be_fq1=new FqMainBackend(&sr); - unsigned char f_id1=1; + //unsigned char f_id1=1; FqAuxBackend * be_fq2=new FqAuxBackend(); map_id_backend[0]=be_fq1; map_id_backend[1]=be_fq2; diff --git a/src/unit_test_read_utils.cpp b/src/unit_test_read_utils.cpp index f01e5f5dd2267c86ecd387359bec5d7067cd04da..3380743b2baeb63c4267fad197faede2c2776ce9 100644 --- a/src/unit_test_read_utils.cpp +++ b/src/unit_test_read_utils.cpp @@ -29,7 +29,7 @@ void test_getReadSingle() { assert(my_struct1.read_a1==639); assert(my_struct2.read_a1==1228); srp io_sr; - unsigned int j=0; + // unsigned int j=0; DnaSeqStr a_seqs; char dna_read[MAX_READ_LENGTH]; @@ -166,7 +166,7 @@ void test_getReadPEWithNQST() { assert(my_struct2.read_a1==13647); assert(my_struct2.read_a2==6); - int min_score_qual=14; + int min_score_qual=14+k_phred_32; getDnaStr(fic_map,my_struct1,a_seqs,dna_read,min_score_qual); std::cout<<"PE dna read:"<<endl; std::cout<<dna_read<<endl; @@ -288,8 +288,8 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32. int nb_expected_k_mers=strlen(dnaStr)+1; nb_expected_k_mers-=k; nbrKmerDecompo.reserve(nb_expected_k_mers); - unsigned long expected=read_p.kMerToNumber((char *) "GCTACCATAACACCAACTGTTTTCACCATA",NULL); - unsigned long expected_rev=read_p.kMerToNumber((char *) "TATGGTGAAAACAGTTGGTGTTATGGTAGC",NULL); + // unsigned long expected=read_p.kMerToNumber((char *) "GCTACCATAACACCAACTGTTTTCACCATA",NULL); + // unsigned long expected_rev=read_p.kMerToNumber((char *) "TATGGTGAAAACAGTTGGTGTTATGGTAGC",NULL); read_p.getKMerNumbers((char *) dnaStr,strlen(dnaStr),nbrKmerDecompo); assert(nbrKmerDecompo.size()==nb_expected_k_mers); // for each k_mer, we also expect to have a number for its reverse complement. @@ -381,7 +381,7 @@ void test_getReadWithNQST() { assert(my_struct1.read_a1==639); assert(my_struct2.read_a1==1228); srp io_sr; - unsigned int j=0; + //unsigned int j=0; DnaSeqStr a_seqs; char dna_read[MAX_READ_LENGTH]; @@ -394,7 +394,7 @@ void test_getReadWithNQST() { fic_map[3]=&be_fq; init_DnaSeqStr(&a_seqs); - nucl_qual_score_thres=32; + nucl_qual_score_thres=32+k_phred_32; getDNASeqstr(fic_map, my_struct1, 0, &a_seqs,nucl_qual_score_thres); char * tmp=(char *) a_seqs.fq_record_buf; tmp+=a_seqs.start_idx; @@ -405,7 +405,7 @@ void test_getReadWithNQST() { assert(strcmp(dna_read,"NNNNNAGGTGCTACCATAACACCAACTGTTTTCACNATAATTTTAAAATCAAGCATTAGAGACGCNTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTNATTTCTTCCACTANCCTGCCATAATCCAGTACAACCTGGTATAACGGNCAANCGCTTTTTATCATAGGANCTGTATTCTCCTACCTCACGTGGCAAAGGAGGNCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGNNNNN")==0); init_DnaSeqStr(&a_seqs); - nucl_qual_score_thres=50; + nucl_qual_score_thres=50+k_phred_32; getDNASeqstr(fic_map, my_struct1, 0, &a_seqs,nucl_qual_score_thres); tmp=(char *) a_seqs.fq_record_buf; tmp+=a_seqs.start_idx; diff --git a/test/Makefile.am b/test/Makefile.am index b9f8f5f9590b0a58e8e4ea5624723c4cfde8547b..a11dfe4daef2692d7fffe32479b1cb39ca079278 100755 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -4,6 +4,6 @@ LOG_COMPILER = $(SHELL) TESTS = rock.sh -EXTRA_DIST = $(TESTS) rock_mem.sh data/iofiles.args/extract_klebsiella_long_reads_100.txt data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt data/fastq.raw/klebsiella_100_1.fq data/fastq.raw/klebsiella_100_2.fq data/unit/klebsiella_PE1.fq data/unit/klebsiella_PE2.fq data/unit/test_PE1.fq data/unit/test_PE2.fq data/unit/test_PE1_2.fq data/unit/test_PE2_2.fq data/unit/test_single2.fq data/unit/test_single.fq data/unit/list_input1.txt data/unit/list_output1.txt data/unit/list_input2.txt data/unit/list_output2.txt data/unit/list_input3.txt data/unit/list_output3.txt data/iofiles.args/output_files_noNQ_Thres.txt data/iofiles.args/output_files_NQ_Thres_very_low.txt data/iofiles.args/output_files_NQ_Thres_12.txt data/iofiles.args/output_files_NQ_Thres_13.txt +EXTRA_DIST = $(TESTS) rock_mem.sh data/iofiles.args/extract_klebsiella_long_reads_100.txt data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt data/fastq.raw/klebsiella_100_1.fq data/fastq.raw/klebsiella_100_2.fq data/unit/klebsiella_PE1.fq data/unit/klebsiella_PE2.fq data/unit/test_PE1.fq data/unit/test_PE2.fq data/unit/test_PE1_2.fq data/unit/test_PE2_2.fq data/unit/test_single2.fq data/unit/test_single.fq data/unit/list_input1.txt data/unit/list_output1.txt data/unit/list_input2.txt data/unit/list_output2.txt data/unit/list_input3.txt data/unit/list_output3.txt data/iofiles.args/output_files_noNQ_Thres.txt data/iofiles.args/output_files_NQ_Thres_very_low.txt data/iofiles.args/output_files_NQ_Thres_12.txt data/iofiles.args/output_files_NQ_Thres_13.txt data/unit/09-4607_S43_R1.fastq data/unit/09-4607_S43_R2.fastq data/unit/09-4607_S43_R1_big.fastq data/unit/09-4607_S43_R2_big.fastq data/unit/thres_100_09-4607_S43_R1.undef.fastq data/unit/thres_100_09-4607_S43_R2.undef.fastq data/unit/thres_180_09-4607_S43_R1.undef.fastq data/unit/thres_180_09-4607_S43_R2.undef.fastq diff --git a/test/data/unit/09-4607_S43_R1.fastq b/test/data/unit/09-4607_S43_R1.fastq new file mode 100644 index 0000000000000000000000000000000000000000..d492ec62c9087ce98b6f6c15419d03358c9938ab --- /dev/null +++ b/test/data/unit/09-4607_S43_R1.fastq @@ -0,0 +1,16 @@ +@NS500443:65:H573HAFXX:1:11101:6972:1045/1 +GACGGTGGTTCTGGTGCTGCGTAACGAGACCGTTGTTGGCGTCCTCGCCCTCCAGGATACGTTGCGTGACGATGCGCGCGACGCCATCCGCGATCTGCATCAGTTAGGCGTTAACGGCGTGATTCTGACTGGCGATAACCCACGGGCGGCG ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEA<< +@NS500443:65:H573HAFXX:1:11101:23631:1047/1 +CATATAAAGGGTATCTATTTCCCGGGAGGTGACTATGATAGCCAGCAAATTCGGTATCGGCCAACAGGTCCGCCATTCCCTGTTAGGTTACCTCGGAGTGGTCGTCGATATCGACCCGGAATATTCGCTTGATGAGCCGTCGCCTGATGAA ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEAEEEEEAEEEEAEAEEEAEEAEEEEEEAEEAE<<EEEE<AEAEEEEAEEE<AE</ +@NS500443:65:H573HAFXX:1:11101:20964:1048/1 +ACATTTCACTGGTCATGAGCTTGGTGAGAGAAGCGATACGAATGACCGAATCAAGCTGTGGACGAACATTATTGCCCGGTCTGGTGTCACCGAAACTACGAAACACGCGTTGATTGCCGTCGATCACCACCAGCGCCATGCCCGTGGCGC ++ +AAAAAEEAEEEEEEEEE6EE/EEEEEEEEAEEAEEEEEEEEEEEEEEAEEEA/A/EEEAEEEEEE/EE</EAEEEEEE/EEE/<//AEEE<E/E/EEEEEEEEE/AAA6EAE<<EE<EEEEEA6E<EAEEAA<EEEAEAEEA<<<<EEAA +@NS500443:65:H573HAFXX:1:11101:3151:1053/1 +GGCAAAGCGCTAAAAGCAGCGAGCGTAGCGCGGGAAGAGCTGTTTATCACCACCAAGTTGTGGAATGACGTTGCCATCCTGTAGCCTGATAATGGTTGGATTAGCCATGATGTGTTCCTCTTTTAATTGGCTCGCCGGAGTCGGGTCCGG ++ +AAAAAAEEEEEEEEEEEAEEE/EEEEAEEEEEEEEEEEEEAEEEAEEEAEEEEEEEEEEEEEEEEEEAEAAEEE//<EEAE/EEEEE/EAEEEEEAAEEE/A<EEEEEE/EEE/<66EEEA6E6EEEEEEEEAE<AA<EEAEEEEEEEAE diff --git a/test/data/unit/09-4607_S43_R1_big.fastq b/test/data/unit/09-4607_S43_R1_big.fastq new file mode 100644 index 0000000000000000000000000000000000000000..40191a45c14ab880a405da878011e9ccb625a61e --- /dev/null +++ b/test/data/unit/09-4607_S43_R1_big.fastq @@ -0,0 +1,44 @@ +@NS500443:65:H573HAFXX:1:11101:6972:1045/1 +GACGGTGGTTCTGGTGCTGCGTAACGAGACCGTTGTTGGCGTCCTCGCCCTCCAGGATACGTTGCGTGACGATGCGCGCGACGCCATCCGCGATCTGCATCAGTTAGGCGTTAACGGCGTGATTCTGACTGGCGATAACCCACGGGCGGCG ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEA<< +@NS500443:65:H573HAFXX:1:11101:23631:1047/1 +CATATAAAGGGTATCTATTTCCCGGGAGGTGACTATGATAGCCAGCAAATTCGGTATCGGCCAACAGGTCCGCCATTCCCTGTTAGGTTACCTCGGAGTGGTCGTCGATATCGACCCGGAATATTCGCTTGATGAGCCGTCGCCTGATGAA ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEAEEEEEAEEEEAEAEEEAEEAEEEEEEAEEAE<<EEEE<AEAEEEEAEEE<AE</ +@NS500443:65:H573HAFXX:1:11101:20964:1048/1 +ACATTTCACTGGTCATGAGCTTGGTGAGAGAAGCGATACGAATGACCGAATCAAGCTGTGGACGAACATTATTGCCCGGTCTGGTGTCACCGAAACTACGAAACACGCGTTGATTGCCGTCGATCACCACCAGCGCCATGCCCGTGGCGC ++ +AAAAAEEAEEEEEEEEE6EE/EEEEEEEEAEEAEEEEEEEEEEEEEEAEEEA/A/EEEAEEEEEE/EE</EAEEEEEE/EEE/<//AEEE<E/E/EEEEEEEEE/AAA6EAE<<EE<EEEEEA6E<EAEEAA<EEEAEAEEA<<<<EEAA +@NS500443:65:H573HAFXX:1:11101:3151:1053/1 +GGCAAAGCGCTAAAAGCAGCGAGCGTAGCGCGGGAAGAGCTGTTTATCACCACCAAGTTGTGGAATGACGTTGCCATCCTGTAGCCTGATAATGGTTGGATTAGCCATGATGTGTTCCTCTTTTAATTGGCTCGCCGGAGTCGGGTCCGG ++ +AAAAAAEEEEEEEEEEEAEEE/EEEEAEEEEEEEEEEEEEAEEEAEEEAEEEEEEEEEEEEEEEEEEAEAAEEE//<EEAE/EEEEE/EAEEEEEAAEEE/A<EEEEEE/EEE/<66EEEA6E6EEEEEEEEAE<AA<EEAEEEEEEEAE +@NS500443:66:H573HAFXX:1:11101:3151:1053/1 +GGCAAAGCGCTAAAAGCAGCGAGCGTAGCGCGGGAAGAGCTGTTTATCACCACCAAGTTGTGGAATGACGTTGCCATCCTGTAGCCTGATAATGGTTGGATTAGCCATGATGTGTTCCTCTTTTAATTGGCTCGCCGGAGTCGGGTCCGG ++ +AAAAAAEEEEEEEEEEEAEEE/EEEEAEEEEEEEEEEEEEAEEEAEEEAEEEEEEEEEEEEEEEEEEAEAAEEE//<EEAE/EEEEE/EAEEEEEAAEEE/A<EEEEEE/EEE/<66EEEA6E6EEEEEEEEAE<AA<EEAEEEEEEEAE +@NS500443:67:H573HAFXX:1:11101:3151:1053/1 +GGCAAAGCGCTAAAAGCAGCGAGCGTAGCGCGGGAAGAGCTGTTTATCACCACCAAGTTGTGGAATGACGTTGCCATCCTGTAGCCTGATAATGGTTGGATTAGCCATGATGTGTTCCTCTTTTAATTGGCTCGCCGGAGTCGGGTCCGG ++ +AAAAAAEEEEEEEEEEEAEEE/EEEEAEEEEEEEEEEEEEAEEEAEEEAEEEEEEEEEEEEEEEEEEAEAAEEE//<EEAE/EEEEE/EAEEEEEAAEEE/A<EEEEEE/EEE/<66EEEA6E6EEEEEEEEAE<AA<EEAEEEEEEEAE +@NS500443:68:H573HAFXX:1:11101:3151:1053/1 +GGCAAAGCGCTAAAAGCAGCGAGCGTAGCGCGGGAAGAGCTGTTTATCACCACCAAGTTGTGGAATGACGTTGCCATCCTGTAGCCTGATAATGGTTGGATTAGCCATGATGTGTTCCTCTTTTAATTGGCTCGCCGGAGTCGGGTCCGG ++ +AAAAAAEEEEEEEEEEEAEEE/EEEEAEEEEEEEEEEEEEAEEEAEEEAEEEEEEEEEEEEEEEEEEAEAAEEE//<EEAE/EEEEE/EAEEEEEAAEEE/A<EEEEEE/EEE/<66EEEA6E6EEEEEEEEAE<AA<EEAEEEEEEEAE +@NS500443:69:H573HAFXX:1:11101:3151:1053/1 +GGCAAAGCGCTAAAAGCAGCGAGCGTAGCGCGGGAAGAGCTGTTTATCACCACCAAGTTGTGGAATGACGTTGCCATCCTGTAGCCTGATAATGGTTGGATTAGCCATGATGTGTTCCTCTTTTAATTGGCTCGCCGGAGTCGGGTCCGG ++ +AAAAAAEEEEEEEEEEEAEEE/EEEEAEEEEEEEEEEEEEAEEEAEEEAEEEEEEEEEEEEEEEEEEAEAAEEE//<EEAE/EEEEE/EAEEEEEAAEEE/A<EEEEEE/EEE/<66EEEA6E6EEEEEEEEAE<AA<EEAEEEEEEEAE +@NS500443:70:H573HAFXX:1:11101:6972:1045/1 +GACGGTGGTTCTGGTGCTGCGTAACGAGACCGTTGTTGGCGTCCTCGCCCTCCAGGATACGTTGCGTGACGATGCGCGCGACGCCATCCGCGATCTGCATCAGTTAGGCGTTAACGGCGTGATTCTGACTGGCGATAACCCACGGGCGGCG ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEA<< +@NS500443:71:H573HAFXX:1:11101:20964:1048/1 +ACATTTCACTGGTCATGAGCTTGGTGAGAGAAGCGATACGAATGACCGAATCAAGCTGTGGACGAACATTATTGCCCGGTCTGGTGTCACCGAAACTACGAAACACGCGTTGATTGCCGTCGATCACCACCAGCGCCATGCCCGTGGCGC ++ +AAAAAEEAEEEEEEEEE6EE/EEEEEEEEAEEAEEEEEEEEEEEEEEAEEEA/A/EEEAEEEEEE/EE</EAEEEEEE/EEE/<//AEEE<E/E/EEEEEEEEE/AAA6EAE<<EE<EEEEEA6E<EAEEAA<EEEAEAEEA<<<<EEAA +@NS500443:72:H573HAFXX:1:11101:20964:1048/1 +ACATTTCACTGGTCATGAGCTTGGTGAGAGAAGCGATACGAATGACCGAATCAAGCTGTGGACGAACATTATTGCCCGGTCTGGTGTCACCGAAACTACGAAACACGCGTTGATTGCCGTCGATCACCACCAGCGCCATGCCCGTGGCGC ++ +AAAAAEEAEEEEEEEEE6EE/EEEEEEEEAEEAEEEEEEEEEEEEEEAEEEA/A/EEEAEEEEEE/EE</EAEEEEEE/EEE/<//AEEE<E/E/EEEEEEEEE/AAA6EAE<<EE<EEEEEA6E<EAEEAA<EEEAEAEEA<<<<EEAA diff --git a/test/data/unit/09-4607_S43_R2.fastq b/test/data/unit/09-4607_S43_R2.fastq new file mode 100644 index 0000000000000000000000000000000000000000..f3bd38142700ef09c2838855344146485157eccf --- /dev/null +++ b/test/data/unit/09-4607_S43_R2.fastq @@ -0,0 +1,16 @@ +@NS500443:65:H573HAFXX:1:11101:6972:1045/2 +CGCCGGCAATCGCTGCCGCCGCCCGTGGGTTATCGCCAGTCAGAATCACGCCGTTAACGCCTAACTGATGCAGATCGCGGATGGCGTCGCGCGCATCGTCACGCAACGTATCCTGGAGGGCGAGGACGCCAACAACGGTCTCGTTACGCAG ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEE/EEEEEEEEEE<EEEE/ +@NS500443:65:H573HAFXX:1:11101:23631:1047/2 +ACCACGGAGCGGCGCGAAGTTCGTCGTTAACCGCCAGTTCATCAGGCGACGGCTCATCAAGCGAATATTCCGGGTCGATATCGACGACCACTCCGAGGTAACCTAACAGGGAATGGCGGACCTGTTGGCCGATACCGAATTTGCTGGCTAT ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEAEEEEEEEEEEAEEEEEEEEEEEEEEEEE/EEEEE<EE/EEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEE<EEEEEEEEEAA</EEEAEA<AEEEEEAEEEE +@NS500443:65:H573HAFXX:1:11101:20964:1048/2 +CTGATATCGTTGATCGTTATGCCGAGCATATCTTTTATGGTAGCGGCGCCACGGGCATGGCGCTGGTGGTGATCGACGGCAATCAACGCGTGTTTCGTAGTTTCGGTGACACCAGACCGGGCAATAATGTTCGTCCACAGCTTGATTCGGT ++ +AAAAAEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEAEEEEEEEEEEEEEAEEE/EEEEAE<EEEEEEEEEEAEEA<<E<EAEAAEE<EEEEEEEEEEEEEAE</A<AEEEE<EAAA<AA +@NS500443:65:H573HAFXX:1:11101:3151:1053/2 +CTGACTTTCGTTTTTGGCCATTTCGTCCAGACTTAAGTCACTTAGTCTCGCCGGACCCGACTCCGGCGAGCCAATTAAAAGAGGAACACATCATGGCTAATCCCACCATTATCAGGCTACAGGATGGCAACGTCATTCCACAACTTGGTGG ++ +AAAAAE/EEEAEEEEE/EEEEAEEEEEEEEAAAEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEAEEAEEEEEEEEEEAE/EEE/EEE/EE/AAEE<A<EEE//EEEA6E<E/A<E/AE<E/<EEEE<<EE/E</AEE/E/<A/<A<AE6 diff --git a/test/data/unit/09-4607_S43_R2_big.fastq b/test/data/unit/09-4607_S43_R2_big.fastq new file mode 100644 index 0000000000000000000000000000000000000000..c036509490f588f218219de8b740ba2f384d0bdc --- /dev/null +++ b/test/data/unit/09-4607_S43_R2_big.fastq @@ -0,0 +1,44 @@ +@NS500443:65:H573HAFXX:1:11101:6972:1045/2 +CGCCGGCAATCGCTGCCGCCGCCCGTGGGTTATCGCCAGTCAGAATCACGCCGTTAACGCCTAACTGATGCAGATCGCGGATGGCGTCGCGCGCATCGTCACGCAACGTATCCTGGAGGGCGAGGACGCCAACAACGGTCTCGTTACGCAG ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEE/EEEEEEEEEE<EEEE/ +@NS500443:65:H573HAFXX:1:11101:23631:1047/2 +ACCACGGAGCGGCGCGAAGTTCGTCGTTAACCGCCAGTTCATCAGGCGACGGCTCATCAAGCGAATATTCCGGGTCGATATCGACGACCACTCCGAGGTAACCTAACAGGGAATGGCGGACCTGTTGGCCGATACCGAATTTGCTGGCTAT ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEAEEEEEEEEEEAEEEEEEEEEEEEEEEEE/EEEEE<EE/EEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEE<EEEEEEEEEAA</EEEAEA<AEEEEEAEEEE +@NS500443:65:H573HAFXX:1:11101:20964:1048/2 +CTGATATCGTTGATCGTTATGCCGAGCATATCTTTTATGGTAGCGGCGCCACGGGCATGGCGCTGGTGGTGATCGACGGCAATCAACGCGTGTTTCGTAGTTTCGGTGACACCAGACCGGGCAATAATGTTCGTCCACAGCTTGATTCGGT ++ +AAAAAEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEAEEEEEEEEEEEEEAEEE/EEEEAE<EEEEEEEEEEAEEA<<E<EAEAAEE<EEEEEEEEEEEEEAE</A<AEEEE<EAAA<AA +@NS500443:65:H573HAFXX:1:11101:3151:1053/2 +CTGACTTTCGTTTTTGGCCATTTCGTCCAGACTTAAGTCACTTAGTCTCGCCGGACCCGACTCCGGCGAGCCAATTAAAAGAGGAACACATCATGGCTAATCCCACCATTATCAGGCTACAGGATGGCAACGTCATTCCACAACTTGGTGG ++ +AAAAAE/EEEAEEEEE/EEEEAEEEEEEEEAAAEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEAEEAEEEEEEEEEEAE/EEE/EEE/EE/AAEE<A<EEE//EEEA6E<E/A<E/AE<E/<EEEE<<EE/E</AEE/E/<A/<A<AE6 +@NS500443:66:H573HAFXX:1:11101:3151:1053/2 +CTGACTTTCGTTTTTGGCCATTTCGTCCAGACTTAAGTCACTTAGTCTCGCCGGACCCGACTCCGGCGAGCCAATTAAAAGAGGAACACATCATGGCTAATCCCACCATTATCAGGCTACAGGATGGCAACGTCATTCCACAACTTGGTGG ++ +AAAAAE/EEEAEEEEE/EEEEAEEEEEEEEAAAEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEAEEAEEEEEEEEEEAE/EEE/EEE/EE/AAEE<A<EEE//EEEA6E<E/A<E/AE<E/<EEEE<<EE/E</AEE/E/<A/<A<AE6 +@NS500443:67:H573HAFXX:1:11101:3151:1053/2 +CTGACTTTCGTTTTTGGCCATTTCGTCCAGACTTAAGTCACTTAGTCTCGCCGGACCCGACTCCGGCGAGCCAATTAAAAGAGGAACACATCATGGCTAATCCCACCATTATCAGGCTACAGGATGGCAACGTCATTCCACAACTTGGTGG ++ +AAAAAE/EEEAEEEEE/EEEEAEEEEEEEEAAAEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEAEEAEEEEEEEEEEAE/EEE/EEE/EE/AAEE<A<EEE//EEEA6E<E/A<E/AE<E/<EEEE<<EE/E</AEE/E/<A/<A<AE6 +@NS500443:68:H573HAFXX:1:11101:3151:1053/2 +CTGACTTTCGTTTTTGGCCATTTCGTCCAGACTTAAGTCACTTAGTCTCGCCGGACCCGACTCCGGCGAGCCAATTAAAAGAGGAACACATCATGGCTAATCCCACCATTATCAGGCTACAGGATGGCAACGTCATTCCACAACTTGGTGG ++ +AAAAAE/EEEAEEEEE/EEEEAEEEEEEEEAAAEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEAEEAEEEEEEEEEEAE/EEE/EEE/EE/AAEE<A<EEE//EEEA6E<E/A<E/AE<E/<EEEE<<EE/E</AEE/E/<A/<A<AE6 +@NS500443:69:H573HAFXX:1:11101:3151:1053/2 +CTGACTTTCGTTTTTGGCCATTTCGTCCAGACTTAAGTCACTTAGTCTCGCCGGACCCGACTCCGGCGAGCCAATTAAAAGAGGAACACATCATGGCTAATCCCACCATTATCAGGCTACAGGATGGCAACGTCATTCCACAACTTGGTGG ++ +AAAAAE/EEEAEEEEE/EEEEAEEEEEEEEAAAEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEAEEAEEEEEEEEEEAE/EEE/EEE/EE/AAEE<A<EEE//EEEA6E<E/A<E/AE<E/<EEEE<<EE/E</AEE/E/<A/<A<AE6 +@NS500443:70:H573HAFXX:1:11101:6972:1045/2 +CGCCGGCAATCGCTGCCGCCGCCCGTGGGTTATCGCCAGTCAGAATCACGCCGTTAACGCCTAACTGATGCAGATCGCGGATGGCGTCGCGCGCATCGTCACGCAACGTATCCTGGAGGGCGAGGACGCCAACAACGGTCTCGTTACGCAG ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEE/EEEEEEEEEE<EEEE/ +@NS500443:71:H573HAFXX:1:11101:20964:1048/2 +CTGATATCGTTGATCGTTATGCCGAGCATATCTTTTATGGTAGCGGCGCCACGGGCATGGCGCTGGTGGTGATCGACGGCAATCAACGCGTGTTTCGTAGTTTCGGTGACACCAGACCGGGCAATAATGTTCGTCCACAGCTTGATTCGGT ++ +AAAAAEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEAEEEEEEEEEEEEEAEEE/EEEEAE<EEEEEEEEEEAEEA<<E<EAEAAEE<EEEEEEEEEEEEEAE</A<AEEEE<EAAA<AA +@NS500443:72:H573HAFXX:1:11101:20964:1048/2 +CTGATATCGTTGATCGTTATGCCGAGCATATCTTTTATGGTAGCGGCGCCACGGGCATGGCGCTGGTGGTGATCGACGGCAATCAACGCGTGTTTCGTAGTTTCGGTGACACCAGACCGGGCAATAATGTTCGTCCACAGCTTGATTCGGT ++ +AAAAAEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEAEEEEEEEEEEEEEAEEE/EEEEAE<EEEEEEEEEEAEEA<<E<EAEAAEE<EEEEEEEEEEEEEAE</A<AEEEE<EAAA<AA diff --git a/test/data/unit/thres_100_09-4607_S43_R1.undef.fastq b/test/data/unit/thres_100_09-4607_S43_R1.undef.fastq new file mode 100644 index 0000000000000000000000000000000000000000..69b9baa4d6c331275a5b41379d192a7c7912031f --- /dev/null +++ b/test/data/unit/thres_100_09-4607_S43_R1.undef.fastq @@ -0,0 +1,20 @@ +@NS500443:65:H573HAFXX:1:11101:3151:1053/1 +GGCAAAGCGCTAAAAGCAGCGAGCGTAGCGCGGGAAGAGCTGTTTATCACCACCAAGTTGTGGAATGACGTTGCCATCCTGTAGCCTGATAATGGTTGGATTAGCCATGATGTGTTCCTCTTTTAATTGGCTCGCCGGAGTCGGGTCCGG ++ +AAAAAAEEEEEEEEEEEAEEE/EEEEAEEEEEEEEEEEEEAEEEAEEEAEEEEEEEEEEEEEEEEEEAEAAEEE//<EEAE/EEEEE/EAEEEEEAAEEE/A<EEEEEE/EEE/<66EEEA6E6EEEEEEEEAE<AA<EEAEEEEEEEAE +@NS500443:66:H573HAFXX:1:11101:3151:1053/1 +GGCAAAGCGCTAAAAGCAGCGAGCGTAGCGCGGGAAGAGCTGTTTATCACCACCAAGTTGTGGAATGACGTTGCCATCCTGTAGCCTGATAATGGTTGGATTAGCCATGATGTGTTCCTCTTTTAATTGGCTCGCCGGAGTCGGGTCCGG ++ +AAAAAAEEEEEEEEEEEAEEE/EEEEAEEEEEEEEEEEEEAEEEAEEEAEEEEEEEEEEEEEEEEEEAEAAEEE//<EEAE/EEEEE/EAEEEEEAAEEE/A<EEEEEE/EEE/<66EEEA6E6EEEEEEEEAE<AA<EEAEEEEEEEAE +@NS500443:67:H573HAFXX:1:11101:3151:1053/1 +GGCAAAGCGCTAAAAGCAGCGAGCGTAGCGCGGGAAGAGCTGTTTATCACCACCAAGTTGTGGAATGACGTTGCCATCCTGTAGCCTGATAATGGTTGGATTAGCCATGATGTGTTCCTCTTTTAATTGGCTCGCCGGAGTCGGGTCCGG ++ +AAAAAAEEEEEEEEEEEAEEE/EEEEAEEEEEEEEEEEEEAEEEAEEEAEEEEEEEEEEEEEEEEEEAEAAEEE//<EEAE/EEEEE/EAEEEEEAAEEE/A<EEEEEE/EEE/<66EEEA6E6EEEEEEEEAE<AA<EEAEEEEEEEAE +@NS500443:68:H573HAFXX:1:11101:3151:1053/1 +GGCAAAGCGCTAAAAGCAGCGAGCGTAGCGCGGGAAGAGCTGTTTATCACCACCAAGTTGTGGAATGACGTTGCCATCCTGTAGCCTGATAATGGTTGGATTAGCCATGATGTGTTCCTCTTTTAATTGGCTCGCCGGAGTCGGGTCCGG ++ +AAAAAAEEEEEEEEEEEAEEE/EEEEAEEEEEEEEEEEEEAEEEAEEEAEEEEEEEEEEEEEEEEEEAEAAEEE//<EEAE/EEEEE/EAEEEEEAAEEE/A<EEEEEE/EEE/<66EEEA6E6EEEEEEEEAE<AA<EEAEEEEEEEAE +@NS500443:69:H573HAFXX:1:11101:3151:1053/1 +GGCAAAGCGCTAAAAGCAGCGAGCGTAGCGCGGGAAGAGCTGTTTATCACCACCAAGTTGTGGAATGACGTTGCCATCCTGTAGCCTGATAATGGTTGGATTAGCCATGATGTGTTCCTCTTTTAATTGGCTCGCCGGAGTCGGGTCCGG ++ +AAAAAAEEEEEEEEEEEAEEE/EEEEAEEEEEEEEEEEEEAEEEAEEEAEEEEEEEEEEEEEEEEEEAEAAEEE//<EEAE/EEEEE/EAEEEEEAAEEE/A<EEEEEE/EEE/<66EEEA6E6EEEEEEEEAE<AA<EEAEEEEEEEAE diff --git a/test/data/unit/thres_100_09-4607_S43_R2.undef.fastq b/test/data/unit/thres_100_09-4607_S43_R2.undef.fastq new file mode 100644 index 0000000000000000000000000000000000000000..42a913a61d0b754a00ba887657d6dd16abc8ccce --- /dev/null +++ b/test/data/unit/thres_100_09-4607_S43_R2.undef.fastq @@ -0,0 +1,20 @@ +@NS500443:65:H573HAFXX:1:11101:3151:1053/2 +CTGACTTTCGTTTTTGGCCATTTCGTCCAGACTTAAGTCACTTAGTCTCGCCGGACCCGACTCCGGCGAGCCAATTAAAAGAGGAACACATCATGGCTAATCCCACCATTATCAGGCTACAGGATGGCAACGTCATTCCACAACTTGGTGG ++ +AAAAAE/EEEAEEEEE/EEEEAEEEEEEEEAAAEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEAEEAEEEEEEEEEEAE/EEE/EEE/EE/AAEE<A<EEE//EEEA6E<E/A<E/AE<E/<EEEE<<EE/E</AEE/E/<A/<A<AE6 +@NS500443:66:H573HAFXX:1:11101:3151:1053/2 +CTGACTTTCGTTTTTGGCCATTTCGTCCAGACTTAAGTCACTTAGTCTCGCCGGACCCGACTCCGGCGAGCCAATTAAAAGAGGAACACATCATGGCTAATCCCACCATTATCAGGCTACAGGATGGCAACGTCATTCCACAACTTGGTGG ++ +AAAAAE/EEEAEEEEE/EEEEAEEEEEEEEAAAEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEAEEAEEEEEEEEEEAE/EEE/EEE/EE/AAEE<A<EEE//EEEA6E<E/A<E/AE<E/<EEEE<<EE/E</AEE/E/<A/<A<AE6 +@NS500443:67:H573HAFXX:1:11101:3151:1053/2 +CTGACTTTCGTTTTTGGCCATTTCGTCCAGACTTAAGTCACTTAGTCTCGCCGGACCCGACTCCGGCGAGCCAATTAAAAGAGGAACACATCATGGCTAATCCCACCATTATCAGGCTACAGGATGGCAACGTCATTCCACAACTTGGTGG ++ +AAAAAE/EEEAEEEEE/EEEEAEEEEEEEEAAAEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEAEEAEEEEEEEEEEAE/EEE/EEE/EE/AAEE<A<EEE//EEEA6E<E/A<E/AE<E/<EEEE<<EE/E</AEE/E/<A/<A<AE6 +@NS500443:68:H573HAFXX:1:11101:3151:1053/2 +CTGACTTTCGTTTTTGGCCATTTCGTCCAGACTTAAGTCACTTAGTCTCGCCGGACCCGACTCCGGCGAGCCAATTAAAAGAGGAACACATCATGGCTAATCCCACCATTATCAGGCTACAGGATGGCAACGTCATTCCACAACTTGGTGG ++ +AAAAAE/EEEAEEEEE/EEEEAEEEEEEEEAAAEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEAEEAEEEEEEEEEEAE/EEE/EEE/EE/AAEE<A<EEE//EEEA6E<E/A<E/AE<E/<EEEE<<EE/E</AEE/E/<A/<A<AE6 +@NS500443:69:H573HAFXX:1:11101:3151:1053/2 +CTGACTTTCGTTTTTGGCCATTTCGTCCAGACTTAAGTCACTTAGTCTCGCCGGACCCGACTCCGGCGAGCCAATTAAAAGAGGAACACATCATGGCTAATCCCACCATTATCAGGCTACAGGATGGCAACGTCATTCCACAACTTGGTGG ++ +AAAAAE/EEEAEEEEE/EEEEAEEEEEEEEAAAEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEAEEAEEEEEEEEEEAE/EEE/EEE/EE/AAEE<A<EEE//EEEA6E<E/A<E/AE<E/<EEEE<<EE/E</AEE/E/<A/<A<AE6 diff --git a/test/data/unit/thres_180_09-4607_S43_R1.undef.fastq b/test/data/unit/thres_180_09-4607_S43_R1.undef.fastq new file mode 100644 index 0000000000000000000000000000000000000000..4fde1ccad52b41bde26c1288428ea7baf121f8d1 --- /dev/null +++ b/test/data/unit/thres_180_09-4607_S43_R1.undef.fastq @@ -0,0 +1,32 @@ +@NS500443:65:H573HAFXX:1:11101:20964:1048/1 +ACATTTCACTGGTCATGAGCTTGGTGAGAGAAGCGATACGAATGACCGAATCAAGCTGTGGACGAACATTATTGCCCGGTCTGGTGTCACCGAAACTACGAAACACGCGTTGATTGCCGTCGATCACCACCAGCGCCATGCCCGTGGCGC ++ +AAAAAEEAEEEEEEEEE6EE/EEEEEEEEAEEAEEEEEEEEEEEEEEAEEEA/A/EEEAEEEEEE/EE</EAEEEEEE/EEE/<//AEEE<E/E/EEEEEEEEE/AAA6EAE<<EE<EEEEEA6E<EAEEAA<EEEAEAEEA<<<<EEAA +@NS500443:65:H573HAFXX:1:11101:3151:1053/1 +GGCAAAGCGCTAAAAGCAGCGAGCGTAGCGCGGGAAGAGCTGTTTATCACCACCAAGTTGTGGAATGACGTTGCCATCCTGTAGCCTGATAATGGTTGGATTAGCCATGATGTGTTCCTCTTTTAATTGGCTCGCCGGAGTCGGGTCCGG ++ +AAAAAAEEEEEEEEEEEAEEE/EEEEAEEEEEEEEEEEEEAEEEAEEEAEEEEEEEEEEEEEEEEEEAEAAEEE//<EEAE/EEEEE/EAEEEEEAAEEE/A<EEEEEE/EEE/<66EEEA6E6EEEEEEEEAE<AA<EEAEEEEEEEAE +@NS500443:66:H573HAFXX:1:11101:3151:1053/1 +GGCAAAGCGCTAAAAGCAGCGAGCGTAGCGCGGGAAGAGCTGTTTATCACCACCAAGTTGTGGAATGACGTTGCCATCCTGTAGCCTGATAATGGTTGGATTAGCCATGATGTGTTCCTCTTTTAATTGGCTCGCCGGAGTCGGGTCCGG ++ +AAAAAAEEEEEEEEEEEAEEE/EEEEAEEEEEEEEEEEEEAEEEAEEEAEEEEEEEEEEEEEEEEEEAEAAEEE//<EEAE/EEEEE/EAEEEEEAAEEE/A<EEEEEE/EEE/<66EEEA6E6EEEEEEEEAE<AA<EEAEEEEEEEAE +@NS500443:67:H573HAFXX:1:11101:3151:1053/1 +GGCAAAGCGCTAAAAGCAGCGAGCGTAGCGCGGGAAGAGCTGTTTATCACCACCAAGTTGTGGAATGACGTTGCCATCCTGTAGCCTGATAATGGTTGGATTAGCCATGATGTGTTCCTCTTTTAATTGGCTCGCCGGAGTCGGGTCCGG ++ +AAAAAAEEEEEEEEEEEAEEE/EEEEAEEEEEEEEEEEEEAEEEAEEEAEEEEEEEEEEEEEEEEEEAEAAEEE//<EEAE/EEEEE/EAEEEEEAAEEE/A<EEEEEE/EEE/<66EEEA6E6EEEEEEEEAE<AA<EEAEEEEEEEAE +@NS500443:68:H573HAFXX:1:11101:3151:1053/1 +GGCAAAGCGCTAAAAGCAGCGAGCGTAGCGCGGGAAGAGCTGTTTATCACCACCAAGTTGTGGAATGACGTTGCCATCCTGTAGCCTGATAATGGTTGGATTAGCCATGATGTGTTCCTCTTTTAATTGGCTCGCCGGAGTCGGGTCCGG ++ +AAAAAAEEEEEEEEEEEAEEE/EEEEAEEEEEEEEEEEEEAEEEAEEEAEEEEEEEEEEEEEEEEEEAEAAEEE//<EEAE/EEEEE/EAEEEEEAAEEE/A<EEEEEE/EEE/<66EEEA6E6EEEEEEEEAE<AA<EEAEEEEEEEAE +@NS500443:69:H573HAFXX:1:11101:3151:1053/1 +GGCAAAGCGCTAAAAGCAGCGAGCGTAGCGCGGGAAGAGCTGTTTATCACCACCAAGTTGTGGAATGACGTTGCCATCCTGTAGCCTGATAATGGTTGGATTAGCCATGATGTGTTCCTCTTTTAATTGGCTCGCCGGAGTCGGGTCCGG ++ +AAAAAAEEEEEEEEEEEAEEE/EEEEAEEEEEEEEEEEEEAEEEAEEEAEEEEEEEEEEEEEEEEEEAEAAEEE//<EEAE/EEEEE/EAEEEEEAAEEE/A<EEEEEE/EEE/<66EEEA6E6EEEEEEEEAE<AA<EEAEEEEEEEAE +@NS500443:71:H573HAFXX:1:11101:20964:1048/1 +ACATTTCACTGGTCATGAGCTTGGTGAGAGAAGCGATACGAATGACCGAATCAAGCTGTGGACGAACATTATTGCCCGGTCTGGTGTCACCGAAACTACGAAACACGCGTTGATTGCCGTCGATCACCACCAGCGCCATGCCCGTGGCGC ++ +AAAAAEEAEEEEEEEEE6EE/EEEEEEEEAEEAEEEEEEEEEEEEEEAEEEA/A/EEEAEEEEEE/EE</EAEEEEEE/EEE/<//AEEE<E/E/EEEEEEEEE/AAA6EAE<<EE<EEEEEA6E<EAEEAA<EEEAEAEEA<<<<EEAA +@NS500443:72:H573HAFXX:1:11101:20964:1048/1 +ACATTTCACTGGTCATGAGCTTGGTGAGAGAAGCGATACGAATGACCGAATCAAGCTGTGGACGAACATTATTGCCCGGTCTGGTGTCACCGAAACTACGAAACACGCGTTGATTGCCGTCGATCACCACCAGCGCCATGCCCGTGGCGC ++ +AAAAAEEAEEEEEEEEE6EE/EEEEEEEEAEEAEEEEEEEEEEEEEEAEEEA/A/EEEAEEEEEE/EE</EAEEEEEE/EEE/<//AEEE<E/E/EEEEEEEEE/AAA6EAE<<EE<EEEEEA6E<EAEEAA<EEEAEAEEA<<<<EEAA diff --git a/test/data/unit/thres_180_09-4607_S43_R2.undef.fastq b/test/data/unit/thres_180_09-4607_S43_R2.undef.fastq new file mode 100644 index 0000000000000000000000000000000000000000..03fe6589d839a6787eb2adb5e06003f378e77f07 --- /dev/null +++ b/test/data/unit/thres_180_09-4607_S43_R2.undef.fastq @@ -0,0 +1,32 @@ +@NS500443:65:H573HAFXX:1:11101:20964:1048/2 +CTGATATCGTTGATCGTTATGCCGAGCATATCTTTTATGGTAGCGGCGCCACGGGCATGGCGCTGGTGGTGATCGACGGCAATCAACGCGTGTTTCGTAGTTTCGGTGACACCAGACCGGGCAATAATGTTCGTCCACAGCTTGATTCGGT ++ +AAAAAEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEAEEEEEEEEEEEEEAEEE/EEEEAE<EEEEEEEEEEAEEA<<E<EAEAAEE<EEEEEEEEEEEEEAE</A<AEEEE<EAAA<AA +@NS500443:65:H573HAFXX:1:11101:3151:1053/2 +CTGACTTTCGTTTTTGGCCATTTCGTCCAGACTTAAGTCACTTAGTCTCGCCGGACCCGACTCCGGCGAGCCAATTAAAAGAGGAACACATCATGGCTAATCCCACCATTATCAGGCTACAGGATGGCAACGTCATTCCACAACTTGGTGG ++ +AAAAAE/EEEAEEEEE/EEEEAEEEEEEEEAAAEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEAEEAEEEEEEEEEEAE/EEE/EEE/EE/AAEE<A<EEE//EEEA6E<E/A<E/AE<E/<EEEE<<EE/E</AEE/E/<A/<A<AE6 +@NS500443:66:H573HAFXX:1:11101:3151:1053/2 +CTGACTTTCGTTTTTGGCCATTTCGTCCAGACTTAAGTCACTTAGTCTCGCCGGACCCGACTCCGGCGAGCCAATTAAAAGAGGAACACATCATGGCTAATCCCACCATTATCAGGCTACAGGATGGCAACGTCATTCCACAACTTGGTGG ++ +AAAAAE/EEEAEEEEE/EEEEAEEEEEEEEAAAEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEAEEAEEEEEEEEEEAE/EEE/EEE/EE/AAEE<A<EEE//EEEA6E<E/A<E/AE<E/<EEEE<<EE/E</AEE/E/<A/<A<AE6 +@NS500443:67:H573HAFXX:1:11101:3151:1053/2 +CTGACTTTCGTTTTTGGCCATTTCGTCCAGACTTAAGTCACTTAGTCTCGCCGGACCCGACTCCGGCGAGCCAATTAAAAGAGGAACACATCATGGCTAATCCCACCATTATCAGGCTACAGGATGGCAACGTCATTCCACAACTTGGTGG ++ +AAAAAE/EEEAEEEEE/EEEEAEEEEEEEEAAAEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEAEEAEEEEEEEEEEAE/EEE/EEE/EE/AAEE<A<EEE//EEEA6E<E/A<E/AE<E/<EEEE<<EE/E</AEE/E/<A/<A<AE6 +@NS500443:68:H573HAFXX:1:11101:3151:1053/2 +CTGACTTTCGTTTTTGGCCATTTCGTCCAGACTTAAGTCACTTAGTCTCGCCGGACCCGACTCCGGCGAGCCAATTAAAAGAGGAACACATCATGGCTAATCCCACCATTATCAGGCTACAGGATGGCAACGTCATTCCACAACTTGGTGG ++ +AAAAAE/EEEAEEEEE/EEEEAEEEEEEEEAAAEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEAEEAEEEEEEEEEEAE/EEE/EEE/EE/AAEE<A<EEE//EEEA6E<E/A<E/AE<E/<EEEE<<EE/E</AEE/E/<A/<A<AE6 +@NS500443:69:H573HAFXX:1:11101:3151:1053/2 +CTGACTTTCGTTTTTGGCCATTTCGTCCAGACTTAAGTCACTTAGTCTCGCCGGACCCGACTCCGGCGAGCCAATTAAAAGAGGAACACATCATGGCTAATCCCACCATTATCAGGCTACAGGATGGCAACGTCATTCCACAACTTGGTGG ++ +AAAAAE/EEEAEEEEE/EEEEAEEEEEEEEAAAEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEAEEAEEEEEEEEEEAE/EEE/EEE/EE/AAEE<A<EEE//EEEA6E<E/A<E/AE<E/<EEEE<<EE/E</AEE/E/<A/<A<AE6 +@NS500443:71:H573HAFXX:1:11101:20964:1048/2 +CTGATATCGTTGATCGTTATGCCGAGCATATCTTTTATGGTAGCGGCGCCACGGGCATGGCGCTGGTGGTGATCGACGGCAATCAACGCGTGTTTCGTAGTTTCGGTGACACCAGACCGGGCAATAATGTTCGTCCACAGCTTGATTCGGT ++ +AAAAAEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEAEEEEEEEEEEEEEAEEE/EEEEAE<EEEEEEEEEEAEEA<<E<EAEAAEE<EEEEEEEEEEEEEAE</A<AEEEE<EAAA<AA +@NS500443:72:H573HAFXX:1:11101:20964:1048/2 +CTGATATCGTTGATCGTTATGCCGAGCATATCTTTTATGGTAGCGGCGCCACGGGCATGGCGCTGGTGGTGATCGACGGCAATCAACGCGTGTTTCGTAGTTTCGGTGACACCAGACCGGGCAATAATGTTCGTCCACAGCTTGATTCGGT ++ +AAAAAEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEAEEEEEEEEEEEEEAEEE/EEEEAE<EEEEEEEEEEAEEA<<E<EAEAAEE<EEEEEEEEEEEEEAE</A<AEEEE<EAAA<AA diff --git a/test/rock.sh b/test/rock.sh index bce3eedb1861e19060da513ad81daeada39ab421..877bc956f814b8d93582481e62817d9cb0ba9f37 100755 --- a/test/rock.sh +++ b/test/rock.sh @@ -63,6 +63,8 @@ echo "doing more checks on options and error messages" ../src/rock -C 500 -c 400 -k 10 -l 12 -g 25 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "options are mutually exclusive" >/dev/null|| exit 12 ../src/rock -C 500 -c 400 -k 10 -g 10000 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "This machine only has" >/dev/null || exit 13 ../src/rock -C 500 -c 400 -k 10 -l 12 -n 85000000 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "options are mutually exclusive">/dev/null || exit 14 +../src/rock -C 500 -c 400 -k 10 -q 3 -m 0 -i ${srcdir}/test/data/iofiles.args/extract_klebsiella_long_reads_100.txt|grep "minimum number of valid k-mer for keeping a read must be an integer">/dev/null || exit 15 +../src/rock -C 500 -c 400 -k 10 -q -1 -m 2 -i ${srcdir}/test/data/iofiles.args/extract_klebsiella_long_reads_100.txt|grep "q must be">/dev/null || exit 16 # here check that we have enough memory for running the tests. diff --git a/test/rock_mem.sh b/test/rock_mem.sh index 54849ebd8ec5339c3737c8a1fbe416449b912fc2..19114d74ab6c0b8247e879d6efe99685a091db95 100755 --- a/test/rock_mem.sh +++ b/test/rock_mem.sh @@ -110,12 +110,6 @@ test $ret2=0 || exit 1922 ../src/rock -C 100 -c 1 -l 2 -q 13 -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_13.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq||exit 194 -#diff_PE1=`diff tmp/klebsiella_100_1_13_qual_thres.fq tmp/klebsiella_100_1_12_qual_thres.fq` -#diff_PE2=`diff tmp/klebsiella_100_2_13_qual_thres.fq tmp/klebsiella_100_2_12_qual_thres.fq` - -#test "$diff_PE1" = "$expected_diff1" || exit 1931 # problem : even if strings are identical test returns false. TO DO solve that later -#test "$diff_PE2" = "$expected_diff2" || exit 1932 - ret1=`diff tmp/klebsiella_100_1_13_qual_thres.fq tmp/klebsiella_100_1_12_qual_thres.fq|grep -c "@SRR1222430.37"` test $ret1 = 1 || exit 1931 @@ -130,6 +124,55 @@ test $ret2 = 2 || exit 1934 rm -fr tmp +echo " " +echo "##################################################################################" +echo "testing ROCK with a quality score threshold for nucleotides and minimum number of valid k-mer to keep a read." + +mkdir tmp +../src/rock -C 100 -c 1 -l 2 -o ${srcdir}/data/iofiles.args/output_files_noNQ_Thres.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq || exit 191 + +../src/rock -C 100 -c 1 -l 2 -q 2 -m 5 -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_very_low.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq || exit 192 + +ret1=`diff tmp/klebsiella_100_1_very_low_qual_thres.fq tmp/klebsiella_100_1_no_qual_thres.fq` +test $ret1=0 || exit 19211 +ret2=`diff tmp/klebsiella_100_2_very_low_qual_thres.fq tmp/klebsiella_100_2_no_qual_thres.fq` +test $ret2=0 || exit 19212 + + +[ -f "tmp/klebsiella_100_1.undefined.fq" ] || exit 19213 +[ -f "tmp/klebsiella_100_2.undefined.fq" ] || exit 19214 + +ret1=`cat tmp/klebsiella_100_1.undefined.fq|wc -l` +ret2=`cat tmp/klebsiella_100_2.undefined.fq|wc -l` + +test $ret1=0 || exit 19215 +test $ret1=0 || exit 19216 + +../src/rock -C 100 -c 1 -l 2 -q 13 -m 500 -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_13.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq || exit 192161 + +[ -f "tmp/klebsiella_100_1.undefined.fq" ] || exit 19215 +[ -f "tmp/klebsiella_100_2.undefined.fq" ] || exit 19216 + +ret1=`cat tmp/klebsiella_100_1.undefined.fq|wc -l` +ret2=`cat tmp/klebsiella_100_2.undefined.fq|wc -l` + +test $ret1 = 400 || exit 19217 +test $ret1 = 400 || exit 19218 + +../src/rock -C 100 -c 1 -l 2 -q 13 -m 300 -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_13.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq || exit 192162 + +[ -f "tmp/klebsiella_100_1.undefined.fq" ] || exit 192191 +[ -f "tmp/klebsiella_100_2.undefined.fq" ] || exit 192192 + +ret1=`cat tmp/klebsiella_100_1.undefined.fq|wc -l` +ret2=`cat tmp/klebsiella_100_2.undefined.fq|wc -l` + +test $ret1 = 136 || exit 192193 +test $ret1 = 136 || exit 192194 + +rm -fr tmp + + echo " " echo "##################################################################################" echo "unit testing for cms component"