diff --git a/NEWS b/NEWS index abf10de6f2613f12d4e735083d75c578367c08e9..3cae91bc477c1adc3363dd30316e3904b23ee895 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,7 @@ +New in version 1.5 + * --separate_PE option for taking into account the 2 ends of a PE separately in the CMS. Default mode is concatenating the 2 ends and treat them as a single read in the CMS. +New in version 1.4 + * -m option for required minimum number of correct k-mer to keep a read. New in version 1.3 * -q option for nucleotique quality score threshold. New in version 1.2 diff --git a/configure.ac b/configure.ac index b3950b388c19e5e013c0273b679eb11fd3542a88..8b259ff7492711677522ccde6f4f592d974ca097 100644 --- a/configure.ac +++ b/configure.ac @@ -1,7 +1,7 @@ dnl Process this file with autoconf to produce a configure script. AC_PREREQ(2.59) -AC_INIT(rock, 1.4) +AC_INIT(rock, 1.5) AC_CANONICAL_SYSTEM diff --git a/doc/rock.pod b/doc/rock.pod index a48a3dea1b42959815db7ea9770e3926c6d434e2..30de1fde6617334222139981c70de298fc8d9ce6 100644 --- a/doc/rock.pod +++ b/doc/rock.pod @@ -49,6 +49,13 @@ Specify minimum number of correct k-mers required to keep a read for CMS filter. Indicate only a integer in version 1.4. Default is 1. +=item --separate_PE + +Use this option to indicate ROCK that you want the CMS to handle pair end reads separately. Default behavior is to concatenate the 2 ends of the PE (once their k-mers are converted into number) and process them as a single read. +No argument. +Cannot be used together with -m. +When --separate_PE is enabled, -m defaults to 1 which means that as there is at least 1 valid k-mer in each part of the PE it is kept. + =item -C Specify maximum coverage wanted. Default is 50. Maximum is 65535. diff --git a/src/CountMinSketch.hpp b/src/CountMinSketch.hpp index c913ffc99a40de2e11be8ffa99e17c54bb40ab87..6b00880884a145154eec7a6815c295e3a036cdb0 100644 --- a/src/CountMinSketch.hpp +++ b/src/CountMinSketch.hpp @@ -13,6 +13,8 @@ #include <string.h> #include "rock_commons.h" + + #define BAD_TYPE -1 // Store the non mersenne prime numbers for modulo hashing in this array. @@ -73,7 +75,8 @@ typedef struct { int kappa; int kappa_prime; int filter_size; // max amount of RAM wanted for the CMS. - int nucl_score_threshold; // minimum quality score below which nucleotides are considered in error. 0 by default. + int filter_PE_as_single; // indicates whether PE reads must be treated as single by the cms. Indeed it may appear that 1 end contains useful k-mer but that other end contains k-mer such that "global" median is below threshold> + // In this case, read is rejected by the CMS (default behavior). We want to do an experimentation to see if keeping such reads wold lead to better results in assembling. } CMSparams; template <typename T> @@ -93,8 +96,6 @@ struct get_mask<unsigned short> { template<typename T> class CountMinSketch { -public: - int nucl_score_threshold; private: static const unsigned long mask1=1; // used only for hash64to32bs @@ -104,6 +105,7 @@ private: int lambda; int kappa; int kappa_prime; + int filter_PE_as_single; T ** cms_lambda_array; T mask; @@ -133,9 +135,10 @@ private: - inline int isRCovBelowThres(const readNumericValues& read_val, const int& threshold); + inline int isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold); + //inline int isRCovBelowThresAux(const readNumericValues& read_val, const int& threshold); - void init(int glambda,int gkappa,int gkappa_prime,int gnucl_score_threshold); + void init(int glambda,int gkappa,int gkappa_prime,int filter_PE_as_single=1); // for unit tests. friend void test_CMS(int lambda,int kappa,int kappa_prime); @@ -143,12 +146,12 @@ private: public: - CountMinSketch(int glambda,int gkappa,int gkappa_prime,int gnucl_score_threshold=0) { - init(glambda,gkappa,gkappa_prime,gnucl_score_threshold); + CountMinSketch(int glambda,int gkappa,int gkappa_prime,int filter_PE_as_single=1) { + init(glambda,gkappa,gkappa_prime,filter_PE_as_single); } CountMinSketch(CMSparams parms) { - init(parms.lambda,parms.kappa,parms.kappa_prime,parms.nucl_score_threshold); + init(parms.lambda,parms.kappa,parms.kappa_prime,parms.filter_PE_as_single); } ~CountMinSketch() { @@ -175,20 +178,32 @@ public: // keep that just for unit testing purpose T getEstimatedNbOcc(const unsigned long& val); - int addRead(const readNumericValues& read_val); + //int addRead(const readNumericValues& read_val); + int addRead(const T_read_numericValues& read_val); - int isBeneathMinKappa(const readNumericValues&read_val); + int isBeneathMinKappa(const T_read_numericValues&read_val); unsigned long getNbDistinctKMers(); -}; + int arePEFilteredAsSingle() { + return filter_PE_as_single; + } +}; -template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const readNumericValues& read_val, const int& threshold) { +/* + * Computes median of occurences for all the k_mers in a single read or for a part of a PE read. + */ +template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold) { + int PE1_below_thres; + int PE2_below_thres=0; int a=0,b=0; int j,h; T min; readNumericValues::const_iterator it; - for (it=read_val.begin();it!=read_val.end();++it) { + readNumericValues::const_iterator it_aux; + if (read_val.idx_start_PE2) it_aux=read_val.single_or_PE_val.begin()+read_val.idx_start_PE2; + else it_aux=read_val.single_or_PE_val.end(); + for (it=read_val.single_or_PE_val.begin();it!=it_aux;++it) { // idx_start_PE2 is 0 if PE are treated as single or in the case of single reads. j=lambda; min=mask; while (--j>=0 && min>threshold) { @@ -198,18 +213,30 @@ template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const readNu (min<threshold)? a+=1:a; ++b; } - return (2*a>b); + PE1_below_thres=(2*a>b); + if (read_val.idx_start_PE2 && !filter_PE_as_single) { + for (it=it_aux;it!=read_val.single_or_PE_val.end();++it) { // idx_start_PE2 is 0 if PE are treated as single or in the case of single reads. + j=lambda; + min=mask; + while (--j>=0 && min>threshold) { + h=hash64to32(*it,j); + min=cms_lambda_array[j] [h]; + } + (min<threshold)? a+=1:a; + ++b; + } + PE2_below_thres=(2*a>b); + } + return PE1_below_thres||PE2_below_thres; } -template<typename T> void CountMinSketch<T>::init(int glambda,int gkappa,int gkappa_prime,int gnucl_score_threshold) { +template<typename T> void CountMinSketch<T>::init(int glambda,int gkappa,int gkappa_prime,int gfilter_PE_as_single) { lambda=glambda; kappa=gkappa; kappa_prime=gkappa_prime; - nucl_score_threshold=gnucl_score_threshold; + filter_PE_as_single=gfilter_PE_as_single; int j; mask=get_mask<T>::value; - /*if (lambda<ubytemask) mask=ubytemask; // byte implementation - else mask=ushortmask; // short implementation*/ cms_lambda_array= (T**) malloc(lambda*sizeof(T*)); for (j=0;j<lambda;j++) { cms_lambda_array[j]=(T *) malloc(sizeof(T)*INT_MAX); @@ -230,18 +257,18 @@ template<typename T> T CountMinSketch<T>::getEstimatedNbOcc(const unsigned long& return min; } -template<typename T> int CountMinSketch<T>::addRead(const readNumericValues& read_val) { +template<typename T> int CountMinSketch<T>::addRead(const T_read_numericValues& read_val) { int keep_r=isRCovBelowThres(read_val,kappa); - if (keep_r) { - readNumericValues::const_iterator it; - for (it=read_val.begin();it!=read_val.end();it++) { - this->addKMer(*it); - } + if (keep_r) { + readNumericValues::const_iterator it; + for (it=read_val.single_or_PE_val.begin();it!=read_val.single_or_PE_val.end();it++) { + this->addKMer(*it); } - return keep_r; + } + return keep_r; } -template<typename T> int CountMinSketch<T>::isBeneathMinKappa(const readNumericValues&read_val) { +template<typename T> int CountMinSketch<T>::isBeneathMinKappa(const T_read_numericValues& read_val) { int res=isRCovBelowThres(read_val,kappa_prime); return res; } diff --git a/src/Filter.hpp b/src/Filter.hpp index 176d284a7fd9350176eb0cc99d94754819be40d2..87b4e0e95a75cc48e542234e4b7200a3d4f70f3e 100644 --- a/src/Filter.hpp +++ b/src/Filter.hpp @@ -28,7 +28,7 @@ class Filter { ByteCountMinSketch * pByteCMS; ShortCountMinSketch * pShortCMS; - // long CMS_size_in_Bytes; + FasqQualThreshold qual_thres; long nb_bytes_before_fill_CMS,nb_bytes_after_fill_CMS; void getRSS(int before_fill=0); @@ -39,14 +39,14 @@ class Filter { public: - Filter(const CMSparams& parms) { + Filter(const CMSparams& parms,const FasqQualThreshold& the_qual_thres) { pByteCMS=NULL; pShortCMS=NULL; getRSS(); if (parms.kappa<ubytemask) pByteCMS=new ByteCountMinSketch(parms); else pShortCMS=new ShortCountMinSketch(parms); - //CMS_size_in_Bytes=0; - + qual_thres=the_qual_thres; + nb_bytes_after_fill_CMS=0; } @@ -63,7 +63,7 @@ template <typename T> void Filter::underlyingfillCMS(FqBaseBackend* map_id_backe k_dim::iterator it_struct; // const unsigned char masque=0x0F; // to retrieve 2nd file identifier. ReadProcessor read_p(k); - readNumericValues nbrKmerDecompo; + T_read_numericValues nbrKmerDecompo; // 1rst step, open files before fetching reads in them int i; @@ -72,8 +72,6 @@ template <typename T> void Filter::underlyingfillCMS(FqBaseBackend* map_id_backe } for (rit=io_sr->rbegin(); rit!=io_sr->rend(); ++rit) { //process map in reverse order (by decreasing scores). - //cout << "score="<<rit->first<<endl; - // unsigned long score=rit->first; for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { unsigned long j=it_offs->first; for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { @@ -81,20 +79,16 @@ template <typename T> void Filter::underlyingfillCMS(FqBaseBackend* map_id_backe DnaSeqStr a_seqs[2]; init_DnaSeqStr(&a_seqs[0]); init_DnaSeqStr(&a_seqs[1]); - getDNASeqstr(map_id_backend,*it_struct,j,a_seqs,cms->nucl_score_threshold); - // debug stuff only. - /*if (strstr(a_seqs->fq_record_buf,"@SRR1222430.37")!=NULL) { - printf("coucou"); - }*/ + getDNASeqstr(map_id_backend,*it_struct,j,a_seqs,qual_thres.nucl_score_threshold); // decompose dna string into k-mers and turn k_mers into numbers. - decomposeReadInKMerNums(read_p, nbrKmerDecompo,k,a_seqs); + decomposeReadInKMerNums(read_p, nbrKmerDecompo,k,cms->arePEFilteredAsSingle(),a_seqs); // insert k-mers into CMS if overall read satisfies kappa condition. ret=cms->addRead(nbrKmerDecompo); if (!ret) { // read is rejected, update rpos structure accordingly. it_struct->fileid=0; // TODO Benchmark: Wouldn't rit->second.erase(++it_struct) be more efficient (reduce size of CMS in memory then 2nd parsing for kappa_prime filtering and third parsing for writing output files would be faster? } - nbrKmerDecompo.clear(); + nbrKmerDecompo.single_or_PE_val.clear(); } } } @@ -104,13 +98,13 @@ template <typename T> void Filter::underlyingfillCMS(FqBaseBackend* map_id_backe } } -template <typename T> void Filter::underlyinglowFilterCMS(FqBaseBackend* map_id_backend[],int nb_be,int k,srp* io_sr, CountMinSketch<T>* pcms) { // TODO refactor get k from CMS. +template <typename T> void Filter::underlyinglowFilterCMS(FqBaseBackend* map_id_backend[],int nb_be,int k,srp* io_sr, CountMinSketch<T>* cms) { // TODO refactor get k from CMS. srp::iterator it; i_dim::iterator it_offs; k_dim::iterator it_struct; int ret; ReadProcessor read_p(k); - readNumericValues nbrKmerDecompo; + T_read_numericValues nbrKmerDecompo; // 1rst step, open files before fetching reads in them int i; @@ -129,10 +123,10 @@ template <typename T> void Filter::underlyinglowFilterCMS(FqBaseBackend* map_id_ init_DnaSeqStr(&a_seqs[1]); getDNASeqstr(map_id_backend,*it_struct,j,a_seqs); // decompose dna string into k-mers and turn k_mers into numbers. - decomposeReadInKMerNums(read_p, nbrKmerDecompo,k,a_seqs); - ret=pcms->isBeneathMinKappa(nbrKmerDecompo); + decomposeReadInKMerNums(read_p, nbrKmerDecompo,k,cms->arePEFilteredAsSingle(),a_seqs); + ret=cms->isBeneathMinKappa(nbrKmerDecompo); if (ret) it_struct->fileid=0; - nbrKmerDecompo.clear(); + nbrKmerDecompo.single_or_PE_val.clear(); } } } diff --git a/src/FqAuxBackend.cpp b/src/FqAuxBackend.cpp index 8cc77dad13933c101c14ed941e188b1cdc2d567f..a11d8d8504717ae0973c9ed4e53cffbc222f4acc 100644 --- a/src/FqAuxBackend.cpp +++ b/src/FqAuxBackend.cpp @@ -148,7 +148,7 @@ void FqAuxBackend::writeToUndefFile() { // here duplicate code for the moment si writeStrToUndefFile(buf_info.p_start_cur_rec,buf_info.pchar-buf_info.p_start_cur_rec); } else { memcpy(cur_fq_record+l,buf_info.p_start_cur_rec,buf_info.cnt); - char tmp=buf_info.buf[buf_info.cnt]; + // char tmp=buf_info.buf[buf_info.cnt]; l+=buf_info.cnt; cur_fq_record[l]='\0'; writeStrToUndefFile(cur_fq_record,strlen(cur_fq_record)); // Try that, if there is performance problem due to resolving inheritance, will find a different way diff --git a/src/FqBaseBackend.cpp b/src/FqBaseBackend.cpp index 90a52ac6851d39cd7d41589560980a5cbc2411f4..f80088d7abc845d89170ad06922d1131190f59e5 100644 --- a/src/FqBaseBackend.cpp +++ b/src/FqBaseBackend.cpp @@ -91,7 +91,7 @@ void FqBaseBackend::openOutputFile(){ void FqBaseBackend::openUndefFile(){ //std::cout<<qual_thres.min_correct_k_mers_in_read<<endl; - if (qual_thres.min_correct_k_mers_in_read>1) { + if (qual_thres.min_correct_k_mers_in_read>=1) { if (undef_filename==NULL) throw std::runtime_error("No file currently associated to this backend for undefined reads."); openFile4Output(undef_filename,&undef_f_desc); } @@ -144,7 +144,7 @@ void FqBaseBackend::writeToUndefFile(const T_buf_info& buf_info) { cout<<cur_fq_record<<endl; cout<<"going to add to it :"<<buf_info.cnt<<" char from buf"<<endl;*/ memcpy(cur_fq_record+l,buf_info.p_start_cur_rec,buf_info.cnt+1); - char tmp=buf_info.buf[buf_info.cnt]; + // char tmp=buf_info.buf[buf_info.cnt]; l+=buf_info.cnt; cur_fq_record[l+1]='\0'; writeStrToUndefFile(cur_fq_record,strlen(cur_fq_record)); // Try that, if there is performance problem due to resolving inheritance, will find a different way diff --git a/src/FqBaseBackend.h b/src/FqBaseBackend.h index bd9d36a90b9c13269304f1e73197488bae783398..bccd1028bdf3e9c8eca573f265a947cdad55b54e 100644 --- a/src/FqBaseBackend.h +++ b/src/FqBaseBackend.h @@ -39,10 +39,11 @@ typedef struct { */ typedef struct { unsigned long rstart_offset; // fq record start offset in file. - int nb_k_mers_in_error; // number of k-mers that contain nucleoties whose quality score is below given threshold. + int nb_k_mers_in_error; // number of k-mers that contain nucleotides whose quality score is below given threshold in PE1 or single read. + int nb_k_mers_in_error_in_PE2; // number k-mers that contain nucleotides whose quality score is below given threshold in PE2. unsigned int nb_nucleotides_in_read; // number of nucleotides in read (single or PE1) unsigned int nb_nucleotides_in_read_PE2; // number of nucleotides in PE2. - unsigned int st; // totalread score (sum of the nucleotides quality score). + unsigned int st; // total read score (sum of the nucleotides quality score). unsigned int idx_nucl_in_read; }T_fq_rec_info; diff --git a/src/FqMainBackend.cpp b/src/FqMainBackend.cpp index 784012342b59ccfb9b89ee98330bbc115fc3b36d..d4f72998729808a6dbe150e30ad1962e77b98a56 100644 --- a/src/FqMainBackend.cpp +++ b/src/FqMainBackend.cpp @@ -86,11 +86,12 @@ void FqMainBackend::processFile(char * filename,unsigned char f_id) { void FqMainBackend::onEndFastqRecord(T_fq_rec_info& rec_info,const T_buf_info& bufinfo) { rpos rp=init_rpos(f_id,rec_info.rstart_offset); - if (p_auxFqProcessor!=NULL) { - rinfo pe2info; + rinfo pe2info; + int nb_k_mer_PE2; + if (p_auxFqProcessor!=NULL) { int eof=p_auxFqProcessor->getNextRead(&pe2info); assert(!eof); // There should be the same number of reads in both PE files. - rec_info.nb_k_mers_in_error+=pe2info.nb_k_mers_in_error; + rec_info.nb_k_mers_in_error_in_PE2=pe2info.nb_k_mers_in_error; rec_info.nb_nucleotides_in_read_PE2=pe2info.nb_nucleotides_in_read; rec_info.st+=pe2info.score; unsigned long j=rec_info.rstart_offset/UINT_MAX; @@ -101,8 +102,13 @@ void FqMainBackend::onEndFastqRecord(T_fq_rec_info& rec_info,const T_buf_info& b k_dim& ref_k_dim=ref_i_dim[rec_info.rstart_offset/UINT_MAX]; debug_processBuf(on_record_end,bufinfo,rec_info.rstart_offset); int nb_k_mer=rec_info.nb_nucleotides_in_read+1-qual_thres.k; - (rec_info.nb_nucleotides_in_read_PE2>0)?nb_k_mer+=rec_info.nb_nucleotides_in_read_PE2+1-qual_thres.k:nb_k_mer; - (nb_k_mer-rec_info.nb_k_mers_in_error>=qual_thres.min_correct_k_mers_in_read)?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo); + rec_info.nb_nucleotides_in_read_PE2>0?nb_k_mer_PE2=rec_info.nb_nucleotides_in_read_PE2+1-qual_thres.k:nb_k_mer_PE2=0; + if (treat_PE_as_single) { + nb_k_mer+=nb_k_mer_PE2; + (nb_k_mer-rec_info.nb_k_mers_in_error-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read)?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo); + } else { + ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2>=1) && (nb_k_mer-rec_info.nb_k_mers_in_error>=1))?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo); + } // empty buffer keeping current record cur_fq_record[0]='\0'; if (p_auxFqProcessor!=NULL) p_auxFqProcessor->resetCurFqRecord(); @@ -136,6 +142,7 @@ void FqMainBackend::processBuf(T_buf_info& buf_info,unsigned char f_id,unsigned case k_read_qual_start: qual_score=1; fq_rec_info.nb_k_mers_in_error=0; + fq_rec_info.nb_k_mers_in_error_in_PE2=0; fq_rec_info.st=0; fq_rec_info.idx_nucl_in_read=0; n=0; // counter for k-mers in error diff --git a/src/FqMainBackend.h b/src/FqMainBackend.h index 6f42599fbe47aa776e1d968b3c796fd13d52b60a..d7bfb1bcb9ddb19120d8d6bb946a3cbb2f86e1d1 100644 --- a/src/FqMainBackend.h +++ b/src/FqMainBackend.h @@ -17,6 +17,7 @@ class FqMainBackend : public FqBaseBackend { + static int treat_PE_as_single; FqAuxBackend * p_auxFqProcessor; /* Another fastq processor component is necessary for handling the case of PE reads.*/ srp * p_scoreReadStruct; /* Where we store information about the reads. */ char current_id[50]; @@ -38,6 +39,10 @@ public: void processFile(char * filename,unsigned char f_id); void processBuf(T_buf_info&,unsigned char f_id,unsigned long cur_offset); + + static void setTreatPEAsSingle(const int& treat_PE){ + FqMainBackend::treat_PE_as_single=treat_PE; + } }; #endif /* FQMAINBACKEND_H_ */ diff --git a/src/Makefile.am b/src/Makefile.am index 8176ea400dbac74fb73c19791aef892be93c1516..9a87fb480901152739dc7409b5ee6e0be55360da 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -29,5 +29,5 @@ unit_test_main_utils_LDADD=librock.a librock_a_SOURCES = $(SRC) -SRC = fqreader.cpp FqBaseBackend.cpp FqAuxBackend.cpp FqMainBackend.cpp read_utils.cpp ReadProcessor.cpp fqwriter.cpp main_utils.cpp ROCKparams.cpp -HDR = srp.h fqreader.h FqConstants.h FqBaseBackend.h FqAuxBackend.h FqMainBackend.h ReadProcessor.h rock_commons.h read_utils.h CountMinSketch.hpp Filter.hpp fqwriter.h main_utils.h ROCKparams.h +SRC = fqreader.cpp FqBaseBackend.cpp FqAuxBackend.cpp FqMainBackend.cpp read_utils.cpp ReadProcessor.cpp fqwriter.cpp main_utils.cpp ROCKparams.cpp unit_tests_tools.cpp +HDR = srp.h fqreader.h FqConstants.h FqBaseBackend.h FqAuxBackend.h FqMainBackend.h ReadProcessor.h rock_commons.h read_utils.h CountMinSketch.hpp Filter.hpp fqwriter.h main_utils.h ROCKparams.h unit_tests_tools.h diff --git a/src/ROCKparams.cpp b/src/ROCKparams.cpp index ddad4f103e7e8648f96de203a90996948f9e4d2d..7776cae71639a7f7d88aa60fa1afb9df093c6dc2 100644 --- a/src/ROCKparams.cpp +++ b/src/ROCKparams.cpp @@ -69,7 +69,7 @@ void ROCKparams::optArgConsistency(const string& input_file,const string& output cout<<"ERROR lower filter is higher than high filter!"<<endl; exit(EXIT_FAILURE); } - if (parms.nucl_score_threshold<0) { + if (qual_thres.nucl_score_threshold<0) { cout<<"ERROR nucleotide score threshold must be positive."<<endl; exit(EXIT_FAILURE); } @@ -262,10 +262,21 @@ void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { int i,q,m; std::vector<string> v_input_lines; std::vector<string> v_output_lines; + static int PE_as_single=1; - while((i = getopt(argc, argv, "i:o:l:k:c:C:g:n:vq:m:")) != -1) { + static struct option long_options[] = + { + {"separate_PE", no_argument, &PE_as_single,0}, + {0,0,0,0} + }; + + int option_index=0; + + while((i = getopt_long(argc, argv, "i:o:l:k:c:C:g:n:vq:m:",long_options,&option_index)) != -1) { //printf("found option: %c\n",i); switch(i) { + case 0: + break; case 'i': input_file=optarg;break; case 'c': @@ -310,8 +321,7 @@ void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { cout<<"q must be >=0"<<endl; usage(EXIT_FAILURE); } - parms.nucl_score_threshold=atoi(optarg)+k_phred_32; - qual_thres.nucl_score_threshold=parms.nucl_score_threshold; + qual_thres.nucl_score_threshold=atoi(optarg)+k_phred_32; break; case 'm': m=atoi(optarg); @@ -319,11 +329,18 @@ void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { cout<<"minimum number of valid k-mer for keeping a read must be an integer >=1"<<endl; usage(EXIT_FAILURE); } + qual_thres.min_correct_k_mers_in_read=atoi(optarg); break; default: usage(EXIT_FAILURE); break; } } + if (!PE_as_single && qual_thres.min_correct_k_mers_in_read!=1) { + cout<<"Incompatible options: when PE are processed independently, minimum number of correct k-mers in each end is 1."<<endl; + usage(EXIT_FAILURE); + } + parms.filter_PE_as_single=PE_as_single; + //cout<<PE_as_single<<endl; processMainArgs(optind, argc, argv,v_input_lines); optArgConsistency(input_file,output_file,g,parms,nb_k_mers,v_input_lines); if (nb_k_mers) { diff --git a/src/ROCKparams.h b/src/ROCKparams.h index 7d899465989c9546f7657790a9b085f65800e026..3a30f925675c0e01bed5ae93d6c695706a5f442b 100644 --- a/src/ROCKparams.h +++ b/src/ROCKparams.h @@ -53,7 +53,6 @@ class ROCKparams{ int f_id; unsigned long cms_size; float expected_collision_proba; - int min_correct_k_mers_in_read; void computeLambda(); @@ -88,13 +87,13 @@ public: parms.kappa=50; parms.kappa_prime=0; parms.lambda=0; - parms.nucl_score_threshold=0; + parms.filter_PE_as_single=1; qual_thres.min_correct_k_mers_in_read=1; qual_thres.nucl_score_threshold=0; verbose_mode=0; cms_size=0; expected_collision_proba=0.0; - min_correct_k_mers_in_read=0; + //min_correct_k_mers_in_read=0; parms.filter_size=getNodePhysMemory()/100.0*90-defaultGRPMAXSize; // Prefer not to use all the machine's memory if (parms.filter_size==0) throw EXIT_FAILURE; } @@ -107,6 +106,11 @@ public: return parms; } + FasqQualThreshold getQualThresholds() { + return qual_thres; + } + + int get_f_id() { return f_id; } diff --git a/src/fqreader.cpp b/src/fqreader.cpp index 5dcccf6bb504c75d106ce566b4b64a782ac14fde..cad0bbe46a6effdbe0049fdc9cfe9214d1793c6e 100644 --- a/src/fqreader.cpp +++ b/src/fqreader.cpp @@ -21,6 +21,7 @@ FasqQualThreshold FqBaseBackend::qual_thres; +int FqMainBackend::treat_PE_as_single; /* * Processes 1 file containing single reads. @@ -55,9 +56,10 @@ void processPEFiles(char * fq_1, unsigned char f_id1,char * fq_2, unsigned char * pointers to Fqbackend objects. * This function ALLOCATES MEMORY with new. */ -void processInputFiles(const std::vector<IO_fq_files>& single_files,const vector<PE_files>& v_PE_files,FqBaseBackend * array_be[],const FasqQualThreshold& a_qual_thres, srp* io_sr) { +void processInputFiles(const std::vector<IO_fq_files>& single_files,const vector<PE_files>& v_PE_files,FqBaseBackend * array_be[],const FasqQualThreshold& a_qual_thres, srp* io_sr,const int& PE_as_single) { unsigned char f_id=1; FqBaseBackend::setQualThreshold(a_qual_thres); + FqMainBackend::setTreatPEAsSingle(PE_as_single); std::vector<IO_fq_files>::const_iterator it_single; for (it_single=single_files.begin();it_single!=single_files.end();it_single++) { FqMainBackend * be_fq=new FqMainBackend(io_sr); diff --git a/src/fqreader.h b/src/fqreader.h index a8c95ca4432d7ef40a08d96b50ab0e06f203bc15..8d1036ddb0890eec7b39bee2e7a0aad54845ba22 100644 --- a/src/fqreader.h +++ b/src/fqreader.h @@ -19,5 +19,5 @@ using namespace std; void processSingleFile(char *, unsigned char, srp*); void processPEFiles(char * fq_1, unsigned char f_id1,char * gq_2, unsigned char f_id2,srp *io_sr,char * fq_1_test_undef=NULL,char * fq_2_test_undef=NULL,size_t test_bufsize=0); -void processInputFiles(const std::vector<IO_fq_files>& ,const vector<PE_files>& ,FqBaseBackend * [], const FasqQualThreshold&,srp* ); +void processInputFiles(const std::vector<IO_fq_files>& ,const vector<PE_files>& ,FqBaseBackend * [], const FasqQualThreshold&,srp*,const int& PE_as_single=1 ); #endif diff --git a/src/main_utils.cpp b/src/main_utils.cpp index d7dfd51e3a49373841291e456459056f342b0891..a2da48d4ae83bdd3a71a7049044c228c552a9d25 100644 --- a/src/main_utils.cpp +++ b/src/main_utils.cpp @@ -117,6 +117,7 @@ void usage(int status) { cout<<" -g <int> .... maximum RAM to use (in Gb) for the CMS."<<endl; cout<<" -q <int> .... minimum quality threshold for nucleotides. Default is 0."<<endl; cout<<" -m <int> .... minimum of valid k-mer threshold for reads. Default is 1."<<endl; + cout<<" --separate_PE .... Filter criterion (-c and -C) apply to each end of a PE independly."<<endl; cout<<" -v .... verbose"<<endl; exit(status); } diff --git a/src/read_utils.cpp b/src/read_utils.cpp index 2a73709b4b9e2446b52d3ef834ce77f34218979b..8108db64220a112086a2b304d10a50f3f6287f2e 100644 --- a/src/read_utils.cpp +++ b/src/read_utils.cpp @@ -137,15 +137,21 @@ void getDNASeqstr(FqBaseBackend* fq_files_be [], } -void decomposeReadInKMerNums(ReadProcessor& read_p, readNumericValues& nbrKmerDecompo,int k,DnaSeqStr a_seqs[2]) { - int nb_expected_k_mers=a_seqs[0].length+a_seqs[1].length+1; - nb_expected_k_mers-=k; - nbrKmerDecompo.reserve(nb_expected_k_mers); +void decomposeReadInKMerNums(ReadProcessor& read_p, T_read_numericValues& nbrKmerDecompo,int k,const int& filter_PE_as_single,DnaSeqStr a_seqs[2]) { + int nb_expected_k_mers_PE1; + int nb_expected_k_mers_PE2=0; + nbrKmerDecompo.idx_start_PE2=0; + nb_expected_k_mers_PE1=a_seqs[0].length+1-k; + if (a_seqs[1].length) nb_expected_k_mers_PE2=a_seqs[1].length+1-k; + nbrKmerDecompo.single_or_PE_val.reserve(nb_expected_k_mers_PE1+nb_expected_k_mers_PE2); + char * start_dna_str=a_seqs[0].fq_record_buf+a_seqs[0].start_idx; - read_p.getKMerNumbers(start_dna_str,a_seqs[0].length,nbrKmerDecompo); + read_p.getKMerNumbers(start_dna_str,a_seqs[0].length,nbrKmerDecompo.single_or_PE_val); + if (a_seqs[1].length) { // case of PE reads + if (!filter_PE_as_single) nbrKmerDecompo.idx_start_PE2=nbrKmerDecompo.single_or_PE_val.size(); start_dna_str=a_seqs[1].fq_record_buf+a_seqs[1].start_idx; - read_p.getKMerNumbers(start_dna_str,a_seqs[1].length,nbrKmerDecompo); + read_p.getKMerNumbers(start_dna_str,a_seqs[1].length,nbrKmerDecompo.single_or_PE_val); } } diff --git a/src/read_utils.h b/src/read_utils.h index b6a643cc2af3f84d41960f26d0ab8305bf43ebda..48c12c0c19d06803139f07423ed1998e096e3766 100644 --- a/src/read_utils.h +++ b/src/read_utils.h @@ -36,7 +36,8 @@ void getDNASeqstr(FqBaseBackend* [], void init_DnaSeqStr(DnaSeqStr * dna_seq); -void decomposeReadInKMerNums(ReadProcessor& read_p, readNumericValues& nbrKmerDecompo,int k,DnaSeqStr a_seqs[2]); +void decomposeReadInKMerNums(ReadProcessor& read_p, T_read_numericValues& nbrKmerDecompo,int k,const int &,DnaSeqStr a_seqs[2]); +// void decomposeReadInKMerNums(ReadProcessor& read_p, T_read_numericValues& nbrKmerDecompo,int k,int DnaSeqStr a_seqs[2] ); #endif /* READ_UTILS_H_ */ diff --git a/src/rock.cpp b/src/rock.cpp index b74bba1a70e67e5d31605478cf2f03e4cc7119a5..a375b0c57cdfe23e103fea10390e1f6c845091d5 100644 --- a/src/rock.cpp +++ b/src/rock.cpp @@ -58,11 +58,12 @@ int main(int argc,char * argv[]) { main_parms.initFromMainOptsArgs(argc,argv); int f_id=main_parms.get_f_id(); CMSparams parms=main_parms.getCMSparams(); + FasqQualThreshold qual_thres=main_parms.getQualThresholds(); std::vector<IO_fq_files> single_files=main_parms.get_single_files(); vector<PE_files> v_PE_files=main_parms.get_PE_files(); FqBaseBackend * map_id_backend[k_max_input_files]; - Filter the_filter(parms); + Filter the_filter(parms,qual_thres); #ifdef BENCHMARK cout<<"processed input args; going to start reading fastq files"<<endl; diff --git a/src/rock_commons.h b/src/rock_commons.h index dbd0a7f62c68c8ff8fd7684b7305af470b3d36d1..72ecb5aad6d5d63b006a3ab73dc3e7f6a7e1de2e 100644 --- a/src/rock_commons.h +++ b/src/rock_commons.h @@ -40,6 +40,12 @@ typedef struct { typedef std::vector<unsigned long> readNumericValues; +typedef struct { + readNumericValues single_or_PE_val; + int idx_start_PE2; // In he case where PE reads are not treated as single; + // indicate at which index the single_or_PE_val array starts containing values for PE2 k-mers. +}T_read_numericValues; + diff --git a/src/unit_test_cms.cpp b/src/unit_test_cms.cpp index 3ae712c4be706ee712901c417486c1f29b46e68a..0e0f9dd0d1f3d3f477b073d26258739ba89cac11 100644 --- a/src/unit_test_cms.cpp +++ b/src/unit_test_cms.cpp @@ -34,10 +34,11 @@ void test_CMS(int lambda,int kappa,int kappa_prime) { assert(nb_distinct_k_mers==1); // mimic a read (as given by part 2 output). - readNumericValues read_values; + T_read_numericValues read_values; for (i=1;i<=400;i++) { - read_values.push_back(i*1000-1); + read_values.single_or_PE_val.push_back(i*1000-1); } + read_values.idx_start_PE2=0; ret=cms.addRead(read_values); // read should be accepted since none of the values that it contains is equal to those I inserted previously. assert(ret!=rej_expected); @@ -45,12 +46,13 @@ void test_CMS(int lambda,int kappa,int kappa_prime) { assert(nb_distinct_k_mers==401); // mimic a vector that contains a lot of k_mers already present in the CMS and check that it is rejected. - readNumericValues read_values2; + T_read_numericValues read_values2; + read_values2.idx_start_PE2=0; for (i=1;i<=199;i++) { - read_values2.push_back(i*1000-1); + read_values2.single_or_PE_val.push_back(i*1000-1); } for (i=200;i<=400;i++) { - read_values2.push_back(num); + read_values2.single_or_PE_val.push_back(num); } ret=cms.addRead(read_values2); assert(ret==rej_expected); @@ -65,6 +67,60 @@ void test_CMS(int lambda,int kappa,int kappa_prime) { +void testCMS_with_PE_NotasSingle(int lambda,int kappa,int kappa_prime) { + int filter_PE_as_single=0; + CountMinSketch<unsigned char>* cms=new CountMinSketch<unsigned char>(lambda,kappa,kappa_prime,filter_PE_as_single); + int i; + cout<<"size of the CMS component: "<<sizeof(cms)<<endl; + int num=100*lambda; + //int rej_expected=0; + int ret; + for (i=0;i<kappa;i++) { + cms->addKMer(num); + } + int min=cms->getEstimatedNbOcc(num); + assert(min==kappa); + // mimic a PE read (as given by part 2 output). + T_read_numericValues read_values; + + // mimic a PE read in which PE1 would cntain a majority of k_mers that are not already in the CMS and PE2 would contain a majority of k_mers that are already in the CMS. + for (i=1;i<=200;i++) { + read_values.single_or_PE_val.push_back(i*1000-1); + } + read_values.idx_start_PE2=199; + for (i=1;i<=239;i++) { + read_values.single_or_PE_val.push_back(num); + } + read_values.single_or_PE_val.push_back(50); + read_values.single_or_PE_val.push_back(55); + ret=cms->addRead(read_values); // read should be accepted since none of the values that it contains is equal to those I inserted previously. + assert(ret>0); + + // mimic a PE read (as given by part 2 output). + T_read_numericValues read_values2; + + // mimic a PE read that will be inserted in the CMS kappa_prime+1 times to check low filter. + for (i=1;i<=150;i++) { + read_values2.single_or_PE_val.push_back(i*1500-1); + } + read_values2.idx_start_PE2=150; + for (i=1;i<=151;i++) { + read_values2.single_or_PE_val.push_back(i*1600-1); + } + for (i=0;i<kappa_prime+1;i++) { + ret=cms->addRead(read_values2); + assert(ret>0); + } + ret=cms->isBeneathMinKappa(read_values); + assert(ret>0); + + ret=cms->isBeneathMinKappa(read_values2); + assert(ret==0); + + delete cms; +} + + int main(int argc, char **argv) { int lambda=2; int kappa=50; @@ -90,10 +146,12 @@ int main(int argc, char **argv) { return -1; } - if (argc==1) { - cout<<"testing CMS with lambda="<<lambda<<endl; - test_CMS(lambda,kappa,kappa_prime); // Finally using C arrays (maps implied storing hash keys : 4 Bytes per k_mer overhead) but each array is of size INT_MAX... - cout<<"done"<<endl; - } + + cout<<"testing CMS with lambda="<<lambda<<endl; + test_CMS(lambda,kappa,kappa_prime); // Finally using C arrays (maps implied storing hash keys : 4 Bytes per k_mer overhead) but each array is of size INT_MAX... + cout<<"testing CMS with PE not as single"<<endl; + testCMS_with_PE_NotasSingle(lambda,kappa,kappa_prime); + cout<<"done"<<endl; + return 0; } diff --git a/src/unit_test_fqreader.cpp b/src/unit_test_fqreader.cpp index 49fa8df565cd5ffce60e29356b1317c0fd2be696..bc662d949003d7824066c230f3931d60bdfa2122 100644 --- a/src/unit_test_fqreader.cpp +++ b/src/unit_test_fqreader.cpp @@ -20,13 +20,17 @@ #include "srp.h" #include "FqMainBackend.h" #include "fqreader.h" +#include "unit_tests_tools.h" using namespace std; + + void test_processSingleFile() { srp sr; unsigned char f_id=1; + FqMainBackend::setTreatPEAsSingle(1); processSingleFile((char *) "../test/data/unit/test_single.fq",f_id,&sr); srp::reverse_iterator rit; i_dim::iterator it_offs; @@ -71,6 +75,7 @@ void test_processSingleFileWithMQOption() { qual_thres.nucl_score_threshold=14+k_phred_32; qual_thres.min_correct_k_mers_in_read=100; FqBaseBackend::setQualThreshold(qual_thres); + FqMainBackend::setTreatPEAsSingle(1); FqMainBackend be_fq=FqMainBackend(&sr); be_fq.setUndefFile((char *) "../test/data/unit/test_single.undef.fq"); @@ -148,6 +153,7 @@ void test_processPEfilesWithA() { unsigned char f_id4=4; srp sr; + FqMainBackend::setTreatPEAsSingle(1); processPEFiles(fq_3_test, f_id3,fq_4_test, f_id4,&sr); srp::reverse_iterator rit; i_dim::iterator it_offs; @@ -204,7 +210,7 @@ void test_processPEFiles() { unsigned char f_id2=2; srp sr; - + FqMainBackend::setTreatPEAsSingle(1); processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr); srp::reverse_iterator rit; i_dim::iterator it_offs; @@ -263,6 +269,7 @@ void aux_testPEFilesMQ(FasqQualThreshold qual_thres,int nb_expected_reads) { unsigned char f_id1=1; unsigned char f_id2=2; + FqMainBackend::setTreatPEAsSingle(1); FqBaseBackend::setQualThreshold(qual_thres); processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,fq_1_test_undef,fq_2_test_undef); @@ -291,6 +298,7 @@ void test_processPEFilesWithMQOptions() { qual_thres.k=20; qual_thres.nucl_score_threshold=14+k_phred_32; qual_thres.min_correct_k_mers_in_read=76; + FqMainBackend::setTreatPEAsSingle(1); aux_testPEFilesMQ(qual_thres,4); // last fq records contains only 77 correct k-mers. @@ -308,7 +316,7 @@ void test_processPEFilesWithMQOptions() { } -void check_processAIFilesResults(srp& sr) { +void check_processAllFilesResults(srp& sr) { srp::reverse_iterator rit; i_dim::iterator it_offs; k_dim::iterator it_struct; @@ -343,13 +351,58 @@ void test_processAllFiles() { unsigned char f_single=3; srp sr; - + FqMainBackend::setTreatPEAsSingle(1); processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr); processSingleFile(fq_single,f_single,&sr); - check_processAIFilesResults(sr); + check_processAllFilesResults(sr); } +void test_processPE_not_as_single() { + char * fq_1_test=(char *) "../test/data/unit/test_PE1_PE_not_as_single.fq"; + char * fq_2_test=(char *) "../test/data/unit/test_PE2_PE_not_as_single.fq"; + + char * fq_1_test_undef=(char *) "../test/data/unit/test_PE1_PE_not_as_single.undef.fastq"; + char * fq_2_test_undef=(char *) "../test/data/unit/test_PE2_PE_not_as_single.undef.fastq"; + + char * fq_1_expected_undef=(char *) "../test/data/unit/expected_PE1_PE_not_as_single.undef.fastq"; + char * fq_2_expected_undef=(char *) "../test/data/unit/expected_PE2_PE_not_as_single.undef.fastq"; + + unsigned char f_id1=1; + unsigned char f_id2=2; + srp sr; + + FasqQualThreshold qual_thres; + qual_thres.k=10; + qual_thres.min_correct_k_mers_in_read=1; + qual_thres.nucl_score_threshold=2+k_phred_32; + + FqBaseBackend::setQualThreshold(qual_thres); + FqMainBackend::setTreatPEAsSingle(0); + processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,fq_1_test_undef,fq_2_test_undef,1000); + + srp::reverse_iterator rit; + i_dim::iterator it_offs; + k_dim::iterator it_struct; + int cnt_read=0; + + for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). + //unsigned long score=rit->first; + for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { + for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { + cnt_read++; + } + } + } + assert(cnt_read==1); + + assert(compareFilesLileByLine(fq_1_test_undef,fq_1_expected_undef)==0); + assert(compareFilesLileByLine(fq_2_test_undef,fq_2_expected_undef)==0); + + assert(remove((char *) "../test/data/unit/test_PE1_PE_not_as_single.undef.fastq")==0); + assert(remove((char *) "../test/data/unit/test_PE2_PE_not_as_single.undef.fastq")==0); + +} void test_processInputFiles() { @@ -372,14 +425,16 @@ void test_processInputFiles() { srp sr; FasqQualThreshold default_qual_thres; default_qual_thres.k=30; - default_qual_thres.min_correct_k_mers_in_read=1; + default_qual_thres.min_correct_k_mers_in_read=0; // aim of that test is not to check undef file creation. Disable it by putting 0 here default_qual_thres.nucl_score_threshold=0; // leave default values for that test + FqMainBackend::setTreatPEAsSingle(1); + FqBaseBackend * array_be[k_max_input_files]; processInputFiles(v_single,v_pe,array_be,default_qual_thres,&sr); // check that result is correct in sr. - check_processAIFilesResults(sr); + check_processAllFilesResults(sr); // check that the 3 backends are correct FqMainBackend * pbe=(FqMainBackend *) array_be[0]; // TODO see if one can use check_case, static_cast or one of them if they are not in boost. @@ -400,8 +455,6 @@ void test_processInputFiles() { int i; for (i=0;i<3;i++) delete array_be[i]; - //free(array_be); - } void test_processBufSingle() { @@ -421,6 +474,7 @@ AAAAAEEAEEEEEEEEE6EE/EEEEEEEEAEEAEEEEEEEEEEEEEEAEEEA/A/EEEAEEEEEE/EE</EAEEEEEE/E qual_thres.min_correct_k_mers_in_read=78; T_buf_info buf_info; FqBaseBackend::setQualThreshold(qual_thres); + FqMainBackend::setTreatPEAsSingle(1); FqMainBackend be(&sr); be.setUndefFile((char *) "../test/data/unit/test_processBuf.undef.fq"); buf_info.buf=buf; @@ -478,6 +532,7 @@ AAAAAEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEAEEEEEEEEEEEE T_buf_info buf_info; T_buf_info PE2_buf_info; FqBaseBackend::setQualThreshold(qual_thres); + FqMainBackend::setTreatPEAsSingle(1); FqMainBackend be(&sr); be.setUndefFile((char *) "../test/data/unit/test_processBuf_PE1.undef.fq"); @@ -527,60 +582,6 @@ AAAAAEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEAEEEEEEEEEEEE } -/* - * Auxilliary method to compare 2 files and make ure that they are identical. - */ -int compareFilesLileByLine(char * filename1,char* filename2) { - ifstream file1,file2; - file1.open(filename1,ios::binary); - file2.open(filename2,ios::binary); -//---------- compare number of lines in both files ------------------ - int c1,c2; - c1 = 0; c2 = 0; - string str; - while(!file1.eof()) - { - getline(file1,str); - c1++; - } - file1.clear(); // sets a new value for the error control state - file1.seekg(0,ios::beg); - while(!file2.eof()) - { - getline(file2,str); - c2++; - } - file2.clear(); - file2.seekg(0,ios::beg); - - if(c1 != c2) - { - cout << "Different number of lines in files!" << "\n"; - cout << "file1 has " << c1 << " lines and file2 has " - << c2 << " lines" << "\n"; - return -1; - } -//---------- compare two files line by line ------------------ - char string1[1000], string2[1000]; // assume a read in afastq record is never longer than that. - int j = 0; - while(!file1.eof()) - { - file1.getline(string1,256); - file2.getline(string2,256); - j++; - if(strcmp(string1,string2) != 0) - { - cout << j << "-th strings are not equal" << "\n"; - cout << " " << string1 << "\n"; - cout << " " << string2 << "\n"; - return -1; - } - } - - return 0; -} - - void Aux_MimicBigPEFilesWithMQOptions(const FasqQualThreshold& qual_thres,const int bufsize,const int nb_expected_reads) { srp sr; char * fq_1_test=(char *) "../test/data/unit/09-4607_S43_R1_big.fastq"; @@ -599,6 +600,7 @@ void Aux_MimicBigPEFilesWithMQOptions(const FasqQualThreshold& qual_thres,const // FqBaseBackend::bufsize=bufsize; // choose a very small buf size to mimic behavior on big files and reun the test in a short time FqBaseBackend::setQualThreshold(qual_thres); + FqMainBackend::setTreatPEAsSingle(1); processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,fq_1_test_undef,fq_2_test_undef,bufsize); @@ -670,5 +672,7 @@ int main(int argc, char **argv) { test_processPEFilesWithMQOptions(); cout<<"test processing files with thresholds (nucleotide quality score and minimum number of valid k_mers in read) mimic big files that need several read operations. "<<endl; test_MimicBigPEFilesWithMQOptions(); + cout<<"test processing PE files with nucleotide quality threshold and PE not treated as single option."<<endl; + test_processPE_not_as_single(); cout<<"done"<<endl; } diff --git a/src/unit_test_read_utils.cpp b/src/unit_test_read_utils.cpp index 3380743b2baeb63c4267fad197faede2c2776ce9..da9d830afc0095f1e182353bcaa0a2ef7ed836c9 100644 --- a/src/unit_test_read_utils.cpp +++ b/src/unit_test_read_utils.cpp @@ -183,31 +183,36 @@ void test_getReadPEWithNQST() { void test_decomposeReadInkMerNums() { int k=5; ReadProcessor read_p(k); - readNumericValues nbrKmerDecompo; + T_read_numericValues nbrKmerDecompo; DnaSeqStr a_seqs[2]; init_DnaSeqStr(&a_seqs[0]); init_DnaSeqStr(&a_seqs[1]); DnaSeqStr * seq1=a_seqs; - //cout<<strlen("AAAAAAAAAA")<<endl; + strcpy(seq1->fq_record_buf,"@fake_stuff\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n+\n....@\n"); seq1->start_idx=12; seq1->length=75; - decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs); - assert(nbrKmerDecompo.size()==71); + decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,1,a_seqs); + assert(nbrKmerDecompo.single_or_PE_val.size()==71); + assert(nbrKmerDecompo.idx_start_PE2==0); readNumericValues::iterator it; - for (it=nbrKmerDecompo.begin();it!=nbrKmerDecompo.end();it++) { - // cout<<*it<<endl; + for (it=nbrKmerDecompo.single_or_PE_val.begin();it!=nbrKmerDecompo.single_or_PE_val.end();it++) { assert(*it==1023); } DnaSeqStr * seq2=&a_seqs[1]; strcpy(seq2->fq_record_buf,"@another_fake_stuff\nATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG\n+\n....@\n"); seq2->start_idx=20; seq2->length=80; - nbrKmerDecompo.clear(); - decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs); - //cout<<nbrKmerDecompo.size()<<endl; - assert(nbrKmerDecompo.size()==147); + nbrKmerDecompo.single_or_PE_val.clear(); + decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,1,a_seqs); + assert(nbrKmerDecompo.single_or_PE_val.size()==147); + assert(nbrKmerDecompo.idx_start_PE2==0); + + T_read_numericValues nbrKmerDecompo2; + decomposeReadInKMerNums(read_p,nbrKmerDecompo2,k,0,a_seqs); + assert(nbrKmerDecompo2.single_or_PE_val.size()==147); + assert(nbrKmerDecompo2.idx_start_PE2==71); } diff --git a/src/unit_tests_tools.cpp b/src/unit_tests_tools.cpp new file mode 100644 index 0000000000000000000000000000000000000000..445ac3db15ec5e291b5e91964269f0ce3c4e0d61 --- /dev/null +++ b/src/unit_tests_tools.cpp @@ -0,0 +1,70 @@ +/* + * unit_tests_tools.cpp + * + * Created on: Jan 4, 2017 + * Author: vlegrand + */ +#include <iostream> +#include <fstream> +#include <assert.h> + +using namespace std; + + +/* + * Auxilliary method to compare 2 files and make ure that they are identical. + */ +int compareFilesLileByLine(char * filename1,char* filename2) { + ifstream file1,file2; + file1.open(filename1,ios::binary|ios::in); + assert(file1.good()); + file2.open(filename2,ios::binary|ios::in); + assert(file2.good()); +//---------- compare number of lines in both files ------------------ + int c1,c2; + c1 = 0; c2 = 0; + string str; + while(!file1.eof()) + { + getline(file1,str); + c1++; + } + file1.clear(); // sets a new value for the error control state + file1.seekg(0,ios::beg); + while(!file2.eof()) + { + getline(file2,str); + c2++; + } + file2.clear(); + file2.seekg(0,ios::beg); + + if(c1 != c2) + { + cout << "Different number of lines in files!" << "\n"; + cout << "file1 has " << c1 << " lines and file2 has " + << c2 << " lines" << "\n"; + return -1; + } +//---------- compare two files line by line ------------------ + char string1[1000], string2[1000]; // assume a read in afastq record is never longer than that. + int j = 0; + while(!file1.eof()) + { + file1.getline(string1,256); + file2.getline(string2,256); + j++; + if(strcmp(string1,string2) != 0) + { + cout << j << "-th strings are not equal" << "\n"; + cout << " " << string1 << "\n"; + cout << " " << string2 << "\n"; + return -1; + } + } + + return 0; +} + + + diff --git a/src/unit_tests_tools.h b/src/unit_tests_tools.h new file mode 100644 index 0000000000000000000000000000000000000000..e62b2c012bf356bc520f3b774472ed1d1e8a46b2 --- /dev/null +++ b/src/unit_tests_tools.h @@ -0,0 +1,15 @@ +/* + * unit_tests_tools.h + * + * Created on: Jan 4, 2017 + * Author: vlegrand + */ + +#ifndef UNIT_TESTS_TOOLS_H_ +#define UNIT_TESTS_TOOLS_H_ + +int compareFilesLileByLine(char * filename1,char* filename2); + + + +#endif /* UNIT_TESTS_TOOLS_H_ */ diff --git a/test/Makefile.am b/test/Makefile.am index a11dfe4daef2692d7fffe32479b1cb39ca079278..66846399a34e243b4785fb46d0fac0596f20f8c4 100755 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -4,6 +4,20 @@ LOG_COMPILER = $(SHELL) TESTS = rock.sh -EXTRA_DIST = $(TESTS) rock_mem.sh data/iofiles.args/extract_klebsiella_long_reads_100.txt data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt data/fastq.raw/klebsiella_100_1.fq data/fastq.raw/klebsiella_100_2.fq data/unit/klebsiella_PE1.fq data/unit/klebsiella_PE2.fq data/unit/test_PE1.fq data/unit/test_PE2.fq data/unit/test_PE1_2.fq data/unit/test_PE2_2.fq data/unit/test_single2.fq data/unit/test_single.fq data/unit/list_input1.txt data/unit/list_output1.txt data/unit/list_input2.txt data/unit/list_output2.txt data/unit/list_input3.txt data/unit/list_output3.txt data/iofiles.args/output_files_noNQ_Thres.txt data/iofiles.args/output_files_NQ_Thres_very_low.txt data/iofiles.args/output_files_NQ_Thres_12.txt data/iofiles.args/output_files_NQ_Thres_13.txt data/unit/09-4607_S43_R1.fastq data/unit/09-4607_S43_R2.fastq data/unit/09-4607_S43_R1_big.fastq data/unit/09-4607_S43_R2_big.fastq data/unit/thres_100_09-4607_S43_R1.undef.fastq data/unit/thres_100_09-4607_S43_R2.undef.fastq data/unit/thres_180_09-4607_S43_R1.undef.fastq data/unit/thres_180_09-4607_S43_R2.undef.fastq +EXTRA_DIST = $(TESTS) rock_mem.sh data/iofiles.args/extract_klebsiella_long_reads_100.txt \ +data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt data/fastq.raw/klebsiella_100_1.fq \ +data/fastq.raw/klebsiella_100_2.fq data/unit/klebsiella_PE1.fq data/unit/klebsiella_PE2.fq data/unit/test_PE1.fq \ +data/unit/test_PE2.fq data/unit/test_PE1_2.fq data/unit/test_PE2_2.fq data/unit/test_single2.fq \ +data/unit/test_single.fq data/unit/list_input1.txt data/unit/list_output1.txt data/unit/list_input2.txt \ +data/unit/list_output2.txt data/unit/list_input3.txt data/unit/list_output3.txt \ +data/iofiles.args/output_files_noNQ_Thres.txt data/iofiles.args/output_files_NQ_Thres_very_low.txt \ +data/iofiles.args/output_files_NQ_Thres_12.txt data/iofiles.args/output_files_NQ_Thres_13.txt \ +data/unit/09-4607_S43_R1.fastq data/unit/09-4607_S43_R2.fastq data/unit/09-4607_S43_R1_big.fastq \ +data/unit/09-4607_S43_R2_big.fastq data/unit/thres_100_09-4607_S43_R1.undef.fastq \ +data/unit/thres_100_09-4607_S43_R2.undef.fastq data/unit/thres_180_09-4607_S43_R1.undef.fastq \ +data/unit/thres_180_09-4607_S43_R2.undef.fastq data/fastq.raw/klebsiella_5_1_bad_scores.fq \ +data/fastq.raw/klebsiella_5_2_bad_scores.fq data/iofiles.args/output_files_2_PE_separated.txt \ +data/unit/test_PE1_PE_not_as_single.fq data/unit/test_PE2_PE_not_as_single.fq data/unit/expected_PE1_PE_not_as_single.undef.fastq\ +data/unit/expected_PE2_PE_not_as_single.undef.fastq diff --git a/test/data/fastq.raw/klebsiella_5_1_bad_scores.fq b/test/data/fastq.raw/klebsiella_5_1_bad_scores.fq new file mode 100644 index 0000000000000000000000000000000000000000..f1b49b1dcd177e62170fc3e952ef6d4db6713803 --- /dev/null +++ b/test/data/fastq.raw/klebsiella_5_1_bad_scores.fq @@ -0,0 +1,20 @@ +@SRR1222430.1 1 length=251 +GCCCGCGAAGCGGAGCTGGCCGCCTGCAAAGGCCGTTCGCGCTCGCTGTCGCTGGATCTGTACGCCCAGTGGCGCTGCATGGAGGACAACCACGGCAAGTGGCGCTTCACCTCGCCGACCCATACCGTGCTGGCCTTCGCCCAGGCGCTGAAAGAGCTGGCGCAGGAGGGCGGCGTCAGCGCTCGCCATCAGCGCTACCGCAACAATCAGCGCCGTCTGGTGGCAGGGATGCGCGCGCTCGGCTTCCGGCC ++SRR1222430.1 1 length=251 +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +@SRR1222430.2 2 length=250 +AGAAATTCGCCATCAGAATAAAAACCTCATATGCACATTTTCTTGTTATTGCACAGCCTGTGCCACTTTAGCGCCAGCCTCTCCGGCAATCGTGGAGAAATTAAGGAGATAGTGTAATTTATCATGTTGCTTTTGCCGTATCGTAAAGAAACCTCGAGCTTTCCTGCCAGCAGGTAGCGAGTCTGCTTCGTCACCGCAGACCGGCGCATTATCCCTTGCCGGTGTGAAACCTCATTTCATTTAAGTCAAA ++SRR1222430.2 2 length=250 +BBBBBFFFBBBBFGGGGGGGGGHHGGHHHHHHHHGFGHHHHHHHHGHHHFFFHHFHHHHHHEHGFHHHFGFGFGGGEGGFHHFHGCGGGHHGHHGGHFAFHHHEGFHHGGHHFHFHHHHHEHHHHHHHHHHHGHHHGGGGHHGHFGGFDGFGFHGEGFCDHGGHHHHHHHGHHEHHGHGGGGHGFHHEHGHGGGGDFGGGHHGADA?DBGGGGGGFFFFBBFFFFFFFFFFFFFFFFFBFFFFFFFFF/B +@SRR1222430.3 3 length=236 +CAAACACCTGACGCGGTTCCAGCAGGTACTCCTGCACGCCAATTTCCGGGCGGGCAGTAAAGCGCTGTTTGCAGCCCGTCTGGTGCAGGCGCCCCAGATAGCGGCCAACCCATTCCATCTGATCAAGGTTATCCGCTTCGAACTGACGACCGCCAAGGCTTGGGAAAACGGCAAAGTAGAATCCCTGATGCTGATGAAGCGTGCTGTCATTAAATAAGAGCGGCGCAGCAACGGGC ++SRR1222430.3 3 length=236 +CCCCCFFCFFFFGGGGGGGGGGHHHHHFGHHHHHHHHGGGGGHHHHHGGGGGGGGGGHHHHHHGGGGGHHHHHHHHGGGGHGHGHHGGHGGGGGGGGHHHHHGGGGGHGGGGHHHHGHHHHHHHHHHHHH!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +@SRR1222430.4 4 length=249 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTGAAAAAAAACCATGGCTCGGCGCCCAGGGCATCGTCCCGCAATCCGATGACTGGATGGTCGTCGCCAAAGGCACGATCAACGTCCAGCCTGCGGTGGTGATTGCCATCACTGGCACCTTCCAGGGCGGCAGTATTGGCGAAGTGTCCGGCCTCAAAATTGCCGCCGCCATGGTGCTGGCGGCGGATG ++SRR1222430.4 4 length=249 +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!CAFGFDA1GAEG2FGFAFFCEFGCA/GFD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +@SRR1222430.5 5 length=208 +CGCTTATGGAAATGTGACGATCGTCACCGTTCCGCCCCGGGAGAACGGGGCGGAAAAAGAGGGCGATTTTAGTGCCAGCAGAAGTGATGAACCACCTGGCTAATCAGCTCCCGGGTCGGCTTGATAAAGCGCGTCTCCAGATACTCGTCAGGCTGATGGGCCTGATTGATGGAGCCCGGGCCGAGAACCAGCGTCGGACACAGCGTCT ++SRR1222430.5 5 length=208 +CCDDCCFFFFFFGGGGGGGGGGGHHHGHGHGHHGGGGGGGGGGGHHGGGGGGGGGGHHGGHHGGGGGHGHHHHHHHHHGHHHHHHHHHHHHHHHGGHHHHHHHHHHHHHHHGGGGGGGGGGGGHHHHHHHGGGGGGGGGGGGGGGGGGGGGGGGGGFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF \ No newline at end of file diff --git a/test/data/fastq.raw/klebsiella_5_2_bad_scores.fq b/test/data/fastq.raw/klebsiella_5_2_bad_scores.fq new file mode 100644 index 0000000000000000000000000000000000000000..4f6f0d938eca9b58e726dc1358df59ef2dbf15c9 --- /dev/null +++ b/test/data/fastq.raw/klebsiella_5_2_bad_scores.fq @@ -0,0 +1,20 @@ +@SRR1222430.1 1 length=250 +CGATGCTGCGCTGCATCAATAAGCTGGAAGAGATCACCAGCGGCGATCTGATTGTCGATGGTCTGAAGGTCAACGACCCGAAGGTGGACGAACGTCTGATTCGTCAGGAAGCCGGGATGGTTTTCCAGCAGTTTTATCTGTTCCCGCACCTCACGGCGCTGGAAAACGTGATGTTTGGCCCGCTGCGGGTACGCGGCGCCAGCAAGCAGGCGGCGAGTGCAGGCTATCGTCGAGCAGCGGCCGGAAGCCG ++SRR1222430.1 1 length=250 +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +@SRR1222430.2 2 length=251 +AGAAATTCGCCAGGGTGATCAACGTCTCATCGTCGGCGAGGGTCAGCGCCTGCGCGGGACAGTTTTCCACACAGGCCGGCCCCGCCTCGCGGCCCTGGCATAGATCGCACTTGTGCGCGCTGGCTTTCACCAGGCCTGCGGCCTGCGGCGTCACCACCACCTGCATCACCCCGAACGGGCAGGCCACCATGCAGGATTTACAGCCAATGCATTTCTCCTGACGGACCTGAACGCTGTCGCCGGACTGGGCG ++SRR1222430.2 2 length=251 +>1>AAFFF?11AEGGFFCGGGGGFGHFH2FF1F00EEE/AAEE0FGGFGEGGHGGCGC?EEFHEFEEHDF1EECHEFE/@@/BCCCFGAC@CC@C.CEGFHFHGHFHCEC?FH;CC?CG@@?-AF.BB0BFGF?E./EF;@?;AFF@<-@@??BFF?F-:A?BF999BBBF@@?@@@F-;@B@FF-A-9FF/BFFE/F//B/BBEFBFFFFF/BFFFFFFFEB?-@=B-/BBF--:;/A-B>--;>?EFE9 +@SRR1222430.3 3 length=236 +GCCCGTTGCTGCGCCGCTCTTATTTAATGACAGCACGCTTCATCAGCATCAGGGATTCTACTTTGCCGTTTTCCCAAGCCTTGGCGGTCGTCAGTTCGAAGCGGATAACCTTGATCAGATGGAATGGGTTGGCCGCTATCTGGGGCGCCTGCACCAGACGGGCTGCAAACAGCGCTTTACTGCCCGCCCGGAAATTGGCGTGCAGGAGTACCTGCTGGAACCGCGTCAGGTGTTTG ++SRR1222430.3 3 length=236 +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!HHHEFGGCFGHHHHGGFGGGGGGGGGGGGGGGGGFFFEFFFFFFFBFFFFF=;ABFF/FA:DB;BF.9.BFFFFFFF/AFFFFFABDFFB/9BD.; +@SRR1222430.4 4 length=250 +GCATGGAAAAGGGGTGTGGTGGGGAAAAGGGGAGATCCCTGCTGGAGCCCTACCCCTTAAAAAAAAAAACACAGCACCGGCTGCGTCGGGATACCGTAGCGTATCTCTACCGCCGCCATCACCCGCGCGCGTGCCATTTGGTCACCCAACAATGTGCCCATATGTCCTCCCACAGATGAGTACGTGATGCCAATCCTCATCGCAGAATAGCCTCTCAGTGGCCCCTTTGTAACCCACATACCCTACTTGG ++SRR1222430.4 4 length=250 +>>11111111111A100A0AAEA00A0A0//////011B11/1B10B0A/0B000/B0BB111/>/E////0?0/<0<////</<///////0??///.>-...0=1<=1D----::-::00/.----------;/:;;//-:/;/--9---;/9//////;/99/////-----9///;///----///9;9//-/9////-;--///////-//////;---9--/////:/----////;/--//// +@SRR1222430.5 5 length=208 +AGACGCTGTGTCCGACGCTGGTTCTCGGCCCGGGCTCCATCAATCAGGCCCATCAGCCTGACGAGTATCTGGAGACGCGCTTTATCAAGCCGACCCGGGAGCTGATTAGCCAGGTGGTTCATCACTTCTGCTGGCACTAAAATCGCCCTCTTTTTCCGCCCCGTTCTCCCGGGGCGGAACGGTGACGATCGTCACATTTCCATAAGCG ++SRR1222430.5 5 length=208 +DDDDDDCDDFFFGGGGGGGGGGHHHHGGGGGGGGGGHGHHHHHHHHHHHGGGHHHHHHHHHHGGGGHHHHHHHHHGGGGGGGGHHHHHHHGGGGGGGGGGGGHHHHHHHHHHHHGHGGGHHHHHHHHHHHHHHHGHHHHHHHHHGGGGGHHHHHHHHGGGGGGGGGGGGGGFGGFFFFFFFFDFFFFFFFFFFEFFFFFFFFFFFFFF diff --git a/test/data/iofiles.args/output_files_2_PE_separated.txt b/test/data/iofiles.args/output_files_2_PE_separated.txt new file mode 100644 index 0000000000000000000000000000000000000000..402148ddcc99d1524bd5775740ca9e03669a4e58 --- /dev/null +++ b/test/data/iofiles.args/output_files_2_PE_separated.txt @@ -0,0 +1 @@ +tmp/klebsiella_5_1_2_qual_thres.fq,tmp/klebsiella_5_2_2_qual_thres.fq diff --git a/test/data/unit/expected_PE1_PE_not_as_single.undef.fastq b/test/data/unit/expected_PE1_PE_not_as_single.undef.fastq new file mode 100644 index 0000000000000000000000000000000000000000..48792d4b724c998937716237fc0f6d57ca822061 --- /dev/null +++ b/test/data/unit/expected_PE1_PE_not_as_single.undef.fastq @@ -0,0 +1,8 @@ +@NS500443:42:H3MH2AFXX:1:11101:1162:1066/1 +AGATCAAACTACTTGCCTCGCTTGAAAAAAGCATCGAGATTCATAATGACGCTGGTGTTGTAACGGCAGATTTGCTGCTTGCTCGGGTTTTACGGTATGATTTTTCAAGTGATGTATTTGACGAAGAAAAGGAGTATATTTTACCAGAATT ++ +!!!!!!A!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!A!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!A!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!EE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +@NS500443:42:H3MH2AFXX:1:11101:17849:1071/1 +ACAGTAACGCGCGCATGGTAATCCCCGTATTGTGCAAGACGTTCAGCAAACTCATTTCCAGACATAACACCTTCAGCAACAACAATGATGCTGTGCTTTTTACCACGTTCACGTCCCTTATTAAGACGCCCTACAATATCATCCATGTTAA ++ +AAAAAE6EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEAEEEEEEEEEE/EEAEE/EEAEAEAEEEEE</E//EE/EEEEEEEEEA<EEAAAEEEEEAEEEEAA<AE/EA<AEEEAEEE/EAEAEEEEEEEEEAEEEEEEEEEA/ diff --git a/test/data/unit/expected_PE2_PE_not_as_single.undef.fastq b/test/data/unit/expected_PE2_PE_not_as_single.undef.fastq new file mode 100644 index 0000000000000000000000000000000000000000..268c3a1d7a0e11eade7ee31e1cb7df1db8cd956f --- /dev/null +++ b/test/data/unit/expected_PE2_PE_not_as_single.undef.fastq @@ -0,0 +1,8 @@ +@NS500443:42:H3MH2AFXX:1:11101:1162:1066/2 +CCCTAGAATTATAGTTCAGTTTAGTTCCAAATAGGGTCACAAAATGTGATAGACAGGTCCCGTTCCATACCAAAAAAACTTGGTACTCCGCACTATTCAATTAGCGGTTAACATTCACTTTGGAACGAAGTCTATACAGCCACATAATTTG ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEAEEEAEEEEE<EEEEEAEEEEEA/EEEAAEE<EAAEEAEEEE<EEEEEEEEE/EAE<EAEEEEEEE +@NS500443:42:H3MH2AFXX:1:11101:17849:1071/2 +GTGGAGACGGTTCTTATCACGGTGCTGAGGCTCTTACTAAACGTGGTTTCCCAACAATTGGAATTCCGGGAACAATCGATAATGATATTTCAGGAACAGACTTCACAATAGGTTTCGATACAGCGCTAAATACAGTTTTAGACGCACTTGA ++ +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! diff --git a/test/data/unit/test_PE1_PE_not_as_single.fq b/test/data/unit/test_PE1_PE_not_as_single.fq new file mode 100644 index 0000000000000000000000000000000000000000..8329c73822c9d37840b17a2f334124b9ed4ad2c2 --- /dev/null +++ b/test/data/unit/test_PE1_PE_not_as_single.fq @@ -0,0 +1,12 @@ +@NS500443:42:H3MH2AFXX:1:11101:1162:1066/1 +AGATCAAACTACTTGCCTCGCTTGAAAAAAGCATCGAGATTCATAATGACGCTGGTGTTGTAACGGCAGATTTGCTGCTTGCTCGGGTTTTACGGTATGATTTTTCAAGTGATGTATTTGACGAAGAAAAGGAGTATATTTTACCAGAATT ++ +!!!!!!A!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!A!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!A!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!EE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +@NS500443:42:H3MH2AFXX:1:11101:7372:1069/1 +TTCCAGAAGCGGATTATTTGCAGGAACTGCGGCGGTTTTTCTATCAAGACGAACGGCAAACTTATTCGAATGAAGCTGTTGCTAACCGTTTTTATGAGCATTTTGATGTCGTCTATGAAAAACGAATAACGAAACAGAAAACGTTACAAAA ++ +AAAAAEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEE<EE<EEEEEEEEEEEE/EEEE<EEEEEEEEEEEEEEEEEEE/AEEAAAEEEEEEEE/EEEEEEEE +@NS500443:42:H3MH2AFXX:1:11101:17849:1071/1 +ACAGTAACGCGCGCATGGTAATCCCCGTATTGTGCAAGACGTTCAGCAAACTCATTTCCAGACATAACACCTTCAGCAACAACAATGATGCTGTGCTTTTTACCACGTTCACGTCCCTTATTAAGACGCCCTACAATATCATCCATGTTAA ++ +AAAAAE6EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEAEEEEEEEEEE/EEAEE/EEAEAEAEEEEE</E//EE/EEEEEEEEEA<EEAAAEEEEEAEEEEAA<AE/EA<AEEEAEEE/EAEAEEEEEEEEEAEEEEEEEEEA/ diff --git a/test/data/unit/test_PE2_PE_not_as_single.fq b/test/data/unit/test_PE2_PE_not_as_single.fq new file mode 100644 index 0000000000000000000000000000000000000000..658d9ef650501dfa15405486ade050bc5df5c26d --- /dev/null +++ b/test/data/unit/test_PE2_PE_not_as_single.fq @@ -0,0 +1,12 @@ +@NS500443:42:H3MH2AFXX:1:11101:1162:1066/2 +CCCTAGAATTATAGTTCAGTTTAGTTCCAAATAGGGTCACAAAATGTGATAGACAGGTCCCGTTCCATACCAAAAAAACTTGGTACTCCGCACTATTCAATTAGCGGTTAACATTCACTTTGGAACGAAGTCTATACAGCCACATAATTTG ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEAEEEAEEEEE<EEEEEAEEEEEA/EEEAAEE<EAAEEAEEEE<EEEEEEEEE/EAE<EAEEEEEEE +@NS500443:42:H3MH2AFXX:1:11101:7372:1069/2 +GGTGTTGTTGCAAGTTCCGCATCTAAAATGTCGACGCCACCTGCTTTTAAAACCGACACCAGTTTAGCCTTGATCCGCTCAGCAGAAAGCCTACTATCATGTCCGATAGCGACCACTATTTTTTCTTTTGGAGTTTTTTTCTTTTGCAAGA ++ +AAAAAEEEEEEEEEEEEEEEEEEEEAEEEEEEAEEEEE/EAEEAEEEEEEEEEEE/EAEEEEEEEEEEE<EE<EEEAEEE/EEEEEEEEEEEAEEEEEEEAEEEE/EEEEE/EEEEEEEEEAEEE</E<EEE/EAEEEE<<A/EEAE<EEA +@NS500443:42:H3MH2AFXX:1:11101:17849:1071/2 +GTGGAGACGGTTCTTATCACGGTGCTGAGGCTCTTACTAAACGTGGTTTCCCAACAATTGGAATTCCGGGAACAATCGATAATGATATTTCAGGAACAGACTTCACAATAGGTTTCGATACAGCGCTAAATACAGTTTTAGACGCACTTGA ++ +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! diff --git a/test/rock.sh b/test/rock.sh index 877bc956f814b8d93582481e62817d9cb0ba9f37..0244161a74a3dc084d74069da40afb0e7852ae69 100755 --- a/test/rock.sh +++ b/test/rock.sh @@ -65,7 +65,7 @@ echo "doing more checks on options and error messages" ../src/rock -C 500 -c 400 -k 10 -l 12 -n 85000000 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "options are mutually exclusive">/dev/null || exit 14 ../src/rock -C 500 -c 400 -k 10 -q 3 -m 0 -i ${srcdir}/test/data/iofiles.args/extract_klebsiella_long_reads_100.txt|grep "minimum number of valid k-mer for keeping a read must be an integer">/dev/null || exit 15 ../src/rock -C 500 -c 400 -k 10 -q -1 -m 2 -i ${srcdir}/test/data/iofiles.args/extract_klebsiella_long_reads_100.txt|grep "q must be">/dev/null || exit 16 - +../src/rock -C 500 -c 400 -k 10 -q 2 -m 2 --separate_PE -i ${srcdir}/test/data/iofiles.args/extract_klebsiella_long_reads_100.txt|grep "Incompatible options">/dev/null || exit 161 # here check that we have enough memory for running the tests. ../src/unit_test_cms CHECK_MEM diff --git a/test/rock_mem.sh b/test/rock_mem.sh index 19114d74ab6c0b8247e879d6efe99685a091db95..a06afe25e37f3ef448f7d8c307d2b359848eb283 100755 --- a/test/rock_mem.sh +++ b/test/rock_mem.sh @@ -9,8 +9,7 @@ echo " " echo "##################################################################################" echo "testing high filter" -#echo "I want to check if I can write to a file ">../test/data/fastq.filtered/SRR1222430_1.filtered.fastq -#cat ../test/data/fastq.filtered/SRR1222430_1.filtered.fastq + ../src/rock -C 100 -l 2 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt #>/dev/null || exit 15 # output files should be the same size, contain the same elements but not in the same order. @@ -57,15 +56,23 @@ echo "testing that input fastq file names can be provided in command line." [ -f "test_single.rock.fq" ] || exit 1813 [ -f "test_single2.rock.fq" ] || exit 1814 + + # checking that output files were sorted in decreasing order of quality score. For that expect to have SRR122430.1.1 as 1rst record in filtered file. ret=`head -4 "klebsiella_100_1.rock.fq"|grep "SRR122430.1.1"` test $ret=2 || exit 1815 echo "erasing test result files" -rm -fr "klebsiella_100_1.rock.fq" || exit 1816 -rm -fr "klebsiella_100_2.rock.fq" || exit 1817 -rm -fr "test_single.rock.fq"|| exit 1818 -rm -fr "test_single2.rock.fq"|| exit 1819 +rm -f "klebsiella_100_1.rock.fq" || exit 1816 +rm -f "klebsiella_100_2.rock.fq" || exit 1817 +rm -f "test_single.rock.fq"|| exit 1818 +rm -f "test_single2.rock.fq"|| exit 1819 +rm -f "klebsiella_100_1.undefined.fq" || exit 18191 +rm -f "klebsiella_100_2.undefined.fq" || exit 18192 +rm -f "test_single.undefined.fq" || exit 18193 +rm -f "test_single2.undefined.fq" || exit 19194 + + # test the -v option echo " " @@ -76,10 +83,14 @@ echo "testing verbose mode" ../src/rock -C 100 -c 1 -v -n 1000 ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq ${srcdir}/data/unit/test_single.fq ${srcdir}/data/unit/test_single2.fq|grep "expected probability of collision was:" >/dev/null || exit 183 echo "erasing test result files" -rm -fr "klebsiella_100_1.rock.fq" || exit 1816 -rm -fr "klebsiella_100_2.rock.fq" || exit 1817 -rm -fr "test_single.rock.fq"|| exit 1818 -rm -fr "test_single2.rock.fq"|| exit 1819 +rm -f "klebsiella_100_1.rock.fq" || exit 1816 +rm -f "klebsiella_100_2.rock.fq" || exit 1817 +rm -f "test_single.rock.fq"|| exit 1818 +rm -f "test_single2.rock.fq"|| exit 1819 +rm -f "klebsiella_100_1.undefined.fq" || exit 18191 +rm -f "klebsiella_100_2.undefined.fq" || exit 18192 +rm -f "test_single.undefined.fq" || exit 18193 +rm -f "test_single2.undefined.fq" || exit 19194 echo " " echo "##################################################################################" @@ -170,6 +181,133 @@ ret2=`cat tmp/klebsiella_100_2.undefined.fq|wc -l` test $ret1 = 136 || exit 192193 test $ret1 = 136 || exit 192194 +echo " " +echo "##################################################################################" +echo "testing ROCK with a quality score threshold for nucleotides and PE processed separately" +../src/rock -C 100 -c 1 -l 2 -q 2 --separate_PE -o ${srcdir}/data/iofiles.args/output_files_2_PE_separated.txt ${srcdir}/data/fastq.raw/klebsiella_5_1_bad_scores.fq,${srcdir}/data/fastq.raw/klebsiella_5_2_bad_scores.fq || exit 201 + +[ -f "tmp/klebsiella_5_1_bad_scores.undefined.fq" ] || exit 20100 +[ -f "tmp/klebsiella_5_2_bad_scores.undefined.fq" ] || exit 20101 + +ret1=`cat tmp/klebsiella_5_1_bad_scores.undefined.fq|wc -l` +ret2=`cat tmp/klebsiella_5_2_bad_scores.undefined.fq|wc -l` + +test $ret1=8 || exit 201100 +test $ret1=8 || exit 201101 + +ret1=`grep -c SRR1222430.1 tmp/klebsiella_5_1_bad_scores.undefined.fq` +ret2=`grep -c SRR1222430.1 tmp/klebsiella_5_2_bad_scores.undefined.fq` + +test $ret1=2 || exit 201200 +test $ret1=2 || exit 201201 + +ret1=`grep -c SRR1222430.5 tmp/klebsiella_5_1_bad_scores.undefined.fq` +ret2=`grep -c SRR1222430.5 tmp/klebsiella_5_2_bad_scores.undefined.fq` + +test $ret1=2 || exit 201300 +test $ret1=2 || exit 201301 + +ret1=`grep -c SRR1222430.2 tmp/klebsiella_5_1_bad_scores.undefined.fq` +ret2=`grep -c SRR1222430.2 tmp/klebsiella_5_2_bad_scores.undefined.fq` + +test $ret1=0 || exit 201400 +test $ret1=0 || exit 201401 + +ret1=`grep -c SRR1222430.3 tmp/klebsiella_5_1_bad_scores.undefined.fq` +ret2=`grep -c SRR1222430.3 tmp/klebsiella_5_2_bad_scores.undefined.fq` + +test $ret1=0 || exit 201500 +test $ret1=0 || exit 201501 + +ret1=`grep -c SRR1222430.4 tmp/klebsiella_5_1_bad_scores.undefined.fq` +ret2=`grep -c SRR1222430.4 tmp/klebsiella_5_2_bad_scores.undefined.fq` + +test $ret1=0 || exit 201600 +test $ret1=0 || exit 201601 + +ret1=`cat tmp/klebsiella_5_1_2_qual_thres.fq|wc -l` +ret2=`cat tmp/klebsiella_5_1_2_qual_thres.fq|wc -l` + +test $ret1=12 || exit 201700 +test $ret1=12 || exit 201701 + +ret1=`grep -c SRR1222430.2 tmp/klebsiella_5_1_2_qual_thres.fq` +ret2=`grep -c SRR1222430.2 tmp/klebsiella_5_2_2_qual_thres.fq` + +test $ret1=2 || exit 201800 +test $ret1=2 || exit 201801 + +ret1=`grep -c SRR1222430.3 tmp/klebsiella_5_1_2_qual_thres.fq` +ret2=`grep -c SRR1222430.3 tmp/klebsiella_5_2_2_qual_thres.fq` + +test $ret1=2 || exit 201802 +test $ret1=2 || exit 201803 + +ret1=`grep -c SRR1222430.4 tmp/klebsiella_5_1_2_qual_thres.fq` +ret2=`grep -c SRR1222430.4 tmp/klebsiella_5_2_2_qual_thres.fq` + +test $ret1=2 || exit 201804 +test $ret1=2 || exit 201805 + +rm -f tmp/klebsiella_5_1_bad_scores.undefined.fq +rm -f tmp/klebsiella_5_2_bad_scores.undefined.fq +rm -f tmp/klebsiella_5_1_2_qual_thres.fq +rm -f tmp/klebsiella_5_2_2_qual_thres.fq + + +echo " " +echo "##################################################################################" +echo "testing ROCK with no quality score threshold for nucleotides and PE processed separately" + +../src/rock -C 100 -c 1 -l 2 --separate_PE -o ${srcdir}/data/iofiles.args/output_files_2_PE_separated.txt ${srcdir}/data/fastq.raw/klebsiella_5_1_bad_scores.fq,${srcdir}/data/fastq.raw/klebsiella_5_2_bad_scores.fq || exit 201 + +[ -f "tmp/klebsiella_5_1_bad_scores.undefined.fq" ] || exit 30100 +[ -f "tmp/klebsiella_5_2_bad_scores.undefined.fq" ] || exit 30101 + +ret1=`cat tmp/klebsiella_5_1_bad_scores.undefined.fq|wc -l` +ret2=`cat tmp/klebsiella_5_2_bad_scores.undefined.fq|wc -l` + +test $ret1=0 || exit 301100 +test $ret1=0 || exit 301101 + + + +ret1=`cat tmp/klebsiella_5_1_2_qual_thres.fq|wc -l` +ret2=`cat tmp/klebsiella_5_1_2_qual_thres.fq|wc -l` + +test $ret1=20 || exit 201700 +test $ret1=20 || exit 201701 + +ret1=`grep -c SRR1222430.2 tmp/klebsiella_5_1_2_qual_thres.fq` +ret2=`grep -c SRR1222430.2 tmp/klebsiella_5_2_2_qual_thres.fq` + +test $ret1=2 || exit 301800 +test $ret1=2 || exit 301801 + +ret1=`grep -c SRR1222430.3 tmp/klebsiella_5_1_2_qual_thres.fq` +ret2=`grep -c SRR1222430.3 tmp/klebsiella_5_2_2_qual_thres.fq` + +test $ret1=2 || exit 301802 +test $ret1=2 || exit 301803 + +ret1=`grep -c SRR1222430.4 tmp/klebsiella_5_1_2_qual_thres.fq` +ret2=`grep -c SRR1222430.4 tmp/klebsiella_5_2_2_qual_thres.fq` + +test $ret1=2 || exit 301804 +test $ret1=2 || exit 301805 + +ret1=`grep -c SRR1222430.1 tmp/klebsiella_5_1_2_qual_thres.fq` +ret2=`grep -c SRR1222430.1 tmp/klebsiella_5_2_2_qual_thres.fq` + +test $ret1=2 || exit 301806 +test $ret1=2 || exit 301807 + +ret1=`grep -c SRR1222430.5 tmp/klebsiella_5_1_2_qual_thres.fq` +ret2=`grep -c SRR1222430.5 tmp/klebsiella_5_2_2_qual_thres.fq` + +test $ret1=2 || exit 301808 +test $ret1=2 || exit 301809 + rm -fr tmp