From b073c48b81d1ee45804e16e478323a89d5bad844 Mon Sep 17 00:00:00 2001 From: Veronique Legrand <vlegrand@pasteur.fr> Date: Tue, 11 Apr 2017 11:51:37 +0200 Subject: [PATCH] version 1.6; changed options; refactored code. --- NEWS | 14 ++++ configure.ac | 2 +- doc/rock.pod | 44 +++++++----- src/CountMinSketch.hpp | 158 ++++++++++++++++++++++++++++++----------- src/Filter.hpp | 15 ++-- src/FqAuxBackend.cpp | 37 ++-------- src/FqAuxBackend.h | 6 +- src/FqBaseBackend.cpp | 48 +++++-------- src/FqBaseBackend.h | 27 ++++--- src/FqMainBackend.cpp | 17 +++-- src/FqMainBackend.h | 2 +- src/ROCKparams.cpp | 15 ++-- src/ROCKparams.h | 8 +-- src/ReadProcessor.cpp | 64 +++-------------- src/ReadProcessor.h | 12 ++-- src/main_utils.cpp | 8 +-- src/unit_test_cms.cpp | 37 ++++++++++ test/rock.sh | 22 +++--- test/rock_mem.sh | 30 ++++---- 19 files changed, 306 insertions(+), 260 deletions(-) diff --git a/NEWS b/NEWS index 3cae91b..c40aa43 100644 --- a/NEWS +++ b/NEWS @@ -1,9 +1,23 @@ +New in version 1.6 + * code refactoring (no impact for the user) + * Default values for options have changed: + * --PE_separated has been removed because it is now the default behavior. + * -p has been introduced for the case where you want to process PE as single. + * see man for other option new values. + +New in version 1.5.1 + * bug fix for --separate_PE behavior is now as specified: keep read if R1 or R2 have coverage below kappa. +Remove read if R1 and R2 have coverage below kappa_prime. + New in version 1.5 * --separate_PE option for taking into account the 2 ends of a PE separately in the CMS. Default mode is concatenating the 2 ends and treat them as a single read in the CMS. + New in version 1.4 * -m option for required minimum number of correct k-mer to keep a read. + New in version 1.3 * -q option for nucleotique quality score threshold. + New in version 1.2 * -v option for verbose mode. diff --git a/configure.ac b/configure.ac index 0f719e5..904dfad 100644 --- a/configure.ac +++ b/configure.ac @@ -1,7 +1,7 @@ dnl Process this file with autoconf to produce a configure script. AC_PREREQ(2.59) -AC_INIT(rock, 1.5.1) +AC_INIT(rock, 1.6) AC_CANONICAL_SYSTEM diff --git a/doc/rock.pod b/doc/rock.pod index 30de1fd..327633d 100644 --- a/doc/rock.pod +++ b/doc/rock.pod @@ -11,7 +11,7 @@ =over 4 -=item B<rock> [B<-h>] [B<-i> F<file>] [B<-o> F<file>] [B<-k> F<k_mer_size>] [B<-q> F<nucl_qual_score_threshold>] [B<-C> F<kappa>] [B<-c> F<kappa_prime>] [B<-l> F<lambda>] [B<-n> F<nb_distinct_k_mer>] [B<-g> F<CMS size in GB>] [Args] +=item B<rock> [B<-h>] [B<-i> F<file>] [B<-o> F<file>] [B<-k> F<k_mer_size>] [B<-p>] [B<-q> F<nucl_qual_score_threshold>] [B<-C> F<kappa>] [B<-c> F<kappa_prime>] [B<-l> F<lambda>] [B<-n> F<nb_distinct_k_mer>] [B<-m> F<min_valid_k_mer_per_read>] [B<-g> F<CMS size in GB>] [Args] =back @@ -40,7 +40,7 @@ Specify wanted k-mer size. Default is 30; maximum is 32. =item -q Specify minimum threshold for nucleotides quality score. Nucleotides that have a score below that threshold will be considered as errors (just like N nucleotides). -Default is 0. +Default is 10. =item -m @@ -49,20 +49,20 @@ Specify minimum number of correct k-mers required to keep a read for CMS filter. Indicate only a integer in version 1.4. Default is 1. -=item --separate_PE +=item -p -Use this option to indicate ROCK that you want the CMS to handle pair end reads separately. Default behavior is to concatenate the 2 ends of the PE (once their k-mers are converted into number) and process them as a single read. +Use this option to indicate ROCK that you want the CMS to handle pair end reads as single. Behavior is then to concatenate the 2 ends of the PE (once their k-mers are converted into number) and process them as a single read. No argument. -Cannot be used together with -m. -When --separate_PE is enabled, -m defaults to 1 which means that as there is at least 1 valid k-mer in each part of the PE it is kept. +By default, PE reads are processed independantly (ie kept in the CMS if at least PE1 or PE2 has coverage<=C and>=c. +As -m defaults to 1,it means that as there is at least 1 valid k-mer in each part of the PE it is kept. =item -C -Specify maximum coverage wanted. Default is 50. Maximum is 65535. +Specify maximum coverage wanted. Default is 70. Maximum is 65535. =item -c -Specify lower threshold for coverage. Default is 0 (no lower threshold). +Specify lower threshold for coverage. Default is 5 (no lower threshold). =item -l @@ -86,7 +86,7 @@ Default is to use the minimum between =item -v -verbose mode. At the end of execution displays a recap ofall ROCK parameters (name of input files, values of, kappa and happa_prime) plus, CMS size in Bytes, plus if -n was provided, the expected probability of collision. +verbose mode. At the end of execution displays a recap of all ROCK parameters (name of input files, values of, kappa and happa_prime) plus, CMS size in Bytes, plus if -n was provided, the expected probability of collision. =item -h @@ -97,14 +97,21 @@ Usage display. =head1 DESCRIPTION B<ROCK> is a filter of fastq records. -Its aim is to filter fastq files on 2 criteria: +Its aim is to filter fastq files on different criteria: 1rst the quality score: records with the highest quality score are processed first and thus are sure to be selected. 2nd median coverage of the read: If more than half of the k-mers in the read have occurred less than kappa times; then the read is selected. - - A 3rd criteria can be added via the -c option. It is a lower filter for removing reads that have a median coverage below a certain threshold. + 3rd median coverage of the read: if more than half of the k-mers in the read have occurred less than kappa_prime times; then the read is removed. This is very useful for removing reads from "polluted" samples. -B<ROCK> is highly parameterizable via its options. It is possible to specify the size of the count min sketch by: + +B<ROCK> is highly parameterizable via its options. +It is possible to refine filtering by adding more criteria: + 1) nucleotide score threshold: k-mers containing at least 1 nucleotide below specified threshold will not be taken into account. + 2) minimum number of correct k-mers : let X be this number. Reads containing less than X k-mers without nucleotides below threshold will not be processed. + They will be put in files named with .undefined extension. + 3) For pair end reads, it is possible to indicate if you want them to be processed as single. By default they are processed separately. + +It is possible to specify the size of the count min sketch by: 1) indicating the number of arrays that you want in it (via the -l option), 2) indicating the number of different k-mers in your input files (via the -n option). Thus B<ROCK> will compute the best lambda value by minimizing the probability of collision and taking into account the amount of RAM on your system. 3) indicating (in GigaBytes) the size you want it to have. B<ROCK> will compute the number of arrays in it depending on kappa value. @@ -130,14 +137,19 @@ Reads for median coverage < 20 (kappa_prime) will be removed. =item rock my_PE1.fq,myPE2.fq my_single.fq Rock will filter the 3 files in input with default parameters: -high filter=50, -no low filter +high filter=70, +low filter=5 +nucleotide quality score threshold=10 +minimum number of correct k-mer in a read=1 +pair end reads are processed separately lambda=min(4,possible value according to RAM amount on the execution machine). -And it will produce 3 output files in the execution directory. +And it will produce 6 output files in the execution directory. These files will be named: my_PE1.rock.fq,myPE2.rock.fq my_single.rock.fq +my_PE1.undefined.fq,myPE2.undefined.fq +my_single.undefined.fq =back diff --git a/src/CountMinSketch.hpp b/src/CountMinSketch.hpp index df1c8a1..c42cc97 100644 --- a/src/CountMinSketch.hpp +++ b/src/CountMinSketch.hpp @@ -137,7 +137,7 @@ private: inline int isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold); - //inline int isRCovBelowThresAux(const readNumericValues& read_val, const int& threshold); + inline int isRCovBelowThresPartial(const readNumericValues& read_val, const int& threshold, const int& start=0,const int& stop=0); void init(int glambda,int gkappa,int gkappa_prime,int filter_PE_as_single=1); @@ -180,7 +180,6 @@ public: // keep that just for unit testing purpose T getEstimatedNbOcc(const unsigned long& val); - //int addRead(const readNumericValues& read_val); int addRead(const T_read_numericValues& read_val); int isBeneathMinKappa(const T_read_numericValues&read_val); @@ -194,86 +193,162 @@ public: + + +/* + * Determines if read (single or PE) has a median coverage below a given threshold + */ +// time returns 4m26s for this method. +/* +template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold) { + int PE1_below_thres; + int PE2_below_thres=0; + PE1_below_thres=isRCovBelowThresPartial(read_val.single_or_PE_val,threshold,0,read_val.idx_start_PE2); + if (read_val.idx_start_PE2 && !filter_PE_as_single) { + PE2_below_thres=isRCovBelowThresPartial(read_val.single_or_PE_val,threshold,read_val.idx_start_PE2,0); + if (threshold==kappa) return (PE1_below_thres || PE2_below_thres); + else return(PE1_below_thres && PE2_below_thres); + } + return PE1_below_thres; +}*/ + /* * Computes median of occurences for all the k_mers in a single read or for a part of a PE read. */ +/* template<typename T> inline int CountMinSketch<T>::isRCovBelowThresPartial(const readNumericValues& read_val, const int& threshold, const int& start,const int& stop) { - int a=0,b=0; + int b; + int a=0; int j,h; T min; readNumericValues::const_iterator it; readNumericValues::const_iterator it_aux; - if (stop) it_aux=read_val.begin()+stop; - else it_aux=read_val.end(); - for (it=read_val.begin()+start;it!=it_aux;++it) { - j=lambda; - min=mask; - while (--j>=0 && min>threshold) { - h=hash64to32(*it,j); - min=cms_lambda_array[j] [h]; - } - (min<threshold)? a+=1:a; - ++b; + if (stop) { + it_aux=read_val.begin()+stop; + b=stop-1; + } else { + it_aux=read_val.end(); + b=read_val.size(); + } + it=read_val.begin()+start; + while (it!=it_aux and 2*a<b) { + j=lambda; + min=mask; + while (--j>=0 && min>threshold) { + h=hash64to32(*it,j); + min=cms_lambda_array[j] [h]; + } + (min<threshold)? a+=1:a; + ++it; } return (2*a>b); } - +*/ /* - * Determines if read (single or PE) has a median coverage below a given threshold + * Determines whether median of k-mer coverage is below threshold or not. */ +// this is a few seconds faster template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold) { int PE1_below_thres; int PE2_below_thres=0; - PE1_below_thres=isRCovBelowThresPartial(read_val.single_or_PE_val,threshold,0,read_val.idx_start_PE2); - if (read_val.idx_start_PE2 && !filter_PE_as_single) { - PE2_below_thres=isRCovBelowThresPartial(read_val.single_or_PE_val,threshold,read_val.idx_start_PE2,0); + int a1=0,a2=0; + int b1,b2=0; + int j,h; + T min; + + int num=read_val.single_or_PE_val.size(); + + read_val.idx_start_PE2?b1=read_val.idx_start_PE2:b1=num; + read_val.idx_start_PE2?b2=num-b1:b2; + + // this proves faster than using iterators on these data (short) + long const * p_start=(long const * )&(read_val.single_or_PE_val[0]); // TODO: when we'll use C++11: use the data() method of the vector class to access the underlying C array. + int nb=0; + while(nb<b1 && 2*a1<=b1) { + j=lambda; + min=mask; + while (--j>=0 && min>threshold) { + // printf("nb=%d k_mer=%ld\n",nb,*(p_start+nb)); + h=hash64to32(*(p_start+nb),j); + min=cms_lambda_array[j] [h]; + } + (min<threshold)? a1+=1:a1; + nb++; + } + PE1_below_thres=2*a1>b1; + if (b2) { + nb=b1; + while(nb<num && 2*a2<=b2) { + j=lambda; + min=mask; + while (--j>=0 && min>threshold) { + h=hash64to32(*(p_start+nb),j); + min=cms_lambda_array[j] [h]; + } + (min<threshold)? a2+=1:a2; + nb++; + } + PE2_below_thres=2*a2>b2; if (threshold==kappa) return (PE1_below_thres || PE2_below_thres); else return(PE1_below_thres && PE2_below_thres); } return PE1_below_thres; } +// C++11 style. It is equivalent to the C style optim in terms of performance. /* template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold) { int PE1_below_thres; int PE2_below_thres=0; - int a=0,b=0; + int a1=0,a2=0; + int b1,b2=0; int j,h; T min; - readNumericValues::const_iterator it; - readNumericValues::const_iterator it_aux; - if (read_val.idx_start_PE2) it_aux=read_val.single_or_PE_val.begin()+read_val.idx_start_PE2; - else it_aux=read_val.single_or_PE_val.end(); - for (it=read_val.single_or_PE_val.begin();it!=it_aux;++it) { // idx_start_PE2 is 0 if PE are treated as single or in the case of single reads. + + int num=read_val.single_or_PE_val.size(); + + read_val.idx_start_PE2?b1=read_val.idx_start_PE2:b1=num; + read_val.idx_start_PE2?b2=num-b1:b2; + + // this proves faster than using iterators on these data (short) + // long const * p_start=(long const * )&(read_val.single_or_PE_val[0]); // TODO: when we'll use C++11: use the data() method of the vector class to access the underlying C array. + + long const * ptr = (num > 0) ? (long const * ) read_val.single_or_PE_val.data() : nullptr; + + int nb=0; + while(nb<b1 && 2*a1<=b1) { j=lambda; min=mask; while (--j>=0 && min>threshold) { - h=hash64to32(*it,j); + // printf("nb=%d k_mer=%ld\n",nb,*(p_start+nb)); + h=hash64to32(ptr[nb],j); min=cms_lambda_array[j] [h]; } - (min<threshold)? a+=1:a; - ++b; + (min<threshold)? a1+=1:a1; + nb++; } - PE1_below_thres=(2*a>b); - if (read_val.idx_start_PE2 && !filter_PE_as_single) { - a=0; - b=0; - for (it=it_aux;it!=read_val.single_or_PE_val.end();++it) { // idx_start_PE2 is 0 if PE are treated as single or in the case of single reads. + PE1_below_thres=2*a1>b1; + if (b2) { + nb=b1; + while(nb<num && 2*a2<=b2) { j=lambda; min=mask; while (--j>=0 && min>threshold) { - h=hash64to32(*it,j); + h=hash64to32(ptr[nb],j); min=cms_lambda_array[j] [h]; } - (min<threshold)? a+=1:a; - ++b; + (min<threshold)? a2+=1:a2; + nb++; } - PE2_below_thres=(2*a>b); + PE2_below_thres=2*a2>b2; + if (threshold==kappa) return (PE1_below_thres || PE2_below_thres); + else return(PE1_below_thres && PE2_below_thres); } - return PE1_below_thres||PE2_below_thres; -} -*/ + return PE1_below_thres; +}*/ + + template<typename T> void CountMinSketch<T>::init(int glambda,int gkappa,int gkappa_prime,int gfilter_PE_as_single) { lambda=glambda; @@ -293,7 +368,6 @@ template<typename T> void CountMinSketch<T>::init(int glambda,int gkappa,int gka template<typename T> T CountMinSketch<T>::getEstimatedNbOcc(const unsigned long& val) { int j,h; T min=mask; - // unsigned char min=ubytemask; j=lambda; while(--j>=0 && min>kappa) { h=hash64to32(val,j); @@ -321,7 +395,6 @@ template<typename T> int CountMinSketch<T>::isBeneathMinKappa(const T_read_numer /* * Go through the arrays of the CMS and returns the number of distinct k_mers (biggest nbr of non-zero counters amongst all arrays) */ - template<typename T> unsigned long CountMinSketch<T>::getNbDistinctKMers() { int j,h; unsigned long min=INT_MAX; @@ -329,7 +402,6 @@ template<typename T> unsigned long CountMinSketch<T>::getNbDistinctKMers() { for (j=lambda-1;j>=0;--j) { unsigned long nb_k_mers=0; for (h=INT_MAX-1;h>=0;--h) { - //(min<threshold)? a+=1: NULL; (cms_lambda_array[j] [h]>0)? nb_k_mers+=1: nb_k_mers; } (nb_k_mers<min) ?min=nb_k_mers: min; diff --git a/src/Filter.hpp b/src/Filter.hpp index 87b4e0e..9b2eaf5 100644 --- a/src/Filter.hpp +++ b/src/Filter.hpp @@ -23,7 +23,7 @@ typedef CountMinSketch<unsigned short> ShortCountMinSketch; -// created this class to hide implementation detail (byte versys short) from main program. TODO: refactor it later if I have time. +// created this class to hide implementation detail (byte versys short) from main program. class Filter { ByteCountMinSketch * pByteCMS; ShortCountMinSketch * pShortCMS; @@ -61,7 +61,6 @@ template <typename T> void Filter::underlyingfillCMS(FqBaseBackend* map_id_backe srp::reverse_iterator rit; i_dim::iterator it_offs; k_dim::iterator it_struct; - // const unsigned char masque=0x0F; // to retrieve 2nd file identifier. ReadProcessor read_p(k); T_read_numericValues nbrKmerDecompo; @@ -86,19 +85,19 @@ template <typename T> void Filter::underlyingfillCMS(FqBaseBackend* map_id_backe ret=cms->addRead(nbrKmerDecompo); if (!ret) { // read is rejected, update rpos structure accordingly. - it_struct->fileid=0; // TODO Benchmark: Wouldn't rit->second.erase(++it_struct) be more efficient (reduce size of CMS in memory then 2nd parsing for kappa_prime filtering and third parsing for writing output files would be faster? + it_struct->fileid=0; } nbrKmerDecompo.single_or_PE_val.clear(); } } } - // last step, close fq files. TODO this step may be moved somewhere else for optimization. - for (i=0;i<nb_be;i++){ - map_id_backend[i]->closeInputFile(); - } + // last step, close fq files. + for (i=0;i<nb_be;i++){ + map_id_backend[i]->closeInputFile(); + } } -template <typename T> void Filter::underlyinglowFilterCMS(FqBaseBackend* map_id_backend[],int nb_be,int k,srp* io_sr, CountMinSketch<T>* cms) { // TODO refactor get k from CMS. +template <typename T> void Filter::underlyinglowFilterCMS(FqBaseBackend* map_id_backend[],int nb_be,int k,srp* io_sr, CountMinSketch<T>* cms) { // possible refactor: get k from CMS. srp::iterator it; i_dim::iterator it_offs; k_dim::iterator it_struct; diff --git a/src/FqAuxBackend.cpp b/src/FqAuxBackend.cpp index a11d8d8..c1dcf26 100644 --- a/src/FqAuxBackend.cpp +++ b/src/FqAuxBackend.cpp @@ -20,14 +20,13 @@ #include <iostream> using namespace std; -// #define DEBUG +// TODO : implement some clean logger mechanism (removed debug function for that). int FqAuxBackend::getNextRead(rinfo * p_nr) { int rfound=0; int eof=0; while (!rfound) { if (buf_info.real_bufsize==0 || (buf_info.cnt>=buf_info.real_bufsize)) { - // if (pos_in_buf==nread-1) cout<<"BE2 : last char of buf before reading new buffer: "<<buf[pos_in_buf]<<endl; readBuffer(); } if (buf_info.real_bufsize==0) { eof=1; break;} @@ -53,7 +52,7 @@ int FqAuxBackend::processBuffer(rinfo * p_nr) { case k_read_id_start: if (qual_score) goto inc_score; else { - debug_processBuf(on_record_new,buf_info,fq_rec_info.rstart_offset); + //debug_processBuf(on_record_new,buf_info,fq_rec_info.rstart_offset); rstart_offset=cur_offset-buf_info.real_bufsize+buf_info.cnt; start_rec_in_buf=buf_info.cnt; buf_info.p_start_cur_rec=buf_info.pchar; @@ -68,13 +67,10 @@ int FqAuxBackend::processBuffer(rinfo * p_nr) { n=0; break; case '\n': - debug_processBuf(on_line_end,buf_info,fq_rec_info.rstart_offset); + // debug_processBuf(on_line_end,buf_info,fq_rec_info.rstart_offset); (qual_score==1)?qual_score++:qual_score; num_l_in_rec+=1; if (num_l_in_rec==5) { qual_score=0;/* end of fastq record */ -#ifdef DEBUG - nb_stop++; -#endif p_nr->f_id=f_id; p_nr->score=fq_rec_info.st; p_nr->rstart_offset=rstart_offset; @@ -84,7 +80,7 @@ int FqAuxBackend::processBuffer(rinfo * p_nr) { } break; default: - debug_processBuf(on_store_read_id,buf_info,fq_rec_info.rstart_offset); + //debug_processBuf(on_store_read_id,buf_info,fq_rec_info.rstart_offset); (num_l_in_rec==2)?fq_rec_info.nb_nucleotides_in_read++:fq_rec_info.nb_nucleotides_in_read; inc_score: { if (qual_score==2) onIncScore(fq_rec_info,buf_info,n); @@ -94,15 +90,6 @@ int FqAuxBackend::processBuffer(rinfo * p_nr) { buf_info.cnt++; } if (!rfound) keepCurFastqRecord(buf_info.buf,start_rec_in_buf,buf_info.real_bufsize); -#ifdef DEBUG - std::cout<<"BE2, position in buffer: "<<pos_in_buf<<endl; - if (pos_in_buf==nread) { - cout<<"BE2 Found "<<nb_start<<" and "<<nb_stop<<" start/stop of record in this buffer."<<endl; - tot_nb_start+=nb_start; - tot_nb_stop+=nb_stop; - cout<<"BE2 Total cumulated number of start char found: "<<tot_nb_start<< " Total number of stop: "<<tot_nb_stop<<endl; - } -#endif return rfound; } @@ -140,21 +127,11 @@ void FqAuxBackend::closeFile() { } } +void FqAuxBackend::writeToUndefFile() { + FqBaseBackend::writeToUndefFile(buf_info); +} -void FqAuxBackend::writeToUndefFile() { // here duplicate code for the moment since in the case of Auxbackend, we have processed 1 more char from buffer than in the case of MainBackend. TODO refactor this. - // FqBaseBackend::writeToUndefFile(buf_info); - int l=strlen(cur_fq_record); - if (l==0) { - writeStrToUndefFile(buf_info.p_start_cur_rec,buf_info.pchar-buf_info.p_start_cur_rec); - } else { - memcpy(cur_fq_record+l,buf_info.p_start_cur_rec,buf_info.cnt); - // char tmp=buf_info.buf[buf_info.cnt]; - l+=buf_info.cnt; - cur_fq_record[l]='\0'; - writeStrToUndefFile(cur_fq_record,strlen(cur_fq_record)); // Try that, if there is performance problem due to resolving inheritance, will find a different way - } -} void FqAuxBackend::resetCurFqRecord() { cur_fq_record[0]='\0'; diff --git a/src/FqAuxBackend.h b/src/FqAuxBackend.h index 6e093ea..1f0b863 100644 --- a/src/FqAuxBackend.h +++ b/src/FqAuxBackend.h @@ -12,12 +12,9 @@ #include "FqBaseBackend.h" -// const size_t bufsize=500000; //try that in valgrind class FqAuxBackend:public FqBaseBackend { - //size_t nread; unsigned long cur_offset; - // char * buf; T_buf_info buf_info; void readBuffer(); @@ -27,8 +24,7 @@ class FqAuxBackend:public FqBaseBackend { public: - //size_t pos_in_buf; - char current_id[50]; + // char current_id[50]; FqAuxBackend() { cur_offset=0; diff --git a/src/FqBaseBackend.cpp b/src/FqBaseBackend.cpp index f80088d..0891334 100644 --- a/src/FqBaseBackend.cpp +++ b/src/FqBaseBackend.cpp @@ -35,8 +35,6 @@ T_buf_info init_buf_info(int& nread,char * buf) { } void FqBaseBackend::openInputFile(char * ficname, unsigned char id) { - // int st,s; - // unsigned long cur_offset; mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; closeInputFile(); @@ -47,19 +45,12 @@ void FqBaseBackend::openInputFile(char * ficname, unsigned char id) { if (i_f_desc==-1) { err(errno,"cannot open file: %s.",i_filename); } - - f_stream=fdopen(i_f_desc,"r"); - if (f_stream==NULL) { - err(errno,"cannot open file: %s.",i_filename); - } } void FqBaseBackend::closeInputFile() { if (i_f_desc!=-1) { close(i_f_desc); - // filename=NULL; i_f_desc=-1; - f_stream=NULL; } } @@ -135,18 +126,23 @@ void FqBaseBackend::writeStrToUndefFile(char * start_in_buf, int len) { if (write(undef_f_desc,start_in_buf,len)==-1) err(errno,"cannot write in file: %s.",undef_filename); } -void FqBaseBackend::writeToUndefFile(const T_buf_info& buf_info) { +/* + * Writes a fq record to the file containing "undefined" reads. + * Undefined reads are those who don't contain the minimum number of valid k-mer. + * The thing is that: + * - when called from mainBackend, the \n terminating the fastq record has not been counted yet. + * - when called fromAuxBackend, the final \n has already been counted + * So, values for buf_info.pchar and buf_info.cnt may or may not contain the final \n. + * This is why I use the addCR parameter to add it or not. + */ +void FqBaseBackend::writeToUndefFile(const T_buf_info& buf_info,const int& addCR) { int l=strlen(cur_fq_record); if (l==0) { - writeStrToUndefFile(buf_info.p_start_cur_rec,buf_info.pchar-buf_info.p_start_cur_rec+1); + writeStrToUndefFile(buf_info.p_start_cur_rec,buf_info.pchar-buf_info.p_start_cur_rec+addCR); } else { - /*cout<<"cur_fq_record actually contains: "<<l<<"chars"<<endl; - cout<<cur_fq_record<<endl; - cout<<"going to add to it :"<<buf_info.cnt<<" char from buf"<<endl;*/ - memcpy(cur_fq_record+l,buf_info.p_start_cur_rec,buf_info.cnt+1); - // char tmp=buf_info.buf[buf_info.cnt]; + memcpy(cur_fq_record+l,buf_info.p_start_cur_rec,buf_info.cnt+addCR); l+=buf_info.cnt; - cur_fq_record[l+1]='\0'; + cur_fq_record[l+addCR]='\0'; writeStrToUndefFile(cur_fq_record,strlen(cur_fq_record)); // Try that, if there is performance problem due to resolving inheritance, will find a different way } } @@ -154,7 +150,6 @@ void FqBaseBackend::writeToUndefFile(const T_buf_info& buf_info) { void FqBaseBackend::writeToOutput(const unsigned long& offset) { - // static char * pos_in_w_buf; if (o_buf==NULL) { o_buf=(char *) malloc(FqBaseBackend::bufsize*sizeof(char)); pos_in_w_buf=o_buf; @@ -237,20 +232,18 @@ void FqBaseBackend::keepCurFastqRecord(char * buf,const int& start_rec_in_buf,co if (len+1>=MAX_FQ_RECORD_LENGTH) errx(1,"Buffer for current record is too small given record length."); memcpy(cur_fq_record,&buf[start_rec_in_buf],len); cur_fq_record[len]='\0'; - /*cout<<"keeping in memory "<<len+1<<" char from previous read"<<endl; - cout<<cur_fq_record<<endl;*/ } void FqBaseBackend::onIncScore(T_fq_rec_info& rec_info,T_buf_info& buf_info,int& n) { unsigned int s=(int)*buf_info.pchar; unsigned int remaining_nucl=rec_info.nb_nucleotides_in_read-rec_info.idx_nucl_in_read; - if (s<=qual_thres.nucl_score_threshold) { // TODO rewrite this with chained ternary operators once it is clear to see if it improves performances. - if (rec_info.idx_nucl_in_read<=qual_thres.k-1) { + if (s<=qual_thres.nucl_score_threshold) { // maybe TODO rewrite this with chained ternary operators once it is clear to see if it improves performances.Not useful: performance bottleneck is not here but in median calculation (42% of time approximatively for both filters). + if (rec_info.idx_nucl_in_read<=qual_thres.k-1) { // error is found in the first k nucleotides rec_info.nb_k_mers_in_error=rec_info.idx_nucl_in_read+1; } else if (n>0 && remaining_nucl>n) { (qual_thres.k-n<remaining_nucl-n)?rec_info.nb_k_mers_in_error+=qual_thres.k-n:rec_info.nb_k_mers_in_error+=remaining_nucl-n; - } else if (n<=0) { // there should be a way to make it simplier by making sure n is never negative. + } else if (n<=0) { // no error already encountered in the previous k-1 nucleotides. (qual_thres.k<remaining_nucl)?rec_info.nb_k_mers_in_error+=qual_thres.k:rec_info.nb_k_mers_in_error+=remaining_nucl; } n=qual_thres.k; @@ -260,6 +253,7 @@ void FqBaseBackend::onIncScore(T_fq_rec_info& rec_info,T_buf_info& buf_info,int& rec_info.idx_nucl_in_read++; } +/* void FqBaseBackend::debug_processBuf(int evt,const T_buf_info& buf_info,const unsigned long& rstart_offset) { // put all the bebug stuff in a separate method so that processBuf is shorter and more "lisible". #ifdef DEBUG static int nb_start; // number of fq record start and stop found in current buffer. @@ -312,10 +306,6 @@ void FqBaseBackend::debug_processBuf(int evt,const T_buf_info& buf_info,const un break; } #endif -} - -/*static void FqBaseBackend::setQualThreshold(const FasqQualThreshold& a_qual_thres) { - qual_thres.min_correct_k_mers_in_read=a_qual_thres.min_correct_k_mers_in_read; - qual_thres.nucl_score_threshold=a_qual_thres.nucl_score_threshold; - qual_thres.k=a_qual_thres.k; }*/ + + diff --git a/src/FqBaseBackend.h b/src/FqBaseBackend.h index bccd102..1cd30fe 100644 --- a/src/FqBaseBackend.h +++ b/src/FqBaseBackend.h @@ -16,12 +16,13 @@ #include "ROCKparams.h" //#define init_debug 1 +/* #define on_record_new 2 #define on_record_end 3 #define on_buf_end 4 #define on_line_end 5 #define on_store_read_id 6 -#define on_record_end_pe 7 +#define on_record_end_pe 7*/ // keep that for later use (implementing the logger). /* * Auxilliary structure for buffer processing. @@ -51,18 +52,15 @@ T_buf_info init_buf_info(int& nread,char * buf); class FqBaseBackend { - // TODO: this class will also be associated to writing output "filtered" fastq files. - // TODO Refactor FqBackends. Surely there is no need to keep f_stream member since I don't call ftell anymore + should even be able to get rid of lseek system call to get position in file. - // I could compute offset myself since I am reading caracters from a text file... + // I could compute offset myself since I am reading caracters from a text file... Wouldn't gain much; performance bottleneck is not here. protected: - static const size_t bufsize=6048000; // It seems that I can't have a much bigger value than that or else my objects construction throws exception. Strange. - + static const size_t bufsize=6048000; + // handling input char * i_filename; unsigned char f_id; - int i_f_desc; // for calling read - FILE * f_stream; // for calling ftell + int i_f_desc; // for writing output (filtered reads) char * o_filename; @@ -72,8 +70,7 @@ protected: // for writing undefined (reads that don't contain a sufficient number of correct k-mers) // correct k-mers are k-mers that contain only nucleotides with a quality score greater than a given threshold (default is 0). - // expected minimum number of correct k-mers is provided by the user. - + // expected minimum number of correct k-mers is provided by the user. Default is 1 char * undef_filename; int undef_f_desc; @@ -82,10 +79,8 @@ protected: char cur_fq_record[MAX_FQ_RECORD_LENGTH]; void onIncScore(T_fq_rec_info& rec_info,T_buf_info& buf_info,int& n); - void debug_processBuf(int evt,const T_buf_info& buf_info,const unsigned long& rstart_offset); + // void debug_processBuf(int evt,const T_buf_info& buf_info,const unsigned long& rstart_offset); - friend void test_processInputFiles(); - friend void test_write_PE(); friend void processPEFiles(char *, unsigned char,char * , unsigned char,srp *,char *,char *, size_t); /* for testing only */ @@ -97,12 +92,14 @@ protected: test_bufsize=new_buf_size; } + friend void test_processInputFiles(); + friend void test_write_PE(); + public: FqBaseBackend() { i_filename=NULL; i_f_desc=-1; - f_stream=NULL; f_id=0; o_f_desc=-1; o_filename=NULL; @@ -134,7 +131,7 @@ public: void setUndefFile(char * ficname); void openUndefFile(); void writeStrToUndefFile(char * start_in_buf, int len); - void writeToUndefFile(const T_buf_info& buf_info); + void writeToUndefFile(const T_buf_info& buf_info,const int& addCR=0); void closeUndefFile(); static void setQualThreshold(const FasqQualThreshold& a_qual_thres){ diff --git a/src/FqMainBackend.cpp b/src/FqMainBackend.cpp index d4f7299..240e96d 100644 --- a/src/FqMainBackend.cpp +++ b/src/FqMainBackend.cpp @@ -73,7 +73,6 @@ void FqMainBackend::processFile(char * filename,unsigned char f_id) { cur_offset = lseek(f_single, 0, SEEK_CUR); T_buf_info buf_info=init_buf_info(nread,buf); processBuf(buf_info,f_id,cur_offset); - //cout<<"going to read new buffer." <<endl; } close(f_single); this->closeUndefFile(); @@ -96,11 +95,11 @@ void FqMainBackend::onEndFastqRecord(T_fq_rec_info& rec_info,const T_buf_info& b rec_info.st+=pe2info.score; unsigned long j=rec_info.rstart_offset/UINT_MAX; update_rpos(pe2info.f_id,pe2info.rstart_offset,j,&rp); - debug_processBuf(on_record_end_pe,bufinfo,rec_info.rstart_offset); + //debug_processBuf(on_record_end_pe,bufinfo,rec_info.rstart_offset); } i_dim& ref_i_dim=(*p_scoreReadStruct)[rec_info.st/K_SCORE_NORM_FACTOR]; k_dim& ref_k_dim=ref_i_dim[rec_info.rstart_offset/UINT_MAX]; - debug_processBuf(on_record_end,bufinfo,rec_info.rstart_offset); + //debug_processBuf(on_record_end,bufinfo,rec_info.rstart_offset); int nb_k_mer=rec_info.nb_nucleotides_in_read+1-qual_thres.k; rec_info.nb_nucleotides_in_read_PE2>0?nb_k_mer_PE2=rec_info.nb_nucleotides_in_read_PE2+1-qual_thres.k:nb_k_mer_PE2=0; if (treat_PE_as_single) { @@ -133,7 +132,7 @@ void FqMainBackend::processBuf(T_buf_info& buf_info,unsigned char f_id,unsigned buf_info.p_start_cur_rec=buf_info.pchar; fq_rec_info.rstart_offset=cur_offset-buf_info.real_bufsize+buf_info.cnt; start_rec_in_buf=buf_info.cnt; - debug_processBuf(on_record_new,buf_info,fq_rec_info.rstart_offset); + //debug_processBuf(on_record_new,buf_info,fq_rec_info.rstart_offset); num_l_in_rec=1; fq_rec_info.nb_nucleotides_in_read=0; fq_rec_info.nb_nucleotides_in_read_PE2=0; @@ -148,7 +147,7 @@ void FqMainBackend::processBuf(T_buf_info& buf_info,unsigned char f_id,unsigned n=0; // counter for k-mers in error break; case '\n': - debug_processBuf(on_line_end,buf_info,fq_rec_info.rstart_offset); + //debug_processBuf(on_line_end,buf_info,fq_rec_info.rstart_offset); (qual_score==1)?qual_score++:qual_score; num_l_in_rec+=1; if (num_l_in_rec==5) { @@ -157,7 +156,7 @@ void FqMainBackend::processBuf(T_buf_info& buf_info,unsigned char f_id,unsigned } break; default: - debug_processBuf(on_store_read_id,buf_info,fq_rec_info.rstart_offset); + //debug_processBuf(on_store_read_id,buf_info,fq_rec_info.rstart_offset); (num_l_in_rec==2)?fq_rec_info.nb_nucleotides_in_read++:fq_rec_info.nb_nucleotides_in_read; inc_score: { if (qual_score==2) onIncScore(fq_rec_info,buf_info,n); @@ -168,17 +167,17 @@ void FqMainBackend::processBuf(T_buf_info& buf_info,unsigned char f_id,unsigned buf_info.cnt++; } keepCurFastqRecord(buf_info.buf,start_rec_in_buf,buf_info.real_bufsize); - debug_processBuf(on_buf_end,buf_info,fq_rec_info.rstart_offset); + //debug_processBuf(on_buf_end,buf_info,fq_rec_info.rstart_offset); } /* - * 2 cases when writin records to undef file. + * 2 cases when writing records to undef file. * 1- record was all inside current buffer => write it as is. * 2- beginning of record was at the end of one buffer and the end of the record was processed in a 2nd buffer; so, record was stored in memory. In that case, write cur_fq_record to undef file. */ void FqMainBackend::writeToUndefFile(const T_buf_info& buf_info) { - FqBaseBackend::writeToUndefFile(buf_info); // TODO : if this is slow due to inheritance resolution, use template method or other solution. + FqBaseBackend::writeToUndefFile(buf_info,1); // if this is slow due to inheritance resolution, use template method or other solution. This doesn't semm to slow the perfs according to profiling. So I leave it as is; it's clearer than templates. if (p_auxFqProcessor!=NULL) { p_auxFqProcessor->writeToUndefFile(); } diff --git a/src/FqMainBackend.h b/src/FqMainBackend.h index d7bfb1b..2f65204 100644 --- a/src/FqMainBackend.h +++ b/src/FqMainBackend.h @@ -20,7 +20,7 @@ class FqMainBackend : public FqBaseBackend { static int treat_PE_as_single; FqAuxBackend * p_auxFqProcessor; /* Another fastq processor component is necessary for handling the case of PE reads.*/ srp * p_scoreReadStruct; /* Where we store information about the reads. */ - char current_id[50]; + // char current_id[50]; used only for debug // void debug_processBuf(int evt,const T_buf_info&, const unsigned long &); void writeToUndefFile(const T_buf_info&); diff --git a/src/ROCKparams.cpp b/src/ROCKparams.cpp index b0121dd..3ee9910 100644 --- a/src/ROCKparams.cpp +++ b/src/ROCKparams.cpp @@ -263,18 +263,18 @@ void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { int i,q,m; std::vector<string> v_input_lines; std::vector<string> v_output_lines; - static int PE_as_single=1; - + static int PE_as_single=0; +/* static struct option long_options[] = { {"separate_PE", no_argument, &PE_as_single,0}, {0,0,0,0} - }; + };*/ - int option_index=0; + //int option_index=0; - while((i = getopt_long(argc, argv, "i:o:l:k:c:C:g:n:vq:m:",long_options,&option_index)) != -1) { - //printf("found option: %c\n",i); + // while((i = getopt_long(argc, argv, "i:o:l:k:c:C:g:n:vq:m:",long_options,&option_index)) != -1) { + while((i = getopt(argc, argv, "i:o:l:k:c:C:g:n:vq:m:p")) != -1) { switch(i) { case 0: break; @@ -316,6 +316,9 @@ void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { case 'v': verbose_mode=1; break; + case 'p': + PE_as_single=1; + break; case 'q': q=atoi(optarg); if (q<=0) { diff --git a/src/ROCKparams.h b/src/ROCKparams.h index 3a30f92..a7f1bd5 100644 --- a/src/ROCKparams.h +++ b/src/ROCKparams.h @@ -84,12 +84,12 @@ public: g=0; k=30; nb_k_mers=0; - parms.kappa=50; - parms.kappa_prime=0; + parms.kappa=70; + parms.kappa_prime=5; parms.lambda=0; - parms.filter_PE_as_single=1; + parms.filter_PE_as_single=0; qual_thres.min_correct_k_mers_in_read=1; - qual_thres.nucl_score_threshold=0; + qual_thres.nucl_score_threshold=10; verbose_mode=0; cms_size=0; expected_collision_proba=0.0; diff --git a/src/ReadProcessor.cpp b/src/ReadProcessor.cpp index 85f761b..69350b3 100644 --- a/src/ReadProcessor.cpp +++ b/src/ReadProcessor.cpp @@ -8,22 +8,7 @@ #include <stdlib.h> #include "ReadProcessor.h" -/* -inline void ReadProcessor::init_nucleo_rev_cpl () { - nucleo_rev_cpl['A']='T'; - nucleo_rev_cpl['T']='A'; - nucleo_rev_cpl['C']='G'; - nucleo_rev_cpl['G']='C'; - nucleo_rev_cpl['N']='N'; -}*/ - -/* -inline void ReadProcessor::init_mask(int k) { - int nb_bits=2*k; - mask_kMer=pow(2,nb_bits); - mask_kMer=mask_kMer-1; -}*/ unsigned long ReadProcessor::nucleoToNumberWrap(char s) { if (s=='N') n=k; @@ -31,34 +16,7 @@ unsigned long ReadProcessor::nucleoToNumberWrap(char s) { return nucleoToNumber(s); } -/* -inline unsigned long ReadProcessor::nucleoToNumber(char s) { - unsigned long nbr; - switch(s) - { - case 'A': - nbr=0; - break; - case 'C': - nbr=1; - break; - case 'G': - nbr=2; - break; - case 'T': - nbr=3; - break; - case 'N': - nbr=0; - // n=2*k; // do that because n will also be decremented for reverse complement - break; - default: - throw -1; // TODO Benchmark this inside try catch statement to see if try catch+exception really costs so long. - // throw an integer for the moment. An exception object may not be the most optimal choice in terms of performance. - } - // n--; - return nbr; -}*/ + #include <iostream> unsigned long ReadProcessor::kMerToNumber(char * k_m,unsigned long * p_prev) { @@ -67,12 +25,17 @@ unsigned long ReadProcessor::kMerToNumber(char * k_m,unsigned long * p_prev) { if (p_prev==NULL) { // first k_mer conversion int i; for (i=0;i<k;i++) { - //std::cout<<k_m[i]; - c=nucleoToNumberWrap(k_m[i]); // do not catch exception for the moment (not until I have checked it doesn't slow down execution). If nucleoToNumber returns -1, program will simply crash + /*try { + c=nucleoToNumberWrap(k_m[i]); // do not catch exception for the moment (not until I have checked it doesn't slow down execution). If nucleoToNumber returns -1, program will simply crash + } + catch (int e) { + std::cout<<"couldn't convert nucleotide: "<<k_m[i]<<"to number.\n"; + exit(-1); + } */ // this slightly decreases the performances + c=nucleoToNumberWrap(k_m[i]); // do not catch exception for the moment. If nucleoToNumber throws -1 and there is no catch clause for it, the terminate() function will be called automatically. Anyway, there's not munch we can do if it returns -1. nbr=nbr<<2; nbr=nbr|c; } - //std::cout<<std::endl; } else { nbr=*p_prev; c=nucleoToNumberWrap(k_m[k-1]); @@ -83,14 +46,6 @@ unsigned long ReadProcessor::kMerToNumber(char * k_m,unsigned long * p_prev) { return nbr; } -/* -inline unsigned long ReadProcessor::nucleoToNumberReverse(char s,int i) { - unsigned long nbr=0; - char cpl=nucleo_rev_cpl[s]; - nbr=nucleoToNumber(cpl); - nbr=nbr<<2*(i-1); - return nbr; -}*/ unsigned long ReadProcessor::kMerToNumberReverse(char * k_m,unsigned long * p_prev) { unsigned long nbr=0; @@ -99,7 +54,6 @@ unsigned long ReadProcessor::kMerToNumberReverse(char * k_m,unsigned long * p_pr int i; for (i=k;i>0;i--) { c=nucleoToNumberReverse(k_m[i-1],i); - // nbr=nbr>>2; nbr=nbr|c; } } else { diff --git a/src/ReadProcessor.h b/src/ReadProcessor.h index be4d4de..71fa480 100644 --- a/src/ReadProcessor.h +++ b/src/ReadProcessor.h @@ -8,11 +8,10 @@ #include <vector> #include <math.h> -//#define DEBUG -#ifdef DEBUG + #include <iostream> using namespace std; -#endif + class ReadProcessor { @@ -58,11 +57,8 @@ class ReadProcessor { nbr=0; break; default: -#ifdef DEBUG -cout<<"found unexpected character in DNA string: "<<s<<endl; -#endif - throw -1; // TODO Benchmark this inside try catch statement to see if try catch+exception really costs so long. - // throw an integer for the moment. An exception object may not be the most optimal choice in terms of performance. + cout<<"found unexpected character in DNA string: "<<s<<endl; + throw -1; } return nbr; } diff --git a/src/main_utils.cpp b/src/main_utils.cpp index ccfd370..73c9339 100644 --- a/src/main_utils.cpp +++ b/src/main_utils.cpp @@ -112,14 +112,14 @@ void usage(int status) { cout<<" -o <file> .... file name containing the list of the output FASTQ file names."<<endl; cout<<" -h .... Print this message and exit."<<endl; cout<<" -k <int> .... k-mer size. (default 30)."<<endl; - cout<<" -c <int> .... lower coverage threshold (default: 0)."<<endl; - cout<<" -C <int> .... upper coverage threshold (default: 50; max: 65535)."<<endl; + cout<<" -c <int> .... lower coverage threshold (default: 5)."<<endl; + cout<<" -C <int> .... upper coverage threshold (default: 70; max: 65535)."<<endl; cout<<" -l <int> .... size of the count min sketch (default: at most 4, depending on the available RAM)"<<endl; cout<<" -n <int> .... (expected) number of distinct k-mers within the processed reads."<<endl; cout<<" -g <int> .... maximum RAM to use (in Gb) for the CMS."<<endl; - cout<<" -q <int> .... minimum quality threshold for nucleotides. Default is 0."<<endl; + cout<<" -p .... treat PE as single. Default is NO."<<endl; + cout<<" -q <int> .... minimum quality threshold for nucleotides. Default is 10."<<endl; cout<<" -m <int> .... minimum of valid k-mer threshold for reads. Default is 1."<<endl; - cout<<" --separate_PE .... Filter criterion (-c and -C) apply to each end of a PE independly."<<endl; cout<<" -v .... verbose"<<endl; exit(status); } diff --git a/src/unit_test_cms.cpp b/src/unit_test_cms.cpp index a93ed50..8f71278 100644 --- a/src/unit_test_cms.cpp +++ b/src/unit_test_cms.cpp @@ -178,6 +178,42 @@ void testCMS_with_PE_NotasSingle(int lambda,int kappa,int kappa_prime) { delete cms; } +/* + * TODO: move this somewhere else. + * I use this test because it is in the calculation of the median that we spend the most time (approx: 50%) . I want to know what in it takes the longest to see if I can improve. + */ +/* +#include <time.h> +#include <stdlib.h> +#include <err.h> +#include "Filter.hpp" +void profiling_CMS_fill() { + ByteCountMinSketch * pByteCMS; + CMSparams parms; + parms.lambda=4; + parms.filter_PE_as_single=0; + parms.kappa=50; + parms.kappa_prime=20; + parms.filter_size=getNodePhysMemory()/100.0*90-1; + pByteCMS=new ByteCountMinSketch(parms); + // mimic SRR dataset: approx 3 million PE of 2*250 nucleotides (2*281 k-mers if k=30). + T_read_numericValues nbrKmerDecompo; + nbrKmerDecompo.idx_start_PE2=281; + srand (time(NULL)); + nbrKmerDecompo.single_or_PE_val.reserve(502); + + for (int i=0; i<3000000;i++) { + //printf("x=%d\n",x); + for (int j=0;j<502;j++) { + int x=rand() % 1000000 + 1; + nbrKmerDecompo.single_or_PE_val.push_back(x); + } + int keep_r=pByteCMS->addRead(nbrKmerDecompo); + nbrKmerDecompo.single_or_PE_val.clear(); + } + delete(pByteCMS); +} +*/ int main(int argc, char **argv) { int lambda=2; @@ -211,5 +247,6 @@ int main(int argc, char **argv) { testCMS_with_PE_NotasSingle(lambda,kappa,kappa_prime); cout<<"done"<<endl; + // profiling_CMS_fill(); return 0; } diff --git a/test/rock.sh b/test/rock.sh index 0244161..59a3ba5 100755 --- a/test/rock.sh +++ b/test/rock.sh @@ -55,17 +55,17 @@ mkdir data/fastq.filtered || exit 6 echo " " echo "##################################################################################" echo "doing more checks on options and error messages" -../src/rock -C 500000 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "Bad value for kappa" >/dev/null || exit 7 -../src/rock -C 500 -c 600 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "ERROR lower filter is higher than high filter" >/dev/null || exit 8 -../src/rock -C 500 -c 400 -k 60 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "Bad value for k" >/dev/null || exit 9 -../src/rock -C 500 -c 400 -k 10 -l 0 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "Bad value for lambda" >/dev/null || exit 10 -../src/rock -C 500 -c 400 -k 10 -l 500 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "Not enough RAM on the machine" >/dev/null || exit 11 -../src/rock -C 500 -c 400 -k 10 -l 12 -g 25 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "options are mutually exclusive" >/dev/null|| exit 12 -../src/rock -C 500 -c 400 -k 10 -g 10000 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "This machine only has" >/dev/null || exit 13 -../src/rock -C 500 -c 400 -k 10 -l 12 -n 85000000 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "options are mutually exclusive">/dev/null || exit 14 -../src/rock -C 500 -c 400 -k 10 -q 3 -m 0 -i ${srcdir}/test/data/iofiles.args/extract_klebsiella_long_reads_100.txt|grep "minimum number of valid k-mer for keeping a read must be an integer">/dev/null || exit 15 -../src/rock -C 500 -c 400 -k 10 -q -1 -m 2 -i ${srcdir}/test/data/iofiles.args/extract_klebsiella_long_reads_100.txt|grep "q must be">/dev/null || exit 16 -../src/rock -C 500 -c 400 -k 10 -q 2 -m 2 --separate_PE -i ${srcdir}/test/data/iofiles.args/extract_klebsiella_long_reads_100.txt|grep "Incompatible options">/dev/null || exit 161 +../src/rock -C 500000 -p -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "Bad value for kappa" >/dev/null || exit 7 +../src/rock -C 500 -c 600 -p -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "ERROR lower filter is higher than high filter" >/dev/null || exit 8 +../src/rock -C 500 -c 400 -k 60 -p -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "Bad value for k" >/dev/null || exit 9 +../src/rock -C 500 -c 400 -k 10 -l 0 -p -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "Bad value for lambda" >/dev/null || exit 10 +../src/rock -C 500 -c 400 -k 10 -l 500 -p -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "Not enough RAM on the machine" >/dev/null || exit 11 +../src/rock -C 500 -c 400 -k 10 -l 12 -g 25 -p -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "options are mutually exclusive" >/dev/null|| exit 12 +../src/rock -C 500 -c 400 -k 10 -g 10000 -p -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "This machine only has" >/dev/null || exit 13 +../src/rock -C 500 -c 400 -k 10 -l 12 -n 85000000 -p -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "options are mutually exclusive">/dev/null || exit 14 +../src/rock -C 500 -c 400 -k 10 -q 3 -m 0 -p -i ${srcdir}/test/data/iofiles.args/extract_klebsiella_long_reads_100.txt|grep "minimum number of valid k-mer for keeping a read must be an integer">/dev/null || exit 15 +../src/rock -C 500 -c 400 -k 10 -q -1 -m 2 -p -i ${srcdir}/test/data/iofiles.args/extract_klebsiella_long_reads_100.txt|grep "q must be">/dev/null || exit 16 +../src/rock -C 500 -c 400 -k 10 -q 2 -m 2 -i ${srcdir}/test/data/iofiles.args/extract_klebsiella_long_reads_100.txt|grep "Incompatible options">/dev/null || exit 161 # here check that we have enough memory for running the tests. ../src/unit_test_cms CHECK_MEM diff --git a/test/rock_mem.sh b/test/rock_mem.sh index a06afe2..02455ec 100755 --- a/test/rock_mem.sh +++ b/test/rock_mem.sh @@ -10,7 +10,7 @@ echo "########################################################################## echo "testing high filter" -../src/rock -C 100 -l 2 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt #>/dev/null || exit 15 +../src/rock -C 100 -l 2 -p -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt #>/dev/null || exit 15 # output files should be the same size, contain the same elements but not in the same order. nb_PE1=`grep -c "@" ${srcdir}/data/fastq.raw/klebsiella_100_1.fq` @@ -34,7 +34,7 @@ test $nb_PE2=$nb_PE2_filtered || exit 17 echo " " echo "##################################################################################" echo "testing low filter" -../src/rock -C 100 -c 99 -l 2 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt >/dev/null || exit 18 +../src/rock -C 100 -c 99 -l 2 -p -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt >/dev/null || exit 18 [ -f "data/fastq.filtered/SRR1222430_1.filtered.fastq" ] || exit 181 [ -f "data/fastq.filtered/SRR1222430_2.filtered.fastq" ] || exit 182 @@ -49,7 +49,7 @@ test $nb_PE2_filtered=0 || exit 20 echo " " echo "##################################################################################" echo "testing that input fastq file names can be provided in command line." -../src/rock -C 100 -c 1 -l 2 ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq ${srcdir}/data/unit/test_single.fq ${srcdir}/data/unit/test_single2.fq >/dev/null || exit 181 +../src/rock -C 100 -c 1 -l 2 -p ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq ${srcdir}/data/unit/test_single.fq ${srcdir}/data/unit/test_single2.fq >/dev/null || exit 181 [ -f "klebsiella_100_1.rock.fq" ] || exit 1811 [ -f "klebsiella_100_2.rock.fq" ] || exit 1812 @@ -78,9 +78,9 @@ rm -f "test_single2.undefined.fq" || exit 19194 echo " " echo "##################################################################################" echo "testing verbose mode" -../src/rock -C 100 -c 1 -l 2 -v ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq ${srcdir}/data/unit/test_single.fq ${srcdir}/data/unit/test_single2.fq|grep "CMS size=" >/dev/null || exit 182 +../src/rock -C 100 -c 1 -l 2 -p -v ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq ${srcdir}/data/unit/test_single.fq ${srcdir}/data/unit/test_single2.fq|grep "CMS size=" >/dev/null || exit 182 -../src/rock -C 100 -c 1 -v -n 1000 ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq ${srcdir}/data/unit/test_single.fq ${srcdir}/data/unit/test_single2.fq|grep "expected probability of collision was:" >/dev/null || exit 183 +../src/rock -C 100 -c 1 -v -p -n 1000 ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq ${srcdir}/data/unit/test_single.fq ${srcdir}/data/unit/test_single2.fq|grep "expected probability of collision was:" >/dev/null || exit 183 echo "erasing test result files" rm -f "klebsiella_100_1.rock.fq" || exit 1816 @@ -108,18 +108,18 @@ expected_diff2="> @SRR1222430.37 37 length=251 \ > A3>333BFA2FF4GBFFDGGCGED?FGEGGHGFFFEEG?AF13@50>///11B13@@1/>//B0?>////<//B/00??@/--:--.;:C000;:0/0009.-9:.00:-.;9/9...-;.--9@--9:////-9-9..////9/;//;9///.9-..--------..99.9.//////;-;--9-.////://9/9.;.-;-99-.//.;////-;?9;...9-9-----9;-.;.;/.-9.;/99=--;" mkdir tmp -../src/rock -C 100 -c 1 -l 2 -o ${srcdir}/data/iofiles.args/output_files_noNQ_Thres.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq || exit 191 +../src/rock -C 100 -c 1 -l 2 -p -o ${srcdir}/data/iofiles.args/output_files_noNQ_Thres.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq || exit 191 -../src/rock -C 100 -c 1 -l 2 -q 2 -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_very_low.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq || exit 192 +../src/rock -C 100 -c 1 -l 2 -q 2 -p -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_very_low.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq || exit 192 ret1=`diff tmp/klebsiella_100_1_very_low_qual_thres.fq tmp/klebsiella_100_1_no_qual_thres.fq` test $ret1=0 || exit 1921 ret2=`diff tmp/klebsiella_100_2_very_low_qual_thres.fq tmp/klebsiella_100_2_no_qual_thres.fq` test $ret2=0 || exit 1922 -../src/rock -C 100 -c 1 -l 2 -q 12 -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_12.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq||exit 193 +../src/rock -C 100 -c 1 -l 2 -q 12 -p -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_12.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq||exit 193 -../src/rock -C 100 -c 1 -l 2 -q 13 -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_13.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq||exit 194 +../src/rock -C 100 -c 1 -l 2 -q 13 -p -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_13.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq||exit 194 ret1=`diff tmp/klebsiella_100_1_13_qual_thres.fq tmp/klebsiella_100_1_12_qual_thres.fq|grep -c "@SRR1222430.37"` test $ret1 = 1 || exit 1931 @@ -140,9 +140,9 @@ echo "########################################################################## echo "testing ROCK with a quality score threshold for nucleotides and minimum number of valid k-mer to keep a read." mkdir tmp -../src/rock -C 100 -c 1 -l 2 -o ${srcdir}/data/iofiles.args/output_files_noNQ_Thres.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq || exit 191 +../src/rock -C 100 -c 1 -l 2 -p -o ${srcdir}/data/iofiles.args/output_files_noNQ_Thres.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq || exit 191 -../src/rock -C 100 -c 1 -l 2 -q 2 -m 5 -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_very_low.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq || exit 192 +../src/rock -C 100 -c 1 -l 2 -q 2 -m 5 -p -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_very_low.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq || exit 192 ret1=`diff tmp/klebsiella_100_1_very_low_qual_thres.fq tmp/klebsiella_100_1_no_qual_thres.fq` test $ret1=0 || exit 19211 @@ -159,7 +159,7 @@ ret2=`cat tmp/klebsiella_100_2.undefined.fq|wc -l` test $ret1=0 || exit 19215 test $ret1=0 || exit 19216 -../src/rock -C 100 -c 1 -l 2 -q 13 -m 500 -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_13.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq || exit 192161 +../src/rock -C 100 -c 1 -l 2 -q 13 -m 500 -p -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_13.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq || exit 192161 [ -f "tmp/klebsiella_100_1.undefined.fq" ] || exit 19215 [ -f "tmp/klebsiella_100_2.undefined.fq" ] || exit 19216 @@ -170,7 +170,7 @@ ret2=`cat tmp/klebsiella_100_2.undefined.fq|wc -l` test $ret1 = 400 || exit 19217 test $ret1 = 400 || exit 19218 -../src/rock -C 100 -c 1 -l 2 -q 13 -m 300 -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_13.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq || exit 192162 +../src/rock -C 100 -c 1 -l 2 -q 13 -m 300 -p -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_13.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq || exit 192162 [ -f "tmp/klebsiella_100_1.undefined.fq" ] || exit 192191 [ -f "tmp/klebsiella_100_2.undefined.fq" ] || exit 192192 @@ -184,7 +184,7 @@ test $ret1 = 136 || exit 192194 echo " " echo "##################################################################################" echo "testing ROCK with a quality score threshold for nucleotides and PE processed separately" -../src/rock -C 100 -c 1 -l 2 -q 2 --separate_PE -o ${srcdir}/data/iofiles.args/output_files_2_PE_separated.txt ${srcdir}/data/fastq.raw/klebsiella_5_1_bad_scores.fq,${srcdir}/data/fastq.raw/klebsiella_5_2_bad_scores.fq || exit 201 +../src/rock -C 100 -c 1 -l 2 -q 2 -o ${srcdir}/data/iofiles.args/output_files_2_PE_separated.txt ${srcdir}/data/fastq.raw/klebsiella_5_1_bad_scores.fq,${srcdir}/data/fastq.raw/klebsiella_5_2_bad_scores.fq || exit 201 [ -f "tmp/klebsiella_5_1_bad_scores.undefined.fq" ] || exit 20100 [ -f "tmp/klebsiella_5_2_bad_scores.undefined.fq" ] || exit 20101 @@ -259,7 +259,7 @@ echo " " echo "##################################################################################" echo "testing ROCK with no quality score threshold for nucleotides and PE processed separately" -../src/rock -C 100 -c 1 -l 2 --separate_PE -o ${srcdir}/data/iofiles.args/output_files_2_PE_separated.txt ${srcdir}/data/fastq.raw/klebsiella_5_1_bad_scores.fq,${srcdir}/data/fastq.raw/klebsiella_5_2_bad_scores.fq || exit 201 +../src/rock -C 100 -c 1 -l 2 -o ${srcdir}/data/iofiles.args/output_files_2_PE_separated.txt ${srcdir}/data/fastq.raw/klebsiella_5_1_bad_scores.fq,${srcdir}/data/fastq.raw/klebsiella_5_2_bad_scores.fq || exit 201 [ -f "tmp/klebsiella_5_1_bad_scores.undefined.fq" ] || exit 30100 [ -f "tmp/klebsiella_5_2_bad_scores.undefined.fq" ] || exit 30101 -- GitLab