diff --git a/src/CountMinSketch.hpp b/src/CountMinSketch.hpp index 55c205837d748c238b342606e55e84ca797ed817..a8f07020e2925ebddddee1df94da858d6db77750 100644 --- a/src/CountMinSketch.hpp +++ b/src/CountMinSketch.hpp @@ -41,9 +41,6 @@ typedef struct { int kappa; int kappa_prime; int max_filter_size; // max amount of RAM wanted for the CMS. - int filter_PE_separately; // indicates whether PE reads must be treated as single by the cms. Indeed it may appear that 1 end contains useful k-mer but that other end contains k-mer such that "global" median is below threshold. - // In this case, read is rejected by the CMS (default behavior). We want to do an experimentation to see if keeping such reads wold lead to better results in assembling. - // Values can be 0 (process PE as single),1 (process PE separately with strict implementation for kappa_prime) or 2 (process PE separately with lenient implementation for kappa prime). float wanted_max_collision_proba; } CMSparams; @@ -73,7 +70,6 @@ private: int lambda; int kappa; int kappa_prime; - int filter_PE_separately; T ** cms_lambda_array; T mask; @@ -84,11 +80,11 @@ private: } - inline int isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold); + inline int isRCovBelowThres(const readNumericValues& read_val, const int& threshold); inline int isRCovBelowThresPartial(const readNumericValues& read_val, const int& threshold, const int& start=0,const int& stop=0); - void init(int glambda,int gkappa,int gkappa_prime,int filter_PE_separately=0); + void init(int glambda,int gkappa,int gkappa_prime); // for unit tests. friend void test_CMS(int lambda,int kappa,int kappa_prime); @@ -96,12 +92,12 @@ private: public: - CountMinSketch(int glambda,int gkappa,int gkappa_prime,int filter_PE_separately=0) { - init(glambda,gkappa,gkappa_prime,filter_PE_separately); + CountMinSketch(int glambda,int gkappa,int gkappa_prime) { + init(glambda,gkappa,gkappa_prime); } CountMinSketch(CMSparams parms) { - init(parms.lambda,parms.kappa,parms.kappa_prime,parms.filter_PE_separately); + init(parms.lambda,parms.kappa,parms.kappa_prime); } ~CountMinSketch() { @@ -134,15 +130,12 @@ public: // keep that just for unit testing purpose T getEstimatedNbOcc(const unsigned long& val); - int addRead(const T_read_numericValues& read_val); + int addRead(const readNumericValues& read_val); - int isBeneathMinKappa(const T_read_numericValues&read_val); + int isBeneathMinKappa(const readNumericValues&read_val); unsigned long getNbDistinctKMers(); - int arePEFilteredAsSingle() { - return !filter_PE_separately; - } }; @@ -151,24 +144,20 @@ public: /* * Determines whether median of k-mer coverage is below threshold or not. */ -//#include <stdio.h> // this is a few seconds faster -template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold) { +template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const readNumericValues& read_val, const int& threshold) { int PE1_below_thres; - int PE2_below_thres=0; int a1=0,a2=0; - int b1,b2=0; + int b1; int j; unsigned int h; T min; - int num=read_val.single_or_PE_val.size(); - - read_val.idx_start_PE2?b1=read_val.idx_start_PE2:b1=num; - read_val.idx_start_PE2?b2=num-b1:b2; + int num=read_val.size(); + b1=num; // this proves faster than using iterators on these data (short) - long const * p_start=(long const * )&(read_val.single_or_PE_val[0]); // TODO: when we'll use C++11: use the data() method of the vector class to access the underlying C array. + long const * p_start=(long const * )&(read_val[0]); // TODO: when we'll use C++11: use the data() method of the vector class to access the underlying C array. int nb=0; while(nb<b1 && 2*a1<=b1) { j=lambda; @@ -181,25 +170,6 @@ template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read nb++; } PE1_below_thres=2*a1>b1; - if (b2) { - nb=b1; - while(nb<num && 2*a2<=b2) { - j=lambda; - min=mask; - while (--j>=0 && min>threshold) { - h=hash64to32(*(p_start+nb),j); - min=cms_lambda_array[j] [h]; - } - (min<threshold)? a2+=1:a2; - nb++; - } - PE2_below_thres=2*a2>b2; - if (threshold==kappa) return (PE1_below_thres || PE2_below_thres); - else { - if (filter_PE_separately==1) return(PE1_below_thres && PE2_below_thres); - return(PE1_below_thres || PE2_below_thres); - } - } return PE1_below_thres; } @@ -257,11 +227,10 @@ template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read -template<typename T> void CountMinSketch<T>::init(int glambda,int gkappa,int gkappa_prime,int gfilter_PE_separately) { +template<typename T> void CountMinSketch<T>::init(int glambda,int gkappa,int gkappa_prime) { lambda=glambda; kappa=gkappa; kappa_prime=gkappa_prime; - filter_PE_separately=gfilter_PE_separately; int j; mask=get_mask<T>::value; cms_lambda_array= (T**) malloc(lambda*sizeof(T*)); @@ -283,28 +252,23 @@ template<typename T> T CountMinSketch<T>::getEstimatedNbOcc(const unsigned long& return min; } -template<typename T> int CountMinSketch<T>::addRead(const T_read_numericValues& read_val) { +template<typename T> int CountMinSketch<T>::addRead(const readNumericValues& read_val) { int keep_r=isRCovBelowThres(read_val,kappa); if (keep_r) { readNumericValues::const_iterator it; // according to the intel profiler, we spend a lot of time in this loop. Try to use something slightly faster. - long const * p=(long const *) &read_val.single_or_PE_val[0]; + long const * p=(long const *) &read_val[0]; int cnt; - int stop=read_val.single_or_PE_val.size(); + int stop=read_val.size(); for (cnt=0;cnt<stop;cnt++) { this->addKMer(*p); p=p+1; } -/* - for (it=read_val.single_or_PE_val.begin();it!=read_val.single_or_PE_val.end();it++) { - this->addKMer(*it); - } -*/ } return keep_r; } -template<typename T> int CountMinSketch<T>::isBeneathMinKappa(const T_read_numericValues& read_val) { +template<typename T> int CountMinSketch<T>::isBeneathMinKappa(const readNumericValues& read_val) { int res=isRCovBelowThres(read_val,kappa_prime); return res; } diff --git a/src/Filter.hpp b/src/Filter.hpp index 45e3de85f772dbe03871f0736f20ab7510f84307..8537fa225ef2b7ce8afbf50e030516b1578691f3 100644 --- a/src/Filter.hpp +++ b/src/Filter.hpp @@ -85,7 +85,7 @@ template <typename T> void Filter::underlyingfillCMS(FqBaseBackend* map_id_backe i_dim::iterator it_offs; k_dim::iterator it_struct; ReadProcessor read_p(k); - T_read_numericValues nbrKmerDecompo; + readNumericValues nbrKmerDecompo; // 1rst step, open files before fetching reads in them int i; @@ -103,14 +103,14 @@ template <typename T> void Filter::underlyingfillCMS(FqBaseBackend* map_id_backe init_DnaSeqStr(&a_seqs[1]); getDNASeqstr(map_id_backend,*it_struct,j,a_seqs,qual_thres.nucl_score_threshold); // decompose dna string into k-mers and turn k_mers into numbers. - decomposeReadInKMerNums(read_p, nbrKmerDecompo,k,cms->arePEFilteredAsSingle(),a_seqs); + decomposeReadInKMerNums(read_p, nbrKmerDecompo,k,a_seqs); // insert k-mers into CMS if overall read satisfies kappa condition. ret=cms->addRead(nbrKmerDecompo); if (!ret) { // read is rejected, update rpos structure accordingly. it_struct->fileid=0; } - nbrKmerDecompo.single_or_PE_val.clear(); + nbrKmerDecompo.clear(); } } } @@ -126,7 +126,7 @@ template <typename T> void Filter::underlyinglowFilterCMS(FqBaseBackend* map_id_ k_dim::iterator it_struct; int ret; ReadProcessor read_p(k); - T_read_numericValues nbrKmerDecompo; + readNumericValues nbrKmerDecompo; // 1rst step, open files before fetching reads in them int i; @@ -145,10 +145,10 @@ template <typename T> void Filter::underlyinglowFilterCMS(FqBaseBackend* map_id_ init_DnaSeqStr(&a_seqs[1]); getDNASeqstr(map_id_backend,*it_struct,j,a_seqs); // decompose dna string into k-mers and turn k_mers into numbers. - decomposeReadInKMerNums(read_p, nbrKmerDecompo,k,cms->arePEFilteredAsSingle(),a_seqs); + decomposeReadInKMerNums(read_p, nbrKmerDecompo,k,a_seqs); ret=cms->isBeneathMinKappa(nbrKmerDecompo); if (ret) it_struct->fileid=0; - nbrKmerDecompo.single_or_PE_val.clear(); + nbrKmerDecompo.clear(); } } } diff --git a/src/FqAuxBackend.cpp b/src/FqAuxBackend.cpp index 2f6d1a6d2262291fab07be69608c332845c684ad..311e5e35cc91e57123d6ac0728aaa8ed68064e51 100644 --- a/src/FqAuxBackend.cpp +++ b/src/FqAuxBackend.cpp @@ -23,7 +23,6 @@ #include "FqConstants.h" #include <stdio.h> -//#include <stdlib.h> #include <fcntl.h> #include <unistd.h> #include <errno.h> diff --git a/src/ROCKparams.h b/src/ROCKparams.h index 2ff7bb45c8443d539fb098bd21f17f36d91403fd..bea282b6eb4f6fe9b215ba7898f0b6daf9a30f9f 100755 --- a/src/ROCKparams.h +++ b/src/ROCKparams.h @@ -76,11 +76,9 @@ class ROCKparams{ vector<PE_files> v_PE_files; int f_id; unsigned long cms_size; - //unsigned long total_machine_RAM; float expected_collision_proba; - //void computeLambda(); void processMainArgs(int optind, int argc, char ** argv,std::vector<string>& v_input_lines); int loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<string>& v_input_lines,std::vector<string>& v_output_lines); int loadFileArgs(const std::string& afile,std::vector<string>& v_lines); @@ -107,7 +105,6 @@ class ROCKparams{ public: FasqQualThreshold getQualThreshold(); - //int getfilterPEMode(); ROCKparams() { f_id=0; // to generate id of input/output fastq files. +1=total number of files. @@ -117,7 +114,6 @@ public: parms.kappa=70; parms.kappa_prime=0; parms.lambda=0; - parms.filter_PE_separately=0; parms.wanted_max_collision_proba=k_max_collision_proba; qual_thres.min_correct_k_mers_in_read=1; qual_thres.nucl_score_threshold=k_phred_32; @@ -125,14 +121,11 @@ public: verbose_mode=0; cms_size=0; expected_collision_proba=0.0; //! collision probability that is computed at the beginning of ROCK from the expected number of distinct k-mers provided by the user. - //min_correct_k_mers_in_read=0; - //total_machine_RAM=getNodePhysMemory(); parms.max_filter_size=getNodePhysMemory()/100.0*90-defaultGRPMAXSize; // Prefer not to use all the machine's memory if (parms.max_filter_size==0) throw EXIT_FAILURE; } - void initFromMainOptsArgs(int argc,char ** argv); @@ -162,16 +155,8 @@ public: return k; } - // cms_size is computed when ROCK starts. - /*void setFilterSize(unsigned long fsize) { - cms_size=fsize; - }*/ unsigned long getFilterSize() const { - /*unsigned long cms_size_in_GB=cms_size/1024; - cms_size_in_GB/=1024; - cms_size_in_GB/=1024; - return cms_size_in_GB;*/ return cms_size; } diff --git a/src/read_utils.cpp b/src/read_utils.cpp index 6fcf98c9fe92283e6e0ca4ecfcff361900a629ee..d223fca11d76d04df4c131365d4c942b71ad112f 100644 --- a/src/read_utils.cpp +++ b/src/read_utils.cpp @@ -149,21 +149,19 @@ void getDNASeqstr(FqBaseBackend* fq_files_be [], } -void decomposeReadInKMerNums(ReadProcessor& read_p, T_read_numericValues& nbrKmerDecompo,int k,const int& filter_PE_as_single,DnaSeqStr a_seqs[2]) { - int nb_expected_k_mers_PE1; - int nb_expected_k_mers_PE2=0; - nbrKmerDecompo.idx_start_PE2=0; +void decomposeReadInKMerNums(ReadProcessor& read_p, readNumericValues& nbrKmerDecompo,int k,DnaSeqStr a_seqs[2]) { + int nb_expected_k_mers,nb_expected_k_mers_PE1,nb_expected_k_mers_PE2=0; nb_expected_k_mers_PE1=a_seqs[0].length+1-k; if (a_seqs[1].length) nb_expected_k_mers_PE2=a_seqs[1].length+1-k; - nbrKmerDecompo.single_or_PE_val.reserve(nb_expected_k_mers_PE1+nb_expected_k_mers_PE2); + nb_expected_k_mers=nb_expected_k_mers_PE1+nb_expected_k_mers_PE2; + nbrKmerDecompo.reserve(nb_expected_k_mers); char * start_dna_str=a_seqs[0].fq_record_buf+a_seqs[0].start_idx; - read_p.getKMerNumbers(start_dna_str,a_seqs[0].length,nbrKmerDecompo.single_or_PE_val); + read_p.getKMerNumbers(start_dna_str,a_seqs[0].length,nbrKmerDecompo); if (a_seqs[1].length) { // case of PE reads - if (!filter_PE_as_single) nbrKmerDecompo.idx_start_PE2=nbrKmerDecompo.single_or_PE_val.size(); start_dna_str=a_seqs[1].fq_record_buf+a_seqs[1].start_idx; - read_p.getKMerNumbers(start_dna_str,a_seqs[1].length,nbrKmerDecompo.single_or_PE_val); + read_p.getKMerNumbers(start_dna_str,a_seqs[1].length,nbrKmerDecompo); } } diff --git a/src/read_utils.h b/src/read_utils.h index 48c12c0c19d06803139f07423ed1998e096e3766..5bb504cbef4b7eefbb0006acae918977eaff4fb9 100644 --- a/src/read_utils.h +++ b/src/read_utils.h @@ -36,8 +36,6 @@ void getDNASeqstr(FqBaseBackend* [], void init_DnaSeqStr(DnaSeqStr * dna_seq); -void decomposeReadInKMerNums(ReadProcessor& read_p, T_read_numericValues& nbrKmerDecompo,int k,const int &,DnaSeqStr a_seqs[2]); -// void decomposeReadInKMerNums(ReadProcessor& read_p, T_read_numericValues& nbrKmerDecompo,int k,int DnaSeqStr a_seqs[2] ); - +void decomposeReadInKMerNums(ReadProcessor& read_p, readNumericValues& nbrKmerDecompo,int k,DnaSeqStr a_seqs[2]); #endif /* READ_UTILS_H_ */ diff --git a/src/rock.cpp b/src/rock.cpp index 6fce394572d18a114a50c74089abc9ebcd00a9c4..55673a73dd384e48788f28b22d8c8fa912c1600a 100755 --- a/src/rock.cpp +++ b/src/rock.cpp @@ -74,7 +74,6 @@ void printCMSparams(const CMSparams& p) { cout<<"kappa="<<p.kappa<<endl; cout<<"kappa_prime="<<p.kappa_prime<<endl; cout<<"filter_size="<<p.max_filter_size<<endl; - cout<<"filter_PE_separately="<<p.filter_PE_separately<<endl; } const void printVerboseInfo(const ROCKparams& Rparms,const Filter& filter,const T_read_counters& rc,\ @@ -142,9 +141,7 @@ int main(int argc,char * argv[]) { main_parms.initFromMainOptsArgs(argc,argv); int f_id=main_parms.get_f_id(); CMSparams parms=main_parms.getCMSparams(); - //printCMSparams(parms); FasqQualThreshold qual_thres=main_parms.getQualThresholds(); - //printFastqQualThreshold(qual_thres); std::vector<IO_fq_files> single_files=main_parms.get_single_files(); vector<PE_files> v_PE_files=main_parms.get_PE_files(); diff --git a/src/rock_commons.h b/src/rock_commons.h index 78e5ae192f57a8ab11ed84e5c7997fd43947c0aa..a6fc52d58132393ed2e1b94c666cdefcadb2caa9 100644 --- a/src/rock_commons.h +++ b/src/rock_commons.h @@ -72,11 +72,9 @@ typedef struct { typedef std::vector<unsigned long> readNumericValues; -typedef struct { +/*typedef struct { readNumericValues single_or_PE_val; - int idx_start_PE2; // In he case where PE reads are not treated as single; - // indicate at which index the single_or_PE_val array starts containing values for PE2 k-mers. -}T_read_numericValues; +}T_read_numericValues;*/ typedef struct { unsigned int nb_input_reads; diff --git a/src/unit_test_cms.cpp b/src/unit_test_cms.cpp index 60601743348f317aa4703e0c1f7a3fdb7fd18c97..21bee4ac7e459ce2e3c383679565f29c91d62c2c 100644 --- a/src/unit_test_cms.cpp +++ b/src/unit_test_cms.cpp @@ -47,10 +47,8 @@ void test_getBestLambdaForN() { assert(best==1); nb_k_mer=10000000000; best=Rparms.getBestLambdaForN(nb_k_mer,5,UINT_MAX,0.1); - //cout<<best<<endl; assert(best==1); best=Rparms.getBestLambdaForN(nb_k_mer,1,UINT_MAX,0.05); - //cout<<best<<endl; assert(best==8); nb_k_mer=50000000000; @@ -137,19 +135,16 @@ void test_CMS(int lambda,int kappa,int kappa_prime) { } // Now, check that our k-mer is present in the CMS. int min=cms.getEstimatedNbOcc(num); - //std::cout<<"min="<<min<<endl; assert((min==num) || (min==num-get_mask<unsigned char>::value-1)); // addKmer doesn't check kappa. unsigned long nb_distinct_k_mers=cms.getNbDistinctKMers(); - //cout<<nb_distinct_k_mers<<endl; assert(nb_distinct_k_mers==1); // mimic a read (as given by part 2 output). - T_read_numericValues read_values; + readNumericValues read_values; for (i=1;i<=400;i++) { - read_values.single_or_PE_val.push_back(i*1000-1); + read_values.push_back(i*1000-1); } - read_values.idx_start_PE2=0; ret=cms.addRead(read_values); // read should be accepted since none of the values that it contains is equal to those I inserted previously. assert(ret!=rej_expected); @@ -157,13 +152,12 @@ void test_CMS(int lambda,int kappa,int kappa_prime) { assert(nb_distinct_k_mers==401); // mimic a vector that contains a lot of k_mers already present in the CMS and check that it is rejected. - T_read_numericValues read_values2; - read_values2.idx_start_PE2=0; + readNumericValues read_values2; for (i=1;i<=199;i++) { - read_values2.single_or_PE_val.push_back(i*1000-1); + read_values2.push_back(i*1000-1); } for (i=200;i<=400;i++) { - read_values2.single_or_PE_val.push_back(num); + read_values2.push_back(num); } ret=cms.addRead(read_values2); assert(ret==rej_expected); @@ -178,231 +172,7 @@ void test_CMS(int lambda,int kappa,int kappa_prime) { -void testCMS_with_PE_NotasSingle(int lambda,int kappa,int kappa_prime) { - // int filter_PE_as_single=0; - int filter_PE_separately=1; - CountMinSketch<unsigned char>* cms=new CountMinSketch<unsigned char>(lambda,kappa,kappa_prime,filter_PE_separately); - int i; - cout<<"size of the CMS component: "<<sizeof(cms)<<endl; - int num=200; - //int rej_expected=0; - int ret; - for (i=0;i<kappa;i++) { - cms->addKMer(num); - } - int min=cms->getEstimatedNbOcc(num); - assert(min==kappa); - // mimic a PE read (as given by part 2 output). - T_read_numericValues read_values; - - // mimic a PE read in which PE1 contains k-mer that appear rarely in the CMS and PE2 contains k_mers that already appear kappa times in the CMS. - for (i=1;i<=200;i++) { - read_values.single_or_PE_val.push_back(i*1000-1); - } - read_values.idx_start_PE2=200; - for (i=1;i<=239;i++) { - read_values.single_or_PE_val.push_back(num); - } - read_values.single_or_PE_val.push_back(50); - read_values.single_or_PE_val.push_back(55); - ret=cms->addRead(read_values); // read should be accepted since PE1 is nor over covered. - assert(ret>0); - - ret=cms->isBeneathMinKappa(read_values); // read should not be considered under covered since only PE1 is under covered. - assert(ret==0); - - // Now, change PE1 a little and insert it kappa times in the CMS - for (int j=1;j<=kappa;j++) { - read_values.single_or_PE_val[2]=j; - ret=cms->addRead(read_values); - if (j<=kappa-1) assert(ret>0); // read should be accepted as long as we haven't inserted PE1 kappa times. - else assert(ret==0); //otherwise it is rejected. - } - - ret=cms->isBeneathMinKappa(read_values); // read should not be considered under covered since PE1 was inserted kappa times. - assert(ret==0); - - // mimic a PE read (as given by part 2 output). - T_read_numericValues read_values2; - - // mimic a PE read that will be inserted in the CMS kappa_prime+1 times to check low filter. - for (i=1;i<=150;i++) { - read_values2.single_or_PE_val.push_back(i*1500-1); - } - read_values2.idx_start_PE2=150; - for (i=1;i<=151;i++) { - read_values2.single_or_PE_val.push_back(i*1600-1); - } - for (i=0;i<kappa_prime+1;i++) { - ret=cms->addRead(read_values2); - assert(ret>0); - } - - ret=cms->isBeneathMinKappa(read_values2); - assert(ret==0); - //mimic a read for which PE1 is under kappa_prime and PE2 too. Such a read is expected to be removed by low filter - T_read_numericValues read_values3; - - for (i=1;i<=200;i++) { - read_values3.single_or_PE_val.push_back(i); - } - read_values3.idx_start_PE2=201; - for (i=1;i<=205;i++) { - read_values3.single_or_PE_val.push_back(i); - } - ret=cms->addRead(read_values3); - assert(ret>0); - - ret=cms->isBeneathMinKappa(read_values3); - assert(ret>0); - - //mimic a read for which PE1 is over kappa_prime but not PE2 - T_read_numericValues read_values4; - for (i=201;i<=399;i++) { - read_values4.single_or_PE_val.push_back(200); - } - read_values4.idx_start_PE2=199; - for (i=201;i<=399;i++) { - read_values4.single_or_PE_val.push_back(i); - } - ret=cms->addRead(read_values4); - assert(ret>0); - - ret=cms->isBeneathMinKappa(read_values4); - assert(ret==0); - - //mimic a read for which PE2 is over kappa_prime but not PE1 - T_read_numericValues read_values5; - for (i=201;i<=399;i++) { - read_values5.single_or_PE_val.push_back(i); - } - read_values5.idx_start_PE2=199; - for (i=201;i<=399;i++) { - read_values5.single_or_PE_val.push_back(200); - } - ret=cms->addRead(read_values5); - assert(ret>0); - - ret=cms->isBeneathMinKappa(read_values5); - assert(ret==0); - - delete cms; -} - -/* Test new implementation of PE as single. - * In this implementation, we process PE separately and we keep a PE if med(cov(PE1)) or med((covPE2)) < C and - * med(cov(PE1)) or med(cov(PE2)<c .*/ -void testCMS_with_PE_NotasSingle2(int lambda,int kappa,int kappa_prime) { - int filter_PE_separately=2; - CountMinSketch<unsigned char>* cms=new CountMinSketch<unsigned char>(lambda,kappa,kappa_prime,filter_PE_separately); - int i; - cout<<"size of the CMS component: "<<sizeof(cms)<<endl; - int num=200; - //int rej_expected=0; - int ret; - for (i=0;i<kappa;i++) { - cms->addKMer(num); - } - int min=cms->getEstimatedNbOcc(num); - assert(min==kappa); - // mimic a PE read (as given by part 2 output). - T_read_numericValues read_values; - - // mimic a PE read in which PE1 contains k-mer that appear rarely in the CMS and PE2 contains k_mers that already appear kappa times in the CMS. - for (i=1;i<=200;i++) { - read_values.single_or_PE_val.push_back(i*1000-1); - } - read_values.idx_start_PE2=200; - for (i=1;i<=239;i++) { - read_values.single_or_PE_val.push_back(num); - } - read_values.single_or_PE_val.push_back(50); - read_values.single_or_PE_val.push_back(55); - ret=cms->addRead(read_values); // read should be accepted since PE1 is nor over covered. - assert(ret>0); - - ret=cms->isBeneathMinKappa(read_values); // read should be considered under covered since PE1 is under covered. - assert(ret>0); - - // Now, change PE1 a little and insert it kappa times in the CMS - for (int j=1;j<=kappa;j++) { - read_values.single_or_PE_val[2]=j; - ret=cms->addRead(read_values); - if (j<=kappa-1) assert(ret>0); // read should be accepted as long as we haven't inserted PE1 kappa times. - else assert(ret==0); //otherwise it is rejected. - } - - ret=cms->isBeneathMinKappa(read_values); // read should not be considered under covered since PE1 was inserted kappa times. - assert(ret==0); - - // mimic a PE read (as given by part 2 output). - T_read_numericValues read_values2; - - // mimic a PE read that will be inserted in the CMS kappa_prime+1 times to check low filter. - for (i=1;i<=150;i++) { - read_values2.single_or_PE_val.push_back(i*1500-1); - } - read_values2.idx_start_PE2=150; - for (i=1;i<=151;i++) { - read_values2.single_or_PE_val.push_back(i*1600-1); - } - for (i=0;i<kappa_prime+1;i++) { - ret=cms->addRead(read_values2); - assert(ret>0); - } - - ret=cms->isBeneathMinKappa(read_values2); - assert(ret==0); - - //mimic a read for which PE1 is under kappa_prime and PE2 too. Such a read is expected to be removed by low filter - T_read_numericValues read_values3; - - for (i=1;i<=200;i++) { - read_values3.single_or_PE_val.push_back(i); - } - read_values3.idx_start_PE2=201; - for (i=1;i<=205;i++) { - read_values3.single_or_PE_val.push_back(i); - } - ret=cms->addRead(read_values3); - assert(ret>0); - - ret=cms->isBeneathMinKappa(read_values3); - assert(ret>0); - - //mimic a read for which PE1 is over kappa_prime but not PE2 - T_read_numericValues read_values4; - for (i=201;i<=399;i++) { - read_values4.single_or_PE_val.push_back(200); - } - read_values4.idx_start_PE2=199; - for (i=201;i<=399;i++) { - read_values4.single_or_PE_val.push_back(i); - } - ret=cms->addRead(read_values4); - assert(ret>0); - - ret=cms->isBeneathMinKappa(read_values4); - assert(ret>0); - - //mimic a read for which PE2 is over kappa_prime but not PE1 - T_read_numericValues read_values5; - for (i=201;i<=399;i++) { - read_values5.single_or_PE_val.push_back(i); - } - read_values5.idx_start_PE2=199; - for (i=201;i<=399;i++) { - read_values5.single_or_PE_val.push_back(200); - } - ret=cms->addRead(read_values5); - assert(ret>0); - - ret=cms->isBeneathMinKappa(read_values5); - assert(ret>0); - - delete cms; -} /* * TODO: move this somewhere else. @@ -470,11 +240,6 @@ int main(int argc, char **argv) { test_ROCKparams(); cout<<"testing CMS with lambda="<<lambda<<endl; test_CMS(lambda,kappa,kappa_prime); // Finally using C arrays (maps implied storing hash keys : 4 Bytes per k_mer overhead) but each array is of size INT_MAX... - //cout<<"testing CMS with PE not as single strict kappa prime implementation"<<endl; - // VL comment these tests since AC says That the best option is to treat PE as single. - /*testCMS_with_PE_NotasSingle(lambda,kappa,kappa_prime); - cout<<"testing CMS with PE not as single lenient kappa prime implementation"<<endl; - testCMS_with_PE_NotasSingle2(lambda,kappa,kappa_prime);*/ cout<<"done"<<endl; // profiling_CMS_fill(); diff --git a/src/unit_test_read_utils.cpp b/src/unit_test_read_utils.cpp index 648af9aa9da03ef3026b8582ed10e0220722ee17..5b22586cf93491a9945726cc559d688412b9788c 100644 --- a/src/unit_test_read_utils.cpp +++ b/src/unit_test_read_utils.cpp @@ -202,7 +202,7 @@ void test_getReadPEWithNQST() { void test_decomposeReadInkMerNums() { int k=5; ReadProcessor read_p(k); - T_read_numericValues nbrKmerDecompo; + readNumericValues nbrKmerDecompo; DnaSeqStr a_seqs[2]; init_DnaSeqStr(&a_seqs[0]); @@ -212,26 +212,19 @@ void test_decomposeReadInkMerNums() { strcpy(seq1->fq_record_buf,"@fake_stuff\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n+\n....@\n"); seq1->start_idx=12; seq1->length=75; - decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,1,a_seqs); - assert(nbrKmerDecompo.single_or_PE_val.size()==71); - assert(nbrKmerDecompo.idx_start_PE2==0); + decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs); + assert(nbrKmerDecompo.size()==71); readNumericValues::iterator it; - for (it=nbrKmerDecompo.single_or_PE_val.begin();it!=nbrKmerDecompo.single_or_PE_val.end();it++) { + for (it=nbrKmerDecompo.begin();it!=nbrKmerDecompo.end();it++) { assert(*it==1023); } DnaSeqStr * seq2=&a_seqs[1]; strcpy(seq2->fq_record_buf,"@another_fake_stuff\nATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG\n+\n....@\n"); seq2->start_idx=20; seq2->length=80; - nbrKmerDecompo.single_or_PE_val.clear(); - decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,1,a_seqs); - assert(nbrKmerDecompo.single_or_PE_val.size()==147); - assert(nbrKmerDecompo.idx_start_PE2==0); - - T_read_numericValues nbrKmerDecompo2; - decomposeReadInKMerNums(read_p,nbrKmerDecompo2,k,0,a_seqs); - assert(nbrKmerDecompo2.single_or_PE_val.size()==147); - assert(nbrKmerDecompo2.idx_start_PE2==71); + nbrKmerDecompo.clear(); + decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs); + assert(nbrKmerDecompo.size()==147); } @@ -312,8 +305,6 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32. int nb_expected_k_mers=strlen(dnaStr)+1; nb_expected_k_mers-=k; nbrKmerDecompo.reserve(nb_expected_k_mers); - // unsigned long expected=read_p.kMerToNumber((char *) "GCTACCATAACACCAACTGTTTTCACCATA",NULL); - // unsigned long expected_rev=read_p.kMerToNumber((char *) "TATGGTGAAAACAGTTGGTGTTATGGTAGC",NULL); read_p.getKMerNumbers((char *) dnaStr,strlen(dnaStr),nbrKmerDecompo); assert(nbrKmerDecompo.size()==nb_expected_k_mers); // for each k_mer, we also expect to have a number for its reverse complement. @@ -356,7 +347,6 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32. } } -// NTTTTNGGTGCTACCATAACACCAACTGTTTTCACCATAATTTTAANATCAAGCATTAGAGACGCTTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTCATTTCTTCCACTAACCTGCCATAATCCAGTACAACCTGGTATAACGGTCAAACGCTTTTTATCATAGGAACTGTATTCTCCTACCTCACGTGGCAAAGGAGGTCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGTGGTA /* * Tests the handling of N nucleotides. @@ -405,7 +395,6 @@ void test_getReadWithNQST() { assert(my_struct1.read_a1==639); assert(my_struct2.read_a1==1228); srp io_sr; - //unsigned int j=0; DnaSeqStr a_seqs; char dna_read[MAX_READ_LENGTH]; @@ -424,8 +413,6 @@ void test_getReadWithNQST() { tmp+=a_seqs.start_idx; memcpy(dna_read,tmp,a_seqs.length); dna_read[a_seqs.length]='\0'; - /*std::cout<<"modified dna read:"<<endl; - std::cout<<dna_read<<endl;*/ assert(strcmp(dna_read,"NNNNNAGGTGCTACCATAACACCAACTGTTTTCACNATAATTTTAAAATCAAGCATTAGAGACGCNTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTNATTTCTTCCACTANCCTGCCATAATCCAGTACAACCTGGTATAACGGNCAANCGCTTTTTATCATAGGANCTGTATTCTCCTACCTCACGTGGCAAAGGAGGNCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGNNNNN")==0); init_DnaSeqStr(&a_seqs); @@ -434,7 +421,6 @@ void test_getReadWithNQST() { tmp=(char *) a_seqs.fq_record_buf; tmp+=a_seqs.start_idx; memcpy(dna_read,tmp,a_seqs.length); - // std::cout<<dna_read<<endl; assert(strcmp(dna_read,"NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN")==0); // now test the same thing with PE reads