From 7e557a77124e54011b43a30834c93c3814ff53f6 Mon Sep 17 00:00:00 2001 From: Veronique Legrand <vlegrand@pasteur.fr> Date: Mon, 31 May 2021 15:30:08 +0200 Subject: [PATCH] did some cleaning: removed all read processing taht didn't treat PE as single --- src/CountMinSketch.hpp | 70 +++------- src/Filter.hpp | 12 +- src/FqAuxBackend.cpp | 1 - src/ROCKparams.h | 15 --- src/read_utils.cpp | 14 +- src/read_utils.h | 4 +- src/rock.cpp | 3 - src/rock_commons.h | 6 +- src/unit_test_cms.cpp | 245 +---------------------------------- src/unit_test_read_utils.cpp | 28 +--- 10 files changed, 44 insertions(+), 354 deletions(-) diff --git a/src/CountMinSketch.hpp b/src/CountMinSketch.hpp index 55c2058..a8f0702 100644 --- a/src/CountMinSketch.hpp +++ b/src/CountMinSketch.hpp @@ -41,9 +41,6 @@ typedef struct { int kappa; int kappa_prime; int max_filter_size; // max amount of RAM wanted for the CMS. - int filter_PE_separately; // indicates whether PE reads must be treated as single by the cms. Indeed it may appear that 1 end contains useful k-mer but that other end contains k-mer such that "global" median is below threshold. - // In this case, read is rejected by the CMS (default behavior). We want to do an experimentation to see if keeping such reads wold lead to better results in assembling. - // Values can be 0 (process PE as single),1 (process PE separately with strict implementation for kappa_prime) or 2 (process PE separately with lenient implementation for kappa prime). float wanted_max_collision_proba; } CMSparams; @@ -73,7 +70,6 @@ private: int lambda; int kappa; int kappa_prime; - int filter_PE_separately; T ** cms_lambda_array; T mask; @@ -84,11 +80,11 @@ private: } - inline int isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold); + inline int isRCovBelowThres(const readNumericValues& read_val, const int& threshold); inline int isRCovBelowThresPartial(const readNumericValues& read_val, const int& threshold, const int& start=0,const int& stop=0); - void init(int glambda,int gkappa,int gkappa_prime,int filter_PE_separately=0); + void init(int glambda,int gkappa,int gkappa_prime); // for unit tests. friend void test_CMS(int lambda,int kappa,int kappa_prime); @@ -96,12 +92,12 @@ private: public: - CountMinSketch(int glambda,int gkappa,int gkappa_prime,int filter_PE_separately=0) { - init(glambda,gkappa,gkappa_prime,filter_PE_separately); + CountMinSketch(int glambda,int gkappa,int gkappa_prime) { + init(glambda,gkappa,gkappa_prime); } CountMinSketch(CMSparams parms) { - init(parms.lambda,parms.kappa,parms.kappa_prime,parms.filter_PE_separately); + init(parms.lambda,parms.kappa,parms.kappa_prime); } ~CountMinSketch() { @@ -134,15 +130,12 @@ public: // keep that just for unit testing purpose T getEstimatedNbOcc(const unsigned long& val); - int addRead(const T_read_numericValues& read_val); + int addRead(const readNumericValues& read_val); - int isBeneathMinKappa(const T_read_numericValues&read_val); + int isBeneathMinKappa(const readNumericValues&read_val); unsigned long getNbDistinctKMers(); - int arePEFilteredAsSingle() { - return !filter_PE_separately; - } }; @@ -151,24 +144,20 @@ public: /* * Determines whether median of k-mer coverage is below threshold or not. */ -//#include <stdio.h> // this is a few seconds faster -template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold) { +template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const readNumericValues& read_val, const int& threshold) { int PE1_below_thres; - int PE2_below_thres=0; int a1=0,a2=0; - int b1,b2=0; + int b1; int j; unsigned int h; T min; - int num=read_val.single_or_PE_val.size(); - - read_val.idx_start_PE2?b1=read_val.idx_start_PE2:b1=num; - read_val.idx_start_PE2?b2=num-b1:b2; + int num=read_val.size(); + b1=num; // this proves faster than using iterators on these data (short) - long const * p_start=(long const * )&(read_val.single_or_PE_val[0]); // TODO: when we'll use C++11: use the data() method of the vector class to access the underlying C array. + long const * p_start=(long const * )&(read_val[0]); // TODO: when we'll use C++11: use the data() method of the vector class to access the underlying C array. int nb=0; while(nb<b1 && 2*a1<=b1) { j=lambda; @@ -181,25 +170,6 @@ template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read nb++; } PE1_below_thres=2*a1>b1; - if (b2) { - nb=b1; - while(nb<num && 2*a2<=b2) { - j=lambda; - min=mask; - while (--j>=0 && min>threshold) { - h=hash64to32(*(p_start+nb),j); - min=cms_lambda_array[j] [h]; - } - (min<threshold)? a2+=1:a2; - nb++; - } - PE2_below_thres=2*a2>b2; - if (threshold==kappa) return (PE1_below_thres || PE2_below_thres); - else { - if (filter_PE_separately==1) return(PE1_below_thres && PE2_below_thres); - return(PE1_below_thres || PE2_below_thres); - } - } return PE1_below_thres; } @@ -257,11 +227,10 @@ template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read -template<typename T> void CountMinSketch<T>::init(int glambda,int gkappa,int gkappa_prime,int gfilter_PE_separately) { +template<typename T> void CountMinSketch<T>::init(int glambda,int gkappa,int gkappa_prime) { lambda=glambda; kappa=gkappa; kappa_prime=gkappa_prime; - filter_PE_separately=gfilter_PE_separately; int j; mask=get_mask<T>::value; cms_lambda_array= (T**) malloc(lambda*sizeof(T*)); @@ -283,28 +252,23 @@ template<typename T> T CountMinSketch<T>::getEstimatedNbOcc(const unsigned long& return min; } -template<typename T> int CountMinSketch<T>::addRead(const T_read_numericValues& read_val) { +template<typename T> int CountMinSketch<T>::addRead(const readNumericValues& read_val) { int keep_r=isRCovBelowThres(read_val,kappa); if (keep_r) { readNumericValues::const_iterator it; // according to the intel profiler, we spend a lot of time in this loop. Try to use something slightly faster. - long const * p=(long const *) &read_val.single_or_PE_val[0]; + long const * p=(long const *) &read_val[0]; int cnt; - int stop=read_val.single_or_PE_val.size(); + int stop=read_val.size(); for (cnt=0;cnt<stop;cnt++) { this->addKMer(*p); p=p+1; } -/* - for (it=read_val.single_or_PE_val.begin();it!=read_val.single_or_PE_val.end();it++) { - this->addKMer(*it); - } -*/ } return keep_r; } -template<typename T> int CountMinSketch<T>::isBeneathMinKappa(const T_read_numericValues& read_val) { +template<typename T> int CountMinSketch<T>::isBeneathMinKappa(const readNumericValues& read_val) { int res=isRCovBelowThres(read_val,kappa_prime); return res; } diff --git a/src/Filter.hpp b/src/Filter.hpp index 45e3de8..8537fa2 100644 --- a/src/Filter.hpp +++ b/src/Filter.hpp @@ -85,7 +85,7 @@ template <typename T> void Filter::underlyingfillCMS(FqBaseBackend* map_id_backe i_dim::iterator it_offs; k_dim::iterator it_struct; ReadProcessor read_p(k); - T_read_numericValues nbrKmerDecompo; + readNumericValues nbrKmerDecompo; // 1rst step, open files before fetching reads in them int i; @@ -103,14 +103,14 @@ template <typename T> void Filter::underlyingfillCMS(FqBaseBackend* map_id_backe init_DnaSeqStr(&a_seqs[1]); getDNASeqstr(map_id_backend,*it_struct,j,a_seqs,qual_thres.nucl_score_threshold); // decompose dna string into k-mers and turn k_mers into numbers. - decomposeReadInKMerNums(read_p, nbrKmerDecompo,k,cms->arePEFilteredAsSingle(),a_seqs); + decomposeReadInKMerNums(read_p, nbrKmerDecompo,k,a_seqs); // insert k-mers into CMS if overall read satisfies kappa condition. ret=cms->addRead(nbrKmerDecompo); if (!ret) { // read is rejected, update rpos structure accordingly. it_struct->fileid=0; } - nbrKmerDecompo.single_or_PE_val.clear(); + nbrKmerDecompo.clear(); } } } @@ -126,7 +126,7 @@ template <typename T> void Filter::underlyinglowFilterCMS(FqBaseBackend* map_id_ k_dim::iterator it_struct; int ret; ReadProcessor read_p(k); - T_read_numericValues nbrKmerDecompo; + readNumericValues nbrKmerDecompo; // 1rst step, open files before fetching reads in them int i; @@ -145,10 +145,10 @@ template <typename T> void Filter::underlyinglowFilterCMS(FqBaseBackend* map_id_ init_DnaSeqStr(&a_seqs[1]); getDNASeqstr(map_id_backend,*it_struct,j,a_seqs); // decompose dna string into k-mers and turn k_mers into numbers. - decomposeReadInKMerNums(read_p, nbrKmerDecompo,k,cms->arePEFilteredAsSingle(),a_seqs); + decomposeReadInKMerNums(read_p, nbrKmerDecompo,k,a_seqs); ret=cms->isBeneathMinKappa(nbrKmerDecompo); if (ret) it_struct->fileid=0; - nbrKmerDecompo.single_or_PE_val.clear(); + nbrKmerDecompo.clear(); } } } diff --git a/src/FqAuxBackend.cpp b/src/FqAuxBackend.cpp index 2f6d1a6..311e5e3 100644 --- a/src/FqAuxBackend.cpp +++ b/src/FqAuxBackend.cpp @@ -23,7 +23,6 @@ #include "FqConstants.h" #include <stdio.h> -//#include <stdlib.h> #include <fcntl.h> #include <unistd.h> #include <errno.h> diff --git a/src/ROCKparams.h b/src/ROCKparams.h index 2ff7bb4..bea282b 100755 --- a/src/ROCKparams.h +++ b/src/ROCKparams.h @@ -76,11 +76,9 @@ class ROCKparams{ vector<PE_files> v_PE_files; int f_id; unsigned long cms_size; - //unsigned long total_machine_RAM; float expected_collision_proba; - //void computeLambda(); void processMainArgs(int optind, int argc, char ** argv,std::vector<string>& v_input_lines); int loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<string>& v_input_lines,std::vector<string>& v_output_lines); int loadFileArgs(const std::string& afile,std::vector<string>& v_lines); @@ -107,7 +105,6 @@ class ROCKparams{ public: FasqQualThreshold getQualThreshold(); - //int getfilterPEMode(); ROCKparams() { f_id=0; // to generate id of input/output fastq files. +1=total number of files. @@ -117,7 +114,6 @@ public: parms.kappa=70; parms.kappa_prime=0; parms.lambda=0; - parms.filter_PE_separately=0; parms.wanted_max_collision_proba=k_max_collision_proba; qual_thres.min_correct_k_mers_in_read=1; qual_thres.nucl_score_threshold=k_phred_32; @@ -125,14 +121,11 @@ public: verbose_mode=0; cms_size=0; expected_collision_proba=0.0; //! collision probability that is computed at the beginning of ROCK from the expected number of distinct k-mers provided by the user. - //min_correct_k_mers_in_read=0; - //total_machine_RAM=getNodePhysMemory(); parms.max_filter_size=getNodePhysMemory()/100.0*90-defaultGRPMAXSize; // Prefer not to use all the machine's memory if (parms.max_filter_size==0) throw EXIT_FAILURE; } - void initFromMainOptsArgs(int argc,char ** argv); @@ -162,16 +155,8 @@ public: return k; } - // cms_size is computed when ROCK starts. - /*void setFilterSize(unsigned long fsize) { - cms_size=fsize; - }*/ unsigned long getFilterSize() const { - /*unsigned long cms_size_in_GB=cms_size/1024; - cms_size_in_GB/=1024; - cms_size_in_GB/=1024; - return cms_size_in_GB;*/ return cms_size; } diff --git a/src/read_utils.cpp b/src/read_utils.cpp index 6fcf98c..d223fca 100644 --- a/src/read_utils.cpp +++ b/src/read_utils.cpp @@ -149,21 +149,19 @@ void getDNASeqstr(FqBaseBackend* fq_files_be [], } -void decomposeReadInKMerNums(ReadProcessor& read_p, T_read_numericValues& nbrKmerDecompo,int k,const int& filter_PE_as_single,DnaSeqStr a_seqs[2]) { - int nb_expected_k_mers_PE1; - int nb_expected_k_mers_PE2=0; - nbrKmerDecompo.idx_start_PE2=0; +void decomposeReadInKMerNums(ReadProcessor& read_p, readNumericValues& nbrKmerDecompo,int k,DnaSeqStr a_seqs[2]) { + int nb_expected_k_mers,nb_expected_k_mers_PE1,nb_expected_k_mers_PE2=0; nb_expected_k_mers_PE1=a_seqs[0].length+1-k; if (a_seqs[1].length) nb_expected_k_mers_PE2=a_seqs[1].length+1-k; - nbrKmerDecompo.single_or_PE_val.reserve(nb_expected_k_mers_PE1+nb_expected_k_mers_PE2); + nb_expected_k_mers=nb_expected_k_mers_PE1+nb_expected_k_mers_PE2; + nbrKmerDecompo.reserve(nb_expected_k_mers); char * start_dna_str=a_seqs[0].fq_record_buf+a_seqs[0].start_idx; - read_p.getKMerNumbers(start_dna_str,a_seqs[0].length,nbrKmerDecompo.single_or_PE_val); + read_p.getKMerNumbers(start_dna_str,a_seqs[0].length,nbrKmerDecompo); if (a_seqs[1].length) { // case of PE reads - if (!filter_PE_as_single) nbrKmerDecompo.idx_start_PE2=nbrKmerDecompo.single_or_PE_val.size(); start_dna_str=a_seqs[1].fq_record_buf+a_seqs[1].start_idx; - read_p.getKMerNumbers(start_dna_str,a_seqs[1].length,nbrKmerDecompo.single_or_PE_val); + read_p.getKMerNumbers(start_dna_str,a_seqs[1].length,nbrKmerDecompo); } } diff --git a/src/read_utils.h b/src/read_utils.h index 48c12c0..5bb504c 100644 --- a/src/read_utils.h +++ b/src/read_utils.h @@ -36,8 +36,6 @@ void getDNASeqstr(FqBaseBackend* [], void init_DnaSeqStr(DnaSeqStr * dna_seq); -void decomposeReadInKMerNums(ReadProcessor& read_p, T_read_numericValues& nbrKmerDecompo,int k,const int &,DnaSeqStr a_seqs[2]); -// void decomposeReadInKMerNums(ReadProcessor& read_p, T_read_numericValues& nbrKmerDecompo,int k,int DnaSeqStr a_seqs[2] ); - +void decomposeReadInKMerNums(ReadProcessor& read_p, readNumericValues& nbrKmerDecompo,int k,DnaSeqStr a_seqs[2]); #endif /* READ_UTILS_H_ */ diff --git a/src/rock.cpp b/src/rock.cpp index 6fce394..55673a7 100755 --- a/src/rock.cpp +++ b/src/rock.cpp @@ -74,7 +74,6 @@ void printCMSparams(const CMSparams& p) { cout<<"kappa="<<p.kappa<<endl; cout<<"kappa_prime="<<p.kappa_prime<<endl; cout<<"filter_size="<<p.max_filter_size<<endl; - cout<<"filter_PE_separately="<<p.filter_PE_separately<<endl; } const void printVerboseInfo(const ROCKparams& Rparms,const Filter& filter,const T_read_counters& rc,\ @@ -142,9 +141,7 @@ int main(int argc,char * argv[]) { main_parms.initFromMainOptsArgs(argc,argv); int f_id=main_parms.get_f_id(); CMSparams parms=main_parms.getCMSparams(); - //printCMSparams(parms); FasqQualThreshold qual_thres=main_parms.getQualThresholds(); - //printFastqQualThreshold(qual_thres); std::vector<IO_fq_files> single_files=main_parms.get_single_files(); vector<PE_files> v_PE_files=main_parms.get_PE_files(); diff --git a/src/rock_commons.h b/src/rock_commons.h index 78e5ae1..a6fc52d 100644 --- a/src/rock_commons.h +++ b/src/rock_commons.h @@ -72,11 +72,9 @@ typedef struct { typedef std::vector<unsigned long> readNumericValues; -typedef struct { +/*typedef struct { readNumericValues single_or_PE_val; - int idx_start_PE2; // In he case where PE reads are not treated as single; - // indicate at which index the single_or_PE_val array starts containing values for PE2 k-mers. -}T_read_numericValues; +}T_read_numericValues;*/ typedef struct { unsigned int nb_input_reads; diff --git a/src/unit_test_cms.cpp b/src/unit_test_cms.cpp index 6060174..21bee4a 100644 --- a/src/unit_test_cms.cpp +++ b/src/unit_test_cms.cpp @@ -47,10 +47,8 @@ void test_getBestLambdaForN() { assert(best==1); nb_k_mer=10000000000; best=Rparms.getBestLambdaForN(nb_k_mer,5,UINT_MAX,0.1); - //cout<<best<<endl; assert(best==1); best=Rparms.getBestLambdaForN(nb_k_mer,1,UINT_MAX,0.05); - //cout<<best<<endl; assert(best==8); nb_k_mer=50000000000; @@ -137,19 +135,16 @@ void test_CMS(int lambda,int kappa,int kappa_prime) { } // Now, check that our k-mer is present in the CMS. int min=cms.getEstimatedNbOcc(num); - //std::cout<<"min="<<min<<endl; assert((min==num) || (min==num-get_mask<unsigned char>::value-1)); // addKmer doesn't check kappa. unsigned long nb_distinct_k_mers=cms.getNbDistinctKMers(); - //cout<<nb_distinct_k_mers<<endl; assert(nb_distinct_k_mers==1); // mimic a read (as given by part 2 output). - T_read_numericValues read_values; + readNumericValues read_values; for (i=1;i<=400;i++) { - read_values.single_or_PE_val.push_back(i*1000-1); + read_values.push_back(i*1000-1); } - read_values.idx_start_PE2=0; ret=cms.addRead(read_values); // read should be accepted since none of the values that it contains is equal to those I inserted previously. assert(ret!=rej_expected); @@ -157,13 +152,12 @@ void test_CMS(int lambda,int kappa,int kappa_prime) { assert(nb_distinct_k_mers==401); // mimic a vector that contains a lot of k_mers already present in the CMS and check that it is rejected. - T_read_numericValues read_values2; - read_values2.idx_start_PE2=0; + readNumericValues read_values2; for (i=1;i<=199;i++) { - read_values2.single_or_PE_val.push_back(i*1000-1); + read_values2.push_back(i*1000-1); } for (i=200;i<=400;i++) { - read_values2.single_or_PE_val.push_back(num); + read_values2.push_back(num); } ret=cms.addRead(read_values2); assert(ret==rej_expected); @@ -178,231 +172,7 @@ void test_CMS(int lambda,int kappa,int kappa_prime) { -void testCMS_with_PE_NotasSingle(int lambda,int kappa,int kappa_prime) { - // int filter_PE_as_single=0; - int filter_PE_separately=1; - CountMinSketch<unsigned char>* cms=new CountMinSketch<unsigned char>(lambda,kappa,kappa_prime,filter_PE_separately); - int i; - cout<<"size of the CMS component: "<<sizeof(cms)<<endl; - int num=200; - //int rej_expected=0; - int ret; - for (i=0;i<kappa;i++) { - cms->addKMer(num); - } - int min=cms->getEstimatedNbOcc(num); - assert(min==kappa); - // mimic a PE read (as given by part 2 output). - T_read_numericValues read_values; - - // mimic a PE read in which PE1 contains k-mer that appear rarely in the CMS and PE2 contains k_mers that already appear kappa times in the CMS. - for (i=1;i<=200;i++) { - read_values.single_or_PE_val.push_back(i*1000-1); - } - read_values.idx_start_PE2=200; - for (i=1;i<=239;i++) { - read_values.single_or_PE_val.push_back(num); - } - read_values.single_or_PE_val.push_back(50); - read_values.single_or_PE_val.push_back(55); - ret=cms->addRead(read_values); // read should be accepted since PE1 is nor over covered. - assert(ret>0); - - ret=cms->isBeneathMinKappa(read_values); // read should not be considered under covered since only PE1 is under covered. - assert(ret==0); - - // Now, change PE1 a little and insert it kappa times in the CMS - for (int j=1;j<=kappa;j++) { - read_values.single_or_PE_val[2]=j; - ret=cms->addRead(read_values); - if (j<=kappa-1) assert(ret>0); // read should be accepted as long as we haven't inserted PE1 kappa times. - else assert(ret==0); //otherwise it is rejected. - } - - ret=cms->isBeneathMinKappa(read_values); // read should not be considered under covered since PE1 was inserted kappa times. - assert(ret==0); - - // mimic a PE read (as given by part 2 output). - T_read_numericValues read_values2; - - // mimic a PE read that will be inserted in the CMS kappa_prime+1 times to check low filter. - for (i=1;i<=150;i++) { - read_values2.single_or_PE_val.push_back(i*1500-1); - } - read_values2.idx_start_PE2=150; - for (i=1;i<=151;i++) { - read_values2.single_or_PE_val.push_back(i*1600-1); - } - for (i=0;i<kappa_prime+1;i++) { - ret=cms->addRead(read_values2); - assert(ret>0); - } - - ret=cms->isBeneathMinKappa(read_values2); - assert(ret==0); - //mimic a read for which PE1 is under kappa_prime and PE2 too. Such a read is expected to be removed by low filter - T_read_numericValues read_values3; - - for (i=1;i<=200;i++) { - read_values3.single_or_PE_val.push_back(i); - } - read_values3.idx_start_PE2=201; - for (i=1;i<=205;i++) { - read_values3.single_or_PE_val.push_back(i); - } - ret=cms->addRead(read_values3); - assert(ret>0); - - ret=cms->isBeneathMinKappa(read_values3); - assert(ret>0); - - //mimic a read for which PE1 is over kappa_prime but not PE2 - T_read_numericValues read_values4; - for (i=201;i<=399;i++) { - read_values4.single_or_PE_val.push_back(200); - } - read_values4.idx_start_PE2=199; - for (i=201;i<=399;i++) { - read_values4.single_or_PE_val.push_back(i); - } - ret=cms->addRead(read_values4); - assert(ret>0); - - ret=cms->isBeneathMinKappa(read_values4); - assert(ret==0); - - //mimic a read for which PE2 is over kappa_prime but not PE1 - T_read_numericValues read_values5; - for (i=201;i<=399;i++) { - read_values5.single_or_PE_val.push_back(i); - } - read_values5.idx_start_PE2=199; - for (i=201;i<=399;i++) { - read_values5.single_or_PE_val.push_back(200); - } - ret=cms->addRead(read_values5); - assert(ret>0); - - ret=cms->isBeneathMinKappa(read_values5); - assert(ret==0); - - delete cms; -} - -/* Test new implementation of PE as single. - * In this implementation, we process PE separately and we keep a PE if med(cov(PE1)) or med((covPE2)) < C and - * med(cov(PE1)) or med(cov(PE2)<c .*/ -void testCMS_with_PE_NotasSingle2(int lambda,int kappa,int kappa_prime) { - int filter_PE_separately=2; - CountMinSketch<unsigned char>* cms=new CountMinSketch<unsigned char>(lambda,kappa,kappa_prime,filter_PE_separately); - int i; - cout<<"size of the CMS component: "<<sizeof(cms)<<endl; - int num=200; - //int rej_expected=0; - int ret; - for (i=0;i<kappa;i++) { - cms->addKMer(num); - } - int min=cms->getEstimatedNbOcc(num); - assert(min==kappa); - // mimic a PE read (as given by part 2 output). - T_read_numericValues read_values; - - // mimic a PE read in which PE1 contains k-mer that appear rarely in the CMS and PE2 contains k_mers that already appear kappa times in the CMS. - for (i=1;i<=200;i++) { - read_values.single_or_PE_val.push_back(i*1000-1); - } - read_values.idx_start_PE2=200; - for (i=1;i<=239;i++) { - read_values.single_or_PE_val.push_back(num); - } - read_values.single_or_PE_val.push_back(50); - read_values.single_or_PE_val.push_back(55); - ret=cms->addRead(read_values); // read should be accepted since PE1 is nor over covered. - assert(ret>0); - - ret=cms->isBeneathMinKappa(read_values); // read should be considered under covered since PE1 is under covered. - assert(ret>0); - - // Now, change PE1 a little and insert it kappa times in the CMS - for (int j=1;j<=kappa;j++) { - read_values.single_or_PE_val[2]=j; - ret=cms->addRead(read_values); - if (j<=kappa-1) assert(ret>0); // read should be accepted as long as we haven't inserted PE1 kappa times. - else assert(ret==0); //otherwise it is rejected. - } - - ret=cms->isBeneathMinKappa(read_values); // read should not be considered under covered since PE1 was inserted kappa times. - assert(ret==0); - - // mimic a PE read (as given by part 2 output). - T_read_numericValues read_values2; - - // mimic a PE read that will be inserted in the CMS kappa_prime+1 times to check low filter. - for (i=1;i<=150;i++) { - read_values2.single_or_PE_val.push_back(i*1500-1); - } - read_values2.idx_start_PE2=150; - for (i=1;i<=151;i++) { - read_values2.single_or_PE_val.push_back(i*1600-1); - } - for (i=0;i<kappa_prime+1;i++) { - ret=cms->addRead(read_values2); - assert(ret>0); - } - - ret=cms->isBeneathMinKappa(read_values2); - assert(ret==0); - - //mimic a read for which PE1 is under kappa_prime and PE2 too. Such a read is expected to be removed by low filter - T_read_numericValues read_values3; - - for (i=1;i<=200;i++) { - read_values3.single_or_PE_val.push_back(i); - } - read_values3.idx_start_PE2=201; - for (i=1;i<=205;i++) { - read_values3.single_or_PE_val.push_back(i); - } - ret=cms->addRead(read_values3); - assert(ret>0); - - ret=cms->isBeneathMinKappa(read_values3); - assert(ret>0); - - //mimic a read for which PE1 is over kappa_prime but not PE2 - T_read_numericValues read_values4; - for (i=201;i<=399;i++) { - read_values4.single_or_PE_val.push_back(200); - } - read_values4.idx_start_PE2=199; - for (i=201;i<=399;i++) { - read_values4.single_or_PE_val.push_back(i); - } - ret=cms->addRead(read_values4); - assert(ret>0); - - ret=cms->isBeneathMinKappa(read_values4); - assert(ret>0); - - //mimic a read for which PE2 is over kappa_prime but not PE1 - T_read_numericValues read_values5; - for (i=201;i<=399;i++) { - read_values5.single_or_PE_val.push_back(i); - } - read_values5.idx_start_PE2=199; - for (i=201;i<=399;i++) { - read_values5.single_or_PE_val.push_back(200); - } - ret=cms->addRead(read_values5); - assert(ret>0); - - ret=cms->isBeneathMinKappa(read_values5); - assert(ret>0); - - delete cms; -} /* * TODO: move this somewhere else. @@ -470,11 +240,6 @@ int main(int argc, char **argv) { test_ROCKparams(); cout<<"testing CMS with lambda="<<lambda<<endl; test_CMS(lambda,kappa,kappa_prime); // Finally using C arrays (maps implied storing hash keys : 4 Bytes per k_mer overhead) but each array is of size INT_MAX... - //cout<<"testing CMS with PE not as single strict kappa prime implementation"<<endl; - // VL comment these tests since AC says That the best option is to treat PE as single. - /*testCMS_with_PE_NotasSingle(lambda,kappa,kappa_prime); - cout<<"testing CMS with PE not as single lenient kappa prime implementation"<<endl; - testCMS_with_PE_NotasSingle2(lambda,kappa,kappa_prime);*/ cout<<"done"<<endl; // profiling_CMS_fill(); diff --git a/src/unit_test_read_utils.cpp b/src/unit_test_read_utils.cpp index 648af9a..5b22586 100644 --- a/src/unit_test_read_utils.cpp +++ b/src/unit_test_read_utils.cpp @@ -202,7 +202,7 @@ void test_getReadPEWithNQST() { void test_decomposeReadInkMerNums() { int k=5; ReadProcessor read_p(k); - T_read_numericValues nbrKmerDecompo; + readNumericValues nbrKmerDecompo; DnaSeqStr a_seqs[2]; init_DnaSeqStr(&a_seqs[0]); @@ -212,26 +212,19 @@ void test_decomposeReadInkMerNums() { strcpy(seq1->fq_record_buf,"@fake_stuff\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n+\n....@\n"); seq1->start_idx=12; seq1->length=75; - decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,1,a_seqs); - assert(nbrKmerDecompo.single_or_PE_val.size()==71); - assert(nbrKmerDecompo.idx_start_PE2==0); + decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs); + assert(nbrKmerDecompo.size()==71); readNumericValues::iterator it; - for (it=nbrKmerDecompo.single_or_PE_val.begin();it!=nbrKmerDecompo.single_or_PE_val.end();it++) { + for (it=nbrKmerDecompo.begin();it!=nbrKmerDecompo.end();it++) { assert(*it==1023); } DnaSeqStr * seq2=&a_seqs[1]; strcpy(seq2->fq_record_buf,"@another_fake_stuff\nATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG\n+\n....@\n"); seq2->start_idx=20; seq2->length=80; - nbrKmerDecompo.single_or_PE_val.clear(); - decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,1,a_seqs); - assert(nbrKmerDecompo.single_or_PE_val.size()==147); - assert(nbrKmerDecompo.idx_start_PE2==0); - - T_read_numericValues nbrKmerDecompo2; - decomposeReadInKMerNums(read_p,nbrKmerDecompo2,k,0,a_seqs); - assert(nbrKmerDecompo2.single_or_PE_val.size()==147); - assert(nbrKmerDecompo2.idx_start_PE2==71); + nbrKmerDecompo.clear(); + decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs); + assert(nbrKmerDecompo.size()==147); } @@ -312,8 +305,6 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32. int nb_expected_k_mers=strlen(dnaStr)+1; nb_expected_k_mers-=k; nbrKmerDecompo.reserve(nb_expected_k_mers); - // unsigned long expected=read_p.kMerToNumber((char *) "GCTACCATAACACCAACTGTTTTCACCATA",NULL); - // unsigned long expected_rev=read_p.kMerToNumber((char *) "TATGGTGAAAACAGTTGGTGTTATGGTAGC",NULL); read_p.getKMerNumbers((char *) dnaStr,strlen(dnaStr),nbrKmerDecompo); assert(nbrKmerDecompo.size()==nb_expected_k_mers); // for each k_mer, we also expect to have a number for its reverse complement. @@ -356,7 +347,6 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32. } } -// NTTTTNGGTGCTACCATAACACCAACTGTTTTCACCATAATTTTAANATCAAGCATTAGAGACGCTTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTCATTTCTTCCACTAACCTGCCATAATCCAGTACAACCTGGTATAACGGTCAAACGCTTTTTATCATAGGAACTGTATTCTCCTACCTCACGTGGCAAAGGAGGTCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGTGGTA /* * Tests the handling of N nucleotides. @@ -405,7 +395,6 @@ void test_getReadWithNQST() { assert(my_struct1.read_a1==639); assert(my_struct2.read_a1==1228); srp io_sr; - //unsigned int j=0; DnaSeqStr a_seqs; char dna_read[MAX_READ_LENGTH]; @@ -424,8 +413,6 @@ void test_getReadWithNQST() { tmp+=a_seqs.start_idx; memcpy(dna_read,tmp,a_seqs.length); dna_read[a_seqs.length]='\0'; - /*std::cout<<"modified dna read:"<<endl; - std::cout<<dna_read<<endl;*/ assert(strcmp(dna_read,"NNNNNAGGTGCTACCATAACACCAACTGTTTTCACNATAATTTTAAAATCAAGCATTAGAGACGCNTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTNATTTCTTCCACTANCCTGCCATAATCCAGTACAACCTGGTATAACGGNCAANCGCTTTTTATCATAGGANCTGTATTCTCCTACCTCACGTGGCAAAGGAGGNCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGNNNNN")==0); init_DnaSeqStr(&a_seqs); @@ -434,7 +421,6 @@ void test_getReadWithNQST() { tmp=(char *) a_seqs.fq_record_buf; tmp+=a_seqs.start_idx; memcpy(dna_read,tmp,a_seqs.length); - // std::cout<<dna_read<<endl; assert(strcmp(dna_read,"NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN")==0); // now test the same thing with PE reads -- GitLab