From 91c02af3acbae520e4371b9a2819226fec36b6da Mon Sep 17 00:00:00 2001 From: Veronique Legrand <vlegrand@pasteur.fr> Date: Fri, 15 Apr 2022 11:28:09 +0200 Subject: [PATCH] version 2.0_beta where PE reads are processed separately and selected or removed from the cms if at least one of the PE has a median coverage below threshold. --- configure.ac | 2 +- src/CountMinSketch.hpp | 51 +++++++++++++---- src/Filter.hpp | 8 +-- src/FqMainBackend.cpp | 15 ++--- src/FqMainBackend.h | 6 +- src/ReadProcessor.cpp | 10 ++-- src/ReadProcessor.h | 3 +- src/fqreader.cpp | 4 +- src/read_utils.cpp | 11 ++-- src/read_utils.h | 2 +- src/rock_commons.h | 8 ++- src/unit_test_cms.cpp | 30 ++++++++-- src/unit_test_fqreader.cpp | 105 +++++++++-------------------------- src/unit_test_read_utils.cpp | 41 +++++--------- test/rock_mem.sh | 14 +++-- 15 files changed, 150 insertions(+), 160 deletions(-) diff --git a/configure.ac b/configure.ac index e2143e2..37c685a 100644 --- a/configure.ac +++ b/configure.ac @@ -1,7 +1,7 @@ dnl Process this file with autoconf to produce a configure script. AC_PREREQ(2.59) -AC_INIT(rock, 1.9.6) +AC_INIT(rock, 2.0_beta) AC_CANONICAL_SYSTEM diff --git a/src/CountMinSketch.hpp b/src/CountMinSketch.hpp index a8f0702..24d5d0d 100644 --- a/src/CountMinSketch.hpp +++ b/src/CountMinSketch.hpp @@ -80,7 +80,9 @@ private: } - inline int isRCovBelowThres(const readNumericValues& read_val, const int& threshold); + + + inline int isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold); inline int isRCovBelowThresPartial(const readNumericValues& read_val, const int& threshold, const int& start=0,const int& stop=0); @@ -94,10 +96,12 @@ public: CountMinSketch(int glambda,int gkappa,int gkappa_prime) { init(glambda,gkappa,gkappa_prime); + unsigned int test=hash64to32(657922856560023,0); } CountMinSketch(CMSparams parms) { init(parms.lambda,parms.kappa,parms.kappa_prime); + unsigned int test=hash64to32(657922856560023,0); } ~CountMinSketch() { @@ -130,9 +134,9 @@ public: // keep that just for unit testing purpose T getEstimatedNbOcc(const unsigned long& val); - int addRead(const readNumericValues& read_val); + int addRead(const T_read_numericValues& read_val); - int isBeneathMinKappa(const readNumericValues&read_val); + int isBeneathMinKappa(const T_read_numericValues&read_val); unsigned long getNbDistinctKMers(); @@ -145,31 +149,54 @@ public: * Determines whether median of k-mer coverage is below threshold or not. */ // this is a few seconds faster -template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const readNumericValues& read_val, const int& threshold) { +template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold) { int PE1_below_thres; + int PE2_below_thres=0; int a1=0,a2=0; - int b1; + int b1,b2=0; int j; unsigned int h; T min; - int num=read_val.size(); - b1=num; + int num=read_val.single_or_PE_val.size(); + + read_val.idx_start_PE2?b1=read_val.idx_start_PE2:b1=num; + read_val.idx_start_PE2?b2=num-b1:b2; // this proves faster than using iterators on these data (short) - long const * p_start=(long const * )&(read_val[0]); // TODO: when we'll use C++11: use the data() method of the vector class to access the underlying C array. + long const * p_start=(long const * )&(read_val.single_or_PE_val[0]); // TODO: when we'll use C++11: use the data() method of the vector class to access the underlying C array. int nb=0; while(nb<b1 && 2*a1<=b1) { j=lambda; min=mask; while (--j>=0 && min>threshold) { + //std::cout<<*(p_start+nb)<<std::endl; + //std::cout<<j<<std::endl; h=hash64to32(*(p_start+nb),j); + //std::cout<<h<<std::endl; + //unsigned int test=hash64to32(657922856560023,0); + min=cms_lambda_array[j] [h]; } (min<threshold)? a1+=1:a1; nb++; } PE1_below_thres=2*a1>b1; + if (b2) { + nb=b1; + while(nb<num && 2*a2<=b2) { + j=lambda; + min=mask; + while (--j>=0 && min>threshold) { + h=hash64to32(*(p_start+nb),j); + min=cms_lambda_array[j] [h]; + } + (min<threshold)? a2+=1:a2; + nb++; + } + PE2_below_thres=2*a2>b2; + return (PE1_below_thres || PE2_below_thres); + } return PE1_below_thres; } @@ -252,14 +279,14 @@ template<typename T> T CountMinSketch<T>::getEstimatedNbOcc(const unsigned long& return min; } -template<typename T> int CountMinSketch<T>::addRead(const readNumericValues& read_val) { +template<typename T> int CountMinSketch<T>::addRead(const T_read_numericValues& read_val) { int keep_r=isRCovBelowThres(read_val,kappa); if (keep_r) { readNumericValues::const_iterator it; // according to the intel profiler, we spend a lot of time in this loop. Try to use something slightly faster. - long const * p=(long const *) &read_val[0]; + long const * p=(long const *) &read_val.single_or_PE_val[0]; int cnt; - int stop=read_val.size(); + int stop=read_val.single_or_PE_val.size(); for (cnt=0;cnt<stop;cnt++) { this->addKMer(*p); p=p+1; @@ -268,7 +295,7 @@ template<typename T> int CountMinSketch<T>::addRead(const readNumericValues& rea return keep_r; } -template<typename T> int CountMinSketch<T>::isBeneathMinKappa(const readNumericValues& read_val) { +template<typename T> int CountMinSketch<T>::isBeneathMinKappa(const T_read_numericValues& read_val) { int res=isRCovBelowThres(read_val,kappa_prime); return res; } diff --git a/src/Filter.hpp b/src/Filter.hpp index 8537fa2..e439979 100644 --- a/src/Filter.hpp +++ b/src/Filter.hpp @@ -85,7 +85,7 @@ template <typename T> void Filter::underlyingfillCMS(FqBaseBackend* map_id_backe i_dim::iterator it_offs; k_dim::iterator it_struct; ReadProcessor read_p(k); - readNumericValues nbrKmerDecompo; + T_read_numericValues nbrKmerDecompo; // 1rst step, open files before fetching reads in them int i; @@ -110,7 +110,7 @@ template <typename T> void Filter::underlyingfillCMS(FqBaseBackend* map_id_backe // read is rejected, update rpos structure accordingly. it_struct->fileid=0; } - nbrKmerDecompo.clear(); + nbrKmerDecompo.single_or_PE_val.clear(); } } } @@ -126,7 +126,7 @@ template <typename T> void Filter::underlyinglowFilterCMS(FqBaseBackend* map_id_ k_dim::iterator it_struct; int ret; ReadProcessor read_p(k); - readNumericValues nbrKmerDecompo; + T_read_numericValues nbrKmerDecompo; // 1rst step, open files before fetching reads in them int i; @@ -148,7 +148,7 @@ template <typename T> void Filter::underlyinglowFilterCMS(FqBaseBackend* map_id_ decomposeReadInKMerNums(read_p, nbrKmerDecompo,k,a_seqs); ret=cms->isBeneathMinKappa(nbrKmerDecompo); if (ret) it_struct->fileid=0; - nbrKmerDecompo.clear(); + nbrKmerDecompo.single_or_PE_val.clear(); } } } diff --git a/src/FqMainBackend.cpp b/src/FqMainBackend.cpp index a259053..2e73578 100644 --- a/src/FqMainBackend.cpp +++ b/src/FqMainBackend.cpp @@ -71,9 +71,6 @@ void FqMainBackend::processFile(char * filename,unsigned char f_id) { size_t buffer_size; (test_mode)?buffer_size=test_bufsize:buffer_size=FqBaseBackend::bufsize; - //this->openUndefFile(); - //if (p_auxFqProcessor!=NULL) p_auxFqProcessor->openUndefFile(); - int f_single=open(filename,O_RDONLY,mode); if (f_single==-1) { err(errno,"cannot open file: %s.",filename); @@ -90,9 +87,6 @@ void FqMainBackend::processFile(char * filename,unsigned char f_id) { processBuf(buf_info,f_id,cur_offset); } close(f_single); - /*this->closeUndefFile(); - if (p_auxFqProcessor!=NULL) p_auxFqProcessor->closeUndefFile();*/ - free(buf); } @@ -117,14 +111,17 @@ void FqMainBackend::onEndFastqRecord(T_fq_rec_info& rec_info,const T_buf_info& b //debug_processBuf(on_record_end,bufinfo,rec_info.rstart_offset); int nb_k_mer=rec_info.nb_nucleotides_in_read+1-qual_thres.k; rec_info.nb_nucleotides_in_read_PE2>0?nb_k_mer_PE2=rec_info.nb_nucleotides_in_read_PE2+1-qual_thres.k:nb_k_mer_PE2=0; - if (!treat_PE_separately || (p_auxFqProcessor==NULL)) { + if (p_auxFqProcessor==NULL) { //case of single reads nb_k_mer+=nb_k_mer_PE2; //(nb_k_mer-rec_info.nb_k_mers_in_error-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read)?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo); if (nb_k_mer-rec_info.nb_k_mers_in_error-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read) ref_k_dim.push_back(rp); - } else { + } else { // PE reads are processed separately by default // ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2>=1) && (nb_k_mer-rec_info.nb_k_mers_in_error>=1))?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo); // ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read) && (nb_k_mer-rec_info.nb_k_mers_in_error>=qual_thres.min_correct_k_mers_in_read))?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo); - if ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read) && (nb_k_mer-rec_info.nb_k_mers_in_error>=qual_thres.min_correct_k_mers_in_read)) ref_k_dim.push_back(rp); + //if ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read) && (nb_k_mer-rec_info.nb_k_mers_in_error>=qual_thres.min_correct_k_mers_in_read)) ref_k_dim.push_back(rp); + cout<<"nb_k_mer_PE2="<<nb_k_mer_PE2<<" nb_k_mer="<<nb_k_mer<<endl; + cout<<" nb_k_mers_in_error_in_PE2="<<rec_info.nb_k_mers_in_error_in_PE2<<" nb_k_mers_in_error="<<rec_info.nb_k_mers_in_error<<endl; + if ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2+nb_k_mer-rec_info.nb_k_mers_in_error>=qual_thres.min_correct_k_mers_in_read)) ref_k_dim.push_back(rp); } // empty buffer keeping current record cur_fq_record[0]='\0'; diff --git a/src/FqMainBackend.h b/src/FqMainBackend.h index aad846b..050f1c9 100644 --- a/src/FqMainBackend.h +++ b/src/FqMainBackend.h @@ -32,7 +32,7 @@ class FqMainBackend : public FqBaseBackend { // static int treat_PE_as_single; - static int treat_PE_separately; + //static int treat_PE_separately; unsigned long cnt_k_mers; //! counter for the total number of k-mers found in a single file or in a pair of PE files. //! This includes k-mers in error. FqAuxBackend * p_auxFqProcessor; /* Another fastq processor component is necessary for handling the case of PE reads.*/ @@ -56,9 +56,9 @@ public: void processBuf(T_buf_info&,unsigned char f_id,unsigned long cur_offset); - static void setTreatPEMode(const int& treat_PE){ + /*static void setTreatPEMode(const int& treat_PE){ FqMainBackend::treat_PE_separately=treat_PE; - } + }*/ unsigned long getCntKMers() { return cnt_k_mers; diff --git a/src/ReadProcessor.cpp b/src/ReadProcessor.cpp index dde6617..66bec29 100644 --- a/src/ReadProcessor.cpp +++ b/src/ReadProcessor.cpp @@ -59,17 +59,17 @@ void ReadProcessor::kMerAndRevToNumber(char * k_m,KmRevNumbers& numbers) { -void ReadProcessor::getKMerNumbers(char * dnaStr,int l,std::vector<unsigned long>& my_vect) { // See simple_test.cpp and results. benchmark showed that std::vector is very slightly faster than C array and doesn't require more memory in our case. So, I am using it since it makes code simpler. +void ReadProcessor::getKMerNumbers(char * dnaStr,int l,readNumericValues& all_values) { // See simple_test.cpp and results. benchmark showed that std::vector is very slightly faster than C array and doesn't require more memory in our case. So, I am using it since it makes code simpler. int i; KmRevNumbers numbers; initNumbers(numbers); char * p_char=dnaStr; int nb_k_m=l-k+1; for (i=0; i<nb_k_m;i++) { - kMerAndRevToNumber(p_char,numbers); - if (n<0) { - (numbers.nbr<numbers.nbr_rev)?my_vect.push_back(numbers.nbr_rev):my_vect.push_back(numbers.nbr); // finding a k-mer is equivalent to finding its reverse complement; as numeric values for k-mers are unique, just take into account the biggest of the 2. - } + kMerAndRevToNumber(p_char,numbers); + if (n<0) { + (numbers.nbr<numbers.nbr_rev)?all_values.push_back(numbers.nbr_rev):all_values.push_back(numbers.nbr); // finding a k-mer is equivalent to finding its reverse complement; as numeric values for k-mers are unique, just take into account the biggest of the 2. + } p_char++; } } diff --git a/src/ReadProcessor.h b/src/ReadProcessor.h index 8caa552..f3396cd 100644 --- a/src/ReadProcessor.h +++ b/src/ReadProcessor.h @@ -33,6 +33,7 @@ #include <iostream> +#include "rock_commons.h" using namespace std; @@ -132,7 +133,7 @@ public: void kMerAndRevToNumber(char * k_m,KmRevNumbers& numbers); unsigned long nucleoToNumberWrap(char s); - void getKMerNumbers(char * dnaStr,int l,std::vector<unsigned long>& my_vect); + void getKMerNumbers(char * dnaStr,int l,readNumericValues&); }; diff --git a/src/fqreader.cpp b/src/fqreader.cpp index 80f7fde..0a0a3a4 100644 --- a/src/fqreader.cpp +++ b/src/fqreader.cpp @@ -38,7 +38,7 @@ FasqQualThreshold FqBaseBackend::qual_thres; -int FqMainBackend::treat_PE_separately; +//int FqMainBackend::treat_PE_separately; /* * Processes 1 file containing single reads. @@ -80,7 +80,7 @@ unsigned long processInputFiles(const std::vector<IO_fq_files>& single_files,\ unsigned long tot_nb_kmers=0; unsigned char f_id=1; FqBaseBackend::setQualThreshold(a_qual_thres); - FqMainBackend::setTreatPEMode(PE_process_mode); + //FqMainBackend::setTreatPEMode(PE_process_mode); std::vector<IO_fq_files>::const_iterator it_single; for (it_single=single_files.begin();it_single!=single_files.end();it_single++) { FqMainBackend * be_fq=new FqMainBackend(io_sr); diff --git a/src/read_utils.cpp b/src/read_utils.cpp index d223fca..7f71a4f 100644 --- a/src/read_utils.cpp +++ b/src/read_utils.cpp @@ -149,19 +149,22 @@ void getDNASeqstr(FqBaseBackend* fq_files_be [], } -void decomposeReadInKMerNums(ReadProcessor& read_p, readNumericValues& nbrKmerDecompo,int k,DnaSeqStr a_seqs[2]) { +void decomposeReadInKMerNums(ReadProcessor& read_p, T_read_numericValues& nbrKmerDecompo,int k,DnaSeqStr a_seqs[2]) { int nb_expected_k_mers,nb_expected_k_mers_PE1,nb_expected_k_mers_PE2=0; nb_expected_k_mers_PE1=a_seqs[0].length+1-k; + nbrKmerDecompo.idx_start_PE2=0; if (a_seqs[1].length) nb_expected_k_mers_PE2=a_seqs[1].length+1-k; nb_expected_k_mers=nb_expected_k_mers_PE1+nb_expected_k_mers_PE2; - nbrKmerDecompo.reserve(nb_expected_k_mers); + nbrKmerDecompo.single_or_PE_val.reserve(nb_expected_k_mers); + //nbrKmerDecompo.nb_kmer_PE1=nb_expected_k_mers_PE1; char * start_dna_str=a_seqs[0].fq_record_buf+a_seqs[0].start_idx; - read_p.getKMerNumbers(start_dna_str,a_seqs[0].length,nbrKmerDecompo); + read_p.getKMerNumbers(start_dna_str,a_seqs[0].length,nbrKmerDecompo.single_or_PE_val); if (a_seqs[1].length) { // case of PE reads + nbrKmerDecompo.idx_start_PE2=nb_expected_k_mers_PE1; start_dna_str=a_seqs[1].fq_record_buf+a_seqs[1].start_idx; - read_p.getKMerNumbers(start_dna_str,a_seqs[1].length,nbrKmerDecompo); + read_p.getKMerNumbers(start_dna_str,a_seqs[1].length,nbrKmerDecompo.single_or_PE_val); } } diff --git a/src/read_utils.h b/src/read_utils.h index 5bb504c..aaf064f 100644 --- a/src/read_utils.h +++ b/src/read_utils.h @@ -36,6 +36,6 @@ void getDNASeqstr(FqBaseBackend* [], void init_DnaSeqStr(DnaSeqStr * dna_seq); -void decomposeReadInKMerNums(ReadProcessor& read_p, readNumericValues& nbrKmerDecompo,int k,DnaSeqStr a_seqs[2]); +void decomposeReadInKMerNums(ReadProcessor& read_p, T_read_numericValues& nbrKmerDecompo,int k,DnaSeqStr a_seqs[2]); #endif /* READ_UTILS_H_ */ diff --git a/src/rock_commons.h b/src/rock_commons.h index f4647d3..cfaf09c 100644 --- a/src/rock_commons.h +++ b/src/rock_commons.h @@ -70,11 +70,13 @@ typedef struct { }PE_files; -typedef std::vector<unsigned long> readNumericValues; +typedef struct std::vector<unsigned long> readNumericValues; -/*typedef struct { +typedef struct { readNumericValues single_or_PE_val; -}T_read_numericValues;*/ + int idx_start_PE2; // In he case where PE reads are not treated as single; + // indicate at which index the single_or_PE_val array starts containing values for PE2 k-mers. +}T_read_numericValues; typedef struct { unsigned int nb_input_reads; diff --git a/src/unit_test_cms.cpp b/src/unit_test_cms.cpp index 21bee4a..8a78c55 100644 --- a/src/unit_test_cms.cpp +++ b/src/unit_test_cms.cpp @@ -141,9 +141,10 @@ void test_CMS(int lambda,int kappa,int kappa_prime) { assert(nb_distinct_k_mers==1); // mimic a read (as given by part 2 output). - readNumericValues read_values; + T_read_numericValues read_values; + read_values.idx_start_PE2=0; for (i=1;i<=400;i++) { - read_values.push_back(i*1000-1); + read_values.single_or_PE_val.push_back(i*1000-1); } ret=cms.addRead(read_values); // read should be accepted since none of the values that it contains is equal to those I inserted previously. assert(ret!=rej_expected); @@ -151,15 +152,32 @@ void test_CMS(int lambda,int kappa,int kappa_prime) { nb_distinct_k_mers=cms.getNbDistinctKMers(); assert(nb_distinct_k_mers==401); - // mimic a vector that contains a lot of k_mers already present in the CMS and check that it is rejected. - readNumericValues read_values2; + // mimic a vector that contains a lot of k_mers already present in the CMS in its PE2 but not in its PE1 and check that it is kept. + T_read_numericValues read_values2; + read_values2.idx_start_PE2=200; for (i=1;i<=199;i++) { - read_values2.push_back(i*1000-1); + read_values2.single_or_PE_val.push_back(i*1000-1); } for (i=200;i<=400;i++) { - read_values2.push_back(num); + read_values2.single_or_PE_val.push_back(num); } ret=cms.addRead(read_values2); + assert(ret!=rej_expected); + + nb_distinct_k_mers=cms.getNbDistinctKMers(); + assert(nb_distinct_k_mers==401); + + //mimic a vector that contains in both its PE1 and PE2 parts a lot of kmers already in the CMS and check that it is rejected. + T_read_numericValues read_values3; + read_values3.idx_start_PE2=200; + for (i=1;i<=160;i++) { + read_values3.single_or_PE_val.push_back(num); + } + for (i=61;i<=99;i++) read_values3.single_or_PE_val.push_back(num-1); + for (i=200;i<=400;i++) { + read_values3.single_or_PE_val.push_back(num); + } + ret=cms.addRead(read_values3); assert(ret==rej_expected); nb_distinct_k_mers=cms.getNbDistinctKMers(); diff --git a/src/unit_test_fqreader.cpp b/src/unit_test_fqreader.cpp index 46c3df9..9756c32 100644 --- a/src/unit_test_fqreader.cpp +++ b/src/unit_test_fqreader.cpp @@ -47,7 +47,7 @@ const int treat_PE_as_single=0; void test_processSingleFile() { srp sr; unsigned char f_id=1; - FqMainBackend::setTreatPEMode(0); + //FqMainBackend::setTreatPEMode(0); processSingleFile((char *) "../test/data/unit/test_single.fq",f_id,&sr); srp::reverse_iterator rit; i_dim::iterator it_offs; @@ -92,7 +92,7 @@ void test_processSingleFileWithMQOption() { qual_thres.nucl_score_threshold=14+k_phred_32; qual_thres.min_correct_k_mers_in_read=100; FqBaseBackend::setQualThreshold(qual_thres); - FqMainBackend::setTreatPEMode(0); + //FqMainBackend::setTreatPEMode(0); FqMainBackend be_fq=FqMainBackend(&sr); //be_fq.setUndefFile((char *) "../test/data/unit/test_single.undef.fq"); @@ -170,7 +170,7 @@ void test_processPEfilesWithA() { unsigned char f_id4=4; srp sr; - FqMainBackend::setTreatPEMode(0); + //FqMainBackend::setTreatPEMode(0); processPEFiles(fq_3_test, f_id3,fq_4_test, f_id4,&sr); srp::reverse_iterator rit; i_dim::iterator it_offs; @@ -227,7 +227,7 @@ void test_processPEFiles() { unsigned char f_id2=2; srp sr; - FqMainBackend::setTreatPEMode(0); + //FqMainBackend::setTreatPEMode(0); processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr); srp::reverse_iterator rit; i_dim::iterator it_offs; @@ -286,7 +286,7 @@ void aux_testPEFilesMQ(FasqQualThreshold qual_thres,int nb_expected_reads) { unsigned char f_id1=1; unsigned char f_id2=2; - FqMainBackend::setTreatPEMode(0); + //FqMainBackend::setTreatPEMode(0); FqBaseBackend::setQualThreshold(qual_thres); processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr); @@ -315,7 +315,7 @@ void test_processPEFilesWithMQOptions() { qual_thres.k=20; qual_thres.nucl_score_threshold=14+k_phred_32; qual_thres.min_correct_k_mers_in_read=76; - FqMainBackend::setTreatPEMode(0); + //FqMainBackend::setTreatPEMode(0); aux_testPEFilesMQ(qual_thres,4); // last fq records contains only 77 correct k-mers. @@ -368,23 +368,17 @@ void test_processAllFiles() { unsigned char f_single=3; srp sr; - FqMainBackend::setTreatPEMode(0); + //FqMainBackend::setTreatPEMode(0); processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr); processSingleFile(fq_single,f_single,&sr); check_processAllFilesResults(sr); } -void test_processPE_not_as_single() { +void test_processPE_nuclthreshold() { char * fq_1_test=(char *) "../test/data/unit/test_PE1_PE_not_as_single.fq"; char * fq_2_test=(char *) "../test/data/unit/test_PE2_PE_not_as_single.fq"; - char * fq_1_test_undef=(char *) "../test/data/unit/test_PE1_PE_not_as_single.undef.fastq"; - char * fq_2_test_undef=(char *) "../test/data/unit/test_PE2_PE_not_as_single.undef.fastq"; - - char * fq_1_expected_undef=(char *) "../test/data/unit/expected_PE1_PE_not_as_single.undef.fastq"; - char * fq_2_expected_undef=(char *) "../test/data/unit/expected_PE2_PE_not_as_single.undef.fastq"; - unsigned char f_id1=1; unsigned char f_id2=2; srp sr; @@ -395,7 +389,6 @@ void test_processPE_not_as_single() { qual_thres.nucl_score_threshold=2+k_phred_32; FqBaseBackend::setQualThreshold(qual_thres); - FqMainBackend::setTreatPEMode(1); processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,1000); srp::reverse_iterator rit; @@ -411,18 +404,13 @@ void test_processPE_not_as_single() { } } } - assert(cnt_read==1); - - /*assert(compareFilesLileByLine(fq_1_test_undef,fq_1_expected_undef)==0); - assert(compareFilesLileByLine(fq_2_test_undef,fq_2_expected_undef)==0); + assert(cnt_read==3); - assert(remove((char *) "../test/data/unit/test_PE1_PE_not_as_single.undef.fastq")==0); - assert(remove((char *) "../test/data/unit/test_PE2_PE_not_as_single.undef.fastq")==0);*/ } -void test_processPE_not_as_single2() { +void test_processPE_nuclthreshold2() { char * fq_1_test=(char *) "../test/data/unit/test_PE1_PE_not_as_single_pathological.fq"; char * fq_2_test=(char *) "../test/data/unit/test_PE2_PE_not_as_single_pathological.fq"; @@ -435,11 +423,10 @@ void test_processPE_not_as_single2() { FasqQualThreshold qual_thres; qual_thres.k=30; - qual_thres.min_correct_k_mers_in_read=1; + qual_thres.min_correct_k_mers_in_read=221; qual_thres.nucl_score_threshold=10+k_phred_32; FqBaseBackend::setQualThreshold(qual_thres); - FqMainBackend::setTreatPEMode(1); processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,1000); srp::reverse_iterator rit; @@ -448,20 +435,14 @@ void test_processPE_not_as_single2() { int cnt_read=0; for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). - //unsigned long score=rit->first; for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { cnt_read++; } } } - assert(cnt_read==0); - - /*assert(compareFilesLileByLine(fq_1_test_undef,fq_1_test)==0); - assert(compareFilesLileByLine(fq_2_test_undef,fq_2_test)==0); + assert(cnt_read==2); - assert(remove((char *) "../test/data/unit/test_PE1_PE_not_as_single_pathological.undef.fastq")==0); - assert(remove((char *) "../test/data/unit/test_PE2_PE_not_as_single_pathological.undef.fastq")==0);*/ } @@ -469,9 +450,6 @@ void test_processPE_not_as_singleWithMQ() { char * fq_1_test=(char *) "../test/data/unit/fake_PE1.fq"; char * fq_2_test=(char *) "../test/data/unit/fake_PE2.fq"; - char * fq_1_test_undef=(char *) "../test/data/unit/fake_PE1.undef.fastq"; - char * fq_2_test_undef=(char *) "../test/data/unit/fake_PE2.undef.fastq"; - unsigned char f_id1=1; unsigned char f_id2=2; srp sr; @@ -482,7 +460,6 @@ void test_processPE_not_as_singleWithMQ() { qual_thres.nucl_score_threshold=10+k_phred_32; FqBaseBackend::setQualThreshold(qual_thres); - FqMainBackend::setTreatPEMode(1); processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,1000); srp::reverse_iterator rit; @@ -491,7 +468,6 @@ void test_processPE_not_as_singleWithMQ() { int cnt_read=0; for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). - //unsigned long score=rit->first; for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { cnt_read++; @@ -499,27 +475,19 @@ void test_processPE_not_as_singleWithMQ() { } } assert(cnt_read==1); - /*struct stat fileStat; - stat(fq_1_test_undef, &fileStat); - assert(fileStat.st_size==0); - stat(fq_2_test_undef, &fileStat); - assert(fileStat.st_size==0);*/ sr.clear(); + cout<<"try with a different min_correct_k_mers_in_read=87"<<endl; qual_thres.k=30; - qual_thres.min_correct_k_mers_in_read=45; + qual_thres.min_correct_k_mers_in_read=87; qual_thres.nucl_score_threshold=15+k_phred_32; FqBaseBackend::setQualThreshold(qual_thres); - FqMainBackend::setTreatPEMode(1); processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,1000); - - cnt_read=0; for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). - //unsigned long score=rit->first; for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { cnt_read++; @@ -527,25 +495,21 @@ void test_processPE_not_as_singleWithMQ() { } } assert(cnt_read==0); - sr.clear(); - /*assert(compareFilesLileByLine(fq_1_test_undef,fq_1_test)==0); - assert(compareFilesLileByLine(fq_2_test_undef,fq_2_test)==0);*/ + cout<<"try with a different min_correct_k_mers_in_read=13"<<endl; qual_thres.k=30; qual_thres.min_correct_k_mers_in_read=13; qual_thres.nucl_score_threshold=18+k_phred_32; FqBaseBackend::setQualThreshold(qual_thres); - FqMainBackend::setTreatPEMode(1); processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,1000); cnt_read=0; for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). - //unsigned long score=rit->first; for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { cnt_read++; @@ -555,16 +519,6 @@ void test_processPE_not_as_singleWithMQ() { assert(cnt_read==1); sr.clear(); - - /*stat(fq_1_test_undef, &fileStat); - assert(fileStat.st_size==0); - stat(fq_2_test_undef, &fileStat); - assert(fileStat.st_size==0); - - - assert(remove((char *) fq_1_test_undef)==0); - assert(remove((char *) fq_2_test_undef)==0);*/ - } @@ -591,7 +545,7 @@ void test_processInputFiles() { default_qual_thres.min_correct_k_mers_in_read=0; // aim of that test is not to check undef file creation. Disable it by putting 0 here default_qual_thres.nucl_score_threshold=0; // leave default values for that test - FqMainBackend::setTreatPEMode(treat_PE_as_single); + //FqMainBackend::setTreatPEMode(treat_PE_as_single); FqBaseBackend * array_be[k_max_input_files]; processInputFiles(v_single,v_pe,array_be,default_qual_thres,&sr,1); @@ -638,7 +592,7 @@ AAAAAEEAEEEEEEEEE6EE/EEEEEEEEAEEAEEEEEEEEEEEEEEAEEEA/A/EEEAEEEEEE/EE</EAEEEEEE/E qual_thres.min_correct_k_mers_in_read=78; T_buf_info buf_info; FqBaseBackend::setQualThreshold(qual_thres); - FqMainBackend::setTreatPEMode(treat_PE_as_single); + //FqMainBackend::setTreatPEMode(treat_PE_as_single); FqMainBackend be(&sr); buf_info.buf=buf; buf_info.pchar=buf; @@ -680,7 +634,7 @@ AAAAAEEEEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEE<EEEAEEEEA<EAEEEEEEEAAAAE<EEEE/EEEEAAEEE qual_thres.nucl_score_threshold=k_phred_32; qual_thres.k=25; FqBaseBackend::setQualThreshold(qual_thres); - FqMainBackend::setTreatPEMode(treat_PE_as_single); + //FqMainBackend::setTreatPEMode(treat_PE_as_single); FqMainBackend be(&sr); buf_info.buf=buf; buf_info.pchar=buf; @@ -751,7 +705,7 @@ AAAAAEEEEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEE<EEEAEEEEA<EAEEEEEEEAAAAE<EEEE/EEEEAAEEE qual_thres.nucl_score_threshold=k_phred_32; qual_thres.k=25; FqBaseBackend::setQualThreshold(qual_thres); - FqMainBackend::setTreatPEMode(treat_PE_as_single); + //FqMainBackend::setTreatPEMode(treat_PE_as_single); FqMainBackend be(&sr); FqAuxBackend be2; @@ -820,7 +774,7 @@ AAAAAEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEAEEEEEEEEEEEE T_buf_info buf_info; T_buf_info PE2_buf_info; FqBaseBackend::setQualThreshold(qual_thres); - FqMainBackend::setTreatPEMode(0); + //FqMainBackend::setTreatPEMode(0); FqMainBackend be(&sr); @@ -861,18 +815,11 @@ void Aux_MimicBigPEFilesWithMQOptions(const FasqQualThreshold& qual_thres,const char * fq_1_test_undef=(char *) "../test/data/unit/09-4607_S43_R1.undef.fastq"; char * fq_2_test_undef=(char *) "../test/data/unit/09-4607_S43_R2.undef.fastq"; - char * fq_1_expected_undef_100=(char *) "../test/data/unit/thres_100_09-4607_S43_R1.undef.fastq"; - char * fq_2_expected_undef_100=(char *) "../test/data/unit/thres_100_09-4607_S43_R2.undef.fastq"; - - char * fq_1_expected_undef_180=(char *) "../test/data/unit/thres_180_09-4607_S43_R1.undef.fastq"; - char * fq_2_expected_undef_180=(char *) "../test/data/unit/thres_180_09-4607_S43_R2.undef.fastq"; - unsigned char f_id1=1; unsigned char f_id2=2; - // FqBaseBackend::bufsize=bufsize; // choose a very small buf size to mimic behavior on big files and reun the test in a short time + FqBaseBackend::setQualThreshold(qual_thres); - FqMainBackend::setTreatPEMode(0); processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,bufsize); @@ -937,11 +884,11 @@ int main(int argc, char **argv) { test_processPEFilesWithMQOptions(); cout<<"test processing files with thresholds (nucleotide quality score and minimum number of valid k_mers in read) mimic big files that need several read operations. "<<endl; test_MimicBigPEFilesWithMQOptions(); - cout<<"test processing PE files with nucleotide quality threshold and PE not treated as single option."<<endl; - test_processPE_not_as_single(); - cout<<"test processing PE files with lots of errors in PE2; nucleotide quality threshold and PE not treated as single option."<<endl; - test_processPE_not_as_single2(); - cout<<"test processing PE files with PE not treated as single; quality score threshold and minimum number of valid k-mer."<<endl; + cout<<"test processing PE files with nucleotide quality threshold."<<endl; + test_processPE_nuclthreshold(); + cout<<"test processing PE files with lots of errors in PE2; nucleotide quality threshold"<<endl; + test_processPE_nuclthreshold2(); + cout<<"test processing PE files; quality score threshold and minimum number of valid k-mer."<<endl; test_processPE_not_as_singleWithMQ(); cout<<"done"<<endl; } diff --git a/src/unit_test_read_utils.cpp b/src/unit_test_read_utils.cpp index 44560b9..af81c08 100644 --- a/src/unit_test_read_utils.cpp +++ b/src/unit_test_read_utils.cpp @@ -42,8 +42,6 @@ void test_getReadSingle() { my_struct1=init_rpos(4,639); my_struct2=init_rpos(4,1228); - //std::cout<<my_struct1.read_a1<<endl; - assert(my_struct1.read_a1==639); assert(my_struct2.read_a1==1228); srp io_sr; @@ -147,7 +145,6 @@ void test_getReadPE() { be_fq1.closeInputFile(); be_fq2.closeFile(); - //cout<<"done"<<endl; } void test_getReadPEWithNQST() { @@ -191,19 +188,17 @@ void test_getReadPEWithNQST() { std::cout<<dna_read<<endl; assert(strcmp(dna_read,"GGNTCTGTTGGCTCAAACGCCNTCACNTCNTTGNTCAAAAGCTTATAATGCGNGCCAAANTCCGCCATCGNGACNNCTACGCCTTCCCCNGCTNTCCCGNCAAANNCNNGTCTTGCCGGANCTTCNCNNNCTCCNNTCGAAAGCGGCGAAATCTTAGNGGAAGGNGGANATAATGCNNTCNCNTCGNACNNTGAANNTNTANNCGGCANNCNGCAGNNTCNAGGNCTTNCNGNGNAACGNTTAANNGCAGATGGCTACGGNTTTGCNGGGGNANGAGACNGGANANCNNNGNCGNTCGNCCN")==0); - /*getDnaStr(fic_map,my_struct2,a_seqs,dna_read); - assert(strcmp(dna_read,"ATTGTGGGGTTCCTTTTTGTAGCATTGGAATGGAAATTAAAAATGGGGCTTCAGGATGCCCGCTCCATTATTTAATTCCAGAATGTAACGATGCTGTTTACCGGGGGGACTGGAAAGATGCACTTGAGCTTTTGATTAAAACAAATAACATGCCCAGAACCAATCACTGCAATTTTTTTATCCCACCGCACTATCGGTGGAGTCGGCATGAACCAACCTAAACCAAACCCACGGTCAATAATAGCCCGTTCAATCGAATTAATACCCACAGCAGGATCAGAAATTGCAACCGTACAACTTC")==0); -*/ + be_fq1.closeInputFile(); be_fq2.closeFile(); - //cout<<"done"<<endl; } void test_decomposeReadInkMerNums() { int k=5; ReadProcessor read_p(k); - readNumericValues nbrKmerDecompo; + T_read_numericValues nbrKmerDecompo; + DnaSeqStr a_seqs[2]; init_DnaSeqStr(&a_seqs[0]); init_DnaSeqStr(&a_seqs[1]); @@ -212,26 +207,23 @@ void test_decomposeReadInkMerNums() { strcpy(seq1->fq_record_buf,"@fake_stuff\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n+\n....@\n"); seq1->start_idx=12; seq1->length=75; - decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs); - assert(nbrKmerDecompo.size()==71); - readNumericValues::iterator it; - for (it=nbrKmerDecompo.begin();it!=nbrKmerDecompo.end();it++) { - assert(*it==1023); - } + DnaSeqStr * seq2=&a_seqs[1]; strcpy(seq2->fq_record_buf,"@another_fake_stuff\nATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG\n+\n....@\n"); seq2->start_idx=20; seq2->length=80; - nbrKmerDecompo.clear(); decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs); - assert(nbrKmerDecompo.size()==147); + assert(nbrKmerDecompo.single_or_PE_val.size()==71+76); + assert(nbrKmerDecompo.idx_start_PE2==71); + readNumericValues::iterator it; + readNumericValues::iterator it_end_PE1=nbrKmerDecompo.single_or_PE_val.begin()+71; + for (it=nbrKmerDecompo.single_or_PE_val.begin();it!=it_end_PE1;it++) { + assert(*it==1023); + } + + nbrKmerDecompo.single_or_PE_val.clear(); } -/* -void initNumbers(KmRevNumbers& numbers) { - numbers.first=1; - numbers.nbr=0; - numbers.nbr_rev=0; -}*/ + void savNumbers(ReadProcessor::KmRevNumbers& numbers,ReadProcessor::KmRevNumbers& numbers_sav) { numbers_sav.first=numbers.first; @@ -384,12 +376,9 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32. // k-mer number 10 is GCTACCATAACACCAACTGTTTTCACCATA // its reverse complement is : TATGGTGAAAACAGTTGGTGTTATGGTAGC if (cnt_k_mer==10) { - //num=read_p.kMerToNumber((char *) "GCTACCATAACACCAACTGTTTTCACCATA",NULL); assert(*it==max(704021989794238796,930978566989888201)); } - /*if (cnt_k_mer==20) { - assert(*it==930978566989888201); - }*/ + } } diff --git a/test/rock_mem.sh b/test/rock_mem.sh index 7f3fcb0..1d9b0c0 100755 --- a/test/rock_mem.sh +++ b/test/rock_mem.sh @@ -139,17 +139,23 @@ test $ret2 -eq 0 || exit 143 ../src/rock -C 100 -k 30 -c 1 -l 2 -q 13 -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_13.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq >/dev/null ||exit 145 -ret1=`diff tmp/klebsiella_100_1_13_qual_thres.fq tmp/klebsiella_100_1_12_qual_thres.fq|grep -c "@SRR1222430.37"` +ret1=`diff tmp/klebsiella_100_1_13_qual_thres.fq tmp/klebsiella_100_1_12_qual_thres.fq|grep -c "@SRR1222430.14"` test $ret1 -eq 1 || exit 146 +ret1=`diff tmp/klebsiella_100_1_13_qual_thres.fq tmp/klebsiella_100_1_12_qual_thres.fq|grep -c "@SRR1222430.24"` +test $ret1 -eq 1 || exit 1461 + ret2=`diff tmp/klebsiella_100_1_13_qual_thres.fq tmp/klebsiella_100_1_12_qual_thres.fq|grep -c "length"` -test $ret2 -eq 2 || exit 147 +test $ret2 -eq 4 || exit 147 -ret1=`diff tmp/klebsiella_100_2_13_qual_thres.fq tmp/klebsiella_100_2_12_qual_thres.fq|grep -c "@SRR1222430.37"` +ret1=`diff tmp/klebsiella_100_2_13_qual_thres.fq tmp/klebsiella_100_2_12_qual_thres.fq|grep -c "@SRR1222430.14"` test $ret1 -eq 1 || exit 148 +ret1=`diff tmp/klebsiella_100_2_13_qual_thres.fq tmp/klebsiella_100_2_12_qual_thres.fq|grep -c "@SRR1222430.24"` +test $ret1 -eq 1 || exit 1481 + ret2=`diff tmp/klebsiella_100_2_13_qual_thres.fq tmp/klebsiella_100_2_12_qual_thres.fq|grep -c "length"` -test $ret2 -eq 2 || exit 149 +test $ret2 -eq 4 || exit 149 rm -fr tmp -- GitLab