diff --git a/NEWS b/NEWS index c40aa43128f56e394300b1708b558f9e13056300..b14abf9f20622328fdbfe3e54911649496d94fd4 100644 --- a/NEWS +++ b/NEWS @@ -4,6 +4,9 @@ New in version 1.6 * --PE_separated has been removed because it is now the default behavior. * -p has been introduced for the case where you want to process PE as single. * see man for other option new values. + * bugfix in the counting of k-mers in errors: for small reads (ie read length<2*k) containing a lot of errors; it was possible that the number of k-mers in error was greater than the total number of k-mers. +This resulted in PE reads being kept whereas PE2 was only containing nucleotides in error. + * code optimization on loops: a few seconds faster than 1.5.1. New in version 1.5.1 * bug fix for --separate_PE behavior is now as specified: keep read if R1 or R2 have coverage below kappa. diff --git a/src/CountMinSketch.hpp b/src/CountMinSketch.hpp index 00ac75710b3fcac989b424688f3a588e270e5684..f2a0ab37c6812a3973b4d172a68c31f404ad8c5f 100644 --- a/src/CountMinSketch.hpp +++ b/src/CountMinSketch.hpp @@ -194,60 +194,10 @@ public: - -/* - * Determines if read (single or PE) has a median coverage below a given threshold - */ -// time returns 4m26s for this method. -/* -template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold) { - int PE1_below_thres; - int PE2_below_thres=0; - PE1_below_thres=isRCovBelowThresPartial(read_val.single_or_PE_val,threshold,0,read_val.idx_start_PE2); - if (read_val.idx_start_PE2 && !filter_PE_as_single) { - PE2_below_thres=isRCovBelowThresPartial(read_val.single_or_PE_val,threshold,read_val.idx_start_PE2,0); - if (threshold==kappa) return (PE1_below_thres || PE2_below_thres); - else return(PE1_below_thres && PE2_below_thres); - } - return PE1_below_thres; -}*/ - -/* - * Computes median of occurences for all the k_mers in a single read or for a part of a PE read. - */ -/* -template<typename T> inline int CountMinSketch<T>::isRCovBelowThresPartial(const readNumericValues& read_val, const int& threshold, const int& start,const int& stop) { - int b; - int a=0; - int j,h; - T min; - readNumericValues::const_iterator it; - readNumericValues::const_iterator it_aux; - if (stop) { - it_aux=read_val.begin()+stop; - b=stop-1; - } else { - it_aux=read_val.end(); - b=read_val.size(); - } - it=read_val.begin()+start; - while (it!=it_aux and 2*a<b) { - j=lambda; - min=mask; - while (--j>=0 && min>threshold) { - h=hash64to32(*it,j); - min=cms_lambda_array[j] [h]; - } - (min<threshold)? a+=1:a; - ++it; - } - return (2*a>b); -} -*/ - /* * Determines whether median of k-mer coverage is below threshold or not. */ +#include <stdio.h> // this is a few seconds faster template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read_numericValues& read_val, const int& threshold) { int PE1_below_thres; @@ -290,9 +240,11 @@ template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const T_read nb++; } PE2_below_thres=2*a2>b2; + //printf("PE1_below_thres=%d, PE2_below_thres=%d \n",PE1_below_thres,PE2_below_thres); if (threshold==kappa) return (PE1_below_thres || PE2_below_thres); else return(PE1_below_thres && PE2_below_thres); } + //printf("PE1_below_thres=%d\n",PE1_below_thres); return PE1_below_thres; } diff --git a/src/FqBaseBackend.cpp b/src/FqBaseBackend.cpp index 4ba07458c5456fb87f0abfab2d1ad0965b68b1de..43eccfd5fb894829a4608a4390f6735aefda5ee6 100644 --- a/src/FqBaseBackend.cpp +++ b/src/FqBaseBackend.cpp @@ -199,6 +199,16 @@ int FqBaseBackend::getRead(const unsigned long& offset, char * fq_record) { throw errno; } nread=read(i_f_desc,fq_record,MAX_FQ_RECORD_LENGTH); + /*if (strstr(fq_record,"@SRR1222430.1378154 1378154 length=35")==fq_record) { + printf("found @SRR1222430.1378154 1378154 length=35\n"); + } + if (strstr(fq_record,"@SRR1222430.1527342 1527342 length=35")==fq_record) { + printf("found @SRR1222430.1527342 1527342 length=35\n"); + } + if (strstr(fq_record,"@SRR1222430.1918478 1918478 length=35")==fq_record) { + printf("found @SRR1222430.1918478 1918478 length=35\n"); + }*/ + #ifdef DEBUG assert(nread<=MAX_FQ_RECORD_LENGTH); assert(*(fq_record)=='@'); @@ -240,7 +250,8 @@ void FqBaseBackend::onIncScore(T_fq_rec_info& rec_info,T_buf_info& buf_info,int& unsigned int remaining_nucl=rec_info.nb_nucleotides_in_read-rec_info.idx_nucl_in_read; if (s<=qual_thres.nucl_score_threshold) { // maybe TODO rewrite this with chained ternary operators once it is clear to see if it improves performances.Not useful: performance bottleneck is not here but in median calculation (42% of time approximatively for both filters). if (rec_info.idx_nucl_in_read<=qual_thres.k-1) { // error is found in the first k nucleotides - rec_info.nb_k_mers_in_error=rec_info.idx_nucl_in_read+1; + int nb_k_mers=rec_info.nb_nucleotides_in_read-qual_thres.k+1; + (nb_k_mers>rec_info.idx_nucl_in_read+1)?rec_info.nb_k_mers_in_error=rec_info.idx_nucl_in_read+1:rec_info.nb_k_mers_in_error=nb_k_mers; } else if (n>0 && remaining_nucl>n) { (qual_thres.k-n<remaining_nucl-n)?rec_info.nb_k_mers_in_error+=qual_thres.k-n:rec_info.nb_k_mers_in_error+=remaining_nucl-n; diff --git a/src/unit_test_fqreader.cpp b/src/unit_test_fqreader.cpp index bc662d949003d7824066c230f3931d60bdfa2122..536ad498cf0750335cc36fa8b9e8a4714ebe75dc 100644 --- a/src/unit_test_fqreader.cpp +++ b/src/unit_test_fqreader.cpp @@ -405,6 +405,50 @@ void test_processPE_not_as_single() { } +void test_processPE_not_as_single2() { + char * fq_1_test=(char *) "../test/data/unit/test_PE1_PE_not_as_single_pathological.fq"; + char * fq_2_test=(char *) "../test/data/unit/test_PE2_PE_not_as_single_pathological.fq"; + + char * fq_1_test_undef=(char *) "../test/data/unit/test_PE1_PE_not_as_single_pathological.undef.fastq"; + char * fq_2_test_undef=(char *) "../test/data/unit/test_PE2_PE_not_as_single_pathological.undef.fastq"; + + unsigned char f_id1=1; + unsigned char f_id2=2; + srp sr; + + FasqQualThreshold qual_thres; + qual_thres.k=30; + qual_thres.min_correct_k_mers_in_read=1; + qual_thres.nucl_score_threshold=10+k_phred_32; + + FqBaseBackend::setQualThreshold(qual_thres); + FqMainBackend::setTreatPEAsSingle(0); + processPEFiles(fq_1_test, f_id1,fq_2_test, f_id2,&sr,fq_1_test_undef,fq_2_test_undef,1000); + + srp::reverse_iterator rit; + i_dim::iterator it_offs; + k_dim::iterator it_struct; + int cnt_read=0; + + for (rit=sr.rbegin(); rit!=sr.rend(); ++rit) { //process map in reverse order (by decreasing scores). + //unsigned long score=rit->first; + for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { + for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { + cnt_read++; + } + } + } + assert(cnt_read==0); + + assert(compareFilesLileByLine(fq_1_test_undef,fq_1_test)==0); + assert(compareFilesLileByLine(fq_2_test_undef,fq_2_test)==0); + + assert(remove((char *) "../test/data/unit/test_PE1_PE_not_as_single_pathological.undef.fastq")==0); + assert(remove((char *) "../test/data/unit/test_PE2_PE_not_as_single_pathological.undef.fastq")==0); + +} + + void test_processInputFiles() { char * fq_1_test=(char *) "../test/data/unit/test_PE1.fq"; char * fq_2_test=(char *) "../test/data/unit/test_PE2.fq"; @@ -463,10 +507,11 @@ void test_processBufSingle() { k_dim::iterator it_struct; int cnt_read=0; unsigned char f_id1=1; - char * buf="@NS500443:65:H573HAFXX:1:11101:20964:1048/1\n\ + std::string bufstr="@NS500443:65:H573HAFXX:1:11101:20964:1048/1\n\ ACATTTCACTGGTCATGAGCTTGGTGAGAGAAGCGATACGAATGACCGAATCAAGCTGTGGACGAACATTATTGCCCGGTCTGGTGTCACCGAAACTACGAAACACGCGTTGATTGCCGTCGATCACCACCAGCGCCATGCCCGTGGCGC\n\ +\n\ AAAAAEEAEEEEEEEEE6EE/EEEEEEEEAEEAEEEEEEEEEEEEEEAEEEA/A/EEEAEEEEEE/EE</EAEEEEEE/EEE/<//AEEE<E/E/EEEEEEEEE/AAA6EAE<<EE<EEEEEA6E<EAEEAA<EEEAEAEEA<<<<EEAA\n"; + char * buf=(char *) bufstr.c_str(); srp sr; FasqQualThreshold qual_thres; qual_thres.k=20; @@ -513,17 +558,18 @@ void test_processBufPE() { k_dim::iterator it_struct; int cnt_read=0; unsigned char f_id1=1; - char * buf_PE1="@NS500443:65:H573HAFXX:1:11101:20964:1048/1\n\ + std::string bufstr_PE1="@NS500443:65:H573HAFXX:1:11101:20964:1048/1\n\ ACATTTCACTGGTCATGAGCTTGGTGAGAGAAGCGATACGAATGACCGAATCAAGCTGTGGACGAACATTATTGCCCGGTCTGGTGTCACCGAAACTACGAAACACGCGTTGATTGCCGTCGATCACCACCAGCGCCATGCCCGTGGCGC\n\ +\n\ AAAAAEEAEEEEEEEEE6EE/EEEEEEEEAEEAEEEEEEEEEEEEEEAEEEA/A/EEEAEEEEEE/EE</EAEEEEEE/EEE/<//AEEE<E/E/EEEEEEEEE/AAA6EAE<<EE<EEEEEA6E<EAEEAA<EEEAEAEEA<<<<EEAA\n"; - char * buf_PE2="@NS500443:65:H573HAFXX:1:11101:20964:1048/2\n\ + std::string bufstr_PE2="@NS500443:65:H573HAFXX:1:11101:20964:1048/2\n\ CTGATATCGTTGATCGTTATGCCGAGCATATCTTTTATGGTAGCGGCGCCACGGGCATGGCGCTGGTGGTGATCGACGGCAATCAACGCGTGTTTCGTAGTTTCGGTGACACCAGACCGGGCAATAATGTTCGTCCACAGCTTGATTCGGT\n\ +\n\ AAAAAEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEAEEEEEEEEEEEEEAEEE/EEEEAE<EEEEEEEEEEAEEA<<E<EAEAAEE<EEEEEEEEEEEEEAE</A<AEEEE<EAAA<AA\n"; - + char * buf_PE1=(char *) bufstr_PE1.c_str(); + char * buf_PE2=(char *) bufstr_PE2.c_str(); srp sr; FasqQualThreshold qual_thres; qual_thres.k=20; @@ -674,5 +720,7 @@ int main(int argc, char **argv) { test_MimicBigPEFilesWithMQOptions(); cout<<"test processing PE files with nucleotide quality threshold and PE not treated as single option."<<endl; test_processPE_not_as_single(); + cout<<"test processing PE files with lots of errors in PE2; nucleotide quality threshold and PE not treated as single option."<<endl; + test_processPE_not_as_single2(); cout<<"done"<<endl; } diff --git a/test/Makefile.am b/test/Makefile.am index 66846399a34e243b4785fb46d0fac0596f20f8c4..029c5f5c32360ecd57a73a303c38f4e4e36ac03a 100755 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -18,6 +18,7 @@ data/unit/thres_100_09-4607_S43_R2.undef.fastq data/unit/thres_180_09-4607_S43_R data/unit/thres_180_09-4607_S43_R2.undef.fastq data/fastq.raw/klebsiella_5_1_bad_scores.fq \ data/fastq.raw/klebsiella_5_2_bad_scores.fq data/iofiles.args/output_files_2_PE_separated.txt \ data/unit/test_PE1_PE_not_as_single.fq data/unit/test_PE2_PE_not_as_single.fq data/unit/expected_PE1_PE_not_as_single.undef.fastq\ -data/unit/expected_PE2_PE_not_as_single.undef.fastq +data/unit/expected_PE2_PE_not_as_single.undef.fastq \ +data/unit/test_PE1_PE_not_as_single_pathological.fq data/unit/test_PE2_PE_not_as_single_pathological.fq diff --git a/test/data/unit/test_PE1_PE_not_as_single_pathological.fq b/test/data/unit/test_PE1_PE_not_as_single_pathological.fq new file mode 100644 index 0000000000000000000000000000000000000000..a46a2aa8b0f85cc7c69221a182b57622fcc9237e --- /dev/null +++ b/test/data/unit/test_PE1_PE_not_as_single_pathological.fq @@ -0,0 +1,12 @@ +@SRR1222430.1378154 1378154 length=251 +CAGGAACCGCCTTCTGGTGATTTGCAAGAACGCAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTGAAAAAAAAAAAAAAAACTGGTGTTGAAGGTTAAAATGATACATGCAGATAAACAATAATGTGTGATCTTGTACGAATCAAAAACTACAAACGCACGATAGAACTATGTTATAGACTAACAATACGCATCAGCGCGACAGTCTTGGTTACGTCTATAT ++SRR1222430.1378154 1378154 length=251 +CCCCDFFDCDDCGGGGGGGGGGHHHHHHHHHGEGGCHFGEBFEHHHGGEGGHHHHHHGHHHGHGGEGFGGGE/FGHGGHHHHGGGHGHGGHHHHHHHHHGGGG/</<<<-<00/0/...00000000:0:0000000:000000;;0;.;00;9000.009000000.....0000..000000..----...;00000000000000000000000:....-000.:-----:000000.00././.900 +@SRR1222430.1527342 1527342 length=251 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTGAAAAAAAAAAAAAAAAATGTAAAAATAGTCGTAGATGAGATTATCAATCGTTTGAGTAGTCAGATAACACTAGGAGCCAGTGTGAGCGTCAAAGCGGAAAGCACATAGGCACAGCGACATACGTAACGGTACTGGCGCTACATTAGACGTCGTAACAATGTGATGTTGTAGAGCAGGCTCATACACAC ++SRR1222430.1527342 1527342 length=251 +AAAAAAAAAFFFGFCC1FGG01GEFHHFFHGHHHHFHAAFDAGGFGGHHGHEEEFFEFHGFH10FHHHGGGGE@///>/<B1222B>10112B//0?/22>11111222211/<?F/1/11111111111111101010../.000000/-...:00/.--:9C////C000;09/;.----.00-//----;-;///-----9///9///----99--;////////9;/-/;B////----///;///- +@SRR1222430.1918478 1918478 length=249 +GATCGGAAGAGCACACGTCTGAACTCCATTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTGAAAAAAAAAAAAAAAAACGAAGAAAGGAAAGGATGTAAGGAGGACAAATCAGAAAAAAGTAGGAACGCGAGCAGAAGGAGAATGAAGAGAAGGTCACGGTGAGAGAGTGAACGCGGGAAAAGAGTGAAGAGAGGTGAGGACTAAGATGAGTAGTGGGGAAATAGTAGATCAGATCGCGCGAGAACC ++SRR1222430.1918478 1918478 length=249 +BB@ABBABBFFBGGFGGFGFFFHHFHCGHHHGGGCFH5GFC5FEFFAGHHHE?EFFFFGFHHGHHF5CEE?EEF?/EE////////2220?2200222222222//?/000111111100<.1110<0...<----////:....000000.9//./0//......0/../00//..-9---9./.9////9/;.../;/.././///9//;///9//.9-...//////9///;///..----9;./. diff --git a/test/data/unit/test_PE2_PE_not_as_single_pathological.fq b/test/data/unit/test_PE2_PE_not_as_single_pathological.fq new file mode 100644 index 0000000000000000000000000000000000000000..f77ebaa782e6cab0d8bafd5ed2f885a2948ea988 --- /dev/null +++ b/test/data/unit/test_PE2_PE_not_as_single_pathological.fq @@ -0,0 +1,12 @@ +@SRR1222430.1378154 1378154 length=35 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN ++SRR1222430.1378154 1378154 length=35 +################################### +@SRR1222430.1527342 1527342 length=35 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN ++SRR1222430.1527342 1527342 length=35 +################################### +@SRR1222430.1918478 1918478 length=35 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN ++SRR1222430.1918478 1918478 length=35 +###################################