diff --git a/NEWS b/NEWS index a1d3c41ec3febef608ed8eb128f07f1ecbad8314..abf10de6f2613f12d4e735083d75c578367c08e9 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,5 @@ +New in version 1.3 + * -q option for nucleotique quality score threshold. New in version 1.2 * -v option for verbose mode. diff --git a/configure.ac b/configure.ac index 2235b95494d21ba35729a1699deb188c72666cd3..cb0ff451e22f57c5074c502c22f75bf5bea26029 100644 --- a/configure.ac +++ b/configure.ac @@ -1,7 +1,7 @@ dnl Process this file with autoconf to produce a configure script. AC_PREREQ(2.59) -AC_INIT(rock, 1.2) +AC_INIT(rock, 1.3) AC_CANONICAL_SYSTEM diff --git a/doc/rock.pod b/doc/rock.pod index d7822cc41500ccd3010604e9e9238612041d9973..fd5ca12022ad7b57fed8747bec1ae9fa679732e5 100644 --- a/doc/rock.pod +++ b/doc/rock.pod @@ -11,7 +11,7 @@ =over 4 -=item B<rock> [B<-h>] [B<-i> F<file>] [B<-o> F<file>] [B<-k> F<k_mer_size>] [B<-C> F<kappa>] [B<-c> F<kappa_prime>] [B<-l> F<lambda>] [B<-n> F<nb_distinct_k_mer>] [B<-g> F<CMS size in GB>] [Args] +=item B<rock> [B<-h>] [B<-i> F<file>] [B<-o> F<file>] [B<-k> F<k_mer_size>] [B<-q> F<nucl_qual_score_threshold>] [B<-C> F<kappa>] [B<-c> F<kappa_prime>] [B<-l> F<lambda>] [B<-n> F<nb_distinct_k_mer>] [B<-g> F<CMS size in GB>] [Args] =back @@ -30,13 +30,17 @@ For more information on F<file> format, see the description section. =item -o F<file> List of output fastq file names is in F<file>. Note that names of output fastq files must be in the same order as names of input fastq. -Result of filtering 1rst fastq file will go in the first first fastq file mentioned in F<file> +Result of filtering 1rst fastq file will go in the first first fastq file mentionned in F<file> For more information on F<file> format, see the description section. =item -k Specify wanted k-mer size. Default is 30; maximum is 32. +=item -q + +Specify minimum threshold for nucleotides quality score. Nucleotides that have a score below thta threshold will be considered as errors (just like N nucleotides). + =item -C Specify maximum coverage wanted. Default is 50. Maximum is 65535. diff --git a/src/CountMinSketch.hpp b/src/CountMinSketch.hpp index 22b2776fdc5592515cd6823987b2fd13905456bf..c913ffc99a40de2e11be8ffa99e17c54bb40ab87 100644 --- a/src/CountMinSketch.hpp +++ b/src/CountMinSketch.hpp @@ -73,6 +73,7 @@ typedef struct { int kappa; int kappa_prime; int filter_size; // max amount of RAM wanted for the CMS. + int nucl_score_threshold; // minimum quality score below which nucleotides are considered in error. 0 by default. } CMSparams; template <typename T> @@ -92,6 +93,10 @@ struct get_mask<unsigned short> { template<typename T> class CountMinSketch { +public: + int nucl_score_threshold; + +private: static const unsigned long mask1=1; // used only for hash64to32bs static const unsigned long mask2=2095103; static const unsigned long mask3=1023; @@ -130,7 +135,7 @@ template<typename T> class CountMinSketch { inline int isRCovBelowThres(const readNumericValues& read_val, const int& threshold); - void init(int glambda,int gkappa,int gkappa_prime); + void init(int glambda,int gkappa,int gkappa_prime,int gnucl_score_threshold); // for unit tests. friend void test_CMS(int lambda,int kappa,int kappa_prime); @@ -138,12 +143,12 @@ template<typename T> class CountMinSketch { public: - CountMinSketch(int glambda,int gkappa,int gkappa_prime) { - init(glambda,gkappa,gkappa_prime); + CountMinSketch(int glambda,int gkappa,int gkappa_prime,int gnucl_score_threshold=0) { + init(glambda,gkappa,gkappa_prime,gnucl_score_threshold); } CountMinSketch(CMSparams parms) { - init(parms.lambda,parms.kappa,parms.kappa_prime); + init(parms.lambda,parms.kappa,parms.kappa_prime,parms.nucl_score_threshold); } ~CountMinSketch() { @@ -196,10 +201,11 @@ template<typename T> inline int CountMinSketch<T>::isRCovBelowThres(const readNu return (2*a>b); } -template<typename T> void CountMinSketch<T>::init(int glambda,int gkappa,int gkappa_prime) { +template<typename T> void CountMinSketch<T>::init(int glambda,int gkappa,int gkappa_prime,int gnucl_score_threshold) { lambda=glambda; kappa=gkappa; kappa_prime=gkappa_prime; + nucl_score_threshold=gnucl_score_threshold; int j; mask=get_mask<T>::value; /*if (lambda<ubytemask) mask=ubytemask; // byte implementation diff --git a/src/Filter.hpp b/src/Filter.hpp index 5e4bd3484a3e65c883ddd29c5ca38d710d71e4af..05e80df5c23f61eb794380f90b5cf66994a0deb6 100644 --- a/src/Filter.hpp +++ b/src/Filter.hpp @@ -81,10 +81,14 @@ template <typename T> void Filter::underlyingfillCMS(FqBaseBackend* map_id_backe DnaSeqStr a_seqs[2]; init_DnaSeqStr(&a_seqs[0]); init_DnaSeqStr(&a_seqs[1]); - getDNASeqstr(map_id_backend,*it_struct,j,a_seqs); + getDNASeqstr(map_id_backend,*it_struct,j,a_seqs,cms->nucl_score_threshold); + // debug stuff only. + /*if (strstr(a_seqs->fq_record_buf,"@SRR1222430.37")!=NULL) { + printf("coucou"); + }*/ // decompose dna string into k-mers and turn k_mers into numbers. decomposeReadInKMerNums(read_p, nbrKmerDecompo,k,a_seqs); - // insert k-mers into CMS if overall read satisjfies kappa condition. + // insert k-mers into CMS if overall read satisfies kappa condition. ret=cms->addRead(nbrKmerDecompo); if (!ret) { // read is rejected, update rpos structure accordingly. diff --git a/src/ROCKparams.cpp b/src/ROCKparams.cpp index 813374d640f0d279e4f549930c37e899fa8b5ff1..a863b2ee2481f73ee0ac5396913f9fac0e08b092 100644 --- a/src/ROCKparams.cpp +++ b/src/ROCKparams.cpp @@ -120,7 +120,9 @@ int ROCKparams::loadFileArgs(const std::string& afile,std::vector<string>& v_lin int ROCKparams::loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<string>& v_input_lines,std::vector<string>& v_output_lines) { int reti=EXIT_SUCCESS; int reto=EXIT_SUCCESS; - reti=loadFileArgs(input_file,v_input_lines); + if (!input_file.empty()) { + reti=loadFileArgs(input_file,v_input_lines); + } if (!output_file.empty()) { reto=loadFileArgs(output_file,v_output_lines); } @@ -228,7 +230,8 @@ void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { std::vector<string> v_input_lines; std::vector<string> v_output_lines; - while((i = getopt(argc, argv, "i:o:l:k:c:C:g:n:v")) != -1) { + while((i = getopt(argc, argv, "i:o:l:k:c:C:g:n:vq:")) != -1) { + //printf("found option: %c\n",i); switch(i) { case 'i': input_file=optarg;break; @@ -268,6 +271,9 @@ void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { case 'v': verbose_mode=1; break; + case 'q': + parms.nucl_score_threshold=atoi(optarg); + break; default: usage(EXIT_FAILURE); break; } } @@ -276,6 +282,6 @@ void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { if (nb_k_mers) { expected_collision_proba=getCollisionProba(nb_k_mers,parms.lambda); } - if (v_input_lines.empty() && (loadInOutFileArgs(input_file,output_file,v_input_lines,v_output_lines)==EXIT_FAILURE)) throw EXIT_FAILURE; + if ((v_input_lines.empty() || v_output_lines.empty()) && (loadInOutFileArgs(input_file,output_file,v_input_lines,v_output_lines)==EXIT_FAILURE)) throw EXIT_FAILURE; if (processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id)!=EXIT_SUCCESS) throw EXIT_FAILURE; } diff --git a/src/read_utils.cpp b/src/read_utils.cpp index 01c91ce95049512895634117785bfcc44c5c5300..23830ce5cecec30e3fc5d7989290f32c190eeb1b 100644 --- a/src/read_utils.cpp +++ b/src/read_utils.cpp @@ -22,12 +22,22 @@ #include <cassert> #endif +void UpdateReadForScoreThreshold(DnaSeqStr * dna_seq,const int & nucl_qual_score_thres,const int& idx_start_qual_score) { + char * read_data_start=dna_seq->fq_record_buf+dna_seq->start_idx; + char * qual_score_start=dna_seq->fq_record_buf+idx_start_qual_score; + int idx=0; + while (idx<dna_seq->length) { + if (*(qual_score_start+idx)-k_phred_32<=nucl_qual_score_thres) *(read_data_start+idx)=k_nucl_in_error; + idx++; + } +} void getFqRecord(FqBaseBackend* fq_files_be[],const unsigned char& f_id,const unsigned long& offset, char * fq_record) { FqBaseBackend * be=fq_files_be[f_id-1]; try { + int fq_rec_size=be->getRead(offset,fq_record); be->getRead(offset,fq_record); } catch (int e) { #ifdef DEBUG @@ -44,10 +54,11 @@ void init_DnaSeqStr(DnaSeqStr * dna_seq) { dna_seq->start_idx=0; } -void processFqRecord(DnaSeqStr * dna_seq) { +void processFqRecord(DnaSeqStr * dna_seq,const int & nucl_qual_score_thres) { char * pchar=dna_seq->fq_record_buf; int cnt=0; int nb_l=0; + int idx_start_qual_score; #ifdef DEBUG //assert(*pchar==k_read_id_start); if (*pchar!=k_read_id_start) throw -1; @@ -58,12 +69,19 @@ void processFqRecord(DnaSeqStr * dna_seq) { if (nb_l==1) dna_seq->start_idx=cnt+1; if (nb_l==2) { dna_seq->length=cnt-dna_seq->start_idx; + // break; + } + if (nb_l==3) { + idx_start_qual_score=cnt+1; break; } } pchar++; cnt++; } + // here, change read if there is a threshold for nucleotide quality score. + if (nucl_qual_score_thres) UpdateReadForScoreThreshold(dna_seq,nucl_qual_score_thres,idx_start_qual_score); + #ifdef DEBUG assert(nb_l>=2); #endif @@ -73,7 +91,7 @@ void processFqRecord(DnaSeqStr * dna_seq) { void getDNASeqstr(FqBaseBackend* fq_files_be [], const rpos& sr, unsigned long j, - DnaSeqStr * dna_seqs) + DnaSeqStr * dna_seqs,const int& nucl_qual_score_thres) { unsigned char f_id1; unsigned char f_id2; @@ -91,7 +109,7 @@ void getDNASeqstr(FqBaseBackend* fq_files_be [], #endif try { getFqRecord(fq_files_be,f_id1,offset1,dna_seqs[0].fq_record_buf); - processFqRecord(p_dna_seqs); + processFqRecord(p_dna_seqs,nucl_qual_score_thres); } catch (int e) { #ifdef DEBUG cout<<"j="<<j<<" sr content: a1="<<sr.read_a1<<" a2= "<<sr.read_a2<<endl; @@ -115,7 +133,7 @@ void getDNASeqstr(FqBaseBackend* fq_files_be [], #endif exit(e); } - processFqRecord(p_dna_seqs); + processFqRecord(p_dna_seqs,nucl_qual_score_thres); } } diff --git a/src/read_utils.h b/src/read_utils.h index e81ac316203e72f52ed09136cb4a661fa2d8e952..55119f16ecf85b729eec5359a8000fdefec6260c 100644 --- a/src/read_utils.h +++ b/src/read_utils.h @@ -12,6 +12,7 @@ #include "rock_commons.h" #include "ReadProcessor.h" +#define k_nucl_in_error 'N' /* * Here, for performance matters, do not want to have to do a memmove nor strcpy or strcat to extract only DNA sequence for all @@ -29,7 +30,7 @@ typedef struct { void getDNASeqstr(FqBaseBackend* [], const rpos&, unsigned long, - DnaSeqStr *); + DnaSeqStr *,const int& nucl_qual_score_thres=0); void init_DnaSeqStr(DnaSeqStr * dna_seq); diff --git a/src/unit_test_read_utils.cpp b/src/unit_test_read_utils.cpp index bf35ec317e9b70b4874c60040992f4fb0249e4c3..f01e5f5dd2267c86ecd387359bec5d7067cd04da 100644 --- a/src/unit_test_read_utils.cpp +++ b/src/unit_test_read_utils.cpp @@ -65,8 +65,8 @@ void test_getReadSingle() { be_fq.closeInputFile(); } -void getDnaStr(FqBaseBackend * fic_map[],rpos my_struct,DnaSeqStr* a_seqs,char * dna_read) { // Auxilliary method for test_getReadPE(). - getDNASeqstr(fic_map, my_struct, 0, a_seqs); +void getDnaStr(FqBaseBackend * fic_map[],rpos my_struct,DnaSeqStr* a_seqs,char * dna_read, int min_score_qual=0) { // Auxilliary method for test_getReadPE(). + getDNASeqstr(fic_map, my_struct, 0, a_seqs,min_score_qual); char * tmp=(char *) a_seqs[0].fq_record_buf; tmp+=a_seqs[0].start_idx; @@ -129,7 +129,55 @@ void test_getReadPE() { be_fq1.closeInputFile(); be_fq2.closeFile(); //cout<<"done"<<endl; +} + +void test_getReadPEWithNQST() { + rpos my_struct1,my_struct2; + char * fq_PE1=(char *) "../test/data/unit/test_PE1_2.fq"; + char * fq_PE2=(char *) "../test/data/unit/test_PE2_2.fq"; + srp io_sr; + unsigned int j=0; + char dna_read[MAX_READ_LENGTH]; + + FqMainBackend be_fq1=FqMainBackend(&io_sr); + FqAuxBackend be_fq2; + be_fq1.setAuxProcessor(&be_fq2); + + be_fq1.openInputFile(fq_PE1, 4); + be_fq2.openFile(fq_PE2, 5); + + FqBaseBackend * fic_map[k_max_input_files]; + fic_map[3]=&be_fq1; + fic_map[4]=&be_fq2; + + DnaSeqStr a_seqs[2]; + + init_DnaSeqStr(&a_seqs[0]); + init_DnaSeqStr(&a_seqs[1]); + + my_struct1=init_rpos(4,0); + update_rpos(5,0,j,&my_struct1); + + my_struct2=init_rpos(4,13647); + update_rpos(5,13653,j,&my_struct2); + assert(my_struct1.read_a1==0); + assert(my_struct1.read_a2==0); + assert(my_struct2.read_a1==13647); + assert(my_struct2.read_a2==6); + + int min_score_qual=14; + getDnaStr(fic_map,my_struct1,a_seqs,dna_read,min_score_qual); + std::cout<<"PE dna read:"<<endl; + std::cout<<dna_read<<endl; + assert(strcmp(dna_read,"GGNTCTGTTGGCTCAAACGCCNTCACNTCNTTGNTCAAAAGCTTATAATGCGNGCCAAANTCCGCCATCGNGACNNCTACGCCTTCCCCNGCTNTCCCGNCAAANNCNNGTCTTGCCGGANCTTCNCNNNCTCCNNTCGAAAGCGGCGAAATCTTAGNGGAAGGNGGANATAATGCNNTCNCNTCGNACNNTGAANNTNTANNCGGCANNCNGCAGNNTCNAGGNCTTNCNGNGNAACGNTTAANNGCAGATGGCTACGGNTTTGCNGGGGNANGAGACNGGANANCNNNGNCGNTCGNCCN")==0); + + /*getDnaStr(fic_map,my_struct2,a_seqs,dna_read); + assert(strcmp(dna_read,"ATTGTGGGGTTCCTTTTTGTAGCATTGGAATGGAAATTAAAAATGGGGCTTCAGGATGCCCGCTCCATTATTTAATTCCAGAATGTAACGATGCTGTTTACCGGGGGGACTGGAAAGATGCACTTGAGCTTTTGATTAAAACAAATAACATGCCCAGAACCAATCACTGCAATTTTTTTATCCCACCGCACTATCGGTGGAGTCGGCATGAACCAACCTAAACCAAACCCACGGTCAATAATAGCCCGTTCAATCGAATTAATACCCACAGCAGGATCAGAAATTGCAACCGTACAACTTC")==0); +*/ + be_fq1.closeInputFile(); + be_fq2.closeFile(); + //cout<<"done"<<endl; } void test_decomposeReadInkMerNums() { @@ -309,6 +357,64 @@ void testDNAToNumberWithN() { it+=11; // The 1rst expected k-mer is the one after the 3rd N char (starting at position 47 in dnaStr). ATCAAGCATTAGAGACGCTTCTCTAATGTA or its reverse complement depending assert(*it==max(234837138459816172,886965076742957027)); + + char dnaStr2[]="GCCTTTTCTTTTTCCAGGGAAAACCATCCAGGAGGAACTTTATTATGGCGATGTATGAAGTCGGTACCGTCACGGGTGCCGCGTCGCAGGCACGGGTGACAGGTGCGACAACAAAATGGTCACAGGAGGCGCTGNGGATACAGCCCGGGTCGATTCTGGTGGTCTACCGCAGCGGTAGTGCTGACCTGTATGCGATCAAATCCGTGGACAGCGACACGCAACTGACGCTGACCCGGAATATCACCACCGC"; + nb_expected_k_mers=strlen(dnaStr2)+1; + nb_expected_k_mers-=k; + + nbrKmerDecompo.clear(); + nbrKmerDecompo.reserve(nb_expected_k_mers); + read_p.getKMerNumbers((char *) dnaStr2,strlen(dnaStr2),nbrKmerDecompo); + assert(nbrKmerDecompo.size()==nb_expected_k_mers-k); // there is only one N in danStr2. + +} + + + +// testing getRead with a quality score threshold for nucleotides. +void test_getReadWithNQST() { + int nucl_qual_score_thres; + rpos my_struct1,my_struct2; + my_struct1=init_rpos(4,639); + my_struct2=init_rpos(4,1228); + + assert(my_struct1.read_a1==639); + assert(my_struct2.read_a1==1228); + srp io_sr; + unsigned int j=0; + DnaSeqStr a_seqs; + char dna_read[MAX_READ_LENGTH]; + + char * fq_single2=(char *) "../test/data/unit/test_single2.fq"; + FqMainBackend be_fq=FqMainBackend(&io_sr); // TODO, remove argument from constructor + be_fq.openInputFile(fq_single2, 4); + + + FqBaseBackend * fic_map[k_max_input_files]; + fic_map[3]=&be_fq; + + init_DnaSeqStr(&a_seqs); + nucl_qual_score_thres=32; + getDNASeqstr(fic_map, my_struct1, 0, &a_seqs,nucl_qual_score_thres); + char * tmp=(char *) a_seqs.fq_record_buf; + tmp+=a_seqs.start_idx; + memcpy(dna_read,tmp,a_seqs.length); + dna_read[a_seqs.length]='\0'; + /*std::cout<<"modified dna read:"<<endl; + std::cout<<dna_read<<endl;*/ + assert(strcmp(dna_read,"NNNNNAGGTGCTACCATAACACCAACTGTTTTCACNATAATTTTAAAATCAAGCATTAGAGACGCNTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTNATTTCTTCCACTANCCTGCCATAATCCAGTACAACCTGGTATAACGGNCAANCGCTTTTTATCATAGGANCTGTATTCTCCTACCTCACGTGGCAAAGGAGGNCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGNNNNN")==0); + + init_DnaSeqStr(&a_seqs); + nucl_qual_score_thres=50; + getDNASeqstr(fic_map, my_struct1, 0, &a_seqs,nucl_qual_score_thres); + tmp=(char *) a_seqs.fq_record_buf; + tmp+=a_seqs.start_idx; + memcpy(dna_read,tmp,a_seqs.length); + // std::cout<<dna_read<<endl; + assert(strcmp(dna_read,"NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN")==0); + + // now test the same thing with PE reads + test_getReadPEWithNQST(); } int main(int argc, char **argv) { @@ -316,6 +422,8 @@ int main(int argc, char **argv) { test_getReadSingle(); cout<<"test getting PE reads."<<endl; test_getReadPE(); + cout<<"test getting reads with a quality score constraint on nucleotides."<<endl; + test_getReadWithNQST(); cout<<"test converting k_mers of different size into number"<<endl; testDNAToNumberSimple(); cout<<"test converting an entire read to a series of numbers"<<endl; diff --git a/test/Makefile.am b/test/Makefile.am index 234b7375d2b39915f6fa0c9f4c5e19c23f7650e0..b9f8f5f9590b0a58e8e4ea5624723c4cfde8547b 100755 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -4,6 +4,6 @@ LOG_COMPILER = $(SHELL) TESTS = rock.sh -EXTRA_DIST = $(TESTS) rock_mem.sh data/extract_klebsiella_long_reads_100.txt data/extract_klebsiella_long_reads_100_filtered.txt data/fastq.raw/klebsiella_100_1.fq data/fastq.raw/klebsiella_100_2.fq data/unit/klebsiella_PE1.fq data/unit/klebsiella_PE2.fq data/unit/test_PE1.fq data/unit/test_PE2.fq data/unit/test_PE1_2.fq data/unit/test_PE2_2.fq data/unit/test_single2.fq data/unit/test_single.fq data/unit/list_input1.txt data/unit/list_output1.txt data/unit/list_input2.txt data/unit/list_output2.txt data/unit/list_input3.txt data/unit/list_output3.txt +EXTRA_DIST = $(TESTS) rock_mem.sh data/iofiles.args/extract_klebsiella_long_reads_100.txt data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt data/fastq.raw/klebsiella_100_1.fq data/fastq.raw/klebsiella_100_2.fq data/unit/klebsiella_PE1.fq data/unit/klebsiella_PE2.fq data/unit/test_PE1.fq data/unit/test_PE2.fq data/unit/test_PE1_2.fq data/unit/test_PE2_2.fq data/unit/test_single2.fq data/unit/test_single.fq data/unit/list_input1.txt data/unit/list_output1.txt data/unit/list_input2.txt data/unit/list_output2.txt data/unit/list_input3.txt data/unit/list_output3.txt data/iofiles.args/output_files_noNQ_Thres.txt data/iofiles.args/output_files_NQ_Thres_very_low.txt data/iofiles.args/output_files_NQ_Thres_12.txt data/iofiles.args/output_files_NQ_Thres_13.txt diff --git a/test/rock.sh b/test/rock.sh index c3b0f07a13c8ecbd5ee4e76b98db1189ef2639b4..bce3eedb1861e19060da513ad81daeada39ab421 100755 --- a/test/rock.sh +++ b/test/rock.sh @@ -15,10 +15,11 @@ echo "########################################################################## echo "checking rock options and error mesages" #../src/rock|grep "input file name is mandatory" >/dev/null || exit 1 ../src/rock|grep "usage" >/dev/null || exit 2 -../src/rock -i ${srcdir}/data/extract_klebsiella_long_reads_100.txt fic1.fq fic2.fq|grep "It cannot be both" >/dev/null ||exit 3 -#../src/rock -i ${srcdir}/data/extract_klebsiella_long_reads_100.txt|grep "usage" >/dev/null ||exit 4 -#../src/rock -i ${srcdir}/data/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/extract_klebsiella_long_reads_100_filtered.txt -../src/rock -i ${srcdir}/data/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/extract_klebsiella_long_reads_100_filtered.txt|grep "parent directory for output files:" >/dev/null ||exit 5 +../src/rock -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt fic1.fq fic2.fq|grep "It cannot be both" >/dev/null ||exit 3 +#../src/rock -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt|grep "usage" >/dev/null ||exit 4 +#../src/rock -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt +../src/rock -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "parent directory for output files:" >/dev/null ||exit 5 + echo " " echo "##################################################################################" echo "creating directory for test output files" @@ -36,23 +37,33 @@ then cp -R ${srcdir}/data/unit data/ chmod u+w data/unit fi +if [ ! -d "data/iofiles.args" ] +then + mkdir data/iofiles.args + cp -R ${srcdir}/data/iofiles.args data/ +fi echo "checking content of local data directory" ls -l data echo "checking content of local data/fastq.raw directory" ls -l data/fastq.raw/ +echo "checking content of local data/iofiles.args directory" +ls -l data/iofiles.args + mkdir data/fastq.filtered || exit 6 + echo " " echo "##################################################################################" echo "doing more checks on options and error messages" -../src/rock -C 500000 -i ${srcdir}/data/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/extract_klebsiella_long_reads_100_filtered.txt|grep "Bad value for kappa" >/dev/null || exit 7 -../src/rock -C 500 -c 600 -i ${srcdir}/data/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/extract_klebsiella_long_reads_100_filtered.txt|grep "ERROR lower filter is higher than high filter" >/dev/null || exit 8 -../src/rock -C 500 -c 400 -k 60 -i ${srcdir}/data/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/extract_klebsiella_long_reads_100_filtered.txt|grep "Bad value for k" >/dev/null || exit 9 -../src/rock -C 500 -c 400 -k 10 -l 0 -i ${srcdir}/data/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/extract_klebsiella_long_reads_100_filtered.txt|grep "Bad value for lambda" >/dev/null || exit 10 -../src/rock -C 500 -c 400 -k 10 -l 500 -i ${srcdir}/data/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/extract_klebsiella_long_reads_100_filtered.txt|grep "Not enough RAM on the machine" >/dev/null || exit 11 -../src/rock -C 500 -c 400 -k 10 -l 12 -g 25 -i ${srcdir}/data/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/extract_klebsiella_long_reads_100_filtered.txt|grep "options are mutually exclusive" >/dev/null|| exit 12 -../src/rock -C 500 -c 400 -k 10 -g 10000 -i ${srcdir}/data/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/extract_klebsiella_long_reads_100_filtered.txt|grep "This machine only has" >/dev/null || exit 13 -../src/rock -C 500 -c 400 -k 10 -l 12 -n 85000000 -i ${srcdir}/data/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/extract_klebsiella_long_reads_100_filtered.txt|grep "options are mutually exclusive">/dev/null || exit 14 +../src/rock -C 500000 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "Bad value for kappa" >/dev/null || exit 7 +../src/rock -C 500 -c 600 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "ERROR lower filter is higher than high filter" >/dev/null || exit 8 +../src/rock -C 500 -c 400 -k 60 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "Bad value for k" >/dev/null || exit 9 +../src/rock -C 500 -c 400 -k 10 -l 0 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "Bad value for lambda" >/dev/null || exit 10 +../src/rock -C 500 -c 400 -k 10 -l 500 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "Not enough RAM on the machine" >/dev/null || exit 11 +../src/rock -C 500 -c 400 -k 10 -l 12 -g 25 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "options are mutually exclusive" >/dev/null|| exit 12 +../src/rock -C 500 -c 400 -k 10 -g 10000 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "This machine only has" >/dev/null || exit 13 +../src/rock -C 500 -c 400 -k 10 -l 12 -n 85000000 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt|grep "options are mutually exclusive">/dev/null || exit 14 + # here check that we have enough memory for running the tests. ../src/unit_test_cms CHECK_MEM @@ -100,6 +111,11 @@ then echo "erasing data/unit" rm -fr data/unit || exit 212 fi +if [ "$erase_indata" = "true" ] +then + echo "erasing data/iofiles.args" + rm -fr data/iofiles.args || exit 212 +fi ## Normal end diff --git a/test/rock_mem.sh b/test/rock_mem.sh index b1e4023bfea696a6c5f2f660597a46e67fe9d65f..54849ebd8ec5339c3737c8a1fbe416449b912fc2 100755 --- a/test/rock_mem.sh +++ b/test/rock_mem.sh @@ -11,7 +11,8 @@ echo "testing high filter" #echo "I want to check if I can write to a file ">../test/data/fastq.filtered/SRR1222430_1.filtered.fastq #cat ../test/data/fastq.filtered/SRR1222430_1.filtered.fastq -../src/rock -C 100 -l 2 -i ${srcdir}/data/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/extract_klebsiella_long_reads_100_filtered.txt #>/dev/null || exit 15 +../src/rock -C 100 -l 2 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt #>/dev/null || exit 15 + # output files should be the same size, contain the same elements but not in the same order. nb_PE1=`grep -c "@" ${srcdir}/data/fastq.raw/klebsiella_100_1.fq` nb_PE2=`grep -c "@" ${srcdir}/data/fastq.raw/klebsiella_100_2.fq` @@ -34,7 +35,7 @@ test $nb_PE2=$nb_PE2_filtered || exit 17 echo " " echo "##################################################################################" echo "testing low filter" -../src/rock -C 100 -c 99 -l 2 -i ${srcdir}/data/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/extract_klebsiella_long_reads_100_filtered.txt >/dev/null || exit 18 +../src/rock -C 100 -c 99 -l 2 -i ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100.txt -o ${srcdir}/data/iofiles.args/extract_klebsiella_long_reads_100_filtered.txt >/dev/null || exit 18 [ -f "data/fastq.filtered/SRR1222430_1.filtered.fastq" ] || exit 181 [ -f "data/fastq.filtered/SRR1222430_2.filtered.fastq" ] || exit 182 @@ -72,7 +73,7 @@ echo "########################################################################## echo "testing verbose mode" ../src/rock -C 100 -c 1 -l 2 -v ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq ${srcdir}/data/unit/test_single.fq ${srcdir}/data/unit/test_single2.fq|grep "CMS size=" >/dev/null || exit 182 -../src/rock -C 100 -c 1 -v -n 1000 ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq ${srcdir}/data/unit/test_single.fq ${srcdir}/data/unit/test_single2.fq|grep "expected probability of collision was:" >/dev/null || exit 182 +../src/rock -C 100 -c 1 -v -n 1000 ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq ${srcdir}/data/unit/test_single.fq ${srcdir}/data/unit/test_single2.fq|grep "expected probability of collision was:" >/dev/null || exit 183 echo "erasing test result files" rm -fr "klebsiella_100_1.rock.fq" || exit 1816 @@ -80,6 +81,55 @@ rm -fr "klebsiella_100_2.rock.fq" || exit 1817 rm -fr "test_single.rock.fq"|| exit 1818 rm -fr "test_single2.rock.fq"|| exit 1819 +echo " " +echo "##################################################################################" +echo "testing rock with a quality score threshold for nucleotides." + +expected_diff1="304a305,308 \ + > @SRR1222430.37 37 length=250 \ +> GCCTTTTCTTTTTCCAGGGAAAACCATCCAGGAGGAACTTTATTATGGCGATGTATGAAGTCGGTACCGTCACGGGTGCCGCGTCGCAGGCACGGGTGACAGGTGCGACAACAAAATGGTCACAGGAGGCGCTGGGGATACAGCCCGGGTCGATTCTGGTGGTCTACCGCAGCGGTAGTGCTGACCTGTATGCGATCAAATCCGTGGACAGCGACACGCAACTGACGCTGACCCGGAATATCACCACCGC \ +> +SRR1222430.37 37 length=250 \ +> AAAABFFFFFFFGCGFGGEFFCHHEEFHHGHHAFEAGAGGGFGGFHGFHGEEEFF5BDHHBDEGCEEHGAEGGFFGFCFHCGGC@E/EEEEHFEC@BBFFFG/GFEFDDCDHHGCC<?FGDDGHFFFGFFCGGG-.AGDGBGGHHGGG?ACGCFGG99B9;?C99B0CDGGCFF;DDFFFFFFFB99:;BBF/A.:9DFFF/F?EEFFFFFF>ADAFDFFFFFFE/ADADDFFFFFA;EEFF/BFFEEFF" + +expected_diff2="> @SRR1222430.37 37 length=251 \ +> GCTGGCCAGCTGGTTAGCAAACGACGAGGTGCTGGCGGGTTTAGCGGGAAAAATTGCGGAAGAGGCGCCGGGAAAAGCGGGGGTGTTATTACAGGTCTACGGCAGATGCGTGTCGCTGGCCGCGGCTTTTAGCGCCTACAAGTCTGGACTTCCCCGTCGGGGGGCAACCACAATTCACCCCGGCTGTATTCCACTCGGCCCCGGTGACCCTTTTGGTGTCGCCCCCGGACACCGTGCCCGTGCCCCGGTAC \ +> +SRR1222430.37 37 length=251 \ +> A3>333BFA2FF4GBFFDGGCGED?FGEGGHGFFFEEG?AF13@50>///11B13@@1/>//B0?>////<//B/00??@/--:--.;:C000;:0/0009.-9:.00:-.;9/9...-;.--9@--9:////-9-9..////9/;//;9///.9-..--------..99.9.//////;-;--9-.////://9/9.;.-;-99-.//.;////-;?9;...9-9-----9;-.;.;/.-9.;/99=--;" + +mkdir tmp +../src/rock -C 100 -c 1 -l 2 -o ${srcdir}/data/iofiles.args/output_files_noNQ_Thres.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq || exit 191 + +../src/rock -C 100 -c 1 -l 2 -q 2 -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_very_low.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq || exit 192 + +ret1=`diff tmp/klebsiella_100_1_very_low_qual_thres.fq tmp/klebsiella_100_1_no_qual_thres.fq` +test $ret1=0 || exit 1921 +ret2=`diff tmp/klebsiella_100_2_very_low_qual_thres.fq tmp/klebsiella_100_2_no_qual_thres.fq` +test $ret2=0 || exit 1922 + +../src/rock -C 100 -c 1 -l 2 -q 12 -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_12.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq||exit 193 + +../src/rock -C 100 -c 1 -l 2 -q 13 -o ${srcdir}/data/iofiles.args/output_files_NQ_Thres_13.txt ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq||exit 194 + +#diff_PE1=`diff tmp/klebsiella_100_1_13_qual_thres.fq tmp/klebsiella_100_1_12_qual_thres.fq` +#diff_PE2=`diff tmp/klebsiella_100_2_13_qual_thres.fq tmp/klebsiella_100_2_12_qual_thres.fq` + +#test "$diff_PE1" = "$expected_diff1" || exit 1931 # problem : even if strings are identical test returns false. TO DO solve that later +#test "$diff_PE2" = "$expected_diff2" || exit 1932 + +ret1=`diff tmp/klebsiella_100_1_13_qual_thres.fq tmp/klebsiella_100_1_12_qual_thres.fq|grep -c "@SRR1222430.37"` +test $ret1 = 1 || exit 1931 + +ret2=`diff tmp/klebsiella_100_1_13_qual_thres.fq tmp/klebsiella_100_1_12_qual_thres.fq|grep -c "length"` +test $ret2 = 2 || exit 1932 + +ret1=`diff tmp/klebsiella_100_2_13_qual_thres.fq tmp/klebsiella_100_2_12_qual_thres.fq|grep -c "@SRR1222430.37"` +test $ret1 = 1 || exit 1933 + +ret2=`diff tmp/klebsiella_100_2_13_qual_thres.fq tmp/klebsiella_100_2_12_qual_thres.fq|grep -c "length"` +test $ret2 = 2 || exit 1934 + +rm -fr tmp + echo " " echo "##################################################################################" echo "unit testing for cms component"