diff --git a/src/.settings/language.settings.xml b/src/.settings/language.settings.xml new file mode 100644 index 0000000000000000000000000000000000000000..527fb011044a60d3943527299156f9d5de8be1d0 --- /dev/null +++ b/src/.settings/language.settings.xml @@ -0,0 +1,15 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<project> + <configuration id="cdt.managedbuild.toolchain.gnu.macosx.base.481822517" name="Default"> + <extension point="org.eclipse.cdt.core.LanguageSettingsProvider"> + <provider copy-of="extension" id="org.eclipse.cdt.ui.UserLanguageSettingsProvider"/> + <provider-reference id="org.eclipse.cdt.core.ReferencedProjectsLanguageSettingsProvider" ref="shared-provider"/> + <provider class="org.eclipse.cdt.managedbuilder.language.settings.providers.GCCBuildCommandParser" id="org.eclipse.cdt.managedbuilder.core.GCCBuildCommandParser" keep-relative-paths="false" name="CDT GCC Build Output Parser" parameter="(g?cc)|([gc]\+\+)|(clang)" prefer-non-shared="true"/> + <provider class="org.eclipse.cdt.managedbuilder.language.settings.providers.GCCBuiltinSpecsDetector" console="false" env-hash="-1667358530298693438" id="org.eclipse.cdt.managedbuilder.core.GCCBuiltinSpecsDetector" keep-relative-paths="false" name="CDT GCC Built-in Compiler Settings" parameter="${COMMAND} ${FLAGS} -E -P -v -dD "${INPUTS}"" prefer-non-shared="true"> + <language-scope id="org.eclipse.cdt.core.gcc"/> + <language-scope id="org.eclipse.cdt.core.g++"/> + </provider> + <provider-reference id="org.eclipse.cdt.managedbuilder.core.MBSLanguageSettingsProvider" ref="shared-provider"/> + </extension> + </configuration> +</project> \ No newline at end of file diff --git a/src/ReadProcessor.cpp b/src/ReadProcessor.cpp index 27030213a7bd5fe4ca2db89827e8b22880665bae..dde661758b9606f3f50919c0342cb5279cc885af 100644 --- a/src/ReadProcessor.cpp +++ b/src/ReadProcessor.cpp @@ -25,7 +25,7 @@ #include "ReadProcessor.h" #include <iostream> -//! Tried to inline this bur slightly slower. Stays here. +//! Tried to inline this but slightly slower. Stays here. unsigned long ReadProcessor::nucleoToNumberWrap(char s) { (s=='N')?n=k:n; n--; @@ -36,7 +36,7 @@ void ReadProcessor::kMerAndRevToNumber(char * k_m,KmRevNumbers& numbers) { unsigned long c,crev; if (numbers.first) { // first k_mer conversion int i; - int j=k; + int j=k; for (i=0;i<k;i++) { c=nucleoToNumberWrap(k_m[i]); // do not catch exception for the moment. If nucleoToNumber throws -1 and there is no catch clause for it, the terminate() function will be called automatically. Anyway, there's not munch we can do if it returns -1. numbers.nbr=numbers.nbr<<2; @@ -45,7 +45,7 @@ void ReadProcessor::kMerAndRevToNumber(char * k_m,KmRevNumbers& numbers) { numbers.nbr_rev=numbers.nbr_rev|crev; j--; } - numbers.first=0; + numbers.first=0; } else { c=nucleoToNumberWrap(k_m[k-1]); numbers.nbr=numbers.nbr<<2; @@ -62,6 +62,7 @@ void ReadProcessor::kMerAndRevToNumber(char * k_m,KmRevNumbers& numbers) { void ReadProcessor::getKMerNumbers(char * dnaStr,int l,std::vector<unsigned long>& my_vect) { // See simple_test.cpp and results. benchmark showed that std::vector is very slightly faster than C array and doesn't require more memory in our case. So, I am using it since it makes code simpler. int i; KmRevNumbers numbers; + initNumbers(numbers); char * p_char=dnaStr; int nb_k_m=l-k+1; for (i=0; i<nb_k_m;i++) { diff --git a/src/ReadProcessor.h b/src/ReadProcessor.h index 62f688179fb833171cd1c1080fee81381000bede..8caa552c8da2795bf4f864211d853010ffa36c4d 100644 --- a/src/ReadProcessor.h +++ b/src/ReadProcessor.h @@ -35,12 +35,8 @@ #include <iostream> using namespace std; -//! Gather here the numbers corresponding to a kmer and it's reverse complement once they were hashed. -typedef struct { - unsigned long nbr; - unsigned long nbr_rev; - unsigned int first; //! set this to 1 before processing the 1rst kmer of the read. -} KmRevNumbers; + + class ReadProcessor { @@ -51,6 +47,7 @@ class ReadProcessor { std::map<char, char> nucleo_rev_cpl; unsigned long mask_kMer; + void init_nucleo_rev_cpl() { nucleo_rev_cpl['A']='T'; nucleo_rev_cpl['T']='A'; @@ -114,11 +111,24 @@ public: init_mask(k); } + //! Gather here the numbers corresponding to a kmer and it's reverse complement once they were hashed. + typedef struct { + unsigned long nbr; + unsigned long nbr_rev; + unsigned int first; //! set this to 1 before processing the 1rst kmer of the read. + } KmRevNumbers; + + void initNumbers(KmRevNumbers& numbers) { + numbers.first=1; + numbers.nbr=0; + numbers.nbr_rev=0; + } + // Do not inline this; it is slightly slower. // These functions are no more used. Now using the same loop to convert hash kmer and reverse complement. /*unsigned long kMerToNumberReverse(char * k_m,unsigned long * p_prev); unsigned long kMerToNumber(char * k_m,unsigned long * p_prev);*/ - + //! The 2 previous methods are now replacs by the one below. void kMerAndRevToNumber(char * k_m,KmRevNumbers& numbers); unsigned long nucleoToNumberWrap(char s); diff --git a/src/unit_test_cms b/src/unit_test_cms new file mode 100755 index 0000000000000000000000000000000000000000..27b3a8364dac9209abcb6410d8a5fb313c7b7b36 Binary files /dev/null and b/src/unit_test_cms differ diff --git a/src/unit_test_fqreader b/src/unit_test_fqreader new file mode 100755 index 0000000000000000000000000000000000000000..056c2b1304d00ab6a76e4bcae8e3a99ed54f62eb Binary files /dev/null and b/src/unit_test_fqreader differ diff --git a/src/unit_test_fqwriter b/src/unit_test_fqwriter new file mode 100755 index 0000000000000000000000000000000000000000..c13adacc2b01f4236a8ba2c702d175fc36616b55 Binary files /dev/null and b/src/unit_test_fqwriter differ diff --git a/src/unit_test_math_utils b/src/unit_test_math_utils new file mode 100755 index 0000000000000000000000000000000000000000..2b841e519706a85f8914034929fbccc3909d2173 Binary files /dev/null and b/src/unit_test_math_utils differ diff --git a/src/unit_test_read_utils b/src/unit_test_read_utils new file mode 100755 index 0000000000000000000000000000000000000000..fcb34f67c20c3b60f24feb096ea77ae0940c29db Binary files /dev/null and b/src/unit_test_read_utils differ diff --git a/src/unit_test_read_utils.cpp b/src/unit_test_read_utils.cpp index 3e6439964b89c3e060874bbe4e5639f8ade0f8e5..44560b997ed5b4905bf5c3bb64927f946d2a90e1 100644 --- a/src/unit_test_read_utils.cpp +++ b/src/unit_test_read_utils.cpp @@ -226,7 +226,18 @@ void test_decomposeReadInkMerNums() { decomposeReadInKMerNums(read_p,nbrKmerDecompo,k,a_seqs); assert(nbrKmerDecompo.size()==147); } - +/* +void initNumbers(KmRevNumbers& numbers) { + numbers.first=1; + numbers.nbr=0; + numbers.nbr_rev=0; +}*/ + +void savNumbers(ReadProcessor::KmRevNumbers& numbers,ReadProcessor::KmRevNumbers& numbers_sav) { + numbers_sav.first=numbers.first; + numbers_sav.nbr=numbers.nbr; + numbers_sav.nbr_rev=numbers.nbr_rev; +} /* * Test the basic methods of the ReadProcessor object. @@ -244,32 +255,42 @@ void testDNAToNumberSimple() { assert(read_p.nucleoToNumber('N')==0); // unsigned long * p_prev=NULL; - KmRevNumbers numbers; + ReadProcessor::KmRevNumbers numbers; + read_p.initNumbers(numbers); read_p.set_k(4); read_p.kMerAndRevToNumber((char *) "AAAA",numbers); assert(numbers.nbr==0); // TODO : this method should be private in ReadProcessor. Design a friend class to test them or declare this testing function friend in ReadProcessor. assert(numbers.nbr_rev==255); read_p.set_k(5); + read_p.initNumbers(numbers); read_p.kMerAndRevToNumber((char *) "AANAA",numbers); assert(numbers.nbr==0); assert(numbers.nbr_rev==975); read_p.set_k(14); + read_p.initNumbers(numbers); read_p.kMerAndRevToNumber((char *) "AANAATCCGATTGG",numbers); assert(numbers.nbr==219386); assert(numbers.nbr_rev==84779983); + ReadProcessor::KmRevNumbers numbers_orig,numbers_w; + savNumbers(numbers,numbers_orig); + savNumbers(numbers_orig,numbers_w); read_p.set_k(15); - read_p.kMerAndRevToNumber((char *) "AANAATCCGATTGGA",numbers); - assert(numbers.nbr==877544); - read_p.kMerAndRevToNumber((char *) "AANAATCCGATTGGN",numbers); - assert(numbers.nbr==877544); - read_p.kMerAndRevToNumber((char *) "AANAATCCGATTGGC",numbers); - assert(numbers.nbr==877545); - read_p.kMerAndRevToNumber((char *) "AANAATCCGATTGGG",numbers); - assert(numbers.nbr==877546); - read_p.kMerAndRevToNumber((char *) "AANAATCCGATTGGT",numbers); - assert(numbers.nbr==877547); + read_p.kMerAndRevToNumber((char *) "AANAATCCGATTGGA",numbers_w); + assert(numbers_w.nbr==877544); + savNumbers(numbers_orig,numbers_w); + read_p.kMerAndRevToNumber((char *) "AANAATCCGATTGGN",numbers_w); + assert(numbers_w.nbr==877544); + savNumbers(numbers_orig,numbers_w); + read_p.kMerAndRevToNumber((char *) "AANAATCCGATTGGC",numbers_w); + assert(numbers_w.nbr==877545); + savNumbers(numbers_orig,numbers_w); + read_p.kMerAndRevToNumber((char *) "AANAATCCGATTGGG",numbers_w); + assert(numbers_w.nbr==877546); + savNumbers(numbers_orig,numbers_w); + read_p.kMerAndRevToNumber((char *) "AANAATCCGATTGGT",numbers_w); + assert(numbers_w.nbr==877547); read_p.set_k(1); assert(read_p.nucleoToNumberReverse('A',1)==3); @@ -285,30 +306,31 @@ void testDNAToNumberSimple() { assert(read_p.nucleoToNumberReverse('T',20)==0); assert(read_p.nucleoToNumberReverse('N',20)==0); - numbers.nbr=84779983; + numbers.nbr_rev=84779983; read_p.set_k(14); read_p.kMerAndRevToNumber((char *)"ANAATCCGATTGGA",numbers); assert(numbers.nbr_rev==222521587); - numbers.nbr=84779983; + numbers.nbr_rev=84779983; read_p.kMerAndRevToNumber((char *)"ANAATCCGATTGGN",numbers); assert(numbers.nbr_rev==21194995); - numbers.nbr=84779983; + numbers.nbr_rev=84779983; read_p.kMerAndRevToNumber((char *)"ANAATCCGATTGGT",numbers); assert(numbers.nbr_rev==21194995); - numbers.nbr=84779983; + numbers.nbr_rev=84779983; read_p.kMerAndRevToNumber((char *)"ANAATCCGATTGGC",numbers); assert(numbers.nbr_rev==155412723); - numbers.nbr=84779983; + numbers.nbr_rev=84779983; read_p.kMerAndRevToNumber((char *)"ANAATCCGATTGGG",numbers); assert(numbers.nbr_rev==88303859); } + + /* * Test the ReadProcessor method that will be called by the main program : the one who takes as input a dna string and returns * a serie of numbers all corresponding to its k-mer composition + reverse complement. */ - void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32. // from a dna string a found in fq files, return its k-mer decomposition; k-mers are numbers. char dnaStr[]="TTTTTAGGTGCTACCATAACACCAACTGTTTTCACCATAATTTTAAAATCAAGCATTAGAGACGCTTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTCATTTCTTCCACTAACCTGCCATAATCCAGTACAACCTGGTATAACGGTCAAACGCTTTTTATCATAGGAACTGTATTCTCCTACCTCACGTGGCAAAGGAGGTCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGTGGTA"; @@ -325,35 +347,44 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32. assert(nbrKmerDecompo.size()==nb_expected_k_mers); // for each k_mer, we also expect to have a number for its reverse complement. int cnt_k_mer=0; unsigned long num,num2; + ReadProcessor::KmRevNumbers numbers,numbers_start; + read_p.initNumbers(numbers); + read_p.initNumbers(numbers_start); + read_p.kMerAndRevToNumber((char *) "TTTTTAGGTGCTACCATAACACCAACTGTT",numbers); + // 1rst k-mer is : TTTTTAGGTGCTACCATAACACCAACTGTT. + // its reverse complement is : AACAGTTGGTGTTATGGTAGCACCTAAAAA for (it=nbrKmerDecompo.begin();it!=nbrKmerDecompo.end();it++) { cnt_k_mer++; - // 1rst k-mer is : TTTTTAGGTGCTACCATAACACCAACTGTT. - // its reverse complement is : AACAGTTGGTGTTATGGTAGCACCTAAAAA if (cnt_k_mer==1) { assert(*it==1151987406330741231); - num=*it; + assert(*it==max(numbers.nbr,numbers.nbr_rev)); + //num=*it; } if (cnt_k_mer==2) { // 2nd k-mer is : TTTTAGGTGCTACCATAACACCAACTGTTT - num2=read_p.kMerToNumber((char *) "TTTTAGGTGCTACCATAACACCAACTGTTT",NULL); - assert(*it==num2); - unsigned long num3=read_p.kMerToNumber((char *) "TTTTAGGTGCTACCATAACACCAACTGTTT",&num); - assert(num3==num2); + read_p.kMerAndRevToNumber((char *) "TTTTAGGTGCTACCATAACACCAACTGTTT",numbers_start); + cout<<numbers_start.nbr<<endl; + cout<<numbers_start.nbr_rev<<endl; + cout<<*it<<endl; + assert(*it==numbers_start.nbr); + read_p.kMerAndRevToNumber((char *) "TTTTAGGTGCTACCATAACACCAACTGTTT",numbers); + assert(numbers.nbr==numbers_start.nbr); } if (cnt_k_mer==3) { - num=read_p.kMerToNumber((char *) "TTTAGGTGCTACCATAACACCAACTGTTTT",NULL); - unsigned long num_rev=read_p.kMerToNumber((char *) "AAAACAGTTGGTGTTATGGTAGCACCTAAA",NULL); - assert(max(num,num_rev)==*it); + read_p.initNumbers(numbers); + read_p.kMerAndRevToNumber((char *) "TTTAGGTGCTACCATAACACCAACTGTTTT",numbers); + assert(max(numbers.nbr,numbers.nbr_rev)==*it); } if (cnt_k_mer==4) { - num=read_p.kMerToNumber((char *) "TTAGGTGCTACCATAACACCAACTGTTTTC",NULL); - assert(num==*it); + read_p.initNumbers(numbers); + read_p.kMerAndRevToNumber((char *) "TTAGGTGCTACCATAACACCAACTGTTTTC",numbers); + assert(numbers.nbr==*it); } // k-mer number 10 is GCTACCATAACACCAACTGTTTTCACCATA // its reverse complement is : TATGGTGAAAACAGTTGGTGTTATGGTAGC if (cnt_k_mer==10) { - num=read_p.kMerToNumber((char *) "GCTACCATAACACCAACTGTTTTCACCATA",NULL); + //num=read_p.kMerToNumber((char *) "GCTACCATAACACCAACTGTTTTCACCATA",NULL); assert(*it==max(704021989794238796,930978566989888201)); } /*if (cnt_k_mer==20) {