Skip to content
Snippets Groups Projects
Commit de35e3b4 authored by Veronique Legrand's avatar Veronique Legrand
Browse files

Now handle the case of k-mers that contain unknown nucleotides (labelled 'N')...

 Now handle the case of k-mers that contain unknown nucleotides (labelled 'N') and who must be processed but not taken into account for CMS.


Former-commit-id: 4df2b1be
parent c5a10623
Branches
Tags
No related merge requests found
......@@ -25,6 +25,12 @@ inline void ReadProcessor::init_mask(int k) {
mask_kMer=mask_kMer-1;
}*/
inline unsigned long ReadProcessor::nucleoToNumberWrap(char s) {
if (s=='N') n=k;
n--;
return nucleoToNumber(s);
}
inline unsigned long ReadProcessor::nucleoToNumber(char s) {
unsigned long nbr;
switch(s)
......@@ -43,11 +49,13 @@ inline unsigned long ReadProcessor::nucleoToNumber(char s) {
break;
case 'N':
nbr=0;
// n=2*k; // do that because n will also be decremented for reverse complement
break;
default:
throw -1; // TODO Benchmark this inside try catch statement to see if try catch+exception really costs so long.
// throw an integer for the moment. An exception object may not be the most optimal choice in terms of performance.
}
// n--;
return nbr;
}
......@@ -57,13 +65,13 @@ inline unsigned long ReadProcessor::kMerToNumber(char * k_m,unsigned long * p_pr
if (p_prev==NULL) { // first k_mer conversion
int i;
for (i=0;i<k;i++) {
c=nucleoToNumber(k_m[i]); // do not catch exception for the moment (not until I have checked it doesn't slow down execution). If nucleoToNumber returns -1, program will simply crash
c=nucleoToNumberWrap(k_m[i]); // do not catch exception for the moment (not until I have checked it doesn't slow down execution). If nucleoToNumber returns -1, program will simply crash
nbr=nbr<<2;
nbr=nbr|c;
}
} else {
nbr=*p_prev;
c=nucleoToNumber(k_m[k-1]);
c=nucleoToNumberWrap(k_m[k-1]);
nbr=nbr<<2;
nbr=nbr&mask_kMer;
nbr=nbr|c;
......@@ -110,9 +118,11 @@ void ReadProcessor::getKMerNumbers(char * dnaStr,int l,std::vector<unsigned long
int nb_k_m=l-k+1;
for (i=0; i<nb_k_m;i++) {
num=kMerToNumber(p_char,p_prev);
my_vect.push_back(num);
num_rev=kMerToNumberReverse(p_char,p_prev_rev);
if (n<0) {
my_vect.push_back(num);
my_vect.push_back(num_rev);
}
p_char++;
p_prev=&num;
p_prev_rev=&num_rev;
......
......@@ -12,6 +12,7 @@
class ReadProcessor {
int k;
int n;
std::map<char, char> nucleo_rev_cpl;
unsigned long mask_kMer;
......@@ -33,7 +34,7 @@ class ReadProcessor {
public:
ReadProcessor(int k=3) {
// init_mask(k);
n=0;
set_k(k);
init_nucleo_rev_cpl();
}
......@@ -49,7 +50,7 @@ public:
unsigned long kMerToNumber(char * k_m,unsigned long * p_prev);
unsigned long nucleoToNumber(char s);
unsigned long nucleoToNumberWrap(char s);
void getKMerNumbers(char * dnaStr,int l,std::vector<unsigned long>& my_vect);
......
......@@ -203,7 +203,6 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32.
ReadProcessor read_p(k);
std::vector<unsigned long> nbrKmerDecompo;
std::vector<unsigned long>::iterator it;
int nb_expected_k_mers=strlen(dnaStr)+1;
nb_expected_k_mers-=k;
......@@ -251,6 +250,37 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32.
}
}
// NTTTTNGGTGCTACCATAACACCAACTGTTTTCACCATAATTTTAANATCAAGCATTAGAGACGCTTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTCATTTCTTCCACTAACCTGCCATAATCCAGTACAACCTGGTATAACGGTCAAACGCTTTTTATCATAGGAACTGTATTCTCCTACCTCACGTGGCAAAGGAGGTCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGTGGTA
/*
* Tests the handling of N nucleotides.
* k-mers containing such nucleotides must not be converted into number and they are not in the result returned to the caller.
*/
void testDNAToNumberWithN() {
char dnaStr[]="NTTTTNGGTGCTACCATAACACCAACTGTTTTCACCATAATTTTAANATCAAGCATTAGAGACGCTTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTCATTTCTTCCACTAACCTGCCATAATCCAGTACAACCTGGTATAACGGTCAAACGCTTTTTATCATAGGAACTGTATTCTCCTACCTCACGTGGCAAAGGAGGTCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGTGGTA";
int k=30;
ReadProcessor read_p(k);
std::vector<unsigned long> nbrKmerDecompo;
std::vector<unsigned long>::iterator it;
int nb_expected_k_mers=strlen(dnaStr)+1;
nb_expected_k_mers-=k;
nb_expected_k_mers-=47; // expect the 1rts 76 k-mers to be skipped due to the presence of the N (unknown) nucleotide.
nb_expected_k_mers+=11; // between the 2nd and 3rd N character, there are 11 k-mers of length 30 that are correct (don't contain any N). no
nbrKmerDecompo.reserve(nb_expected_k_mers);
read_p.getKMerNumbers((char *) dnaStr,strlen(dnaStr),nbrKmerDecompo);
assert(nbrKmerDecompo.size()==nb_expected_k_mers*2);
it=nbrKmerDecompo.begin();
it+=22;
// The 1rst expected k-mer is the one after the 3rd N char (starting at position 47 in dnaStr). ATCAAGCATTAGAGACGCTTCTCTAATGTA
assert(*it==234837138459816172);
it++;
assert(*it==886965076742957027);
}
int main(int argc, char **argv) {
cout<<"test getting single reads."<<endl;
......@@ -261,6 +291,8 @@ int main(int argc, char **argv) {
testDNAToNumberSimple();
cout<<"test converting an entire read to a series of numbers"<<endl;
testDNAToNumberMoreComplex();
cout<<"testing the case of N nucleotides"<<endl;
testDNAToNumberWithN();
cout<<"done"<<endl;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment