Skip to content
Snippets Groups Projects
Commit c5a10623 authored by Veronique Legrand's avatar Veronique Legrand
Browse files

did encapsulation for functions transforming reads/k_mers into numbers

Former-commit-id: 4562a5c5
parent 2db09bcd
No related branches found
No related tags found
No related merge requests found
/*
* ReadProcessor.cpp
*
* Created on: Feb 5, 2016
* Author: vlegrand
*/
#include <stdlib.h>
#include "ReadProcessor.h"
/*
inline void ReadProcessor::init_nucleo_rev_cpl () {
nucleo_rev_cpl['A']='T';
nucleo_rev_cpl['T']='A';
nucleo_rev_cpl['C']='G';
nucleo_rev_cpl['G']='C';
nucleo_rev_cpl['N']='N';
}*/
/*
inline void ReadProcessor::init_mask(int k) {
int nb_bits=2*k;
mask_kMer=pow(2,nb_bits);
mask_kMer=mask_kMer-1;
}*/
inline unsigned long ReadProcessor::nucleoToNumber(char s) {
unsigned long nbr;
switch(s)
{
case 'A':
nbr=0;
break;
case 'C':
nbr=1;
break;
case 'G':
nbr=2;
break;
case 'T':
nbr=3;
break;
case 'N':
nbr=0;
break;
default:
throw -1; // TODO Benchmark this inside try catch statement to see if try catch+exception really costs so long.
// throw an integer for the moment. An exception object may not be the most optimal choice in terms of performance.
}
return nbr;
}
inline unsigned long ReadProcessor::kMerToNumber(char * k_m,unsigned long * p_prev) {
unsigned long nbr=0;
unsigned long c;
if (p_prev==NULL) { // first k_mer conversion
int i;
for (i=0;i<k;i++) {
c=nucleoToNumber(k_m[i]); // do not catch exception for the moment (not until I have checked it doesn't slow down execution). If nucleoToNumber returns -1, program will simply crash
nbr=nbr<<2;
nbr=nbr|c;
}
} else {
nbr=*p_prev;
c=nucleoToNumber(k_m[k-1]);
nbr=nbr<<2;
nbr=nbr&mask_kMer;
nbr=nbr|c;
}
return nbr;
}
inline unsigned long ReadProcessor::nucleoToNumberReverse(char s,int i) {
unsigned long nbr=0;
char cpl=nucleo_rev_cpl[s];
nbr=nucleoToNumber(cpl);
nbr=nbr<<2*(i-1);
return nbr;
}
inline unsigned long ReadProcessor::kMerToNumberReverse(char * k_m,unsigned long * p_prev) {
unsigned long nbr=0;
unsigned long c;
if (p_prev==NULL) { // first k_mer conversion
int i;
for (i=k;i>0;i--) {
c=nucleoToNumberReverse(k_m[i-1],i);
// nbr=nbr>>2;
nbr=nbr|c;
}
} else {
nbr=*p_prev;
c=nucleoToNumberReverse(k_m[k-1],k);
nbr=nbr>>2;
nbr=nbr|c;
}
return nbr;
}
void ReadProcessor::getKMerNumbers(char * dnaStr,int l,std::vector<unsigned long>& my_vect) { // See simple_test.cpp and results. benchmark showed that std::vector is very slightly faster than C array and doesn't require more memory in our case. So, I am using it since it makes code simpler.
/*std::vector<unsigned long> my_vect;
return my_vect;*/
int i;
unsigned long num;
unsigned long num_rev;
unsigned long * p_prev=NULL;
unsigned long * p_prev_rev=NULL;
char * p_char=dnaStr;
int nb_k_m=l-k+1;
for (i=0; i<nb_k_m;i++) {
num=kMerToNumber(p_char,p_prev);
my_vect.push_back(num);
num_rev=kMerToNumberReverse(p_char,p_prev_rev);
my_vect.push_back(num_rev);
p_char++;
p_prev=&num;
p_prev_rev=&num_rev;
}
}
/*
* Utility class to process read DNA strings
*/
#ifndef READPROCESSOR_H_
#define READPROCESSOR_H_
#include <map>
#include <vector>
#include <math.h>
class ReadProcessor {
int k;
std::map<char, char> nucleo_rev_cpl;
unsigned long mask_kMer;
void init_nucleo_rev_cpl() {
nucleo_rev_cpl['A']='T';
nucleo_rev_cpl['T']='A';
nucleo_rev_cpl['C']='G';
nucleo_rev_cpl['G']='C';
nucleo_rev_cpl['N']='N';
}
void init_mask(int k){
int nb_bits=2*k;
mask_kMer=pow(2,nb_bits);
mask_kMer=mask_kMer-1;
}
public:
ReadProcessor(int k=3) {
// init_mask(k);
set_k(k);
init_nucleo_rev_cpl();
}
void set_k(int k) {
this->k=k;
init_mask(k);
}
unsigned long kMerToNumberReverse(char * k_m,unsigned long * p_prev);
unsigned long nucleoToNumberReverse(char s,int i);
unsigned long kMerToNumber(char * k_m,unsigned long * p_prev);
unsigned long nucleoToNumber(char s);
void getKMerNumbers(char * dnaStr,int l,std::vector<unsigned long>& my_vect);
};
#endif
......@@ -11,7 +11,7 @@
* This function processes the content of an rpos structure and returns the character string "ATCG..." corresponding to the read record.
* PE reads are concatenated.
*/
#include <math.h>
#include "FqConstants.h"
#include "rock_types.h"
#include "srp.h"
......@@ -19,24 +19,6 @@
// TODO Encapsulate that+ all those functions in a object (ReadProcessor or something like that...)
std::map<char, char> nucleo_rev_cpl;
void init_nucleo_rev_cpl () {
nucleo_rev_cpl['A']='T';
nucleo_rev_cpl['T']='A';
nucleo_rev_cpl['C']='G';
nucleo_rev_cpl['G']='C';
nucleo_rev_cpl['N']='N';
}
unsigned long mask_kMer;
void init_mask(int k) {
int nb_bits=2*k;
mask_kMer=pow(2,nb_bits);
mask_kMer=mask_kMer-1;
}
void getFqRecord(FqBaseBackend* fq_files_be[],const unsigned char& f_id,const long& offset,
......@@ -105,96 +87,8 @@ void getDNASeqstr(FqBaseBackend* fq_files_be[],
}
unsigned long nucleoToNumber(char s) {
unsigned long nbr;
switch(s)
{
case 'A':
nbr=0;
break;
case 'C':
nbr=1;
break;
case 'G':
nbr=2;
break;
case 'T':
nbr=3;
break;
case 'N':
nbr=0;
break;
default:
throw -1; // TODO Benchmark this inside try catch statement to see if try catch+exception really costs so long.
// throw an integer for the moment. An exception object may not be the most optimal choice in terms of performance.
}
return nbr;
}
unsigned long kMerToNumber(char * k_m,int k,unsigned long * p_prev) {
unsigned long nbr=0;
unsigned long c;
if (p_prev==NULL) { // first k_mer conversion
int i;
for (i=0;i<k;i++) {
c=nucleoToNumber(k_m[i]); // do not catch exception for the moment (not until I have checked it doesn't slow down execution). If nucleoToNumber returns -1, program will simply crash
nbr=nbr<<2;
nbr=nbr|c;
}
} else {
nbr=*p_prev;
c=nucleoToNumber(k_m[k-1]);
nbr=nbr<<2;
nbr=nbr&mask_kMer;
nbr=nbr|c;
}
return nbr;
}
unsigned long nucleoToNumberReverse(char s,int k) {
unsigned long nbr=0;
char cpl=nucleo_rev_cpl[s];
nbr=nucleoToNumber(cpl);
nbr=nbr<<2*(k-1);
return nbr;
}
unsigned long kMerToNumberReverse(char * k_m,int k,unsigned long * p_prev) {
unsigned long nbr=0;
unsigned long c;
if (p_prev==NULL) { // first k_mer conversion
int i;
for (i=k;i>0;i--) {
c=nucleoToNumberReverse(k_m[i-1],i);
// nbr=nbr>>2;
nbr=nbr|c;
}
} else {
nbr=*p_prev;
c=nucleoToNumberReverse(k_m[k-1],k);
nbr=nbr>>2;
nbr=nbr|c;
}
return nbr;
}
void getKMerNumbers(char * dnaStr,int l,int k,std::vector<unsigned long>& my_vect) { // See simple_test.cpp and results. benchmark showed that std::vector is very slightly faster than C array and doesn't require more memory in our case. So, I am using it since it makes code simpler.
/*std::vector<unsigned long> my_vect;
return my_vect;*/
int i;
unsigned long num;
unsigned long num_rev;
unsigned long * p_prev=NULL;
unsigned long * p_prev_rev=NULL;
char * p_char=dnaStr;
int nb_k_m=l-k+1;
for (i=0; i<nb_k_m;i++) {
num=kMerToNumber(p_char,k, p_prev);
my_vect.push_back(num);
num_rev=kMerToNumberReverse(p_char,k, p_prev_rev);
my_vect.push_back(num_rev);
p_char++;
p_prev=&num;
p_prev_rev=&num_rev;
}
}
......@@ -30,14 +30,5 @@ void getDNASeqstr(FqBaseBackend* [],
void init_DnaSeqStr(DnaSeqStr * dna_seq);
void getKMerNumbers(char * dnaStr,int l,int k,std::vector<unsigned long>& my_vect);
unsigned long kMerToNumberReverse(char * k_m,int k,unsigned long * p_prev);
unsigned long nucleoToNumberReverse(char s,int k);
unsigned long kMerToNumber(char * k_m,int k,unsigned long * p_prev);
unsigned long nucleoToNumber(char s);
void init_nucleo_rev_cpl ();
void init_mask(int k);
#endif /* READ_UTILS_H_ */
......@@ -13,6 +13,7 @@
#include "rock_types.h"
#include "FqMainBackend.h"
#include "srp.h"
#include "ReadProcessor.h"
using namespace std;
......@@ -129,70 +130,77 @@ void test_getReadPE() {
}
/*
* Test the basic methods of the ReadProcessor object.
*/
void testDNAToNumberSimple() {
unsigned long nbr;
init_nucleo_rev_cpl();
assert(sizeof(nbr)==8); // This may not be true on some platform; so better know it now.
assert(nucleoToNumber('A')==0); // TODO: use macro or inline functions for converting just a nucleotide.
assert(nucleoToNumber('C')==1);
assert(nucleoToNumber('G')==2);
assert(nucleoToNumber('T')==3);
assert(nucleoToNumber('N')==0);
ReadProcessor read_p;
assert(read_p.nucleoToNumber('A')==0); // TODO: use macro or inline functions for converting just a nucleotide.
assert(read_p.nucleoToNumber('C')==1);
assert(read_p.nucleoToNumber('G')==2);
assert(read_p.nucleoToNumber('T')==3);
assert(read_p.nucleoToNumber('N')==0);
unsigned long * p_prev=NULL;
assert(kMerToNumber((char *) "AAAA",4,p_prev)==0);
assert(kMerToNumber((char *) "AANAA",5,p_prev)==0);
assert(kMerToNumber((char *) "AANAATCCGATTGG",14,p_prev)==219386);
read_p.set_k(4);
assert(read_p.kMerToNumber((char *) "AAAA",p_prev)==0);
assert(read_p.kMerToNumberReverse((char *) "AAAA",p_prev)==255);
read_p.set_k(5);
assert(read_p.kMerToNumber((char *) "AANAA",p_prev)==0);
assert(read_p.kMerToNumberReverse((char *)"AANAA",p_prev)==975);
read_p.set_k(14);
assert(read_p.kMerToNumber((char *) "AANAATCCGATTGG",p_prev)==219386);
assert(read_p.kMerToNumberReverse((char *)"AANAATCCGATTGG",p_prev)==84779983);
unsigned long prev=219386;
p_prev=&prev;
init_mask(15);
assert(kMerToNumber((char *) "AANAATCCGATTGGA",15,p_prev)==877544);
assert(kMerToNumber((char *) "AANAATCCGATTGGN",15,p_prev)==877544);
assert(kMerToNumber((char *) "AANAATCCGATTGGC",15,p_prev)==877545);
assert(kMerToNumber((char *) "AANAATCCGATTGGG",15,p_prev)==877546);
assert(kMerToNumber((char *) "AANAATCCGATTGGT",15,p_prev)==877547);
int k=1;
assert(nucleoToNumberReverse('A',k)==3);
assert(nucleoToNumberReverse('C',k)==2);
assert(nucleoToNumberReverse('G',k)==1);
assert(nucleoToNumberReverse('T',k)==0);
assert(nucleoToNumberReverse('N',k)==0);
k=20;
assert(nucleoToNumberReverse('A',k)==824633720832);
assert(nucleoToNumberReverse('C',k)==549755813888);
assert(nucleoToNumberReverse('G',k)==274877906944);
assert(nucleoToNumberReverse('T',k)==0);
assert(nucleoToNumberReverse('N',k)==0);
p_prev=NULL;
assert(kMerToNumberReverse((char *) "AAAA",4,p_prev)==255);
assert(kMerToNumberReverse((char *)"AANAA",5,p_prev)==975);
assert(kMerToNumberReverse((char *)"AANAATCCGATTGG",14,p_prev)==84779983);
prev=84779983;
read_p.set_k(15);
assert(read_p.kMerToNumber((char *) "AANAATCCGATTGGA",p_prev)==877544);
assert(read_p.kMerToNumber((char *) "AANAATCCGATTGGN",p_prev)==877544);
assert(read_p.kMerToNumber((char *) "AANAATCCGATTGGC",p_prev)==877545);
assert(read_p.kMerToNumber((char *) "AANAATCCGATTGGG",p_prev)==877546);
assert(read_p.kMerToNumber((char *) "AANAATCCGATTGGT",p_prev)==877547);
read_p.set_k(1);
assert(read_p.nucleoToNumberReverse('A',1)==3);
assert(read_p.nucleoToNumberReverse('C',1)==2);
assert(read_p.nucleoToNumberReverse('G',1)==1);
assert(read_p.nucleoToNumberReverse('T',1)==0);
assert(read_p.nucleoToNumberReverse('N',1)==0);
read_p.set_k(20);
assert(read_p.nucleoToNumberReverse('A',20)==824633720832);
assert(read_p.nucleoToNumberReverse('C',20)==549755813888);
assert(read_p.nucleoToNumberReverse('G',20)==274877906944);
assert(read_p.nucleoToNumberReverse('T',20)==0);
assert(read_p.nucleoToNumberReverse('N',20)==0);
prev=84779983;
p_prev=&prev;
init_mask(14);
assert(kMerToNumberReverse((char *)"ANAATCCGATTGGA",14,p_prev)==222521587);
assert(kMerToNumberReverse((char *)"ANAATCCGATTGGN",14,p_prev)==21194995);
assert(kMerToNumberReverse((char *)"ANAATCCGATTGGT",14,p_prev)==21194995);
assert(kMerToNumberReverse((char *)"ANAATCCGATTGGC",14,p_prev)==155412723);
assert(kMerToNumberReverse((char *)"ANAATCCGATTGGG",14,p_prev)==88303859);
read_p.set_k(14);
assert(read_p.kMerToNumberReverse((char *)"ANAATCCGATTGGA",p_prev)==222521587);
assert(read_p.kMerToNumberReverse((char *)"ANAATCCGATTGGN",p_prev)==21194995);
assert(read_p.kMerToNumberReverse((char *)"ANAATCCGATTGGT",p_prev)==21194995);
assert(read_p.kMerToNumberReverse((char *)"ANAATCCGATTGGC",p_prev)==155412723);
assert(read_p.kMerToNumberReverse((char *)"ANAATCCGATTGGG",p_prev)==88303859);
}
/*
* Test the ReadProcessor method that will be called by the main program : the one who takes as input a dna string and returns
* a serie of numbers all corresponding to its k-mer composition + reverse complement.
*/
void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32.
// from a dna string a found in fq files, return its k-mer decomposition; k-mers are numbers.
init_nucleo_rev_cpl();
char dnaStr[]="TTTTTAGGTGCTACCATAACACCAACTGTTTTCACCATAATTTTAAAATCAAGCATTAGAGACGCTTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTCATTTCTTCCACTAACCTGCCATAATCCAGTACAACCTGGTATAACGGTCAAACGCTTTTTATCATAGGAACTGTATTCTCCTACCTCACGTGGCAAAGGAGGTCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGTGGTA";
int k=30;
init_mask(k);
ReadProcessor read_p(k);
std::vector<unsigned long> nbrKmerDecompo;
......@@ -200,10 +208,10 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32.
int nb_expected_k_mers=strlen(dnaStr)+1;
nb_expected_k_mers-=k;
nbrKmerDecompo.reserve(nb_expected_k_mers);
unsigned long expected=kMerToNumber((char *) "GCTACCATAACACCAACTGTTTTCACCATA",30,NULL);
unsigned long expected_rev=kMerToNumber((char *) "TATGGTGAAAACAGTTGGTGTTATGGTAGC",30,NULL);
unsigned long expected=read_p.kMerToNumber((char *) "GCTACCATAACACCAACTGTTTTCACCATA",NULL);
unsigned long expected_rev=read_p.kMerToNumber((char *) "TATGGTGAAAACAGTTGGTGTTATGGTAGC",NULL);
getKMerNumbers((char *) dnaStr,strlen(dnaStr),k,nbrKmerDecompo);
read_p.getKMerNumbers((char *) dnaStr,strlen(dnaStr),nbrKmerDecompo);
assert(nbrKmerDecompo.size()==nb_expected_k_mers*2); // for each k_mer, we also expect to have a number for its reverse complement.
int cnt_k_mer=0;
unsigned long num,num2;
......@@ -217,24 +225,24 @@ void testDNAToNumberMoreComplex() { // TODO in main, check that k<=32.
}
if (cnt_k_mer==3) {
// 2nd k-mer is : TTTTAGGTGCTACCATAACACCAACTGTTT
num2=kMerToNumber((char *) "TTTTAGGTGCTACCATAACACCAACTGTTT",30,NULL);
num2=read_p.kMerToNumber((char *) "TTTTAGGTGCTACCATAACACCAACTGTTT",NULL);
assert(*it==num2);
unsigned long num3=kMerToNumber((char *) "TTTTAGGTGCTACCATAACACCAACTGTTT",30,&num);
unsigned long num3=read_p.kMerToNumber((char *) "TTTTAGGTGCTACCATAACACCAACTGTTT",&num);
assert(num3==num2);
}
if (cnt_k_mer==5) {
num=kMerToNumber((char *) "TTTAGGTGCTACCATAACACCAACTGTTTT",30,NULL);
num=read_p.kMerToNumber((char *) "TTTAGGTGCTACCATAACACCAACTGTTTT",NULL);
assert(num==*it);
}
if (cnt_k_mer==7) {
num=kMerToNumber((char *) "TTAGGTGCTACCATAACACCAACTGTTTTC",30,NULL);
num=read_p.kMerToNumber((char *) "TTAGGTGCTACCATAACACCAACTGTTTTC",NULL);
assert(num==*it);
}
// k-mer number 10 is GCTACCATAACACCAACTGTTTTCACCATA
// its reverse complement is : TATGGTGAAAACAGTTGGTGTTATGGTAGC
if (cnt_k_mer==19) {
num=kMerToNumber((char *) "GCTACCATAACACCAACTGTTTTCACCATA",30,NULL);
num=read_p.kMerToNumber((char *) "GCTACCATAACACCAACTGTTTTCACCATA",NULL);
assert(*it==704021989794238796);
}
if (cnt_k_mer==20) {
......@@ -253,5 +261,6 @@ int main(int argc, char **argv) {
testDNAToNumberSimple();
cout<<"test converting an entire read to a series of numbers"<<endl;
testDNAToNumberMoreComplex();
cout<<"done"<<endl;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment