Skip to content
Snippets Groups Projects
Select Git revision
  • b47d896d889ffa3ee22f1745b553bbdc917ec9c5
  • master default protected
  • jkende-master-patch-56415
3 results

multiple_fasta_reader.py

Blame
  • ReadProcessor.cpp 3.21 KiB
    /*
     * ReadProcessor.cpp
     *
     *  Created on: Feb 5, 2016
     *      Author: vlegrand
     */
    
    #include <stdlib.h>
    #include "ReadProcessor.h"
    
    /*
    inline void ReadProcessor::init_nucleo_rev_cpl () {
        nucleo_rev_cpl['A']='T';
        nucleo_rev_cpl['T']='A';
        nucleo_rev_cpl['C']='G';
        nucleo_rev_cpl['G']='C';
        nucleo_rev_cpl['N']='N';
    }*/
    
    
    /*
    inline void ReadProcessor::init_mask(int k) {
        int nb_bits=2*k;
        mask_kMer=pow(2,nb_bits);
        mask_kMer=mask_kMer-1;
    }*/
    
    inline unsigned long ReadProcessor::nucleoToNumber(char s) {
        unsigned long nbr;
        switch(s)
        {
            case 'A':
                nbr=0;
                break;
            case 'C':
                nbr=1;
                break;
            case 'G':
                nbr=2;
                break;
            case 'T':
                nbr=3;
                break;
            case 'N':
                nbr=0;
                break;
            default:
                throw -1; // TODO Benchmark this inside try catch statement to see if try catch+exception really costs so long.
                          // throw an integer for the moment. An exception object may not be the most optimal choice in terms of performance.
        }
        return nbr;
    }
    
    inline unsigned long ReadProcessor::kMerToNumber(char * k_m,unsigned long * p_prev) {
        unsigned long nbr=0;
        unsigned long c;
        if (p_prev==NULL) { // first k_mer conversion
            int i;
            for (i=0;i<k;i++) {
                c=nucleoToNumber(k_m[i]); // do not catch exception for the moment (not until I have checked it doesn't slow down execution). If nucleoToNumber returns -1, program will simply crash
                nbr=nbr<<2;
                nbr=nbr|c;
            }
        } else {
            nbr=*p_prev;
            c=nucleoToNumber(k_m[k-1]);
            nbr=nbr<<2;
            nbr=nbr&mask_kMer;
            nbr=nbr|c;
        }
        return nbr;
    }
    
    inline unsigned long ReadProcessor::nucleoToNumberReverse(char s,int i) {
        unsigned long nbr=0;
        char cpl=nucleo_rev_cpl[s];
        nbr=nucleoToNumber(cpl);
        nbr=nbr<<2*(i-1);
        return nbr;
    }
    
    inline unsigned long ReadProcessor::kMerToNumberReverse(char * k_m,unsigned long * p_prev) {
        unsigned long nbr=0;
        unsigned long c;
        if (p_prev==NULL) { // first k_mer conversion
            int i;
            for (i=k;i>0;i--) {
                c=nucleoToNumberReverse(k_m[i-1],i);
                // nbr=nbr>>2;
                nbr=nbr|c;
            }
        } else {
            nbr=*p_prev;
            c=nucleoToNumberReverse(k_m[k-1],k);
            nbr=nbr>>2;
            nbr=nbr|c;
        }
        return nbr;
    }
    
    void ReadProcessor::getKMerNumbers(char * dnaStr,int l,std::vector<unsigned long>& my_vect) { // See simple_test.cpp and results. benchmark showed that std::vector is very slightly faster than C array and doesn't require more memory in our case. So, I am using it since it makes code simpler.
        /*std::vector<unsigned long> my_vect;
        return my_vect;*/
        int i;
        unsigned long num;
        unsigned long num_rev;
        unsigned long * p_prev=NULL;
        unsigned long * p_prev_rev=NULL;
        char * p_char=dnaStr;
        int nb_k_m=l-k+1;
        for (i=0; i<nb_k_m;i++) {
            num=kMerToNumber(p_char,p_prev);
            my_vect.push_back(num);
            num_rev=kMerToNumberReverse(p_char,p_prev_rev);
            my_vect.push_back(num_rev);
            p_char++;
            p_prev=&num;
            p_prev_rev=&num_rev;
        }
    }