Skip to content
Snippets Groups Projects
Select Git revision
  • a0014ac62aeadefe8257ce8bc1e43e47f6ceab0e
  • master default protected
  • dev_nanopore
  • 2.0_beta
  • V_3.0
  • v_2.1
  • V_2.0
  • V_1.9.6
  • v_1.9.6
  • v_1.9.5
  • v_1.9.4
  • V_1.9.3
  • V_1.9.2
  • V_1.9.1
  • V_1.9
  • V_1.8
  • V_1.6
  • 1.5.1
  • V_1.5
  • V1.4
  • V1.3
21 results

read_utils.cpp

Blame
  • read_utils.cpp 5.73 KiB
    /*
      Utility methods for processing DNA reads : decomposition in overlapping k-mers, converting a k-mer into a number,
             reverse complement and so on...
    
      Copyright (C) 2016-2021  Institut Pasteur
     
      This program is part of the ROCK software.
      
      This program  is free software:  you can  redistribute it  and/or modify it  under the terms  of the GNU
      General Public License as published by the Free Software Foundation, either version 3 of the License, or
      (at your option) any later version.
      
      This program is distributed in the hope that it will be useful,  but WITHOUT ANY WARRANTY;  without even
      the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
      License for more details.
      
      You should have received a copy of the  GNU General Public License along with this program.  If not, see
      <http://www.gnu.org/licenses/>.
      
      Contact:
       Alexis Criscuolo                                                            alexis.criscuolo@pasteur.fr
       Veronique Legrand                                                           veronique.legrand@pasteur.fr
     
     */
    
    
    #include "FqConstants.h"
    #include "srp.h"
    #include "DnaSeqStr.h"
    #include "read_utils.h"
    #include "rock_commons.h"
    //#define DEBUG
    
    #ifdef DEBUG
    #include <cassert>
    #endif
    
    void UpdateReadForScoreThreshold(DnaSeqStr&  dna_seq,const int & nucl_qual_score_thres,const int& idx_start_qual_score) {
        char * read_data_start=dna_seq.fq_record_buf+dna_seq.start_idx;
        char * qual_score_start=dna_seq.fq_record_buf+idx_start_qual_score;
        int idx=0;
        while (idx<dna_seq.length) {
            if (*(qual_score_start+idx)<=nucl_qual_score_thres) *(read_data_start+idx)=k_nucl_in_error;
            idx++;
        }
    }
    
    
    void getFqRecord(FqBaseBackend* fq_files_be[],const unsigned char& f_id,const unsigned long& offset,
                     char * fq_record) {
        FqBaseBackend * be=fq_files_be[f_id-1];
        try {
            be->getRead(offset,fq_record);
        } catch (int e) {
    #ifdef DEBUG
            cout<<"couldn't retrieve read at offset: "<<offset<<endl;
    #endif
            throw e;
        }
    }
    
    /*
    void init_DnaSeqStr(DnaSeqStr * dna_seq) {
        dna_seq->fq_record_buf[0]='\0';
        dna_seq->length=0;
        dna_seq->start_idx=0;
    }*/
    
    void processFqRecord(DnaSeqStr& dna_seq,const int & nucl_qual_score_thres) {
        char * pchar=dna_seq.fq_record_buf;
        int cnt=0;
        int nb_l=0;
        int idx_start_qual_score;
    #ifdef DEBUG
        //assert(*pchar==k_read_id_start);
        if (*pchar!=k_read_id_start) throw -1;
    #endif
        while (cnt<=MAX_FQ_RECORD_LENGTH-1) {
            if (*pchar=='\n') {
                    nb_l++;
                    if (nb_l==1) dna_seq.start_idx=cnt+1;
                    if (nb_l==2) {
                        dna_seq.length=cnt-dna_seq.start_idx;
                        // break;
                    }
                    if (nb_l==3) {
                        idx_start_qual_score=cnt+1;
                        break;
                    }
            }
            pchar++;
            cnt++;
        }
        // here, change read if there is a threshold for nucleotide quality score.
        if (nucl_qual_score_thres-k_phred_32) UpdateReadForScoreThreshold(dna_seq,nucl_qual_score_thres,idx_start_qual_score);
    
    #ifdef DEBUG
        assert(nb_l>=2);
    #endif
    }
    
    
    void getDNASeqstr(FqBaseBackend* fq_files_be [],
                    const rpos& sr,
                    unsigned long j,
                    DnaSeqStr* dna_seqs,const int& nucl_qual_score_thres)
    {
        unsigned char f_id1;
        unsigned char f_id2;
        //DnaSeqStr * p_dna_seqs=dna_seqs;
    
        unsigned char fid_stored=sr.fileid;
    
        f_id1=fid_stored >>4;
        f_id2=fid_stored &mask_fid;
    
        unsigned long offset1=j*UINT_MAX+sr.read_a1;
    #ifdef DEBUG
        cout<<"processing record at j="<<j<<"and read_a1="<<sr.read_a1<<endl;
        cout<<"getting record at offset: "<<offset1<<endl;
    #endif
        try {
            getFqRecord(fq_files_be,f_id1,offset1,dna_seqs[0].fq_record_buf);
            processFqRecord(dna_seqs[0],nucl_qual_score_thres);
        } catch (int e) {
    #ifdef DEBUG
            cout<<"j="<<j<<" sr content: a1="<<sr.read_a1<<" a2= "<<sr.read_a2<<endl;
            cout<<"pb reading/seeking at offset1="<<offset1<<endl;
            cout<<"got: "<<endl;
            cout<<dna_seqs[0].fq_record_buf<<endl;
    #endif
            exit(e); 
        }
        if (f_id2!=0) { // case of PE reads.
            //p_dna_seqs+=1;
            unsigned long offset2=sr.read_a2+offset1;
            try {
                getFqRecord(fq_files_be,f_id2,offset2,dna_seqs[1].fq_record_buf);
            } catch (int e) {
    #ifdef DEBUG
                cout<<"j="<<j<<" sr content: a1="<<sr.read_a1<<" a2= "<<sr.read_a2<<endl;
                cout<<"pb reading/seeking at offset2="<<offset2<<endl;
                cout<<"got: "<<endl;
                cout<<dna_seqs[1].fq_record_buf<<endl;
    #endif
                exit(e);
            }
            processFqRecord(dna_seqs[1],nucl_qual_score_thres);
        }
    
    }
    
    void decomposeReadInKMerNums(ReadProcessor& read_p, T_read_numericValues& nbrKmerDecompo,int k,DnaSeqStr a_seqs[2], int process_PE_separately) {
        int nb_expected_k_mers,nb_expected_k_mers_PE1,nb_expected_k_mers_PE2=0;
        nb_expected_k_mers_PE1=a_seqs[0].length+1-k;
        nbrKmerDecompo.idx_start_PE2=0;
        if (a_seqs[1].length) nb_expected_k_mers_PE2=a_seqs[1].length+1-k;
        nb_expected_k_mers=nb_expected_k_mers_PE1+nb_expected_k_mers_PE2;
        nbrKmerDecompo.single_or_PE_val.reserve(nb_expected_k_mers);
        //nbrKmerDecompo.nb_kmer_PE1=nb_expected_k_mers_PE1;
    
        char * start_dna_str=a_seqs[0].fq_record_buf+a_seqs[0].start_idx;
        read_p.getKMerNumbers(start_dna_str,a_seqs[0].length,nbrKmerDecompo.single_or_PE_val);
    
        if (a_seqs[1].length) { // case of PE reads
        	if (process_PE_separately) nbrKmerDecompo.idx_start_PE2=nb_expected_k_mers_PE1;
            start_dna_str=a_seqs[1].fq_record_buf+a_seqs[1].start_idx;
            read_p.getKMerNumbers(start_dna_str,a_seqs[1].length,nbrKmerDecompo.single_or_PE_val);
        }
    }