Skip to content
Snippets Groups Projects
FqMainBackend.cpp 8.03 KiB
/*

  Copyright (C) 2016-2021  Institut Pasteur
 
  This program is part of the ROCK software.
  
  This program  is free software:  you can  redistribute it  and/or modify it  under the terms  of the GNU
  General Public License as published by the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  
  This program is distributed in the hope that it will be useful,  but WITHOUT ANY WARRANTY;  without even
  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
  License for more details.
  
  You should have received a copy of the  GNU General Public License along with this program.  If not, see
  <http://www.gnu.org/licenses/>.
  
  Contact:
   Alexis Criscuolo                                                            alexis.criscuolo@pasteur.fr
   Veronique Legrand                                                           veronique.legrand@pasteur.fr
 
 */

#include <sys/stat.h>

#include <stdio.h>
//#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <err.h>
#include <limits.h>
#include <assert.h>
#include <stdlib.h>
#include <stdexcept>
#include <sstream>
#include <string>
#include <iostream>

#define _FILE_OFFSET_BITS 64 // for portability reasons on 32bits systems.

//#define DEBUG

#ifdef DEBUG
#include <string.h>
#include <iostream>
using namespace std;
#endif

#include "FqMainBackend.h"
#include "srp.h"


FqMainBackend::FqMainBackend(srp * io_sr):FqBaseBackend() {
    p_auxFqProcessor=NULL;
    p_scoreReadStruct=io_sr;
    cnt_k_mers=0;
}



void FqMainBackend::processFile(char * filename,unsigned char f_id) {
    FILE * fp;
    int nread;
    long cur_offset;
    mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
    this->i_filename=filename;
    this->f_id=f_id;
    char* buf;
    
    size_t buffer_size;
    (test_mode)?buffer_size=test_bufsize:buffer_size=FqBaseBackend::bufsize;

    int f_single=open(filename,O_RDONLY,mode);
    if (f_single==-1) {
        err(errno,"cannot open file: %s.",filename);
    }

    buf=(char *) malloc(buffer_size*sizeof(char));
    if (buf==NULL) {
        err(errno,"cannot allocate memory: %lu bytes.",buffer_size);
    }
    /* for each read, we want : offset, filenb, total quality score. */
    while ((nread=read(f_single,buf,buffer_size))!=0) {
        cur_offset = lseek(f_single, 0, SEEK_CUR);
        T_buf_info buf_info=init_buf_info(nread,buf);
        processBuf(buf_info,f_id,cur_offset);
    }
    close(f_single);
    free(buf);
}



void FqMainBackend::onEndFastqRecord(T_fq_rec_info& rec_info,const T_buf_info& bufinfo) {
    rpos rp=init_rpos(f_id,rec_info.rstart_offset);
    rinfo pe2info;
    int nb_k_mer_PE2;
    if (p_auxFqProcessor!=NULL) {
         int eof=p_auxFqProcessor->getNextRead(&pe2info);
         assert(!eof); // There should be the same number of reads in both PE files.
         rec_info.nb_k_mers_in_error_in_PE2=pe2info.nb_k_mers_in_error;
         rec_info.nb_nucleotides_in_read_PE2=pe2info.nb_nucleotides_in_read;
         rec_info.st+=pe2info.score;
         unsigned long j=rec_info.rstart_offset/UINT_MAX;
         update_rpos(pe2info.f_id,pe2info.rstart_offset,j,&rp);
         //debug_processBuf(on_record_end_pe,bufinfo,rec_info.rstart_offset);
     }
     i_dim& ref_i_dim=(*p_scoreReadStruct)[rec_info.st/K_SCORE_NORM_FACTOR];
     k_dim& ref_k_dim=ref_i_dim[rec_info.rstart_offset/UINT_MAX];
     //debug_processBuf(on_record_end,bufinfo,rec_info.rstart_offset);
     int nb_k_mer=rec_info.nb_nucleotides_in_read+1-qual_thres.k;
     rec_info.nb_nucleotides_in_read_PE2>0?nb_k_mer_PE2=rec_info.nb_nucleotides_in_read_PE2+1-qual_thres.k:nb_k_mer_PE2=0;
     if (p_auxFqProcessor==NULL) { //case of single reads
         nb_k_mer+=nb_k_mer_PE2;
         //(nb_k_mer-rec_info.nb_k_mers_in_error-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read)?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo);
         if (nb_k_mer-rec_info.nb_k_mers_in_error-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read) ref_k_dim.push_back(rp);
     } else { // PE reads are processed separately by default
         // ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2>=1) && (nb_k_mer-rec_info.nb_k_mers_in_error>=1))?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo);
         // ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read) && (nb_k_mer-rec_info.nb_k_mers_in_error>=qual_thres.min_correct_k_mers_in_read))?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo);
    	 //if ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read) && (nb_k_mer-rec_info.nb_k_mers_in_error>=qual_thres.min_correct_k_mers_in_read)) ref_k_dim.push_back(rp);
    	 cout<<"nb_k_mer_PE2="<<nb_k_mer_PE2<<" nb_k_mer="<<nb_k_mer<<endl;
    	 cout<<" nb_k_mers_in_error_in_PE2="<<rec_info.nb_k_mers_in_error_in_PE2<<" nb_k_mers_in_error="<<rec_info.nb_k_mers_in_error<<endl;
    	 if ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2+nb_k_mer-rec_info.nb_k_mers_in_error>=qual_thres.min_correct_k_mers_in_read)) ref_k_dim.push_back(rp);
     }
     // empty buffer keeping current record
     cur_fq_record[0]='\0';
     if (p_auxFqProcessor!=NULL) p_auxFqProcessor->resetCurFqRecord();
     cnt_k_mers+=nb_k_mer;
}




void FqMainBackend::processBuf(T_buf_info& buf_info,unsigned char f_id,unsigned long cur_offset) {
    static T_fq_rec_info fq_rec_info;
    static int num_l_in_rec=5; /* counter to know on which line inside the fastq record we are */
    //static int qual_score=0;
    static int n;
    int start_rec_in_buf=0;

    // debug_processBuf(init_debug,buf_info,fq_rec_info.rstart_offset);
    while (buf_info.cnt<=buf_info.real_bufsize-1) {
        switch (*buf_info.pchar){
            case k_read_id_start:
                if (num_l_in_rec==4) goto inc_score;
                else {
                    buf_info.p_start_cur_rec=buf_info.pchar;
                    fq_rec_info.rstart_offset=cur_offset-buf_info.real_bufsize+buf_info.cnt;
                    start_rec_in_buf=buf_info.cnt;
                    //debug_processBuf(on_record_new,buf_info,fq_rec_info.rstart_offset);
                    num_l_in_rec=1;
                    fq_rec_info.nb_nucleotides_in_read=0;
                    fq_rec_info.nb_nucleotides_in_read_PE2=0;
                    fq_rec_info.nb_k_mers_in_error=0;
                    fq_rec_info.nb_k_mers_in_error_in_PE2=0;
                    fq_rec_info.st=0;
                    fq_rec_info.idx_nucl_in_read=0;
                    n=0; // counter for k-mers in error
                }
                break;
            case '\n':
                //debug_processBuf(on_line_end,buf_info,fq_rec_info.rstart_offset);
                num_l_in_rec+=1;
                if (num_l_in_rec==5) onEndFastqRecord(fq_rec_info,buf_info);
                break;
            default:
            //debug_processBuf(on_store_read_id,buf_info,fq_rec_info.rstart_offset);
            (num_l_in_rec==2)?fq_rec_info.nb_nucleotides_in_read++:fq_rec_info.nb_nucleotides_in_read;
            inc_score:
                { if (num_l_in_rec==4) onIncScore(fq_rec_info,buf_info,n);
                }
                break;
        }
        buf_info.pchar++;
        buf_info.cnt++;
    }
    keepCurFastqRecord(buf_info.buf,start_rec_in_buf,buf_info.real_bufsize);
    //debug_processBuf(on_buf_end,buf_info,fq_rec_info.rstart_offset);
}


/*
 * 2 cases when writing records to undef file.
 * 1- record was all inside current buffer => write it as is.
 * 2- beginning of record was at the end of one buffer and the end of the record was processed in a 2nd buffer; so, record was stored in memory. In that case, write cur_fq_record to undef file.
 */
/*
void FqMainBackend::writeToUndefFile(const T_buf_info& buf_info) {
    FqBaseBackend::writeToUndefFile(buf_info,1); // if this is slow due to inheritance resolution, use template method or other solution. This doesn't semm to slow the perfs according to profiling. So I leave it as is; it's clearer than templates.
    if (p_auxFqProcessor!=NULL) {
        p_auxFqProcessor->writeToUndefFile();
    }
}
*/