-
Veronique Legrand authored
version 2.0_beta where PE reads are processed separately and selected or removed from the cms if at least one of the PE has a median coverage below threshold.
Veronique Legrand authoredversion 2.0_beta where PE reads are processed separately and selected or removed from the cms if at least one of the PE has a median coverage below threshold.
FqMainBackend.cpp 8.03 KiB
/*
Copyright (C) 2016-2021 Institut Pasteur
This program is part of the ROCK software.
This program is free software: you can redistribute it and/or modify it under the terms of the GNU
General Public License as published by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
License for more details.
You should have received a copy of the GNU General Public License along with this program. If not, see
<http://www.gnu.org/licenses/>.
Contact:
Alexis Criscuolo alexis.criscuolo@pasteur.fr
Veronique Legrand veronique.legrand@pasteur.fr
*/
#include <sys/stat.h>
#include <stdio.h>
//#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <err.h>
#include <limits.h>
#include <assert.h>
#include <stdlib.h>
#include <stdexcept>
#include <sstream>
#include <string>
#include <iostream>
#define _FILE_OFFSET_BITS 64 // for portability reasons on 32bits systems.
//#define DEBUG
#ifdef DEBUG
#include <string.h>
#include <iostream>
using namespace std;
#endif
#include "FqMainBackend.h"
#include "srp.h"
FqMainBackend::FqMainBackend(srp * io_sr):FqBaseBackend() {
p_auxFqProcessor=NULL;
p_scoreReadStruct=io_sr;
cnt_k_mers=0;
}
void FqMainBackend::processFile(char * filename,unsigned char f_id) {
FILE * fp;
int nread;
long cur_offset;
mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
this->i_filename=filename;
this->f_id=f_id;
char* buf;
size_t buffer_size;
(test_mode)?buffer_size=test_bufsize:buffer_size=FqBaseBackend::bufsize;
int f_single=open(filename,O_RDONLY,mode);
if (f_single==-1) {
err(errno,"cannot open file: %s.",filename);
}
buf=(char *) malloc(buffer_size*sizeof(char));
if (buf==NULL) {
err(errno,"cannot allocate memory: %lu bytes.",buffer_size);
}
/* for each read, we want : offset, filenb, total quality score. */
while ((nread=read(f_single,buf,buffer_size))!=0) {
cur_offset = lseek(f_single, 0, SEEK_CUR);
T_buf_info buf_info=init_buf_info(nread,buf);
processBuf(buf_info,f_id,cur_offset);
}
close(f_single);
free(buf);
}
void FqMainBackend::onEndFastqRecord(T_fq_rec_info& rec_info,const T_buf_info& bufinfo) {
rpos rp=init_rpos(f_id,rec_info.rstart_offset);
rinfo pe2info;
int nb_k_mer_PE2;
if (p_auxFqProcessor!=NULL) {
int eof=p_auxFqProcessor->getNextRead(&pe2info);
assert(!eof); // There should be the same number of reads in both PE files.
rec_info.nb_k_mers_in_error_in_PE2=pe2info.nb_k_mers_in_error;
rec_info.nb_nucleotides_in_read_PE2=pe2info.nb_nucleotides_in_read;
rec_info.st+=pe2info.score;
unsigned long j=rec_info.rstart_offset/UINT_MAX;
update_rpos(pe2info.f_id,pe2info.rstart_offset,j,&rp);
//debug_processBuf(on_record_end_pe,bufinfo,rec_info.rstart_offset);
}
i_dim& ref_i_dim=(*p_scoreReadStruct)[rec_info.st/K_SCORE_NORM_FACTOR];
k_dim& ref_k_dim=ref_i_dim[rec_info.rstart_offset/UINT_MAX];
//debug_processBuf(on_record_end,bufinfo,rec_info.rstart_offset);
int nb_k_mer=rec_info.nb_nucleotides_in_read+1-qual_thres.k;
rec_info.nb_nucleotides_in_read_PE2>0?nb_k_mer_PE2=rec_info.nb_nucleotides_in_read_PE2+1-qual_thres.k:nb_k_mer_PE2=0;
if (p_auxFqProcessor==NULL) { //case of single reads
nb_k_mer+=nb_k_mer_PE2;
//(nb_k_mer-rec_info.nb_k_mers_in_error-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read)?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo);
if (nb_k_mer-rec_info.nb_k_mers_in_error-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read) ref_k_dim.push_back(rp);
} else { // PE reads are processed separately by default
// ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2>=1) && (nb_k_mer-rec_info.nb_k_mers_in_error>=1))?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo);
// ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read) && (nb_k_mer-rec_info.nb_k_mers_in_error>=qual_thres.min_correct_k_mers_in_read))?ref_k_dim.push_back(rp):writeToUndefFile(bufinfo);
//if ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2>=qual_thres.min_correct_k_mers_in_read) && (nb_k_mer-rec_info.nb_k_mers_in_error>=qual_thres.min_correct_k_mers_in_read)) ref_k_dim.push_back(rp);
cout<<"nb_k_mer_PE2="<<nb_k_mer_PE2<<" nb_k_mer="<<nb_k_mer<<endl;
cout<<" nb_k_mers_in_error_in_PE2="<<rec_info.nb_k_mers_in_error_in_PE2<<" nb_k_mers_in_error="<<rec_info.nb_k_mers_in_error<<endl;
if ((nb_k_mer_PE2-rec_info.nb_k_mers_in_error_in_PE2+nb_k_mer-rec_info.nb_k_mers_in_error>=qual_thres.min_correct_k_mers_in_read)) ref_k_dim.push_back(rp);
}
// empty buffer keeping current record
cur_fq_record[0]='\0';
if (p_auxFqProcessor!=NULL) p_auxFqProcessor->resetCurFqRecord();
cnt_k_mers+=nb_k_mer;
}
void FqMainBackend::processBuf(T_buf_info& buf_info,unsigned char f_id,unsigned long cur_offset) {
static T_fq_rec_info fq_rec_info;
static int num_l_in_rec=5; /* counter to know on which line inside the fastq record we are */
//static int qual_score=0;
static int n;
int start_rec_in_buf=0;
// debug_processBuf(init_debug,buf_info,fq_rec_info.rstart_offset);
while (buf_info.cnt<=buf_info.real_bufsize-1) {
switch (*buf_info.pchar){
case k_read_id_start:
if (num_l_in_rec==4) goto inc_score;
else {
buf_info.p_start_cur_rec=buf_info.pchar;
fq_rec_info.rstart_offset=cur_offset-buf_info.real_bufsize+buf_info.cnt;
start_rec_in_buf=buf_info.cnt;
//debug_processBuf(on_record_new,buf_info,fq_rec_info.rstart_offset);
num_l_in_rec=1;
fq_rec_info.nb_nucleotides_in_read=0;
fq_rec_info.nb_nucleotides_in_read_PE2=0;
fq_rec_info.nb_k_mers_in_error=0;
fq_rec_info.nb_k_mers_in_error_in_PE2=0;
fq_rec_info.st=0;
fq_rec_info.idx_nucl_in_read=0;
n=0; // counter for k-mers in error
}
break;
case '\n':
//debug_processBuf(on_line_end,buf_info,fq_rec_info.rstart_offset);
num_l_in_rec+=1;
if (num_l_in_rec==5) onEndFastqRecord(fq_rec_info,buf_info);
break;
default:
//debug_processBuf(on_store_read_id,buf_info,fq_rec_info.rstart_offset);
(num_l_in_rec==2)?fq_rec_info.nb_nucleotides_in_read++:fq_rec_info.nb_nucleotides_in_read;
inc_score:
{ if (num_l_in_rec==4) onIncScore(fq_rec_info,buf_info,n);
}
break;
}
buf_info.pchar++;
buf_info.cnt++;
}
keepCurFastqRecord(buf_info.buf,start_rec_in_buf,buf_info.real_bufsize);
//debug_processBuf(on_buf_end,buf_info,fq_rec_info.rstart_offset);
}
/*
* 2 cases when writing records to undef file.
* 1- record was all inside current buffer => write it as is.
* 2- beginning of record was at the end of one buffer and the end of the record was processed in a 2nd buffer; so, record was stored in memory. In that case, write cur_fq_record to undef file.
*/
/*
void FqMainBackend::writeToUndefFile(const T_buf_info& buf_info) {
FqBaseBackend::writeToUndefFile(buf_info,1); // if this is slow due to inheritance resolution, use template method or other solution. This doesn't semm to slow the perfs according to profiling. So I leave it as is; it's clearer than templates.
if (p_auxFqProcessor!=NULL) {
p_auxFqProcessor->writeToUndefFile();
}
}
*/