Select Git revision
read_utils.cpp
Véronique LEGRAND authored
read_utils.cpp 5.73 KiB
/*
Utility methods for processing DNA reads : decomposition in overlapping k-mers, converting a k-mer into a number,
reverse complement and so on...
Copyright (C) 2016-2021 Institut Pasteur
This program is part of the ROCK software.
This program is free software: you can redistribute it and/or modify it under the terms of the GNU
General Public License as published by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
License for more details.
You should have received a copy of the GNU General Public License along with this program. If not, see
<http://www.gnu.org/licenses/>.
Contact:
Alexis Criscuolo alexis.criscuolo@pasteur.fr
Veronique Legrand veronique.legrand@pasteur.fr
*/
#include "FqConstants.h"
#include "srp.h"
#include "DnaSeqStr.h"
#include "read_utils.h"
#include "rock_commons.h"
//#define DEBUG
#ifdef DEBUG
#include <cassert>
#endif
void UpdateReadForScoreThreshold(DnaSeqStr& dna_seq,const int & nucl_qual_score_thres,const int& idx_start_qual_score) {
char * read_data_start=dna_seq.fq_record_buf+dna_seq.start_idx;
char * qual_score_start=dna_seq.fq_record_buf+idx_start_qual_score;
int idx=0;
while (idx<dna_seq.length) {
if (*(qual_score_start+idx)<=nucl_qual_score_thres) *(read_data_start+idx)=k_nucl_in_error;
idx++;
}
}
void getFqRecord(FqBaseBackend* fq_files_be[],const unsigned char& f_id,const unsigned long& offset,
char * fq_record) {
FqBaseBackend * be=fq_files_be[f_id-1];
try {
be->getRead(offset,fq_record);
} catch (int e) {
#ifdef DEBUG
cout<<"couldn't retrieve read at offset: "<<offset<<endl;
#endif
throw e;
}
}
/*
void init_DnaSeqStr(DnaSeqStr * dna_seq) {
dna_seq->fq_record_buf[0]='\0';
dna_seq->length=0;
dna_seq->start_idx=0;
}*/
void processFqRecord(DnaSeqStr& dna_seq,const int & nucl_qual_score_thres) {
char * pchar=dna_seq.fq_record_buf;
int cnt=0;
int nb_l=0;
int idx_start_qual_score;
#ifdef DEBUG
//assert(*pchar==k_read_id_start);
if (*pchar!=k_read_id_start) throw -1;
#endif
while (cnt<=MAX_FQ_RECORD_LENGTH-1) {
if (*pchar=='\n') {
nb_l++;
if (nb_l==1) dna_seq.start_idx=cnt+1;
if (nb_l==2) {
dna_seq.length=cnt-dna_seq.start_idx;
// break;
}
if (nb_l==3) {
idx_start_qual_score=cnt+1;
break;
}
}
pchar++;
cnt++;
}
// here, change read if there is a threshold for nucleotide quality score.
if (nucl_qual_score_thres-k_phred_32) UpdateReadForScoreThreshold(dna_seq,nucl_qual_score_thres,idx_start_qual_score);
#ifdef DEBUG
assert(nb_l>=2);
#endif
}
void getDNASeqstr(FqBaseBackend* fq_files_be [],
const rpos& sr,
unsigned long j,
DnaSeqStr* dna_seqs,const int& nucl_qual_score_thres)
{
unsigned char f_id1;
unsigned char f_id2;
//DnaSeqStr * p_dna_seqs=dna_seqs;
unsigned char fid_stored=sr.fileid;
f_id1=fid_stored >>4;
f_id2=fid_stored &mask_fid;
unsigned long offset1=j*UINT_MAX+sr.read_a1;
#ifdef DEBUG
cout<<"processing record at j="<<j<<"and read_a1="<<sr.read_a1<<endl;
cout<<"getting record at offset: "<<offset1<<endl;
#endif
try {
getFqRecord(fq_files_be,f_id1,offset1,dna_seqs[0].fq_record_buf);
processFqRecord(dna_seqs[0],nucl_qual_score_thres);
} catch (int e) {
#ifdef DEBUG
cout<<"j="<<j<<" sr content: a1="<<sr.read_a1<<" a2= "<<sr.read_a2<<endl;
cout<<"pb reading/seeking at offset1="<<offset1<<endl;
cout<<"got: "<<endl;
cout<<dna_seqs[0].fq_record_buf<<endl;
#endif
exit(e);
}
if (f_id2!=0) { // case of PE reads.
//p_dna_seqs+=1;
unsigned long offset2=sr.read_a2+offset1;
try {
getFqRecord(fq_files_be,f_id2,offset2,dna_seqs[1].fq_record_buf);
} catch (int e) {
#ifdef DEBUG
cout<<"j="<<j<<" sr content: a1="<<sr.read_a1<<" a2= "<<sr.read_a2<<endl;
cout<<"pb reading/seeking at offset2="<<offset2<<endl;
cout<<"got: "<<endl;
cout<<dna_seqs[1].fq_record_buf<<endl;
#endif
exit(e);
}
processFqRecord(dna_seqs[1],nucl_qual_score_thres);
}
}
void decomposeReadInKMerNums(ReadProcessor& read_p, T_read_numericValues& nbrKmerDecompo,int k,DnaSeqStr a_seqs[2], int process_PE_separately) {
int nb_expected_k_mers,nb_expected_k_mers_PE1,nb_expected_k_mers_PE2=0;
nb_expected_k_mers_PE1=a_seqs[0].length+1-k;
nbrKmerDecompo.idx_start_PE2=0;
if (a_seqs[1].length) nb_expected_k_mers_PE2=a_seqs[1].length+1-k;
nb_expected_k_mers=nb_expected_k_mers_PE1+nb_expected_k_mers_PE2;
nbrKmerDecompo.single_or_PE_val.reserve(nb_expected_k_mers);
//nbrKmerDecompo.nb_kmer_PE1=nb_expected_k_mers_PE1;
char * start_dna_str=a_seqs[0].fq_record_buf+a_seqs[0].start_idx;
read_p.getKMerNumbers(start_dna_str,a_seqs[0].length,nbrKmerDecompo.single_or_PE_val);
if (a_seqs[1].length) { // case of PE reads
if (process_PE_separately) nbrKmerDecompo.idx_start_PE2=nb_expected_k_mers_PE1;
start_dna_str=a_seqs[1].fq_record_buf+a_seqs[1].start_idx;
read_p.getKMerNumbers(start_dna_str,a_seqs[1].length,nbrKmerDecompo.single_or_PE_val);
}
}