diff --git a/src/FqAuxBackend.cpp b/src/FqAuxBackend.cpp index a17c28e391773b06065f9102c16fd5775ecccd16..7e78a1750a2d60535f918ee453aca5448927e31a 100644 --- a/src/FqAuxBackend.cpp +++ b/src/FqAuxBackend.cpp @@ -126,8 +126,8 @@ int FqAuxBackend::processBuffer(rinfo * p_nr) { void FqAuxBackend::readBuffer() { - if ((nread=read(f_desc,buf,bufsize))!=0) { - cur_offset=lseek(f_desc,0,SEEK_CUR); + if ((nread=read(i_f_desc,buf,bufsize))!=0) { + cur_offset=lseek(i_f_desc,0,SEEK_CUR); //ftell(f_stream); pos_in_buf=0; } @@ -136,7 +136,7 @@ void FqAuxBackend::readBuffer() { * Opens file and performs the fist read operation. */ void FqAuxBackend::openFile(char * ficname, unsigned char id) { - FqBaseBackend::openFile(ficname,id); + FqBaseBackend::openInputFile(ficname,id); buf=(char *) malloc(bufsize*sizeof(char)); if (buf==NULL) { err(errno,"cannot allocate memory: %lu bytes.",bufsize); @@ -145,8 +145,8 @@ void FqAuxBackend::openFile(char * ficname, unsigned char id) { } void FqAuxBackend::closeFile() { - if (f_desc!=-1) { - FqBaseBackend::closeFile(); + if (i_f_desc!=-1) { + FqBaseBackend::closeInputFile(); free(buf); buf=NULL; } diff --git a/src/FqBaseBackend.cpp b/src/FqBaseBackend.cpp index b702d5bf7c8ae2cd753ae47f043f9206a18c94a8..bd79935494f057b8573b75c370aba6fca827f378 100644 --- a/src/FqBaseBackend.cpp +++ b/src/FqBaseBackend.cpp @@ -13,56 +13,114 @@ #include <err.h> + #include "FqBaseBackend.h" -#define DEBUG +#define DEBUG #ifdef DEBUG +#include <cassert> #include <iostream> using namespace std; #endif -void FqBaseBackend::openFile(char * ficname, unsigned char id) { + +void FqBaseBackend::openInputFile(char * ficname, unsigned char id) { int st,s; // unsigned long cur_offset; mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; - closeFile(); - filename=ficname; + closeInputFile(); + i_filename=ficname; f_id=id; - f_desc=open(filename,O_RDONLY,mode); - if (f_desc==-1) { - err(errno,"cannot open file: %s.",filename); + i_f_desc=open(i_filename,O_RDONLY,mode); + if (i_f_desc==-1) { + err(errno,"cannot open file: %s.",i_filename); } - f_stream=fdopen(f_desc,"r"); + f_stream=fdopen(i_f_desc,"r"); if (f_stream==NULL) { - err(errno,"cannot open file: %s.",filename); + err(errno,"cannot open file: %s.",i_filename); } } -void FqBaseBackend::closeFile() { - if (f_desc!=-1) { - close(f_desc); +void FqBaseBackend::closeInputFile() { + if (i_f_desc!=-1) { + close(i_f_desc); // filename=NULL; - f_desc=-1; + i_f_desc=-1; f_stream=NULL; } } +void FqBaseBackend::setOutputFile(char * ofilename) { + o_filename=ofilename; +} + + +void FqBaseBackend::openInputFile() { + if (i_filename==NULL || f_id==0) throw std::runtime_error("No file currently associated to this backend"); + if (i_f_desc!=-1) { + std::string err_msg("file: "); + err_msg+=i_filename; + err_msg+=" is already open!"; + throw std::runtime_error(err_msg); + } + openInputFile(i_filename,f_id); +} + -void FqBaseBackend::openFile() { - if (filename==NULL || f_id==0) throw std::runtime_error("No file currently associated to this backend"); - if (f_desc!=-1) { +void FqBaseBackend::openOutputFile(){ + mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + if (o_filename==NULL) throw std::runtime_error("No output file currently associated to this backend"); + if (o_f_desc!=-1) { std::string err_msg("file: "); - err_msg+=filename; + err_msg+=o_filename; err_msg+=" is already open!"; throw std::runtime_error(err_msg); } + o_f_desc=open(o_filename,O_WRONLY|O_CREAT,mode); + if (o_f_desc==-1) { + err(errno,"cannot open file: %s.",o_filename); + } +} + + +void FqBaseBackend::closeOutputFile() { + if (pos_in_w_buf!=o_buf) { // check if some fq records remains in buffer before closing file, if so, flush. + if (write(o_f_desc, o_buf, pos_in_w_buf-o_buf)==-1) err(errno,"cannot write in file: %s.",o_filename); + pos_in_w_buf=o_buf; + } + if (o_f_desc!=-1) { + if (close(o_f_desc)==-1) err(errno,"cannot close file: %s.",o_filename); + o_f_desc=-1; + } +} + +void FqBaseBackend::writeToOutput(const unsigned long& offset) { + // static char * pos_in_w_buf; + if (o_buf==NULL) { + o_buf=(char *) malloc(FqBaseBackend::bufsize*sizeof(char)); + pos_in_w_buf=o_buf; + } + if (o_buf+FqBaseBackend::bufsize-pos_in_w_buf<MAX_FQ_RECORD_LENGTH) { + // may not have enough place in buffer to store another fq record. + // => flush buffer into output file. + if (write(o_f_desc, o_buf, pos_in_w_buf-o_buf)==-1) err(errno,"cannot write in file: %s.",o_filename); + pos_in_w_buf=o_buf; + } + + int fq_rec_size=getRead(offset,pos_in_w_buf); // here aim is to avoid multiple strcpy into buffer: read directly at the right place in the buffer. #ifdef DEBUG -cout<<"going to open file : "<<filename<<" id="<<f_id<<endl; + cout<<*(pos_in_w_buf+fq_rec_size-1)<<endl; + assert(*(pos_in_w_buf+fq_rec_size-1)=='\n'); + assert(*(pos_in_w_buf)=='@'); +#endif + pos_in_w_buf+=fq_rec_size; + *pos_in_w_buf='\0'; +#ifdef DEBUG + cout<<o_buf; #endif - openFile(filename,f_id); } /* @@ -71,14 +129,29 @@ cout<<"going to open file : "<<filename<<" id="<<f_id<<endl; * * Note: It is up to the caller to determine where the end of the record is (5th \n character in the fq_record string). */ -void FqBaseBackend::getRead(const long& offset, char * fq_record) { +int FqBaseBackend::getRead(const unsigned long& offset, char * fq_record) { - int nread,i,nb_lines; + int nread,i,nb_lines,fq_rec_size; char * pchar; - if (f_desc==-1) throw std::runtime_error("No open file currently associated to this backend"); + if (i_f_desc==-1) throw std::runtime_error("No open file currently associated to this backend"); - int res=lseek(f_desc,offset,SEEK_SET); + int res=lseek(i_f_desc,offset,SEEK_SET); if (res==-1) err(errno,"fseek problem when trying to retrieve dna string."); - nread=read(f_desc,fq_record,MAX_FQ_RECORD_LENGTH); + nread=read(i_f_desc,fq_record,MAX_FQ_RECORD_LENGTH); + nb_lines=0; + i=1; + pchar=fq_record; + while(i<nread) { + if (*pchar=='\n') { + nb_lines++; + if (nb_lines==4) { + fq_rec_size=i; + break; + } + } + pchar++; + i++; + } + return fq_rec_size; } diff --git a/src/FqBaseBackend.h b/src/FqBaseBackend.h index 16dd424675c8efe03437c01498197a34781f1b87..309da828b21d6ebb38ae15e4756dcf1b05d3fd15 100644 --- a/src/FqBaseBackend.h +++ b/src/FqBaseBackend.h @@ -9,6 +9,7 @@ #define FQBASEBACKEND_H_ #include <cstdio> +#include <stdlib.h> #include "FqConstants.h" #include "srp.h" @@ -24,25 +25,47 @@ class FqBaseBackend { protected: static const size_t bufsize=6048000; // It seems that I can't have a much bigger value than that or else my objects construction throws exception. Strange. - char * filename; + char * i_filename; unsigned char f_id; - int f_desc; // for calling read + int i_f_desc; // for calling read FILE * f_stream; // for calling ftell + + // for writing output (filtered reads) + char * o_filename; + int o_f_desc; + char * o_buf; + char * pos_in_w_buf; + friend void test_processInputFiles(); + friend void test_write_PE(); public: FqBaseBackend() { - filename=NULL; - f_desc=-1; + i_filename=NULL; + i_f_desc=-1; f_stream=NULL; f_id=0; + o_f_desc=-1; + o_filename=NULL; + o_buf=NULL; + pos_in_w_buf=NULL; + } + + ~FqBaseBackend() { + if (o_buf!=NULL) { + free(o_buf); + } } - void openFile(char * ficname, unsigned char id); - void openFile(); - void closeFile(); - void getRead(const long&,char *); + void openInputFile(char * ficname, unsigned char id); + void openInputFile(); + void closeInputFile(); + int getRead(const unsigned long&,char *); + void setOutputFile(char * ofilename); + void openOutputFile(); + void writeToOutput(const unsigned long&); + void closeOutputFile(); }; diff --git a/src/FqMainBackend.cpp b/src/FqMainBackend.cpp index 72d1b15e98d952e1fec7442462137f713d886c4d..777a02a4ab1a2d8b793e70b1ea2bca4ceaa4cff8 100644 --- a/src/FqMainBackend.cpp +++ b/src/FqMainBackend.cpp @@ -15,6 +15,8 @@ #include <assert.h> #include <stdlib.h> +#define _FILE_OFFSET_BITS 64 // for portability reasons on 32bits systems. + //#define DEBUG #ifdef DEBUG @@ -42,7 +44,7 @@ void FqMainBackend::processFile(char * filename,unsigned char f_id) { mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; //cout<<"INT_MAX="<<INT_MAX<<endl; // cout<<filename<<endl; - this->filename=filename; + this->i_filename=filename; this->f_id=f_id; int f_single=open(filename,O_RDONLY,mode); diff --git a/src/Makefile.am b/src/Makefile.am index ea8749be6deb03029816dd580ecab922fe7002e7..576428dd191af24317f72f48a0492cd63df3b592 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -3,7 +3,7 @@ LINTDEFS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) AM_CPPFLAGS = -DDATADIR=\"$(pkgdatadir)\" bin_PROGRAMS=rock -noinst_PROGRAMS = unit_test_fqreader perf_test_fqreader unit_test_read_utils unit_test_cms +noinst_PROGRAMS = unit_test_fqreader perf_test_fqreader unit_test_read_utils unit_test_cms unit_test_fqwriter ## noinst_PROGRAMS = noinst_LIBRARIES = librock.a @@ -24,8 +24,10 @@ unit_test_read_utils_LDADD=librock.a unit_test_cms_SOURCES=unit_test_cms.cpp unit_test_cms_LDADD=librock.a +unit_test_fqwriter_SOURCES=unit_test_fqwriter.cpp +unit_test_fqwriter_LDADD=librock.a librock_a_SOURCES = $(SRC) -SRC = fqreader.cpp FqBaseBackend.cpp FqAuxBackend.cpp FqMainBackend.cpp read_utils.cpp ReadProcessor.cpp CountMinSketch.cpp -HDR = srp.h fqreader.h FqConstants.h FqBaseBackend.h FqAuxBackend.h FqMainBackend.h ReadProcessor.h rock_commons.h read_utils.h CountMinSketch.h +SRC = fqreader.cpp FqBaseBackend.cpp FqAuxBackend.cpp FqMainBackend.cpp read_utils.cpp ReadProcessor.cpp CountMinSketch.cpp fqwriter.cpp +HDR = srp.h fqreader.h FqConstants.h FqBaseBackend.h FqAuxBackend.h FqMainBackend.h ReadProcessor.h rock_commons.h read_utils.h CountMinSketch.h fqwriter.h diff --git a/src/fqwriter.cpp b/src/fqwriter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7042b3e5cba22bff0d6f23ff408cd34f006d59da --- /dev/null +++ b/src/fqwriter.cpp @@ -0,0 +1,60 @@ +/* + * fqwriter.cpp + * + * Created on: Mar 15, 2016 + * Author: vlegrand + * Here gather functions to write filtered fastq records to output files + */ + +#include <map> +#include "srp.h" +#include "FqBaseBackend.h" + +// TODO this looks a little bit like getDnaStr in read_utils.cpp. Some refactoring could be useful. Think about +// classes to encapsulate that without decreasing perfs. +void writeFilteredFastq(FqBaseBackend* map_id_backend[],int nb_be, const srp& io_sr){ + srp::const_iterator it; + i_dim::const_iterator it_offs; + k_dim::const_iterator it_struct; + + unsigned char f_id1; + unsigned char f_id2; + unsigned char fid_stored; + + // 1rst step, open files before fetching/writing reads from/to them + int i; + for (i=0;i<nb_be;i++){ + map_id_backend[i]->openInputFile(); + map_id_backend[i]->openOutputFile(); + } + + for (it=io_sr.begin(); it!=io_sr.end(); ++it) { + for (it_offs=it->second.begin();it_offs!=it->second.end();it_offs++) { + unsigned int j=it_offs->first; + for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { + if (it_struct->fileid) { + fid_stored=it_struct->fileid; + f_id1=fid_stored >>4; + f_id2=fid_stored &mask_fid; + unsigned long offset1=j*INT_MAX+it_struct->read_a1; + FqBaseBackend * be=map_id_backend[f_id1-1]; + be->writeToOutput(offset1); + if (f_id2!=0) { // case of PE reads. + unsigned long offset2=it_struct->read_a2+j*INT_MAX; + map_id_backend[f_id2-1]->writeToOutput(offset2); + } + } + } + } + } + + // last step, close files. + for (i=0;i<nb_be;i++){ + map_id_backend[i]->closeInputFile(); + map_id_backend[i]->closeOutputFile(); + } +} + + + + diff --git a/src/fqwriter.h b/src/fqwriter.h new file mode 100644 index 0000000000000000000000000000000000000000..97d90b44bc98e75818d57d7052b588a492de7cc4 --- /dev/null +++ b/src/fqwriter.h @@ -0,0 +1,15 @@ +/* + * fqwriter.h + * + * Created on: Mar 15, 2016 + * Author: vlegrand + */ + +#ifndef FQWRITER_H_ +#define FQWRITER_H_ + +void writeFilteredFastq(FqBaseBackend* map_id_backend[],int nb_be, const srp& io_sr); + + + +#endif /* FQWRITER_H_ */ diff --git a/src/read_utils.cpp b/src/read_utils.cpp index a446cb89464b164f5045fe8af02168281ad648f5..5ce7952fa902d4815e2edcf61b816adad298cc59 100644 --- a/src/read_utils.cpp +++ b/src/read_utils.cpp @@ -21,7 +21,7 @@ -void getFqRecord(FqBaseBackend* fq_files_be[],const unsigned char& f_id,const long& offset, +void getFqRecord(FqBaseBackend* fq_files_be[],const unsigned char& f_id,const unsigned long& offset, char * fq_record) { FqBaseBackend * be=fq_files_be[f_id-1]; be->getRead(offset,fq_record); @@ -73,13 +73,13 @@ void getDNASeqstr(FqBaseBackend* fq_files_be [], f_id1=fid_stored >>4; f_id2=fid_stored &mask_fid; - long offset1=j*INT_MAX+sr.read_a1; + unsigned long offset1=j*INT_MAX+sr.read_a1; getFqRecord(fq_files_be,f_id1,offset1,dna_seqs[0].fq_record_buf); processFqRecord(p_dna_seqs); if (f_id2!=0) { // case of PE reads. p_dna_seqs+=1; - long offset2=sr.read_a2+j*INT_MAX; + unsigned long offset2=sr.read_a2+j*INT_MAX; getFqRecord(fq_files_be,f_id2,offset2,dna_seqs[1].fq_record_buf); processFqRecord(p_dna_seqs); } diff --git a/src/read_utils.h b/src/read_utils.h index c2c0d397276757aed4234778d9fba02f6f81551b..09bb6c160479ba9bd12ec1c2401cb741ce791f59 100644 --- a/src/read_utils.h +++ b/src/read_utils.h @@ -12,23 +12,26 @@ #include "rock_commons.h" #include "ReadProcessor.h" + /* * Here, for performance matters, do not want to have to do a memmove nor strcpy or strcat to extract only DNA sequence for all * the fastq records that we'll have to process (potentially millions). So, use a structure to store start position of DNA sequence in buffer and its length. * We will not process more than 2 fastq records at a time (case of he PE reads) so it will not take too much space in memory. */ - typedef struct { char fq_record_buf[MAX_FQ_RECORD_LENGTH]; int start_idx; int length; }DnaSeqStr; + + void getDNASeqstr(FqBaseBackend* [], const rpos&, unsigned int, DnaSeqStr *); + void init_DnaSeqStr(DnaSeqStr * dna_seq); void decomposeReadInKMerNums(ReadProcessor& read_p, readNumericValues& nbrKmerDecompo,int k,DnaSeqStr a_seqs[2]); diff --git a/src/rock.cpp b/src/rock.cpp index c915439d258ffb081fa4b9534239a5652688492b..5e82871c4c5eaa22be9080d37ad4f3ff5fa172b7 100644 --- a/src/rock.cpp +++ b/src/rock.cpp @@ -30,7 +30,6 @@ using namespace std; -//#define DEBUG #define BENCHMARK #ifdef BENCHMARK @@ -75,7 +74,7 @@ CountMinSketch * fillCMS(FqBaseBackend* map_id_backend[],int nb_be, int k,CMSpar // 1rst step, open files before fetching reads in them int i; for (i=0;i<nb_be;i++){ - map_id_backend[i]->openFile(); + map_id_backend[i]->openInputFile(); } for (rit=io_sr->rbegin(); rit!=io_sr->rend(); ++rit) { //process map in reverse order (by decreasing scores). @@ -104,7 +103,7 @@ CountMinSketch * fillCMS(FqBaseBackend* map_id_backend[],int nb_be, int k,CMSpar // last step, close fq files. TODO this step may be moved somewhere else for optimization. for (i=0;i<nb_be;i++){ - map_id_backend[i]->closeFile(); + map_id_backend[i]->closeInputFile(); } return cms; } @@ -120,35 +119,28 @@ void lowFilterCMS(FqBaseBackend* map_id_backend[],int nb_be,int k, CountMinSketc // 1rst step, open files before fetching reads in them int i; for (i=0;i<nb_be;i++){ - map_id_backend[i]->openFile(); + map_id_backend[i]->openInputFile(); } - cout<<"all input files are open"<<endl; + for (it=io_sr->begin(); it!=io_sr->end(); ++it) { for (it_offs=it->second.begin();it_offs!=it->second.end();it_offs++) { unsigned int j=it_offs->first; for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { - if (it_struct->fileid) { - // read dna string corresponding to fastq record - DnaSeqStr a_seqs[2]; - init_DnaSeqStr(&a_seqs[0]); - init_DnaSeqStr(&a_seqs[1]); - getDNASeqstr(map_id_backend,*it_struct,j,a_seqs); -#ifdef DEBUG -cout<<a_seqs[0].fq_record_buf<<" "<<a_seqs[0].start_idx<<" "<<a_seqs[0].length<<endl; -cout<<a_seqs[1].fq_record_buf<<" "<<a_seqs[1].start_idx<<" "<<a_seqs[1].length<<endl; -#endif - // decompose dna string into k-mers and turn k_mers into numbers. - decomposeReadInKMerNums(read_p, nbrKmerDecompo,k,a_seqs); - ret=pcms->isBeneathMinKappa(nbrKmerDecompo); - if (ret) it_struct->fileid=0; - nbrKmerDecompo.clear(); - } + // read dna string corresponding to fastq record + DnaSeqStr a_seqs[2]; + init_DnaSeqStr(&a_seqs[0]); + init_DnaSeqStr(&a_seqs[1]); + getDNASeqstr(map_id_backend,*it_struct,j,a_seqs); + // decompose dna string into k-mers and turn k_mers into numbers. + decomposeReadInKMerNums(read_p, nbrKmerDecompo,k,a_seqs); + ret=pcms->isBeneathMinKappa(nbrKmerDecompo); + if (ret) it_struct->fileid=0; + nbrKmerDecompo.clear(); } } } - cout<<"going to close input files."<<endl; for (i=0;i<nb_be;i++){ - map_id_backend[i]->closeFile(); + map_id_backend[i]->closeInputFile(); } } diff --git a/src/rock_commons.h b/src/rock_commons.h index bac4abdf172b247f87ebe77bceb65faad73663f4..10af4ca1c9adb69f9b7dfd91b382a91519c9f0e1 100644 --- a/src/rock_commons.h +++ b/src/rock_commons.h @@ -28,4 +28,6 @@ typedef std::vector<unsigned long> readNumericValues; + + #endif /* ROCK_COMMONS_H_ */ diff --git a/src/unit_test_fqreader.cpp b/src/unit_test_fqreader.cpp index 56fb9317732a819efcabbb4e9162754f3a6a57e1..0c546df114d9b01f6bc716e2ac44bb4777e867f7 100644 --- a/src/unit_test_fqreader.cpp +++ b/src/unit_test_fqreader.cpp @@ -253,15 +253,15 @@ void test_processInputFiles() { FqAuxBackend * pbe2=(FqAuxBackend *) array_be[2]; FqMainBackend * pbe3=(FqMainBackend *) array_be[1]; - assert(strcmp(pbe->filename,fq_single)==0); + assert(strcmp(pbe->i_filename,fq_single)==0); assert(pbe->f_id==1); assert(pbe->p_auxFqProcessor==NULL); assert(pbe3->p_auxFqProcessor==pbe2); - assert(strcmp(pbe3->filename,fq_1_test)==0); + assert(strcmp(pbe3->i_filename,fq_1_test)==0); assert(pbe3->f_id==2); - assert(strcmp(pbe2->filename,fq_2_test)==0); + assert(strcmp(pbe2->i_filename,fq_2_test)==0); assert(pbe2->f_id==3); diff --git a/src/unit_test_fqwriter.cpp b/src/unit_test_fqwriter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a9f02c56c7ba82b308899b4875c3a915fa61fb38 --- /dev/null +++ b/src/unit_test_fqwriter.cpp @@ -0,0 +1,138 @@ +/* + * unit_test_fqwriter.cpp + * + * Created on: Mar 14, 2016 + * Author: vlegrand + */ +#include <cstdio> +#include <cassert> +#include <unistd.h> +#include <fcntl.h> +#include "rock_commons.h" +#include "FqConstants.h" +#include "srp.h" +#include "FqMainBackend.h" +#include "fqwriter.h" + +#include <iostream> +using namespace std; + +#define DEBUG + + + +char expected_content_PE1 []="@SRR1222430.1 1 length=251\n\ +GCCCGCGAAGCGGAGCTGGCCGCCTGCAAAGGCCGTTCGCGCTCGCTGTCGCTGGATCTGTACGCCCAGTGGCGCTGCATGGAGGACAACCACGGCAAGTGGCGCTTCACCTCGCCGACCCATACCGTGCTGGCCTTCGCCCAGGCGCTGAAAGAGCTGGCGCAGGAGGGCGGCGTCAGCGCTCGCCATCAGCGCTACCGCAACAATCAGCGCCGTCTGGTGGCAGGGATGCGCGCGCTCGGCTTCCGGCC\n\ ++SRR1222430.1 1 length=251\n\ +CCCCCCCCCBBCGGCGGGGG@GGGGGHHHGHBHH@GGHGGGGGGGGG@GHGHGGGHHHHHHHHHGGGGGHHHFCGGGGHHHGHHGGHHHHGGGGCCGGGGHFFGGGGGHHHHHGGGGGGCGHHHHGGGGGGGGGGGGGGGGGGAEGGFFFFFFFFFFFFFFAFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEDFFFFFFFFFFF:DA;CCFFBBFDAFFFAFFFFFFFFA-9A>DFFFFFBFFB\n\ +@SRR1222430.2 2 length=250\n\ +AGAAATTCGCCATCAGAATAAAAACCTCATATGCACATTTTCTTGTTATTGCACAGCCTGTGCCACTTTAGCGCCAGCCTCTCCGGCAATCGTGGAGAAATTAAGGAGATAGTGTAATTTATCATGTTGCTTTTGCCGTATCGTAAAGAAACCTCGAGCTTTCCTGCCAGCAGGTAGCGAGTCTGCTTCGTCACCGCAGACCGGCGCATTATCCCTTGCCGGTGTGAAACCTCATTTCATTTAAGTCAAA\n\ ++SRR1222430.2 2 length=250\n\ +BBBBBFFFBBBBFGGGGGGGGGHHGGHHHHHHHHGFGHHHHHHHHGHHHFFFHHFHHHHHHEHGFHHHFGFGFGGGEGGFH@@HGCGGGHHGHHGGHFAFHHHEGFHHGGHHFHFHHHHHEHHHHHHHHHHHGHHHGGGGHHGHFGGFDGFGFHGEGFCDHGGHHHHHHHGHHEHHGHGGGGHGFHHEHGHGGGGDFGGGHHGADA?DBGGGGGGFFFFBBFFFFFFFFFFFFFFFFFBFFFFFFFFF/B\n\ +@SRR1222430.3 3 length=236\n\ +CAAACACCTGACGCGGTTCCAGCAGGTACTCCTGCACGCCAATTTCCGGGCGGGCAGTAAAGCGCTGTTTGCAGCCCGTCTGGTGCAGGCGCCCCAGATAGCGGCCAACCCATTCCATCTGATCAAGGTTATCCGCTTCGAACTGACGACCGCCAAGGCTTGGGAAAACGGCAAAGTAGAATCCCTGATGCTGATGAAGCGTGCTGTCATTAAATAAGAGCGGCGCAGCAACGGGC\n\ ++SRR1222430.3 3 length=236\n\ +CCCCCFFCFFFFGGGGGGGGGGHHHHHFGHHHHHHHHGGGGGHHHHHGGGGGGGGGGHHHHHHGGGGGHHHHHHHHGGGGHGHGHHGGHGGGGGGGGHHHHHGGGGGHGGGGHHHHGHHHHHHHHHHHHHHHHHGGGGGGGGGGGGGGGGGGGFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBFFFFFFFFFFFFFFFABF\n\ +"; + +char expected_content_PE2[]="@SRR1222430.1 1 length=250\n\ +CGATGCTGCGCTGCATCAATAAGCTGGAAGAGATCACCAGCGGCGATCTGATTGTCGATGGTCTGAAGGTCAACGACCCGAAGGTGGACGAACGTCTGATTCGTCAGGAAGCCGGGATGGTTTTCCAGCAGTTTTATCTGTTCCCGCACCTCACGGCGCTGGAAAACGTGATGTTTGGCCCGCTGCGGGTACGCGGCGCCAGCAAGCAGGCGGCGAGTGCAGGCTATCGTCGAGCAGCGGCCGGAAGCCG\n\ ++SRR1222430.1 1 length=250\n\ +CCCCCCFFFCCCGGGGGGGGGGHHHHHHHHGHHHHHHHHHHGGGGGGGGHHGHHHHGHGHGHHHHHHHHHFHHHGGGGGGGGGGBFFHHGGGGGGHHGGHHHGHGGGHHHHHHGGGGGGGFGHGHHHHHHHHHGHHHFHHHHHHDFGGGGHHHHHGGAGGGGGGGGGGGGGGGFGGGFFFFFFFFFFFA=@FDF-BFFFFFFFFFFFFFEDFBDFFF-:FFFFFF/BFFFEFFFFFFFFFFF;--:DEFF\n\ +@SRR1222430.2 2 length=251\n\ +AGAAATTCGCCAGGGTGATCAACGTCTCATCGTCGGCGAGGGTCAGCGCCTGCGCGGGACAGTTTTCCACACAGGCCGGCCCCGCCTCGCGGCCCTGGCATAGATCGCACTTGTGCGCGCTGGCTTTCACCAGGCCTGCGGCCTGCGGCGTCACCACCACCTGCATCACCCCGAACGGGCAGGCCACCATGCAGGATTTACAGCCAATGCATTTCTCCTGACGGACCTGAACGCTGTCGCCGGACTGGGCG\n\ ++SRR1222430.2 2 length=251\n\ +>1>AAFFF@11AEGGFFCGGGGGFGHFH2FF1F00EEE/@AEE0FGGFGEGGHGGCGC?EEFHEFEEHDF1EECHEFE/@@/BCCCFGAC@CC@C.CEGFHFHGHFHCEC?FH;CC?CG@@?-AF.BB0BFGF?E./EF;@?;AFF@<-@@??BFF?F-:A?BF999BBBF@@?@@@F-;@B@FF-A-9FF/BFFE/F//B/BBEFBFFFFF/BFFFFFFFEB?-@=B-/BBF--:;/A-B>--;>?EFE9\n\ +@SRR1222430.3 3 length=236\n\ +GCCCGTTGCTGCGCCGCTCTTATTTAATGACAGCACGCTTCATCAGCATCAGGGATTCTACTTTGCCGTTTTCCCAAGCCTTGGCGGTCGTCAGTTCGAAGCGGATAACCTTGATCAGATGGAATGGGTTGGCCGCTATCTGGGGCGCCTGCACCAGACGGGCTGCAAACAGCGCTTTACTGCCCGCCCGGAAATTGGCGTGCAGGAGTACCTGCTGGAACCGCGTCAGGTGTTTG\n\ ++SRR1222430.3 3 length=236\n\ +?BBABBFBBFFFGGGGGGGGGGHHHHHHHHHHFHHHGGGGGHHHHHHHGHHHHHGHGHHHHHHHGHFEGGHGHHHHHHGGHHHHHGGGEGGGHGHHHGHHGGCGGGDHHHHHHHHGHHGHGHHHHHHFHGGGHFGGGGGHHHHEFGGCFGHHHHGGFGGGGGGGGGGGGGGGGGFFFEFFFFFFFBFFFFF=;ABFF/FA:DB;BF.9.BFFFFFFF/AFFFFFABDFFB/9BD.;\n\ +"; + +void test_write_PE() { + srp sr; + srp::reverse_iterator rit; + i_dim::iterator it_offs; + k_dim::iterator it_struct; + + // step 1 : fill srp with fake data + rpos rp=init_rpos(1,1114); + update_rpos(2,1114,0,&rp); + + rpos rp2=init_rpos(1,558); + update_rpos(2,556,0,&rp2); + + rpos rp3=init_rpos(1,0); + update_rpos(2,0,0,&rp3); + + i_dim& ref_i_dim=sr[600]; + k_dim& ref_k_dim=ref_i_dim[0]; + ref_k_dim.push_back(rp); + + i_dim& ref_i_dim2=sr[500]; + k_dim& ref_k_dim2=ref_i_dim2[0]; + ref_k_dim2.push_back(rp2); + + i_dim& ref_i_dim3=sr[450]; + k_dim& ref_k_dim3=ref_i_dim3[0]; + ref_k_dim3.push_back(rp3); + + // step2 : create the backends + FqBaseBackend * map_id_backend[k_max_input_files]; + FqMainBackend * be_fq1=new FqMainBackend(&sr); + unsigned char f_id1=1; + FqAuxBackend * be_fq2=new FqAuxBackend(); + map_id_backend[0]=be_fq1; + map_id_backend[1]=be_fq2; + + be_fq1->i_filename=(char *) "../data/klebsiella_PE1.fq"; + be_fq1->f_id=1; + be_fq2->f_id=2; + be_fq2->i_filename=(char *) "../data/klebsiella_PE2.fq"; + be_fq1->setOutputFile((char *) "../data/klebsiella_PE1_filtered.fq"); + be_fq2->setOutputFile((char *) "../data/klebsiella_PE2_filtered.fq"); + + writeFilteredFastq(map_id_backend,2, sr); + + // step 4 : re-read output files and check their content. + mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + int f_pe1=open((char *) "../data/klebsiella_PE1_filtered.fq",O_RDONLY,mode); + assert(f_pe1!=-1); + char* buf=(char *) malloc(FqBaseBackend::bufsize*sizeof(char)); // test files are very small, bufsize is enough to read them entirely. + assert(buf!=NULL); + int nread=read(f_pe1,buf,FqBaseBackend::bufsize); + assert(nread<FqBaseBackend::bufsize); + // std::cout<<buf; +#ifdef DEBUG + int i; + for (i=0;i<nread;i++) { + if (buf[i]!=expected_content_PE1[i]) cout<<" character at position"<<i<<" differs: "<<buf[i]<<"!="<<expected_content_PE1[i]<<endl; + } +#endif + assert(strcmp(buf,expected_content_PE1)==0); + int f_pe2=open((char *) "../data/klebsiella_PE2_filtered.fq",O_RDONLY,mode); + assert(f_pe2!=-1); + nread=read(f_pe2,buf,FqBaseBackend::bufsize); + assert(nread<FqBaseBackend::bufsize); +#ifdef DEBUG + for (i=0;i<nread;i++) { + if (buf[i]!=expected_content_PE2[i]) cout<<" character at position"<<i<<" differs: "<<buf[i]<<"!="<<expected_content_PE1[i]<<endl; + } +#endif + assert(strcmp(buf,expected_content_PE2)==0); + free(buf); + + + // step5 : remove output files from disk + + assert(remove((char *) "../data/klebsiella_PE1_filtered.fq")==0); + assert(remove((char *) "../data/klebsiella_PE2_filtered.fq")==0); + +} + +int main(int argc, char **argv) { + test_write_PE(); + +} + + diff --git a/src/unit_test_read_utils.cpp b/src/unit_test_read_utils.cpp index 5bc8ca8e5279bfa700a74b7726c531f7c9ff5436..d01afa18f59281859a189bbeb567ee9aa0b10ec5 100644 --- a/src/unit_test_read_utils.cpp +++ b/src/unit_test_read_utils.cpp @@ -35,7 +35,7 @@ void test_getReadSingle() { char * fq_single2=(char *) "../data/test_single2.fq"; FqMainBackend be_fq=FqMainBackend(&io_sr); // TODO, remove argument from constructor - be_fq.openFile(fq_single2, 4); + be_fq.openInputFile(fq_single2, 4); FqBaseBackend * fic_map[k_max_input_files]; @@ -62,7 +62,7 @@ void test_getReadSingle() { if (a_seqs.length<MAX_READ_LENGTH) dna_read[a_seqs.length]='\0'; std::cout<<dna_read<<endl; assert(strcmp(dna_read,"ACCCAAACTTGCCAGACTTGTGTAGAACGTCCAATATGTATCGGCATCGCTTCCACATGAATGAATCCTTGTTCCACACTTTTTATATGATTCGCATTAATTTCTTGTCCGAAAATCAACTGATTTTTTGCAACATTTTCTCCCGCTCCAAGACTGGCTGCATGTTCTGCAAGCGCAACAGAAACACCACCATGCAAGTAGCCAAAGGGTTGTTTGACCTGATCTGTTATTTCAAGCGCCAGTTCCACTC")==0); - be_fq.closeFile(); + be_fq.closeInputFile(); } void getDnaStr(FqBaseBackend * fic_map[],rpos my_struct,DnaSeqStr* a_seqs,char * dna_read) { // Auxilliary method for test_getReadPE(). @@ -95,7 +95,7 @@ void test_getReadPE() { FqAuxBackend be_fq2; be_fq1.setAuxProcessor(&be_fq2); - be_fq1.openFile(fq_PE1, 4); + be_fq1.openInputFile(fq_PE1, 4); be_fq2.openFile(fq_PE2, 5); FqBaseBackend * fic_map[k_max_input_files]; @@ -126,7 +126,7 @@ void test_getReadPE() { getDnaStr(fic_map,my_struct2,a_seqs,dna_read); assert(strcmp(dna_read,"ATTGTGGGGTTCCTTTTTGTAGCATTGGAATGGAAATTAAAAATGGGGCTTCAGGATGCCCGCTCCATTATTTAATTCCAGAATGTAACGATGCTGTTTACCGGGGGGACTGGAAAGATGCACTTGAGCTTTTGATTAAAACAAATAACATGCCCAGAACCAATCACTGCAATTTTTTTATCCCACCGCACTATCGGTGGAGTCGGCATGAACCAACCTAAACCAAACCCACGGTCAATAATAGCCCGTTCAATCGAATTAATACCCACAGCAGGATCAGAAATTGCAACCGTACAACTTC")==0); - be_fq1.closeFile(); + be_fq1.closeInputFile(); be_fq2.closeFile(); //cout<<"done"<<endl;