diff --git a/src/main_utils.cpp b/src/main_utils.cpp index 0263fd7f8c950ea32457ecb9ec23e9fa7af73238..14017c7a29743ffdc3b8919f0784307e997c958d 100644 --- a/src/main_utils.cpp +++ b/src/main_utils.cpp @@ -4,7 +4,6 @@ * Created on: May 9, 2016 * Author: vlegrand */ -#include <stdint.h> #include <unistd.h> #include <sys/types.h> #include <sys/sysctl.h> @@ -52,7 +51,7 @@ void checkDirExists(const string o_file) { std::size_t o_found = o_file.find_last_of("/"); if (o_found!=string::npos) { string parent_dir=o_file.substr(0,o_found); - cout<<parent_dir<<endl; + // cout<<parent_dir<<endl; struct stat info; if (stat(parent_dir.c_str(),&info)!=0) { cout<<"parent directory for output files: "<<parent_dir<<" doesn't exist."<<endl; @@ -62,78 +61,139 @@ void checkDirExists(const string o_file) { } -/* - * Reads the names of input and output fastq files that ROCK must process from text files whose names are passed as - * argument of the -i/-o options. Fills the appropriate structures. - * orresponding output files are supposed to be in the same order as input files. No checking here. It is ut to the user to give something correct in input. - */ -int processInOutFileArgs(const std::string& input_file,const std::string output_file,std::vector<IO_fq_files>& single_files,vector<PE_files>& v_PE_files,int& f_id) { - ifstream infiles_names(input_file.c_str()); - if (!infiles_names) cout<<"couldn't open file: "<<input_file<<endl; - ifstream ofiles_names(output_file.c_str()); - if (!ofiles_names) cout<<"couldn't open file: "<<output_file<<endl; - /*cout<<"processing in/out files: "<<input_file<<" "<<output_file<<endl; - cout<<f_id<<endl;*/ - while (infiles_names && ofiles_names && f_id<=k_max_input_files) { - // cout<<"entering while loop"<<endl; - string iline; - string oline; - if (!getline(infiles_names,iline)) break; - if (!getline(ofiles_names,oline)) break; - /*cout<< iline<<endl; - cout<<oline<<endl;*/ - std::size_t i_found = iline.find_first_of(k_sep_pair_end); - if (i_found!=std::string::npos) { - // this is PE - f_id+=2; - string i_name_PE1=iline.substr(0,i_found); - string i_name_PE2=iline.substr(i_found+1); - std::size_t o_found = oline.find_first_of(k_sep_pair_end); - if (o_found==std::string::npos) { - cout<< "Inconsistency between input and output files lists!"<<endl; - return EXIT_FAILURE; - } - string o_name_PE1=oline.substr(0,o_found); - checkDirExists(o_name_PE1); - string o_name_PE2=oline.substr(o_found+1); - //cout<<o_name_PE2<<endl; - checkDirExists(o_name_PE2); - PE_files pe; - pe.PE1.in_fq_file=i_name_PE1; - pe.PE1.out_fq_file=o_name_PE1; - pe.PE2.in_fq_file=i_name_PE2; - pe.PE2.out_fq_file=o_name_PE2; - v_PE_files.push_back(pe); +void removePathfromFName(string& FName) { + std::size_t i_found2 =FName.find_last_of(path_sep); // remove path from filename. + if (i_found2!=std::string::npos) { + FName=FName.substr(i_found2+1); + } +} + +void changeExtension(string& FName) { // changes .fq into .rock.fq or adds .rock.fq + std::size_t o_found = FName.find_last_of(k_ext); + if (o_found!=std::string::npos) FName.replace(o_found,1,".rock."); + else FName.append(".rock.fq"); +} + +void genOutFilenames(const std::vector<string>& v_input_lines,std::vector<string>& v_output_lines) { + std::vector<string>::const_iterator it_in; + string o_line; + for (it_in=v_input_lines.begin();it_in!=v_input_lines.end();++it_in) { + std::size_t i_found = (*it_in).find_first_of(k_sep_pair_end); + if (i_found!=std::string::npos) {// PE files are separared by a ',' + string i_name_PE1=(*it_in).substr(0,i_found); + removePathfromFName(i_name_PE1); + string i_name_PE2=(*it_in).substr(i_found+1); + removePathfromFName(i_name_PE2); + changeExtension(i_name_PE1); + changeExtension(i_name_PE2); + o_line=i_name_PE1; + o_line+=k_sep_pair_end; + o_line.append(i_name_PE2); } else { - // this is single. - f_id+=1; - IO_fq_files p; - p.in_fq_file=iline; - checkDirExists(oline); - p.out_fq_file=oline; - single_files.push_back(p); + o_line=*it_in; + removePathfromFName(o_line); + changeExtension(o_line); } + v_output_lines.push_back(o_line); } - if (!infiles_names.eof()) - { - std::cout<<"error while reading input or output file"<<std::endl; - return EXIT_FAILURE; +} + + +int processInOutFileArgs(const std::vector<string>& v_input_lines,std::vector<string>& v_output_lines,std::vector<IO_fq_files>& single_files,vector<PE_files>& v_PE_files,int& f_id) { + if (v_output_lines.empty()) { + // in that case, generate output filenames from input filenames + genOutFilenames(v_input_lines,v_output_lines); } - /*if (!ofiles_names.eof()) - { - std::cout<<"error while reading input or output file"<<std::endl; + if (v_input_lines.size()!=v_output_lines.size()) { + cout<< "Inconsistency between input and output files lists!"<<endl; return EXIT_FAILURE; - }*/ + } else { + std::vector<string>::const_iterator it_in; + std::vector<string>::const_iterator it_out; + it_in=v_input_lines.begin(); + it_out=v_output_lines.begin(); + while (it_in!=v_input_lines.end()) { + std::size_t i_found = (*it_in).find_first_of(k_sep_pair_end); + if (i_found!=std::string::npos) { + // this is PE + f_id+=2; + string i_name_PE1=(*it_in).substr(0,i_found); + string i_name_PE2=(*it_in).substr(i_found+1); + std::size_t o_found = (*it_out).find_first_of(k_sep_pair_end); + if (o_found==std::string::npos) { + cout<< "Inconsistency between input and output files lists!"<<endl; + return EXIT_FAILURE; + } + string o_name_PE1=(*it_out).substr(0,o_found); + checkDirExists(o_name_PE1); + string o_name_PE2=(*it_out).substr(o_found+1); + //cout<<o_name_PE2<<endl; + checkDirExists(o_name_PE2); + PE_files pe; + pe.PE1.in_fq_file=i_name_PE1; + pe.PE1.out_fq_file=o_name_PE1; + pe.PE2.in_fq_file=i_name_PE2; + pe.PE2.out_fq_file=o_name_PE2; + v_PE_files.push_back(pe); - if (f_id>k_max_input_files) { - cout<<"ROCK cannot handle more than "<<k_max_input_files<<" input files."<<endl; - return EXIT_FAILURE; + } else { + // this is single. + f_id+=1; + IO_fq_files p; + p.in_fq_file=*it_in; + checkDirExists(*it_out); + p.out_fq_file=*it_out; + single_files.push_back(p); + } + ++it_in; + ++it_out; + } + if (f_id>k_max_input_files) { + cout<<"ROCK cannot handle more than "<<k_max_input_files<<" input files."<<endl; + return EXIT_FAILURE; + } } return EXIT_SUCCESS; } +/* + * Loads the content of a text file (containing input fastq file names to be filtered or names of files that ROCK must generate). + */ +int loadFileArgs(const std::string& afile,std::vector<string>& v_lines) { + ifstream infiles_names(afile.c_str()); + if (!infiles_names) cout<<"couldn't open file: "<<afile<<endl; + while (infiles_names) { + string iline; + if (!getline(infiles_names,iline)) break; + v_lines.push_back(iline); + } + if (!infiles_names.eof()) + { + std::cout<<"error while reading input or output file"<<std::endl; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} + +/* + * Reads the names of input and output fastq files that ROCK must process from text files whose names are passed as + * argument of the -i/-o options. Fills the appropriate structures. + * orresponding output files are supposed to be in the same order as input files. No checking here. It is ut to the user to give something correct in input. + */ +int loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<string>& v_input_lines,std::vector<string>& v_output_lines) { + int reti=EXIT_SUCCESS; + int reto=EXIT_SUCCESS; + reti=loadFileArgs(input_file,v_input_lines); + if (!output_file.empty()) { + reto=loadFileArgs(output_file,v_output_lines); + } + if (reti==EXIT_SUCCESS && reto==EXIT_SUCCESS) return EXIT_SUCCESS; + else return EXIT_FAILURE; +} + + /* * Minimize lambda for a given number of k-mers. * We want p<=0.01, diff --git a/src/main_utils.h b/src/main_utils.h index b6ce6e5c0168debfe404e961bb51c7175c6b1ec3..7f3a8c296f318bea70a124a56bd6dfc67204c41b 100644 --- a/src/main_utils.h +++ b/src/main_utils.h @@ -10,12 +10,15 @@ #define k_max_input_files 15 #define k_sep_pair_end ',' +#define k_ext '.' +#define path_sep '/' #include <vector> #include "rock_commons.h" unsigned long getNodePhysMemory(); -int processInOutFileArgs(const std::string& input_file,const std::string output_file,std::vector<IO_fq_files>& single_files,std::vector<PE_files>& v_PE_files,int& f_id); +int loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<std::string>& v_input_lines,std::vector<std::string>& v_output_lines); +int processInOutFileArgs(const std::vector<std::string>& v_input_lines,std::vector<std::string>& v_output_lines,std::vector<IO_fq_files>& single_files,std::vector<PE_files>& v_PE_files,int& f_id); int getBestLambdaForN(const unsigned long& nb_k_mers,int lambda_max); float getCollisionProba(const unsigned long& nb_k_mers,const int& lambda); diff --git a/src/unit_test_main_utils.cpp b/src/unit_test_main_utils.cpp index 57781f05e4c1c627d234c54b51bdf7dd5bb2c957..0e9947dc24af06088496f7007cca6b4a17ce8382 100644 --- a/src/unit_test_main_utils.cpp +++ b/src/unit_test_main_utils.cpp @@ -19,18 +19,28 @@ using namespace std; void test_processIOFileArgs() { std::vector<IO_fq_files> single_files; vector<PE_files> v_PE_files; + std::vector<string> v_input_lines; + std::vector<string> v_output_lines; int f_id=0; - int ret=processInOutFileArgs("../test/data/unit/list_input1.txt","../test/data/unit/list_output1.txt",single_files,v_PE_files,f_id); + int ret=loadInOutFileArgs("../test/data/unit/list_input1.txt","../test/data/unit/list_output1.txt",v_input_lines,v_output_lines); + assert(ret==EXIT_SUCCESS); + ret=processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); assert(ret==EXIT_FAILURE); f_id=0; v_PE_files.clear(); single_files.clear(); - ret=processInOutFileArgs("../test/data/unit/list_input2.txt","../test/data/unit/list_output2.txt",single_files,v_PE_files,f_id); + v_input_lines.clear(); + v_output_lines.clear(); + ret=loadInOutFileArgs("../test/data/unit/list_input2.txt","../test/data/unit/list_output2.txt",v_input_lines,v_output_lines); + ret=processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); assert(ret==EXIT_FAILURE); f_id=0; v_PE_files.clear(); single_files.clear(); - ret=processInOutFileArgs("../test/data/unit/list_input3.txt","../test/data/unit/list_output3.txt",single_files,v_PE_files,f_id); + v_input_lines.clear(); + v_output_lines.clear(); + ret=loadInOutFileArgs("../test/data/unit/list_input3.txt","../test/data/unit/list_output3.txt",v_input_lines,v_output_lines); + ret=processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); assert(ret==EXIT_SUCCESS); assert(f_id==14); assert(single_files.size()==2); @@ -41,6 +51,18 @@ void test_processIOFileArgs() { PE_files s2=v_PE_files[5]; assert(s2.PE2.in_fq_file.compare("nono")==0); assert(s2.PE2.out_fq_file.compare("onono")==0); + + v_output_lines.clear(); + v_input_lines.clear(); + v_PE_files.clear(); + single_files.clear(); + ret=loadInOutFileArgs("../test/data/unit/list_input3.txt","",v_input_lines,v_output_lines); + ret=processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); + s=single_files[0]; + assert(s.out_fq_file.compare("fifi.rock.fq")==0); + s2=v_PE_files[5]; + assert(s2.PE2.out_fq_file.compare("nono.rock.fq")==0); + } @@ -50,7 +72,7 @@ void test_getBestLambdaForN() { int best=getBestLambdaForN(nb_k_mer,lambda_max); assert(best==2); nb_k_mer=600000000; - best=getBestLambdaForN(nb_k_mer,lambda_max); + best=getBestLambdaForN(nb_k_mer,lambda_max); assert(best==4); nb_k_mer=2000000000; best=getBestLambdaForN(nb_k_mer,lambda_max);