diff --git a/Makefile.am b/Makefile.am index 1bfdcf48680594bc8757ad43f3144bbdca65cd65..94c45230274fe7315e516349e7d59f6f4ea884fd 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1 +1 @@ -SUBDIRS=src +SUBDIRS=src test diff --git a/configure.ac b/configure.ac index f810bdd7b22c2081017c2bb3038b5702f6fd656e..03c172a77162abdeaf4105685a08f6fff52b7f31 100644 --- a/configure.ac +++ b/configure.ac @@ -13,6 +13,6 @@ AC_PROG_RANLIB # AC_CHECK_PROG(POD2MAN, pod2man, pod2man, :) -AC_CONFIG_FILES(Makefile src/Makefile) +AC_CONFIG_FILES(Makefile src/Makefile test/Makefile) AC_OUTPUT diff --git a/src/CountMinSketch.hpp b/src/CountMinSketch.hpp index 53991c0f846fe9094264b1f2e3eeb019c80c8aab..d4d448a12b88c8fd8714d5e2a37f1e3f17a2262d 100644 --- a/src/CountMinSketch.hpp +++ b/src/CountMinSketch.hpp @@ -72,6 +72,7 @@ typedef struct { int lambda; int kappa; int kappa_prime; + int filter_size; // max amount of RAM wanted for the CMS. } CMSparams; template <typename T> diff --git a/src/Filter.hpp b/src/Filter.hpp index dd0620be5066f685c5c07c76895a86822f2c6026..6676fc974d0c4f74a2a0e7a534d5f41e9113643b 100644 --- a/src/Filter.hpp +++ b/src/Filter.hpp @@ -57,8 +57,8 @@ template <typename T> void Filter::underlyingfillCMS(FqBaseBackend* map_id_backe } for (rit=io_sr->rbegin(); rit!=io_sr->rend(); ++rit) { //process map in reverse order (by decreasing scores). - cout << "score="<<rit->first<<endl; - unsigned long score=rit->first; + //cout << "score="<<rit->first<<endl; + // unsigned long score=rit->first; for (it_offs=rit->second.begin();it_offs!=rit->second.end();it_offs++) { unsigned long j=it_offs->first; for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { diff --git a/src/FqBaseBackend.cpp b/src/FqBaseBackend.cpp index 83e69c83ac3c5b3d652b5196c287738088b454e7..74c8a6ae552f9b305c9673bb16a8af5f828eec34 100644 --- a/src/FqBaseBackend.cpp +++ b/src/FqBaseBackend.cpp @@ -140,8 +140,6 @@ int FqBaseBackend::getRead(const unsigned long& offset, char * fq_record) { #endif int res=lseek(i_f_desc,offset,SEEK_SET); if (res==-1) { - // err(errno,"fseek problem when trying to retrieve dna string."); - // add exception for debug #ifdef DEBUG cout<<"Couldn't get read at offset: "<<offset<<endl; #endif diff --git a/src/Makefile.am b/src/Makefile.am index 27bad12c8640c2cbcac18a677dcbb894fbf12eca..f21fe59daa1b0f0088d2abafe8f0b9b7fb13da2b 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -3,7 +3,7 @@ LINTDEFS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) AM_CPPFLAGS = -DDATADIR=\"$(pkgdatadir)\" bin_PROGRAMS=rock -noinst_PROGRAMS = unit_test_fqreader perf_test_fqreader unit_test_read_utils unit_test_cms unit_test_fqwriter +noinst_PROGRAMS = unit_test_fqreader unit_test_read_utils unit_test_cms unit_test_fqwriter ## noinst_PROGRAMS = noinst_LIBRARIES = librock.a @@ -15,9 +15,6 @@ rock_LDADD = librock.a unit_test_fqreader_SOURCES=unit_test_fqreader.cpp unit_test_fqreader_LDADD=librock.a -perf_test_fqreader_SOURCES=perf_test_fqreader.cpp -perf_test_fqreader_LDADD=librock.a - unit_test_read_utils_SOURCES=unit_test_read_utils.cpp unit_test_read_utils_LDADD=librock.a @@ -29,5 +26,5 @@ unit_test_fqwriter_LDADD=librock.a librock_a_SOURCES = $(SRC) -SRC = fqreader.cpp FqBaseBackend.cpp FqAuxBackend.cpp FqMainBackend.cpp read_utils.cpp ReadProcessor.cpp fqwriter.cpp -HDR = srp.h fqreader.h FqConstants.h FqBaseBackend.h FqAuxBackend.h FqMainBackend.h ReadProcessor.h rock_commons.h read_utils.h CountMinSketch.hpp Filter.hpp fqwriter.h +SRC = fqreader.cpp FqBaseBackend.cpp FqAuxBackend.cpp FqMainBackend.cpp read_utils.cpp ReadProcessor.cpp fqwriter.cpp main_utils.cpp +HDR = srp.h fqreader.h FqConstants.h FqBaseBackend.h FqAuxBackend.h FqMainBackend.h ReadProcessor.h rock_commons.h read_utils.h CountMinSketch.hpp Filter.hpp fqwriter.h main_utils.h diff --git a/src/main_utils.cpp b/src/main_utils.cpp new file mode 100644 index 0000000000000000000000000000000000000000..83386c689e6c4402c5db1dffdaaae9534b183544 --- /dev/null +++ b/src/main_utils.cpp @@ -0,0 +1,153 @@ +/* + * main_utils.cpp + * + * Created on: May 9, 2016 + * Author: vlegrand + */ + +#include <sys/types.h> +#include <sys/sysctl.h> +#include <sys/stat.h> +#include <string.h> +#include <iostream> +#include <fstream> +#include <cmath> +#include "main_utils.h" + +using namespace std; + +/* + * Use this function to get the amount of RAM on the machine (used or not). + * Aim is to make sure that ROCK can run on the current machine with the requirements that the user gave. + */ +unsigned long getNodePhysMemory() { +#ifdef __linux__ + return sysconf(_SC_PHYS_PAGES)*sysconf(_SC_PAGESIZE)/1024/1024/1024; +#elif __APPLE__ + int64_t mem; + size_t len = sizeof(mem); + int ret=sysctlbyname("hw.memsize", (void*) &mem, &len, NULL,0); // total RAM is given in bytes; so convert it to MB.*/ + if (ret==-1) { + cout<<"Couldn't determine how much RAM is usable on your system."<<errno<<endl; + return errno; + } + return mem/1024/1024/1024; +#else + cout<<"ROCK has not been tested on this operating system, please contact author."<<endl; + return unknown_os; +#endif +} + + +/* + * Given a filename (including full path), determine if parent directory exists. + * If is doesn't, display error message and exit. + */ +void checkDirExists(const string o_file) { + std::size_t o_found = o_file.find_last_of("/"); + if (o_found!=string::npos) { + string parent_dir=o_file.substr(0,o_found); + cout<<parent_dir<<endl; + struct stat info; + if (stat(parent_dir.c_str(),&info)!=0) { + cout<<"parent directory for output files: "<<parent_dir<<" doesn't exist."<<endl; + exit(EXIT_FAILURE); + } + } +} + + +/* + * Reads the names of input and output fastq files that ROCK must process from text files whose names are passed as + * argument of the -i/-o options. Fills the appropriate structures. + * orresponding output files are supposed to be in the same order as input files. No checking here. It is ut to the user to give something correct in input. + */ +int processInOutFileArgs(const std::string& input_file,const std::string output_file,std::vector<IO_fq_files>& single_files,vector<PE_files>& v_PE_files,int& f_id) { + ifstream infiles_names(input_file.c_str()); + if (!infiles_names) cout<<"couldn't open file: "<<input_file<<endl; + ifstream ofiles_names(output_file.c_str()); + if (!ofiles_names) cout<<"couldn't open file: "<<output_file<<endl; + /*cout<<"processing in/out files: "<<input_file<<" "<<output_file<<endl; + cout<<f_id<<endl;*/ + while (infiles_names && ofiles_names && f_id<=k_max_input_files) { + // cout<<"entering while loop"<<endl; + string iline; + string oline; + if (!getline(infiles_names,iline)) break; + if (!getline(ofiles_names,oline)) break; + /*cout<< iline<<endl; + cout<<oline<<endl;*/ + std::size_t i_found = iline.find_first_of(k_sep_pair_end); + if (i_found!=std::string::npos) { + // this is PE + f_id+=2; + string i_name_PE1=iline.substr(0,i_found); + string i_name_PE2=iline.substr(i_found+1); + std::size_t o_found = oline.find_first_of(k_sep_pair_end); + if (o_found==std::string::npos) { + cout<< "Inconsistency between input and output files lists!"<<endl; + return EXIT_FAILURE; + } + string o_name_PE1=oline.substr(0,o_found); + checkDirExists(o_name_PE1); + string o_name_PE2=oline.substr(0,o_found+1); + //cout<<o_name_PE2<<endl; + checkDirExists(o_name_PE2); + PE_files pe; + pe.PE1.in_fq_file=i_name_PE1; + pe.PE1.out_fq_file=o_name_PE1; + pe.PE2.in_fq_file=i_name_PE2; + pe.PE2.out_fq_file=o_name_PE2; + v_PE_files.push_back(pe); + + } else { + // this is single. + f_id+=1; + IO_fq_files p; + p.in_fq_file=iline; + checkDirExists(oline); + p.out_fq_file=oline; + single_files.push_back(p); + } + } + if (!infiles_names.eof()) + { + std::cout<<"error while reading input or output file"<<std::endl; + return EXIT_FAILURE; + } + /*if (!ofiles_names.eof()) + { + std::cout<<"error while reading input or output file"<<std::endl; + return EXIT_FAILURE; + }*/ + + if (f_id>k_max_input_files) { + cout<<"ROCK cannot handle more than "<<k_max_input_files<<" input files."<<endl; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} + + +/* + * Minimize lambda for a given number of k-mers. + * We want p<=0.01, + * so choose smallest lambda so that [1-(1-1/m)exp n] exp lambda<=0.01 + */ +int getBestLambdaForN(const unsigned long& nb_k_mers,const int& lambda_max) { + int lambda=2; + int min_lambda=lambda; + float tmp=1/INT_MAX; + tmp=1-tmp; + tmp=pow(tmp,nb_k_mers); + tmp=1-tmp; + while (lambda<=lambda_max) { + min_lambda=lambda; + float p=pow(tmp,lambda); + if (p<=0.01) break; + lambda+=1; + } + return min_lambda; +} + + diff --git a/src/main_utils.h b/src/main_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..80cd267c29f1a5960dd32b27faa1d80f92d99ffe --- /dev/null +++ b/src/main_utils.h @@ -0,0 +1,22 @@ +/* + * main_utils.h + * + * Created on: May 9, 2016 + * Author: vlegrand + */ + +#ifndef MAIN_UTILS_H_ +#define MAIN_UTILS_H_ + +#define k_max_input_files 15 +#define k_sep_pair_end ',' + +#include <vector> +#include "rock_commons.h" + +unsigned long getNodePhysMemory(); +int processInOutFileArgs(const std::string& input_file,const std::string output_file,std::vector<IO_fq_files>& single_files,std::vector<PE_files>& v_PE_files,int& f_id); +int getBestLambdaForN(const unsigned long& nb_k_mers,const int& lambda_max); + + +#endif /* MAIN_UTILS_H_ */ diff --git a/src/rock.cpp b/src/rock.cpp index 938f6ee64c3823d3b5b4f51861e9c9250bca9aa9..fbe8f1f2d2018b5e9d239f4e0f1af8fca3d7b38d 100644 --- a/src/rock.cpp +++ b/src/rock.cpp @@ -7,14 +7,15 @@ Description : see : https://projets.pasteur.fr/projects/homogeneisation-couverture-pre-assemblage/wiki ============================================================================ */ - +#include <sys/sysctl.h> #include <cstdlib> #include <string.h> #include <getopt.h> #include <iostream> -#include <fstream> +//#include <fstream> #include <map> #include <vector> +#include <unistd.h> #include "CountMinSketch.hpp" #include "rock_commons.h" @@ -22,13 +23,17 @@ #include "FqMainBackend.h" #include "fqreader.h" -// #include "ReadProcessor.h" -// #include "read_utils.h" #include "fqwriter.h" #include "Filter.hpp" +#include "main_utils.h" + + -#define k_max_input_files 15 -#define k_sep_pair_end ',' +#define unknown_os 1 +#define defaultGRPMAXSize 1 //(in gigabytes) Default maximum size of srp structure but of course this size depends of the number of reads we are processing. + // 1 GB is enough to store approx 82 millions of reads (must take into account memory needed for stl containers). + // This constant is used only if you use rock's default behavior (use 90% of the machine's RAM). + // If you specify a lambda, it is possible to use more memory for storing more reads and less memory for the CMS. using namespace std; @@ -48,91 +53,76 @@ void printRUsage() { - - -static void usage(int status) { //TODO write me. - /* FILE *f = stdout; - (void)fprintf(f, "usage: %s [options] <dbase:name> <dbase:name> <dbase:name>...\n\n", prog); - (void)fprintf(f, "options:\n"); - (void)fprintf(f, " -a ... Search query by accession number.\n"); - (void)fprintf(f, " -c ... Check query.\n"); - (void)fprintf(f, " -h ... Prints this message and exit.\n"); - (void)fprintf(f, " -i ... Search query by entry name.\n"); - (void)fprintf(f, " -l ... List available databases.\n"); - (void)fprintf(f, " -o <file> ... Place output into <file>.\n"); - (void)fprintf(f, " -f <file1> <file2> <file3>... Read input from <file1> <file2> <file3>.\n");*/ +static void usage(int status) { + cout<<"usage: rock [options]"<<endl<<endl; + cout<<" -i <input_filename.txt> .... In input_filename.txt put the names of the fastq files to filter."<<endl; + cout<<" -o <output_filename.txt> .... In output_filename.txt put the names of the filtered fastq files."<<endl; + cout<<" -h .... Print this message and exit."<<endl; + cout<<" -k <k_mer_size> .... Specify k-mer size. Default is 30."<<endl; + cout<<" -c <kappa_prime> .... Specify lower threshold for coverage. Defaut is 0 (None)."<<endl; + cout<<" -C <kappa> .... Specify upper threshold for coverage. Default is 50. Max is 65535."<<endl; + cout<<" -l <lambda> .... Indicate number of arrays wanted in the CMS. Default is biggest l so that l<B/(b*MAX_INT) where b is 1 or 2 depending on kappa and B is the RAM quantity on the machine."<<endl; + cout<<" -n <nb_distinct_k_mer> .... Indicate the number of distinct k-mer. Useful to compute lambda if not specified with -l."<<endl; + cout<<" -g <size in GB> .... Wanted size for the CMS."<<endl; exit(status); } - - - - - - - /* - * Reads the names of input and output fastq files that ROCK must process from text files whose names are passed as - * argument of the -i/-o options. Fills the appropriate structures. - * orresponding output files are supposed to be in the same order as input files. No checking here. It is ut to the user to give something correct in input. - */ -int processInOutFileArgs(const std::string& input_file,const std::string output_file,std::vector<IO_fq_files>& single_files,vector<PE_files>& v_PE_files,int& f_id) { - ifstream infiles_names(input_file.c_str()); - ifstream ofiles_names(output_file.c_str()); - while (infiles_names && ofiles_names && f_id<=k_max_input_files) { - string iline; - string oline; - if (!getline(infiles_names,iline)) break; - if (!getline(ofiles_names,oline)) break; - std::size_t i_found = iline.find_first_of(k_sep_pair_end); - if (i_found!=std::string::npos) { - // this is PE - f_id+=2; - string i_name_PE1=iline.substr(0,i_found); - string i_name_PE2=iline.substr(i_found+1); - std::size_t o_found = oline.find_first_of(k_sep_pair_end); - if (o_found==std::string::npos) { - cout<< "Inconsistency between input and output files lists!"<<endl; - return EXIT_FAILURE; - } - string o_name_PE1=oline.substr(0,o_found); - string o_name_PE2=oline.substr(o_found+1); - /* debug stuff, TODO comment that */ - /*cout<<"PE1="<<name_PE1<<endl; - cout<<"PE2="<<name_PE2<<endl;*/ - /* */ - PE_files pe; - pe.PE1.in_fq_file=i_name_PE1; - pe.PE1.out_fq_file=o_name_PE1; - pe.PE2.in_fq_file=i_name_PE2; - pe.PE2.out_fq_file=o_name_PE2; - v_PE_files.push_back(pe); - - } else { - // this is single. - f_id+=1; - // cout<<"single="<<iline; - IO_fq_files p; - p.in_fq_file=iline; - p.out_fq_file=oline; - single_files.push_back(p); - } - - } - if (!infiles_names.eof()) - { - std::cout<<"error while reading input file"<<std::endl; - return EXIT_FAILURE; - } - if (f_id>k_max_input_files) { - cout<<"ROCK cannot handle more than "<<k_max_input_files<<" input files."<<endl; - return EXIT_FAILURE; + * Checks options and arguments consistency (ex kappa_prime not superior to kappa; output file directory exists and so on...) + * Also set some CMS parameters depending on value of other options. +*/ +void optArgConsistency(const string& input_file,const string& output_file,const int& g,CMSparams& parms,const int& nb_k_mers) { + if (input_file.empty()) { + std::cout<<"input file name is mandatory" <<std::endl; + usage(EXIT_FAILURE); + } + if (output_file.empty()) { + std::cout<<"output file name is mandatory" <<std::endl; + usage(EXIT_FAILURE); + } + cout<<"g="<<g<<endl; + cout<<"lambda="<<parms.lambda<<endl; + if (g!=0 and parms.lambda!=0) {// user set both lambda and cms size=> inconsistency. + cout<<"-l and -g options are mutually exclusive."<<endl; + usage(EXIT_FAILURE); + } + if (nb_k_mers!=0 and parms.lambda!=0) {// user set both lambda and cms size=> inconsistency. + cout<<"-l and -n options are mutually exclusive."<<endl; + usage(EXIT_FAILURE); + } + float avail_mem=getNodePhysMemory()-defaultGRPMAXSize; + //cout<<avail_mem; + if (g>avail_mem) { + std::cout<<"This machine only has "<<getNodePhysMemory()<<" Gigabytes of RAM. This is not enough to run ROCK with a CMS of size: "<<g<<endl; + exit(EXIT_FAILURE); + } else { + parms.filter_size=g; + // compute lambda if not set by the user. + if (parms.lambda==0) { + parms.lambda=parms.filter_size/INT_MAX; + if (parms.kappa>get_mask<unsigned char>::value) parms.lambda=parms.lambda/sizeof(unsigned short); } - /*if (!ofiles_names.eof()) - { - std::cout<<"error while reading output file"<<std::endl; - return EXIT_FAILURE; - }*/ - return EXIT_SUCCESS; + } + if (nb_k_mers) { // user indicated number of k-mers; use it to minimize parms.lambda + parms.lambda=getBestLambdaForN(nb_k_mers,parms.lambda); + } + if (parms.kappa_prime>=parms.kappa) { + cout<<"ERROR lower filter is higher than high filter!"<<endl; + exit(EXIT_FAILURE); + } + + //cout<<"parms.lambda="<<parms.lambda<<" INT_MAX="<<INT_MAX<<endl; + unsigned long cms_size=INT_MAX; + cms_size*=2; + //cout<<"minimum cms size:"<<cms_size<<endl; + if (parms.kappa>get_mask<unsigned char>::value) cms_size=cms_size*sizeof(unsigned short); + //cout<<"cms_size in B="<<cms_size<<endl; + cms_size=ceil(cms_size/1024.0/1024/1024); // convert to gigabytes. + //cout<<"cms_size in GB="<<cms_size<<endl; + if (cms_size>getNodePhysMemory()-defaultGRPMAXSize) { + cout<<"Not enough RAM on the machine to run rock with a CMS of size:"<<cms_size<<" GB."<<endl; + exit(EXIT_FAILURE); + } + } @@ -143,51 +133,67 @@ int main(int argc,char * argv[]) { printRUsage(); #endif srp sr; - int i; + int i,g; std::string input_file,output_file; - int kappa,kappa_prime,lambda,k,n; - while((i = getopt(argc, argv, "i:o:l:k:c:C")) != -1) { + int k,nb_k_mers; + g=0; + k=30; + nb_k_mers=0; + CMSparams parms; + parms.kappa=50; + parms.kappa_prime=0; + parms.lambda=0; + parms.filter_size=getNodePhysMemory()/100*90-defaultGRPMAXSize; // Prefer not to use all the machine's memory + + if (parms.filter_size==unknown_os) return EXIT_FAILURE; + while((i = getopt(argc, argv, "i:o:l:k:c:C:g:n:")) != -1) { switch(i) { case 'i': input_file=optarg;break; case 'c': - kappa_prime=atoi(optarg);break; + parms.kappa_prime=atoi(optarg);break; case 'h': usage(EXIT_SUCCESS); break; case 'o': output_file=optarg; break; case 'C': - kappa=atoi(optarg);break; + parms.kappa=atoi(optarg); + if (parms.kappa<=0 || parms.kappa>get_mask<unsigned short>::value) { + cout<<"Bad value for kappa. Must choose kappa so that 0<kappa<="<<get_mask<unsigned short>::value<<endl; + usage(EXIT_FAILURE); + } + break; case 'l': - lambda = atoi(optarg); break; + parms.lambda = atoi(optarg); + if (parms.lambda<=0) { + cout<<"Bad value for lambda. Choose a value that is >0 or let ROCK choose for you."<<endl; + usage(EXIT_FAILURE); + } + break; case 'k': - k=atoi(optarg); break; + k=atoi(optarg); + if (k<=0 || k>32) { + cout<<"Bad value for k. Must choose k so that 0<k<=32."<<endl; + usage(EXIT_FAILURE); + } + break; case 'n': - n=atoi(optarg);break; + nb_k_mers=atoi(optarg);break; // number of distinct k-mers + case 'g': + cout<<optarg<<endl; + g=atoi(optarg); + break; default: usage(EXIT_FAILURE); break; } } - /* for the moment, use default values for k, kappa_prime, lambda and kappa*/ - k=30; - CMSparams parms; - parms.kappa=50; - parms.kappa_prime=20; - parms.lambda=4; - if (input_file.empty()) { - std::cout<<"input file name is mandatory" <<std::endl; - return EXIT_FAILURE; - } - if (output_file.empty()) { - std::cout<<"output file name is mandatory" <<std::endl; - return EXIT_FAILURE; + cout<<"g="<<g<<endl; + optArgConsistency(input_file,output_file,g,parms,nb_k_mers); - } int f_id=0; std::vector<IO_fq_files> single_files; vector<PE_files> v_PE_files; if (processInOutFileArgs(input_file,output_file,single_files,v_PE_files,f_id)!=EXIT_SUCCESS) return EXIT_FAILURE; - cout<<"f_id="<<f_id<<endl; - + //cout<<"f_id="<<f_id<<endl; FqBaseBackend * map_id_backend[k_max_input_files]; Filter the_filter(parms); @@ -228,8 +234,6 @@ int main(int argc,char * argv[]) { cout<<"Should now free memory"<<endl; #endif for (i=1;i<=f_id;i++) delete map_id_backend[i-1]; - // free(map_id_backend); - // delete pcms; #ifdef BENCHMARK cout<<"finished,going to exit "<<endl; printRUsage();