diff --git a/NEWS b/NEWS index e16d2968ce40675b15c8da496e49618c5d017852..a1d3c41ec3febef608ed8eb128f07f1ecbad8314 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,6 @@ +New in version 1.2 + * -v option for verbose mode. + New in version 1.1 * input file names can be provided as arguments in command-line. -i option is no more mandatory. * output filenames are generated by ROCK if -o option is not used. -o option is no more mandatory. diff --git a/configure b/configure index 332dd0853a93b3a30e168d2ae76dcbc704209343..38895ab85417a30bf7d581ccf233d82a7b819e94 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for rock 1.1. +# Generated by GNU Autoconf 2.69 for rock 1.2. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -576,8 +576,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='rock' PACKAGE_TARNAME='rock' -PACKAGE_VERSION='1.1' -PACKAGE_STRING='rock 1.1' +PACKAGE_VERSION='1.2' +PACKAGE_STRING='rock 1.2' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1226,7 +1226,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures rock 1.1 to adapt to many kinds of systems. +\`configure' configures rock 1.2 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1297,7 +1297,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of rock 1.1:";; + short | recursive ) echo "Configuration of rock 1.2:";; esac cat <<\_ACEOF @@ -1383,7 +1383,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -rock configure 1.1 +rock configure 1.2 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -1438,7 +1438,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by rock $as_me 1.1, which was +It was created by rock $as_me 1.2, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2217,7 +2217,7 @@ fi # Define the identity of the package. PACKAGE='rock' - VERSION='1.1' + VERSION='1.2' cat >>confdefs.h <<_ACEOF @@ -3726,7 +3726,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by rock $as_me 1.1, which was +This file was extended by rock $as_me 1.2, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -3783,7 +3783,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -rock config.status 1.1 +rock config.status 1.2 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index be1a23ea68322b8596d7f185fc2d79de070c4da6..2235b95494d21ba35729a1699deb188c72666cd3 100644 --- a/configure.ac +++ b/configure.ac @@ -1,7 +1,7 @@ dnl Process this file with autoconf to produce a configure script. AC_PREREQ(2.59) -AC_INIT(rock, 1.1) +AC_INIT(rock, 1.2) AC_CANONICAL_SYSTEM diff --git a/doc/rock.pod b/doc/rock.pod index 89cd73a54141a295bb0cf6b463958c61705d5d52..54f67120a9a73fc1f086af5de43dd935d0e7b916 100644 --- a/doc/rock.pod +++ b/doc/rock.pod @@ -65,6 +65,10 @@ Default is to use the minumum between 1) all the machine's memory minus a constant for storing imput fastq files indexes. 2) a default value of 4. +=item -v + +verbose mode. At the end of execution displays a recap ofall ROCK parameters (name of input files, values of, kappa and happa_prime) plus, CMS size in Bytes, plus if -n was provided, the expected probability of collision. + =item -h Usage display. diff --git a/src/Filter.hpp b/src/Filter.hpp index a0acab56fd18c9aaa79bb11191f9ac71d0e4f9df..5ecea7614bb3ae5f45fb3c32e4f5c17a2fdd6208 100644 --- a/src/Filter.hpp +++ b/src/Filter.hpp @@ -25,6 +25,12 @@ class Filter { ByteCountMinSketch * pByteCMS; ShortCountMinSketch * pShortCMS; +#// long CMS_size_in_Bytes; + long nb_bytes_before_fill_CMS,nb_bytes_after_fill_CMS; + + void getRSS(int before_fill=0); + + template <typename T> void underlyingfillCMS(FqBaseBackend* map_id_backend[],int nb_be, int k, srp* io_sr, CountMinSketch<T>* cms); template <typename T> void underlyinglowFilterCMS(FqBaseBackend* map_id_backend[],int nb_be,int k,srp* io_sr, CountMinSketch<T>* pcms); @@ -33,13 +39,18 @@ public: Filter(const CMSparams& parms) { pByteCMS=NULL; pShortCMS=NULL; + getRSS(); if (parms.kappa<ubytemask) pByteCMS=new ByteCountMinSketch(parms); - else pShortCMS=new ShortCountMinSketch(parms); + else pShortCMS=new ShortCountMinSketch(parms); + //CMS_size_in_Bytes=0; + + nb_bytes_after_fill_CMS=0; } void fillCMS(FqBaseBackend* map_id_backend[],int nb_be,int k,srp* io_sr); void lowFilterCMS(FqBaseBackend* map_id_backend[],int nb_be,int k, srp* io_sr); unsigned long getApproxNbDistinctKMers(); + unsigned long getSize(); }; template <typename T> void Filter::underlyingfillCMS(FqBaseBackend* map_id_backend[],int nb_be, int k, srp* io_sr, CountMinSketch<T>* cms) { @@ -127,6 +138,7 @@ template <typename T> void Filter::underlyinglowFilterCMS(FqBaseBackend* map_id_ void Filter::fillCMS(FqBaseBackend* map_id_backend[],int nb_be,int k,srp* io_sr) { if (pByteCMS!=NULL) underlyingfillCMS<unsigned char>(map_id_backend,nb_be, k, io_sr,pByteCMS); else underlyingfillCMS<unsigned short>(map_id_backend,nb_be, k, io_sr,pShortCMS); + getRSS(1); } void Filter::lowFilterCMS(FqBaseBackend* map_id_backend[],int nb_be,int k, srp* io_sr) { @@ -139,4 +151,18 @@ unsigned long Filter::getApproxNbDistinctKMers() { else return pShortCMS->getNbDistinctKMers(); } +void Filter::getRSS(int before_fill) { + struct rusage usage; + int res2=getrusage(RUSAGE_SELF,&usage); + if (res2==-1) err(errno,"cannot get resource usage."); + if (before_fill==0) nb_bytes_before_fill_CMS=usage.ru_maxrss; + else nb_bytes_after_fill_CMS=usage.ru_maxrss; +} + +unsigned long Filter::getSize() { + unsigned long cms_size=nb_bytes_after_fill_CMS-nb_bytes_before_fill_CMS; + // cms_size/=8; ru_maxrss seems to be already in Bytes. + return cms_size; +} + #endif diff --git a/src/Makefile.am b/src/Makefile.am index d83e7beef20b4d53967e70d775d2be0beb8cb95d..8176ea400dbac74fb73c19791aef892be93c1516 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -29,5 +29,5 @@ unit_test_main_utils_LDADD=librock.a librock_a_SOURCES = $(SRC) -SRC = fqreader.cpp FqBaseBackend.cpp FqAuxBackend.cpp FqMainBackend.cpp read_utils.cpp ReadProcessor.cpp fqwriter.cpp main_utils.cpp -HDR = srp.h fqreader.h FqConstants.h FqBaseBackend.h FqAuxBackend.h FqMainBackend.h ReadProcessor.h rock_commons.h read_utils.h CountMinSketch.hpp Filter.hpp fqwriter.h main_utils.h +SRC = fqreader.cpp FqBaseBackend.cpp FqAuxBackend.cpp FqMainBackend.cpp read_utils.cpp ReadProcessor.cpp fqwriter.cpp main_utils.cpp ROCKparams.cpp +HDR = srp.h fqreader.h FqConstants.h FqBaseBackend.h FqAuxBackend.h FqMainBackend.h ReadProcessor.h rock_commons.h read_utils.h CountMinSketch.hpp Filter.hpp fqwriter.h main_utils.h ROCKparams.h diff --git a/src/ROCKparams.cpp b/src/ROCKparams.cpp new file mode 100644 index 0000000000000000000000000000000000000000..947c64c1caa1794bd88d97426931a28d230b5ac0 --- /dev/null +++ b/src/ROCKparams.cpp @@ -0,0 +1,224 @@ +/* + * ROCKparams.cpp + * + * Created on: Jul 25, 2016 + * Author: vlegrand + */ +#include <iostream> +#include <fstream> +#include <string> +#include <vector> +#include <cmath> +#include "ROCKparams.h" +using namespace std; + + +void ROCKparams::computeLambda() { + unsigned long tmp=parms.filter_size; + tmp*=1073741824; // this is in fact 1024*1024*1024. + parms.lambda=tmp/INT_MAX; + if (parms.kappa>get_mask<unsigned char>::value) parms.lambda=parms.lambda/sizeof(unsigned short); +} + + +void ROCKparams::processMainArgs(int optind, const int argc, char ** argv,std::vector<string>& v_input_lines) { + if (optind==argc) return; // no arguments + while (optind<argc) { // otherwise, only arguments expected are names of files to filter. + string iline=argv[optind]; + v_input_lines.push_back(iline); + optind++; + } + } + + +void ROCKparams::optArgConsistency(const string& input_file,const string& output_file, + const int& g,CMSparams& parms,const int& nb_k_mers, + const std::vector<string>& v_input_lines) { + if (input_file.empty() && v_input_lines.empty()) { + std::cout<<"You must provide filename via -i or arguments to indicate ROCK what to filter." <<std::endl; + usage(EXIT_FAILURE); + } + if (!input_file.empty() && !v_input_lines.empty()) { + std::cout<<"files to filter are provided via the -i option or as arguments. It cannot be both." <<std::endl; + usage(EXIT_FAILURE); + } + if (g!=0 and parms.lambda!=0) {// user set both lambda and cms size=> inconsistency. + cout<<"-l and -g options are mutually exclusive."<<endl; + usage(EXIT_FAILURE); + } + if (nb_k_mers!=0 and parms.lambda!=0) {// user set both lambda and number of k-mers=> inconsistency. + cout<<"-l and -n options are mutually exclusive."<<endl; + usage(EXIT_FAILURE); + } + float avail_mem=getNodePhysMemory()-defaultGRPMAXSize; + // cout<<avail_mem; + if (g>avail_mem) { + std::cout<<"This machine only has "<<getNodePhysMemory()<<" Gigabytes of RAM. This is not enough to run ROCK with a CMS of size: "<<g<<endl; + exit(EXIT_FAILURE); + } else if (g!=0) { + parms.filter_size=g; + computeLambda(); + } + if (nb_k_mers) { // user indicated number of k-mers; use it to minimize parms.lambda + parms.lambda=getBestLambdaForN(nb_k_mers,parms.lambda); + } + if (parms.kappa_prime>=parms.kappa) { + cout<<"ERROR lower filter is higher than high filter!"<<endl; + exit(EXIT_FAILURE); + } + + if (parms.lambda==0) { + // This happens when the user doesn't specify lambda nor g nor nb_k_mers. + computeLambda(); + parms.lambda=min(parms.lambda,proposedLambda); + if (parms.lambda==0) { + cout<<"Not enough RAM on the machine. TotalRAM -1GB must be bigger than INT_MAX to allow at least 1 array in the CMS."<<endl; + exit(EXIT_FAILURE); + } + } +#ifdef DEBUG + cout<<"parms.lambda="<<parms.lambda<<" INT_MAX="<<INT_MAX<<endl; +#endif + unsigned long cms_size=INT_MAX; + cms_size*=parms.lambda; + //cout<<"minimum cms size:"<<cms_size<<endl; + if (parms.kappa>get_mask<unsigned char>::value) cms_size=cms_size*sizeof(unsigned short); + //cout<<"cms_size in B="<<cms_size<<endl; + cms_size=ceil(cms_size/1024.0/1024/1024); // convert to gigabytes. + //cout<<"cms_size in GB="<<cms_size<<endl; + if (cms_size>getNodePhysMemory()-defaultGRPMAXSize) { + cout<<"Not enough RAM on the machine to run rock with a CMS of size:"<<cms_size<<" GB."<<endl; + exit(EXIT_FAILURE); + } +} + + +/* + * Loads the content of a text file (containing input fastq file names to be filtered or names of files that ROCK must generate). + */ +int ROCKparams::loadFileArgs(const std::string& afile,std::vector<string>& v_lines) { + ifstream infiles_names(afile.c_str()); + if (!infiles_names) cout<<"couldn't open file: "<<afile<<endl; + while (infiles_names) { + string iline; + if (!getline(infiles_names,iline)) break; + v_lines.push_back(iline); + } + if (!infiles_names.eof()) + { + std::cout<<"error while reading input or output file"<<std::endl; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} + +/* + * Reads the names of input and output fastq files that ROCK must process from text files whose names are passed as + * argument of the -i/-o options. Fills the appropriate structures. + * orresponding output files are supposed to be in the same order as input files. No checking here. It is ut to the user to give something correct in input. + */ +int ROCKparams::loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<string>& v_input_lines,std::vector<string>& v_output_lines) { + int reti=EXIT_SUCCESS; + int reto=EXIT_SUCCESS; + reti=loadFileArgs(input_file,v_input_lines); + if (!output_file.empty()) { + reto=loadFileArgs(output_file,v_output_lines); + } + if (reti==EXIT_SUCCESS && reto==EXIT_SUCCESS) return EXIT_SUCCESS; + else return EXIT_FAILURE; +} + +void ROCKparams::removePathfromFName(string& FName) { + std::size_t i_found2 =FName.find_last_of(path_sep); // remove path from filename. + if (i_found2!=std::string::npos) { + FName=FName.substr(i_found2+1); + } +} + +void ROCKparams::changeExtension(string& FName) { // changes .fq into .rock.fq or adds .rock.fq + std::size_t o_found = FName.find_last_of(k_ext); + if (o_found!=std::string::npos) FName.replace(o_found,1,".rock."); + else FName.append(".rock.fq"); +} + +void ROCKparams::genOutFilenames(const std::vector<string>& v_input_lines,std::vector<string>& v_output_lines) { + std::vector<string>::const_iterator it_in; + string o_line; + for (it_in=v_input_lines.begin();it_in!=v_input_lines.end();++it_in) { + std::size_t i_found = (*it_in).find_first_of(k_sep_pair_end); + if (i_found!=std::string::npos) {// PE files are separared by a ',' + string i_name_PE1=(*it_in).substr(0,i_found); + removePathfromFName(i_name_PE1); + string i_name_PE2=(*it_in).substr(i_found+1); + removePathfromFName(i_name_PE2); + changeExtension(i_name_PE1); + changeExtension(i_name_PE2); + o_line=i_name_PE1; + o_line+=k_sep_pair_end; + o_line.append(i_name_PE2); + + } else { + o_line=*it_in; + removePathfromFName(o_line); + changeExtension(o_line); + } + v_output_lines.push_back(o_line); + } +} + +int ROCKparams::processInOutFileArgs(const std::vector<string>& v_input_lines,std::vector<string>& v_output_lines,std::vector<IO_fq_files>& single_files,vector<PE_files>& v_PE_files,int& f_id) { + if (v_output_lines.empty()) { + // in that case, generate output filenames from input filenames + genOutFilenames(v_input_lines,v_output_lines); + } + if (v_input_lines.size()!=v_output_lines.size()) { + cout<< "Inconsistency between input and output files lists!"<<endl; + return EXIT_FAILURE; + } else { + std::vector<string>::const_iterator it_in; + std::vector<string>::const_iterator it_out; + it_in=v_input_lines.begin(); + it_out=v_output_lines.begin(); + while (it_in!=v_input_lines.end()) { + std::size_t i_found = (*it_in).find_first_of(k_sep_pair_end); + if (i_found!=std::string::npos) { + // this is PE + f_id+=2; + string i_name_PE1=(*it_in).substr(0,i_found); + string i_name_PE2=(*it_in).substr(i_found+1); + std::size_t o_found = (*it_out).find_first_of(k_sep_pair_end); + if (o_found==std::string::npos) { + cout<< "Inconsistency between input and output files lists!"<<endl; + return EXIT_FAILURE; + } + string o_name_PE1=(*it_out).substr(0,o_found); + checkDirExists(o_name_PE1); + string o_name_PE2=(*it_out).substr(o_found+1); + //cout<<o_name_PE2<<endl; + checkDirExists(o_name_PE2); + PE_files pe; + pe.PE1.in_fq_file=i_name_PE1; + pe.PE1.out_fq_file=o_name_PE1; + pe.PE2.in_fq_file=i_name_PE2; + pe.PE2.out_fq_file=o_name_PE2; + v_PE_files.push_back(pe); + + } else { + // this is single. + f_id+=1; + IO_fq_files p; + p.in_fq_file=*it_in; + checkDirExists(*it_out); + p.out_fq_file=*it_out; + single_files.push_back(p); + } + ++it_in; + ++it_out; + } + if (f_id>k_max_input_files) { + cout<<"ROCK cannot handle more than "<<k_max_input_files<<" input files."<<endl; + return EXIT_FAILURE; + } + } + return EXIT_SUCCESS; +} diff --git a/src/ROCKparams.h b/src/ROCKparams.h new file mode 100644 index 0000000000000000000000000000000000000000..5448008ea62115fbeddaae8b691afb16d7db8516 --- /dev/null +++ b/src/ROCKparams.h @@ -0,0 +1,192 @@ +/* + * ROCKparams.h + * + * Created on: Jul 25, 2016 + * Author: vlegrand + */ + +#ifndef ROCKPARAMS_H_ +#define ROCKPARAMS_H_ + + +#include <cstdlib> +#include <string> +#include <vector> +#include <getopt.h> +#include "CountMinSketch.hpp" +#include "main_utils.h" + +#define defaultGRPMAXSize 1 //(in gigabytes) Default maximum size of srp structure but of course this size depends of the number of reads we are processing. + // 1 GB is enough to store approx 82 millions of reads (must take into account memory needed for stl containers). + // This constant is used only if you use rock's default behavior (use 90% of the machine's RAM). + // If you specify a lambda, it is possible to use more memory for storing more reads and less memory for the CMS. + +#define proposedLambda 4 + +using namespace std; + + +class ROCKparams{ + CMSparams parms; + int g; // max RAM to use for the CMS if specified by the user. + int nb_k_mers; // expected number of k-mers in input data if specified by the user. + int k; // size of the k-mers + int verbose_mode; + std::string input_file,output_file; + + std::vector<IO_fq_files> single_files; + vector<PE_files> v_PE_files; + int f_id; + unsigned long cms_size; + float expected_collision_proba; + + + void computeLambda(); + void processMainArgs(int optind, const int argc, char ** argv,std::vector<string>& v_input_lines); + int loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<string>& v_input_lines,std::vector<string>& v_output_lines); + int loadFileArgs(const std::string& afile,std::vector<string>& v_lines); + void removePathfromFName(string& FName); + void changeExtension(string& FName); + void genOutFilenames(const std::vector<string>& v_input_lines,std::vector<string>& v_output_lines); + int processInOutFileArgs(const std::vector<string>& v_input_lines,std::vector<string>& v_output_lines,std::vector<IO_fq_files>& single_files,vector<PE_files>& v_PE_files,int& f_id); + + /* + * Checks options and arguments consistency (ex kappa_prime not superior to kappa; output file directory exists and so on...) + * Also set some CMS parameters depending on value of other options. + */ + void optArgConsistency(const string& input_file,const string& output_file, + const int& g,CMSparams& parms,const int& nb_k_mers, + const std::vector<string>& v_input_lines); + + friend void test_processIOFileArgs(); + +public: + + ROCKparams() { + f_id=0; // to generate id of input/output fastq files. +1=total number of files. + g=0; + k=30; + nb_k_mers=0; + parms.kappa=50; + parms.kappa_prime=0; + parms.lambda=0; + verbose_mode=0; + cms_size=0; + expected_collision_proba=0.0; + parms.filter_size=getNodePhysMemory()/100.0*90-defaultGRPMAXSize; // Prefer not to use all the machine's memory + if (parms.filter_size==0) throw EXIT_FAILURE; + } + + + + void initFromMainOptsArgs(int argc,char * argv[]) { + int i; + std::vector<string> v_input_lines; + std::vector<string> v_output_lines; + + while((i = getopt(argc, argv, "i:o:l:k:c:C:g:n:v")) != -1) { + switch(i) { + case 'i': + input_file=optarg;break; + case 'c': + parms.kappa_prime=atoi(optarg);break; + case 'h': + usage(EXIT_SUCCESS); break; + case 'o': + output_file=optarg; break; + case 'C': + parms.kappa=atoi(optarg); + if (parms.kappa<=0 || parms.kappa>get_mask<unsigned short>::value) { + cout<<"Bad value for kappa. Must choose kappa so that 0<kappa<="<<get_mask<unsigned short>::value<<endl; + usage(EXIT_FAILURE); + } + break; + case 'l': + parms.lambda = atoi(optarg); + if (parms.lambda<=0) { + cout<<"Bad value for lambda. Choose a value that is >0 or let ROCK choose for you."<<endl; + usage(EXIT_FAILURE); + } + break; + case 'k': + k=atoi(optarg); + if (k<=0 || k>32) { + cout<<"Bad value for k. Must choose k so that 0<k<=32."<<endl; + usage(EXIT_FAILURE); + } + break; + case 'n': + nb_k_mers=atoi(optarg);break; // number of distinct k-mers + case 'g': + // cout<<optarg<<endl; + g=atoi(optarg); + break; + case 'v': + verbose_mode=1; + break; + default: + usage(EXIT_FAILURE); break; } + } + processMainArgs(optind, argc, argv,v_input_lines); + optArgConsistency(input_file,output_file,g,parms,nb_k_mers,v_input_lines); + if (nb_k_mers) { + expected_collision_proba=getCollisionProba(nb_k_mers,parms.lambda); + } + if (v_input_lines.empty() && (loadInOutFileArgs(input_file,output_file,v_input_lines,v_output_lines)==EXIT_FAILURE)) throw EXIT_FAILURE; + if (processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id)!=EXIT_SUCCESS) throw EXIT_FAILURE; + } + + CMSparams getCMSparams() { + return parms; + } + + int get_f_id() { + return f_id; + } + + std::vector<IO_fq_files> get_single_files() { + return single_files; + } + + vector<PE_files> get_PE_files() { + return v_PE_files; + } + + int get_k() { + return k; + } + + void printVerboseInfo() { + std::vector<IO_fq_files>::iterator it_s; + vector<PE_files>::iterator it_pe; + cout<<"Input files: "<<endl; + cout<<"single:"<<endl; + for (it_s=single_files.begin();it_s!=single_files.end();it_s++) { + cout<<" "<<it_s->in_fq_file<<endl; + } + cout<<"pair-end:"<<endl; + for (it_pe=v_PE_files.begin();it_pe!=v_PE_files.end();it_pe++) { + cout<<" "<<it_pe->PE1.in_fq_file<<" "<<it_pe->PE2.in_fq_file<<endl; + } + cout<<endl; + cout<<"k="<<k<<endl; + cout<<"kappa="<<parms.kappa<<endl; + cout<<"kappa prime="<<parms.kappa_prime; + cout<<endl; + cout<<"CMS size="<<cms_size<<" Bytes for "<<parms.lambda<<" arrays."<<endl; + cout<<endl; + if (nb_k_mers!=0) cout<<"expected probability of collision was: "<<expected_collision_proba<<endl; + } + + void setFilterSize(unsigned long fsize) { + cms_size=fsize; + } + + int is_verbose() { + return verbose_mode; + } + +}; + + +#endif /* ROCKPARAMS_H_ */ diff --git a/src/main_utils.cpp b/src/main_utils.cpp index 6b6a02a46069a2a61f58f42a548781d0795dfd97..dc1ba44e4f854658141f2ed8bf5dd009590cfdb7 100644 --- a/src/main_utils.cpp +++ b/src/main_utils.cpp @@ -62,139 +62,6 @@ void checkDirExists(const string o_file) { } -void removePathfromFName(string& FName) { - std::size_t i_found2 =FName.find_last_of(path_sep); // remove path from filename. - if (i_found2!=std::string::npos) { - FName=FName.substr(i_found2+1); - } -} - -void changeExtension(string& FName) { // changes .fq into .rock.fq or adds .rock.fq - std::size_t o_found = FName.find_last_of(k_ext); - if (o_found!=std::string::npos) FName.replace(o_found,1,".rock."); - else FName.append(".rock.fq"); -} - -void genOutFilenames(const std::vector<string>& v_input_lines,std::vector<string>& v_output_lines) { - std::vector<string>::const_iterator it_in; - string o_line; - for (it_in=v_input_lines.begin();it_in!=v_input_lines.end();++it_in) { - std::size_t i_found = (*it_in).find_first_of(k_sep_pair_end); - if (i_found!=std::string::npos) {// PE files are separared by a ',' - string i_name_PE1=(*it_in).substr(0,i_found); - removePathfromFName(i_name_PE1); - string i_name_PE2=(*it_in).substr(i_found+1); - removePathfromFName(i_name_PE2); - changeExtension(i_name_PE1); - changeExtension(i_name_PE2); - o_line=i_name_PE1; - o_line+=k_sep_pair_end; - o_line.append(i_name_PE2); - - } else { - o_line=*it_in; - removePathfromFName(o_line); - changeExtension(o_line); - } - v_output_lines.push_back(o_line); - } -} - - -int processInOutFileArgs(const std::vector<string>& v_input_lines,std::vector<string>& v_output_lines,std::vector<IO_fq_files>& single_files,vector<PE_files>& v_PE_files,int& f_id) { - if (v_output_lines.empty()) { - // in that case, generate output filenames from input filenames - genOutFilenames(v_input_lines,v_output_lines); - } - if (v_input_lines.size()!=v_output_lines.size()) { - cout<< "Inconsistency between input and output files lists!"<<endl; - return EXIT_FAILURE; - } else { - std::vector<string>::const_iterator it_in; - std::vector<string>::const_iterator it_out; - it_in=v_input_lines.begin(); - it_out=v_output_lines.begin(); - while (it_in!=v_input_lines.end()) { - std::size_t i_found = (*it_in).find_first_of(k_sep_pair_end); - if (i_found!=std::string::npos) { - // this is PE - f_id+=2; - string i_name_PE1=(*it_in).substr(0,i_found); - string i_name_PE2=(*it_in).substr(i_found+1); - std::size_t o_found = (*it_out).find_first_of(k_sep_pair_end); - if (o_found==std::string::npos) { - cout<< "Inconsistency between input and output files lists!"<<endl; - return EXIT_FAILURE; - } - string o_name_PE1=(*it_out).substr(0,o_found); - checkDirExists(o_name_PE1); - string o_name_PE2=(*it_out).substr(o_found+1); - //cout<<o_name_PE2<<endl; - checkDirExists(o_name_PE2); - PE_files pe; - pe.PE1.in_fq_file=i_name_PE1; - pe.PE1.out_fq_file=o_name_PE1; - pe.PE2.in_fq_file=i_name_PE2; - pe.PE2.out_fq_file=o_name_PE2; - v_PE_files.push_back(pe); - - } else { - // this is single. - f_id+=1; - IO_fq_files p; - p.in_fq_file=*it_in; - checkDirExists(*it_out); - p.out_fq_file=*it_out; - single_files.push_back(p); - } - ++it_in; - ++it_out; - } - if (f_id>k_max_input_files) { - cout<<"ROCK cannot handle more than "<<k_max_input_files<<" input files."<<endl; - return EXIT_FAILURE; - } - } - return EXIT_SUCCESS; -} - - -/* - * Loads the content of a text file (containing input fastq file names to be filtered or names of files that ROCK must generate). - */ -int loadFileArgs(const std::string& afile,std::vector<string>& v_lines) { - ifstream infiles_names(afile.c_str()); - if (!infiles_names) cout<<"couldn't open file: "<<afile<<endl; - while (infiles_names) { - string iline; - if (!getline(infiles_names,iline)) break; - v_lines.push_back(iline); - } - if (!infiles_names.eof()) - { - std::cout<<"error while reading input or output file"<<std::endl; - return EXIT_FAILURE; - } - return EXIT_SUCCESS; -} - -/* - * Reads the names of input and output fastq files that ROCK must process from text files whose names are passed as - * argument of the -i/-o options. Fills the appropriate structures. - * orresponding output files are supposed to be in the same order as input files. No checking here. It is ut to the user to give something correct in input. - */ -int loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<string>& v_input_lines,std::vector<string>& v_output_lines) { - int reti=EXIT_SUCCESS; - int reto=EXIT_SUCCESS; - reti=loadFileArgs(input_file,v_input_lines); - if (!output_file.empty()) { - reto=loadFileArgs(output_file,v_output_lines); - } - if (reti==EXIT_SUCCESS && reto==EXIT_SUCCESS) return EXIT_SUCCESS; - else return EXIT_FAILURE; -} - - /* * Minimize lambda for a given number of k-mers. * We want p<=0.01, @@ -235,3 +102,17 @@ float getCollisionProba(const unsigned long& nb_k_mers,const int& lambda) { //return p; } +void usage(int status) { + cout<<"usage: rock [options] [args]"<<endl<<endl; + cout<<" -i <file> .... file name containing the list of the FASTQ files to process."<<endl; + cout<<" -o <file> .... file name containing the list of the output FASTQ file names."<<endl; + cout<<" -h .... Print this message and exit."<<endl; + cout<<" -k <int> .... k-mer size. (default 30)."<<endl; + cout<<" -c <int> .... lower coverage threshold (default: 0)."<<endl; + cout<<" -C <int> .... upper coverage threshold (default: 50; max: 65535)."<<endl; + cout<<" -l <int> .... size of the count min sketch (default: at most 4, depending on the available RAM)"<<endl; + cout<<" -n <int> .... (expected) number of distinct k-mers within the processed reads."<<endl; + cout<<" -g <int> .... maximum RAM to use (in Gb) for the CMS."<<endl; + cout<<" -v .... verbose"<<endl; + exit(status); } + diff --git a/src/main_utils.h b/src/main_utils.h index 7f3a8c296f318bea70a124a56bd6dfc67204c41b..ed50a9cea471b62d7e0f7cc1303248a54778bc8a 100644 --- a/src/main_utils.h +++ b/src/main_utils.h @@ -13,14 +13,17 @@ #define k_ext '.' #define path_sep '/' +#include <string> #include <vector> #include "rock_commons.h" unsigned long getNodePhysMemory(); -int loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<std::string>& v_input_lines,std::vector<std::string>& v_output_lines); -int processInOutFileArgs(const std::vector<std::string>& v_input_lines,std::vector<std::string>& v_output_lines,std::vector<IO_fq_files>& single_files,std::vector<PE_files>& v_PE_files,int& f_id); +void checkDirExists(const std::string o_file); +/*int loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<std::string>& v_input_lines,std::vector<std::string>& v_output_lines); +int processInOutFileArgs(const std::vector<std::string>& v_input_lines,std::vector<std::string>& v_output_lines,std::vector<IO_fq_files>& single_files,std::vector<PE_files>& v_PE_files,int& f_id);*/ int getBestLambdaForN(const unsigned long& nb_k_mers,int lambda_max); float getCollisionProba(const unsigned long& nb_k_mers,const int& lambda); +void usage(int status); #endif /* MAIN_UTILS_H_ */ diff --git a/src/rock.cpp b/src/rock.cpp index 33fc9a23d81c296a13bd0e9a23d2ef878111c516..b53e64cf3ad3c610cdefdd6223e73e01b9e49fc8 100644 --- a/src/rock.cpp +++ b/src/rock.cpp @@ -9,8 +9,9 @@ */ #include <sys/sysctl.h> #include <cstdlib> +#include "err.h" #include <string.h> -#include <getopt.h> + #include <iostream> #include <map> #include <vector> @@ -26,15 +27,9 @@ #include "fqwriter.h" #include "Filter.hpp" #include "main_utils.h" +#include "ROCKparams.h" - -#define defaultGRPMAXSize 1 //(in gigabytes) Default maximum size of srp structure but of course this size depends of the number of reads we are processing. - // 1 GB is enough to store approx 82 millions of reads (must take into account memory needed for stl containers). - // This constant is used only if you use rock's default behavior (use 90% of the machine's RAM). - // If you specify a lambda, it is possible to use more memory for storing more reads and less memory for the CMS. -#define proposedLambda 4 - using namespace std; @@ -53,115 +48,6 @@ void printRUsage() { -static void usage(int status) { - cout<<"usage: rock [options] [args]"<<endl<<endl; - cout<<" -i <file> .... file name containing the list of the FASTQ files to process."<<endl; - cout<<" -o <file> .... file name containing the list of the output FASTQ file names."<<endl; - cout<<" -h .... Print this message and exit."<<endl; - cout<<" -k <int> .... k-mer size. (default 30)."<<endl; - cout<<" -c <int> .... lower coverage threshold (default: 0)."<<endl; - cout<<" -C <int> .... upper coverage threshold (default: 50; max: 65535)."<<endl; - cout<<" -l <int> .... size of the count min sketch (default: at most 4, depending on the available RAM)"<<endl; - cout<<" -n <int> .... (expected) number of distinct k-mers within the processed reads."<<endl; - cout<<" -g <int> .... maximum RAM to use (in Gb) for the CMS."<<endl; - exit(status); } - - -void computeLambda(CMSparams& parms) { - /*cout<<ULONG_MAX<<endl; - cout<<UINT_MAX<<endl; - cout<<INT_MAX<<endl;*/ - unsigned long tmp=parms.filter_size; - tmp*=1073741824; // this is in fact 1024*1024*1024. - parms.lambda=tmp/INT_MAX; - //cout<<"parms.lambda="<<parms.lambda<<endl; - if (parms.kappa>get_mask<unsigned char>::value) parms.lambda=parms.lambda/sizeof(unsigned short); -} - - -/* - * Checks options and arguments consistency (ex kappa_prime not superior to kappa; output file directory exists and so on...) - * Also set some CMS parameters depending on value of other options. -*/ -void optArgConsistency(const string& input_file,const string& output_file, - const int& g,CMSparams& parms,const int& nb_k_mers, - const std::vector<string>& v_input_lines) { - if (input_file.empty() && v_input_lines.empty()) { - std::cout<<"You must provide filename via -i or arguments to indicate ROCK what to filter." <<std::endl; - usage(EXIT_FAILURE); - } - if (!input_file.empty() && !v_input_lines.empty()) { - std::cout<<"files to filter are provided via the -i option or as arguments. It cannot be both." <<std::endl; - usage(EXIT_FAILURE); - } - /*if (output_file.empty()) { - std::cout<<"output file name is mandatory" <<std::endl; - usage(EXIT_FAILURE); - }*/ - // cout<<"g="<<g<<endl; - // cout<<"lambda="<<parms.lambda<<endl; - if (g!=0 and parms.lambda!=0) {// user set both lambda and cms size=> inconsistency. - cout<<"-l and -g options are mutually exclusive."<<endl; - usage(EXIT_FAILURE); - } - if (nb_k_mers!=0 and parms.lambda!=0) {// user set both lambda and number of k-mers=> inconsistency. - cout<<"-l and -n options are mutually exclusive."<<endl; - usage(EXIT_FAILURE); - } - float avail_mem=getNodePhysMemory()-defaultGRPMAXSize; - // cout<<avail_mem; - if (g>avail_mem) { - std::cout<<"This machine only has "<<getNodePhysMemory()<<" Gigabytes of RAM. This is not enough to run ROCK with a CMS of size: "<<g<<endl; - exit(EXIT_FAILURE); - } else if (g!=0) { - parms.filter_size=g; - computeLambda(parms); - } - if (nb_k_mers) { // user indicated number of k-mers; use it to minimize parms.lambda - parms.lambda=getBestLambdaForN(nb_k_mers,parms.lambda); - } - if (parms.kappa_prime>=parms.kappa) { - cout<<"ERROR lower filter is higher than high filter!"<<endl; - exit(EXIT_FAILURE); - } - - if (parms.lambda==0) { - // This happens when the user doesn't specify lambda nor g nor nb_k_mers. - computeLambda(parms); - parms.lambda=min(parms.lambda,proposedLambda); - if (parms.lambda==0) { - cout<<"Not enough RAM on the machine. TotalRAM -1GB must be bigger than INT_MAX to allow at least 1 array in the CMS."<<endl; - exit(EXIT_FAILURE); - } - } -#ifdef DEBUG - cout<<"parms.lambda="<<parms.lambda<<" INT_MAX="<<INT_MAX<<endl; -#endif - unsigned long cms_size=INT_MAX; - cms_size*=parms.lambda; - //cout<<"minimum cms size:"<<cms_size<<endl; - if (parms.kappa>get_mask<unsigned char>::value) cms_size=cms_size*sizeof(unsigned short); - //cout<<"cms_size in B="<<cms_size<<endl; - cms_size=ceil(cms_size/1024.0/1024/1024); // convert to gigabytes. - //cout<<"cms_size in GB="<<cms_size<<endl; - if (cms_size>getNodePhysMemory()-defaultGRPMAXSize) { - cout<<"Not enough RAM on the machine to run rock with a CMS of size:"<<cms_size<<" GB."<<endl; - exit(EXIT_FAILURE); - } - -} - -void processArgs(int optind, const int argc, char ** argv,std::vector<string>& v_input_lines) { - if (optind==argc) return; // no arguments - while (optind<argc) { // otherwise, only arguments expected are names of files to filter. - string iline=argv[optind]; - v_input_lines.push_back(iline); - optind++; - } -} - - - int main(int argc,char * argv[]) { #ifdef BENCHMARK @@ -169,74 +55,12 @@ int main(int argc,char * argv[]) { printRUsage(); #endif srp sr; - int i,g; - std::string input_file,output_file; - std::vector<string> v_input_lines; - int k,nb_k_mers; - g=0; - k=30; - nb_k_mers=0; - CMSparams parms; - parms.kappa=50; - parms.kappa_prime=0; - parms.lambda=0; - int f_id=0; - // cout<<getNodePhysMemory()<<endl; - // cout<<getNodePhysMemory()/100.0*90<<endl; - parms.filter_size=getNodePhysMemory()/100.0*90-defaultGRPMAXSize; // Prefer not to use all the machine's memory - if (parms.filter_size==0) return EXIT_FAILURE; - while((i = getopt(argc, argv, "i:o:l:k:c:C:g:n:")) != -1) { - /*cout<<i<<endl; - cout<<optind<<endl;*/ - switch(i) { - case 'i': - input_file=optarg;break; - case 'c': - parms.kappa_prime=atoi(optarg);break; - case 'h': - usage(EXIT_SUCCESS); break; - case 'o': - output_file=optarg; break; - case 'C': - parms.kappa=atoi(optarg); - if (parms.kappa<=0 || parms.kappa>get_mask<unsigned short>::value) { - cout<<"Bad value for kappa. Must choose kappa so that 0<kappa<="<<get_mask<unsigned short>::value<<endl; - usage(EXIT_FAILURE); - } - break; - case 'l': - parms.lambda = atoi(optarg); - if (parms.lambda<=0) { - cout<<"Bad value for lambda. Choose a value that is >0 or let ROCK choose for you."<<endl; - usage(EXIT_FAILURE); - } - break; - case 'k': - k=atoi(optarg); - if (k<=0 || k>32) { - cout<<"Bad value for k. Must choose k so that 0<k<=32."<<endl; - usage(EXIT_FAILURE); - } - break; - case 'n': - nb_k_mers=atoi(optarg);break; // number of distinct k-mers - case 'g': - // cout<<optarg<<endl; - g=atoi(optarg); - break; - default: - usage(EXIT_FAILURE); break; } - } - /*cout<<optind<<endl; - cout<<*argv<<endl;*/ - processArgs(optind, argc, argv,v_input_lines); - optArgConsistency(input_file,output_file,g,parms,nb_k_mers,v_input_lines); - std::vector<string> v_output_lines; - if (v_input_lines.empty() && (loadInOutFileArgs(input_file,output_file,v_input_lines,v_output_lines)==EXIT_FAILURE)) return EXIT_FAILURE; - std::vector<IO_fq_files> single_files; - vector<PE_files> v_PE_files; - if (processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id)!=EXIT_SUCCESS) return EXIT_FAILURE; - //cout<<"f_id="<<f_id<<endl; + ROCKparams main_parms; + main_parms.initFromMainOptsArgs(argc,argv); + int f_id=main_parms.get_f_id(); + CMSparams parms=main_parms.getCMSparams(); + std::vector<IO_fq_files> single_files=main_parms.get_single_files(); + vector<PE_files> v_PE_files=main_parms.get_PE_files(); FqBaseBackend * map_id_backend[k_max_input_files]; Filter the_filter(parms); @@ -252,6 +76,9 @@ int main(int argc,char * argv[]) { printRUsage(); cout<<"Now going to fill CMS"<<endl; #endif + + int k=main_parms.get_k(); + the_filter.fillCMS(map_id_backend,f_id,k, &sr); #ifdef BENCHMARK @@ -268,6 +95,7 @@ int main(int argc,char * argv[]) { cout<<"Now going to remove read with cov<kappa_prime"<<endl; #endif + main_parms.setFilterSize(the_filter.getSize()); // Now, remove reads that are beneath kappa_prime if (parms.kappa_prime) the_filter.lowFilterCMS(map_id_backend,f_id,k,&sr); @@ -283,6 +111,7 @@ int main(int argc,char * argv[]) { printRUsage(); cout<<"Should now free memory"<<endl; #endif + int i; for (i=1;i<=f_id;i++) { // cout<<"deleting backend: "<<i-1<<endl; delete map_id_backend[i-1]; @@ -297,6 +126,8 @@ int main(int argc,char * argv[]) { cout<<"estimated probability of collision:"<<p<<endl; cout<<"estimated number of distinct k-mers:"<<approx_nb_k_mers<<endl; + if (main_parms.is_verbose()) main_parms.printVerboseInfo(); + #ifdef BENCHMARK cout<<"finished,going to exit "<<endl; printRUsage(); diff --git a/src/unit_test_main_utils.cpp b/src/unit_test_main_utils.cpp index 0e9947dc24af06088496f7007cca6b4a17ce8382..5d05fa1646b6a1f2688d27bdf3917b2d6e975508 100644 --- a/src/unit_test_main_utils.cpp +++ b/src/unit_test_main_utils.cpp @@ -15,32 +15,34 @@ using namespace std; #include "rock_commons.h" #include "main_utils.h" +#include "ROCKparams.h" void test_processIOFileArgs() { std::vector<IO_fq_files> single_files; vector<PE_files> v_PE_files; std::vector<string> v_input_lines; std::vector<string> v_output_lines; + ROCKparams main_parms; int f_id=0; - int ret=loadInOutFileArgs("../test/data/unit/list_input1.txt","../test/data/unit/list_output1.txt",v_input_lines,v_output_lines); + int ret=main_parms.loadInOutFileArgs("../test/data/unit/list_input1.txt","../test/data/unit/list_output1.txt",v_input_lines,v_output_lines); assert(ret==EXIT_SUCCESS); - ret=processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); + ret=main_parms.processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); assert(ret==EXIT_FAILURE); f_id=0; v_PE_files.clear(); single_files.clear(); v_input_lines.clear(); v_output_lines.clear(); - ret=loadInOutFileArgs("../test/data/unit/list_input2.txt","../test/data/unit/list_output2.txt",v_input_lines,v_output_lines); - ret=processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); + ret=main_parms.loadInOutFileArgs("../test/data/unit/list_input2.txt","../test/data/unit/list_output2.txt",v_input_lines,v_output_lines); + ret=main_parms.processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); assert(ret==EXIT_FAILURE); f_id=0; v_PE_files.clear(); single_files.clear(); v_input_lines.clear(); v_output_lines.clear(); - ret=loadInOutFileArgs("../test/data/unit/list_input3.txt","../test/data/unit/list_output3.txt",v_input_lines,v_output_lines); - ret=processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); + ret=main_parms.loadInOutFileArgs("../test/data/unit/list_input3.txt","../test/data/unit/list_output3.txt",v_input_lines,v_output_lines); + ret=main_parms.processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); assert(ret==EXIT_SUCCESS); assert(f_id==14); assert(single_files.size()==2); @@ -56,8 +58,8 @@ void test_processIOFileArgs() { v_input_lines.clear(); v_PE_files.clear(); single_files.clear(); - ret=loadInOutFileArgs("../test/data/unit/list_input3.txt","",v_input_lines,v_output_lines); - ret=processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); + ret=main_parms.loadInOutFileArgs("../test/data/unit/list_input3.txt","",v_input_lines,v_output_lines); + ret=main_parms.processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); s=single_files[0]; assert(s.out_fq_file.compare("fifi.rock.fq")==0); s2=v_PE_files[5]; diff --git a/test/rock_mem.sh b/test/rock_mem.sh index 970acad63e0cd7b2ac06306d7b334e6eeff2503c..b1e4023bfea696a6c5f2f660597a46e67fe9d65f 100755 --- a/test/rock_mem.sh +++ b/test/rock_mem.sh @@ -66,6 +66,19 @@ rm -fr "klebsiella_100_2.rock.fq" || exit 1817 rm -fr "test_single.rock.fq"|| exit 1818 rm -fr "test_single2.rock.fq"|| exit 1819 +# test the -v option +echo " " +echo "##################################################################################" +echo "testing verbose mode" +../src/rock -C 100 -c 1 -l 2 -v ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq ${srcdir}/data/unit/test_single.fq ${srcdir}/data/unit/test_single2.fq|grep "CMS size=" >/dev/null || exit 182 + +../src/rock -C 100 -c 1 -v -n 1000 ${srcdir}/data/fastq.raw/klebsiella_100_1.fq,${srcdir}/data/fastq.raw/klebsiella_100_2.fq ${srcdir}/data/unit/test_single.fq ${srcdir}/data/unit/test_single2.fq|grep "expected probability of collision was:" >/dev/null || exit 182 + +echo "erasing test result files" +rm -fr "klebsiella_100_1.rock.fq" || exit 1816 +rm -fr "klebsiella_100_2.rock.fq" || exit 1817 +rm -fr "test_single.rock.fq"|| exit 1818 +rm -fr "test_single2.rock.fq"|| exit 1819 echo " " echo "##################################################################################"