diff --git a/configure b/configure index c0e34311226408d35ec5e271cb21744bd1231a6c..b461cb9240113ded0b3366bdfce8d18f0994fd6a 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for rock 1.9.1. +# Generated by GNU Autoconf 2.69 for rock 1.9.2. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -576,8 +576,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='rock' PACKAGE_TARNAME='rock' -PACKAGE_VERSION='1.9.1' -PACKAGE_STRING='rock 1.9.1' +PACKAGE_VERSION='1.9.2' +PACKAGE_STRING='rock 1.9.2' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1236,7 +1236,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures rock 1.9.1 to adapt to many kinds of systems. +\`configure' configures rock 1.9.2 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1307,7 +1307,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of rock 1.9.1:";; + short | recursive ) echo "Configuration of rock 1.9.2:";; esac cat <<\_ACEOF @@ -1397,7 +1397,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -rock configure 1.9.1 +rock configure 1.9.2 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -1452,7 +1452,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by rock $as_me 1.9.1, which was +It was created by rock $as_me 1.9.2, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2423,7 +2423,7 @@ fi # Define the identity of the package. PACKAGE='rock' - VERSION='1.9.1' + VERSION='1.9.2' cat >>confdefs.h <<_ACEOF @@ -3916,7 +3916,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by rock $as_me 1.9.1, which was +This file was extended by rock $as_me 1.9.2, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -3973,7 +3973,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -rock config.status 1.9.1 +rock config.status 1.9.2 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/src/CountMinSketch.hpp b/src/CountMinSketch.hpp index a7814d24da68c240e1c8c1741a003940d5d93ff8..bd3edc9231b956156e5472ade065d692f040d530 100644 --- a/src/CountMinSketch.hpp +++ b/src/CountMinSketch.hpp @@ -12,6 +12,7 @@ #include <stdlib.h> #include <string.h> #include <limits.h> +#include <vector> #include <cmath> #include "rock_commons.h" @@ -60,6 +61,7 @@ private: T ** cms_lambda_array; T mask; + std::vector<unsigned long> zeroes; //! Number of unset buckets per hash. inline unsigned int hash64to32(const unsigned long& w ,const int& j) { return w % Pi_js[j]; @@ -94,19 +96,24 @@ public: free(cms_lambda_array); } + //! This method must be called after getNbDistinctKMers because getNbDistinctKmers sets the counters. + std::vector<unsigned long> getUnsetBuckets() { + return zeroes; + } + inline void addKMer(const unsigned long& val1) { - int j; - unsigned int h; - T cnt; - j=lambda; - while(--j>=0) { - h=hash64to32(val1,j); - cnt=cms_lambda_array[j] [h]; - cnt++; - cms_lambda_array[j] [h]=(cnt & mask); - } - } + int j; + unsigned int h; + T cnt; + j=lambda; + while(--j>=0) { + h=hash64to32(val1,j); + cnt=cms_lambda_array[j] [h]; + cnt++; + cms_lambda_array[j] [h]=(cnt & mask); + } + } // keep that just for unit testing purpose T getEstimatedNbOcc(const unsigned long& val); @@ -305,6 +312,7 @@ template<typename T> unsigned long CountMinSketch<T>::getNbDistinctKMers() { (cms_lambda_array[j] [h]==0)? z+=1: z; } (cms_lambda_array[j] [0]==0)? z+=1: z; + zeroes.push_back(z); double lnz=log(z); double lnm=log(m); double lnm1=log(m-1); diff --git a/src/Filter.hpp b/src/Filter.hpp index 9b2eaf5e54d877f52b43e26096f4d9487e0825cf..1c1d9daacf2c36d6598a0cf2ccc9c632ee187ebe 100644 --- a/src/Filter.hpp +++ b/src/Filter.hpp @@ -7,12 +7,14 @@ #ifndef FILTER_HPP_ #define FILTER_HPP_ #include <errno.h> +#include <err.h> #include <sys/time.h> #include <sys/resource.h> #include "CountMinSketch.hpp" #include "ReadProcessor.h" #include "read_utils.h" +#include "ROCKparams.h" static const unsigned short ushortmask=65535; @@ -52,8 +54,10 @@ public: void fillCMS(FqBaseBackend* map_id_backend[],int nb_be,int k,srp* io_sr); void lowFilterCMS(FqBaseBackend* map_id_backend[],int nb_be,int k, srp* io_sr); - unsigned long getApproxNbDistinctKMers(); + unsigned long getApproxNbDistinctKMers() const ; unsigned long getSize(); + unsigned int getNbBitsForCounters() const; // returns the size of the dada type used for the counters of the CMS. + std::vector<unsigned long> getUnsetBuckets() const; }; template <typename T> void Filter::underlyingfillCMS(FqBaseBackend* map_id_backend[],int nb_be, int k, srp* io_sr, CountMinSketch<T>* cms) { @@ -146,11 +150,16 @@ void Filter::lowFilterCMS(FqBaseBackend* map_id_backend[],int nb_be,int k, srp* else underlyinglowFilterCMS<unsigned short>(map_id_backend,nb_be, k, io_sr,pShortCMS); } -unsigned long Filter::getApproxNbDistinctKMers() { +unsigned long Filter::getApproxNbDistinctKMers() const { if (pByteCMS!=NULL) return pByteCMS->getNbDistinctKMers(); else return pShortCMS->getNbDistinctKMers(); } +std::vector<unsigned long> Filter::getUnsetBuckets() const { + if (pByteCMS!=NULL) return pByteCMS->getUnsetBuckets(); + else return pShortCMS->getUnsetBuckets(); +} + void Filter::getRSS(int before_fill) { struct rusage usage; int res2=getrusage(RUSAGE_SELF,&usage); @@ -165,4 +174,9 @@ unsigned long Filter::getSize() { return cms_size; } +unsigned int Filter::getNbBitsForCounters() const { + if (pByteCMS!=NULL) return sizeof(unsigned char)*8; + else return sizeof(unsigned short)*8; +} + #endif diff --git a/src/FqMainBackend.cpp b/src/FqMainBackend.cpp index 3f7783e11616234089a0ab828803ab132c96c584..e779cd8f45399b282f04ee29673960b0f0eadef1 100644 --- a/src/FqMainBackend.cpp +++ b/src/FqMainBackend.cpp @@ -38,6 +38,7 @@ using namespace std; FqMainBackend::FqMainBackend(srp * io_sr):FqBaseBackend() { p_auxFqProcessor=NULL; p_scoreReadStruct=io_sr; + cnt_k_mers=0; } @@ -62,10 +63,6 @@ void FqMainBackend::processFile(char * filename,unsigned char f_id) { err(errno,"cannot open file: %s.",filename); } - /*fp=fdopen(f_single,"r"); - if (fp==NULL) { - err(errno,"cannot open file: %s.",filename); - }*/ buf=(char *) malloc(buffer_size*sizeof(char)); if (buf==NULL) { err(errno,"cannot allocate memory: %lu bytes.",buffer_size); @@ -114,6 +111,7 @@ void FqMainBackend::onEndFastqRecord(T_fq_rec_info& rec_info,const T_buf_info& b // empty buffer keeping current record cur_fq_record[0]='\0'; if (p_auxFqProcessor!=NULL) p_auxFqProcessor->resetCurFqRecord(); + cnt_k_mers+=nb_k_mer; } diff --git a/src/FqMainBackend.h b/src/FqMainBackend.h index 656ab4bb210533bdcf5d52c848591276482f944c..73770438fbf89088b19a0333782d53fe55dbf82a 100644 --- a/src/FqMainBackend.h +++ b/src/FqMainBackend.h @@ -19,10 +19,11 @@ class FqMainBackend : public FqBaseBackend { // static int treat_PE_as_single; static int treat_PE_separately; + unsigned long cnt_k_mers; //! counter for the total number of k-mers found in a single file or in a pair of PE files. + //! This includes k-mers in error. FqAuxBackend * p_auxFqProcessor; /* Another fastq processor component is necessary for handling the case of PE reads.*/ srp * p_scoreReadStruct; /* Where we store information about the reads. */ // char current_id[50]; used only for debug - // void debug_processBuf(int evt,const T_buf_info&, const unsigned long &); void writeToUndefFile(const T_buf_info&); void onEndFastqRecord(T_fq_rec_info& rec_info,const T_buf_info& bufinfo); @@ -44,6 +45,10 @@ public: static void setTreatPEMode(const int& treat_PE){ FqMainBackend::treat_PE_separately=treat_PE; } + + unsigned long getCntKMers() { + return cnt_k_mers; + } }; #endif /* FQMAINBACKEND_H_ */ diff --git a/src/Makefile.am b/src/Makefile.am index 9a87fb480901152739dc7409b5ee6e0be55360da..362bef115255dac3e46d19fa3b05d2b01a0b9347 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -3,7 +3,7 @@ LINTDEFS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) AM_CPPFLAGS = -DDATADIR=\"$(pkgdatadir)\" bin_PROGRAMS=rock -noinst_PROGRAMS = unit_test_fqreader unit_test_read_utils unit_test_cms unit_test_fqwriter unit_test_main_utils +noinst_PROGRAMS = unit_test_fqreader unit_test_read_utils unit_test_cms unit_test_fqwriter unit_test_math_utils ## noinst_PROGRAMS = noinst_LIBRARIES = librock.a @@ -24,10 +24,10 @@ unit_test_cms_LDADD=librock.a unit_test_fqwriter_SOURCES=unit_test_fqwriter.cpp unit_test_fqwriter_LDADD=librock.a -unit_test_main_utils_SOURCES=unit_test_main_utils.cpp -unit_test_main_utils_LDADD=librock.a +unit_test_math_utils_SOURCES=unit_test_math_utils.cpp +unit_test_math_utils_LDADD=librock.a librock_a_SOURCES = $(SRC) -SRC = fqreader.cpp FqBaseBackend.cpp FqAuxBackend.cpp FqMainBackend.cpp read_utils.cpp ReadProcessor.cpp fqwriter.cpp main_utils.cpp ROCKparams.cpp unit_tests_tools.cpp -HDR = srp.h fqreader.h FqConstants.h FqBaseBackend.h FqAuxBackend.h FqMainBackend.h ReadProcessor.h rock_commons.h read_utils.h CountMinSketch.hpp Filter.hpp fqwriter.h main_utils.h ROCKparams.h unit_tests_tools.h +SRC = math_utils.cpp low_level_utils.cpp fqreader.cpp FqBaseBackend.cpp FqAuxBackend.cpp FqMainBackend.cpp read_utils.cpp ReadProcessor.cpp fqwriter.cpp ROCKparams.cpp unit_tests_tools.cpp +HDR = math_utils.h low_level_utils.h srp.h fqreader.h FqConstants.h FqBaseBackend.h FqAuxBackend.h FqMainBackend.h ReadProcessor.h rock_commons.h read_utils.h CountMinSketch.hpp Filter.hpp fqwriter.h ROCKparams.h unit_tests_tools.h diff --git a/src/Makefile.in b/src/Makefile.in index 29a6f9b1ff141f346b465d7280338acc53b08349..6eb87c3d511facc312efa2af50a48dccc8637ed7 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -94,7 +94,7 @@ target_triplet = @target@ bin_PROGRAMS = rock$(EXEEXT) noinst_PROGRAMS = unit_test_fqreader$(EXEEXT) \ unit_test_read_utils$(EXEEXT) unit_test_cms$(EXEEXT) \ - unit_test_fqwriter$(EXEEXT) unit_test_main_utils$(EXEEXT) + unit_test_fqwriter$(EXEEXT) unit_test_math_utils$(EXEEXT) subdir = src ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/configure.ac @@ -116,10 +116,11 @@ am__v_AR_0 = @echo " AR " $@; am__v_AR_1 = librock_a_AR = $(AR) $(ARFLAGS) librock_a_LIBADD = -am__objects_1 = fqreader.$(OBJEXT) FqBaseBackend.$(OBJEXT) \ +am__objects_1 = math_utils.$(OBJEXT) low_level_utils.$(OBJEXT) \ + fqreader.$(OBJEXT) FqBaseBackend.$(OBJEXT) \ FqAuxBackend.$(OBJEXT) FqMainBackend.$(OBJEXT) \ read_utils.$(OBJEXT) ReadProcessor.$(OBJEXT) \ - fqwriter.$(OBJEXT) main_utils.$(OBJEXT) ROCKparams.$(OBJEXT) \ + fqwriter.$(OBJEXT) ROCKparams.$(OBJEXT) \ unit_tests_tools.$(OBJEXT) am_librock_a_OBJECTS = $(am__objects_1) librock_a_OBJECTS = $(am_librock_a_OBJECTS) @@ -135,9 +136,9 @@ unit_test_fqreader_DEPENDENCIES = librock.a am_unit_test_fqwriter_OBJECTS = unit_test_fqwriter.$(OBJEXT) unit_test_fqwriter_OBJECTS = $(am_unit_test_fqwriter_OBJECTS) unit_test_fqwriter_DEPENDENCIES = librock.a -am_unit_test_main_utils_OBJECTS = unit_test_main_utils.$(OBJEXT) -unit_test_main_utils_OBJECTS = $(am_unit_test_main_utils_OBJECTS) -unit_test_main_utils_DEPENDENCIES = librock.a +am_unit_test_math_utils_OBJECTS = unit_test_math_utils.$(OBJEXT) +unit_test_math_utils_OBJECTS = $(am_unit_test_math_utils_OBJECTS) +unit_test_math_utils_DEPENDENCIES = librock.a am_unit_test_read_utils_OBJECTS = unit_test_read_utils.$(OBJEXT) unit_test_read_utils_OBJECTS = $(am_unit_test_read_utils_OBJECTS) unit_test_read_utils_DEPENDENCIES = librock.a @@ -160,11 +161,11 @@ am__depfiles_remade = ./$(DEPDIR)/FqAuxBackend.Po \ ./$(DEPDIR)/FqBaseBackend.Po ./$(DEPDIR)/FqMainBackend.Po \ ./$(DEPDIR)/ROCKparams.Po ./$(DEPDIR)/ReadProcessor.Po \ ./$(DEPDIR)/fqreader.Po ./$(DEPDIR)/fqwriter.Po \ - ./$(DEPDIR)/main_utils.Po ./$(DEPDIR)/read_utils.Po \ - ./$(DEPDIR)/rock.Po ./$(DEPDIR)/unit_test_cms.Po \ - ./$(DEPDIR)/unit_test_fqreader.Po \ + ./$(DEPDIR)/low_level_utils.Po ./$(DEPDIR)/math_utils.Po \ + ./$(DEPDIR)/read_utils.Po ./$(DEPDIR)/rock.Po \ + ./$(DEPDIR)/unit_test_cms.Po ./$(DEPDIR)/unit_test_fqreader.Po \ ./$(DEPDIR)/unit_test_fqwriter.Po \ - ./$(DEPDIR)/unit_test_main_utils.Po \ + ./$(DEPDIR)/unit_test_math_utils.Po \ ./$(DEPDIR)/unit_test_read_utils.Po \ ./$(DEPDIR)/unit_tests_tools.Po am__mv = mv -f @@ -183,11 +184,11 @@ am__v_CXXLD_0 = @echo " CXXLD " $@; am__v_CXXLD_1 = SOURCES = $(librock_a_SOURCES) $(rock_SOURCES) \ $(unit_test_cms_SOURCES) $(unit_test_fqreader_SOURCES) \ - $(unit_test_fqwriter_SOURCES) $(unit_test_main_utils_SOURCES) \ + $(unit_test_fqwriter_SOURCES) $(unit_test_math_utils_SOURCES) \ $(unit_test_read_utils_SOURCES) DIST_SOURCES = $(librock_a_SOURCES) $(rock_SOURCES) \ $(unit_test_cms_SOURCES) $(unit_test_fqreader_SOURCES) \ - $(unit_test_fqwriter_SOURCES) $(unit_test_main_utils_SOURCES) \ + $(unit_test_fqwriter_SOURCES) $(unit_test_math_utils_SOURCES) \ $(unit_test_read_utils_SOURCES) am__can_run_installinfo = \ case $$AM_UPDATE_INFO_DIR in \ @@ -328,11 +329,11 @@ unit_test_cms_SOURCES = unit_test_cms.cpp unit_test_cms_LDADD = librock.a unit_test_fqwriter_SOURCES = unit_test_fqwriter.cpp unit_test_fqwriter_LDADD = librock.a -unit_test_main_utils_SOURCES = unit_test_main_utils.cpp -unit_test_main_utils_LDADD = librock.a +unit_test_math_utils_SOURCES = unit_test_math_utils.cpp +unit_test_math_utils_LDADD = librock.a librock_a_SOURCES = $(SRC) -SRC = fqreader.cpp FqBaseBackend.cpp FqAuxBackend.cpp FqMainBackend.cpp read_utils.cpp ReadProcessor.cpp fqwriter.cpp main_utils.cpp ROCKparams.cpp unit_tests_tools.cpp -HDR = srp.h fqreader.h FqConstants.h FqBaseBackend.h FqAuxBackend.h FqMainBackend.h ReadProcessor.h rock_commons.h read_utils.h CountMinSketch.hpp Filter.hpp fqwriter.h main_utils.h ROCKparams.h unit_tests_tools.h +SRC = math_utils.cpp low_level_utils.cpp fqreader.cpp FqBaseBackend.cpp FqAuxBackend.cpp FqMainBackend.cpp read_utils.cpp ReadProcessor.cpp fqwriter.cpp ROCKparams.cpp unit_tests_tools.cpp +HDR = math_utils.h low_level_utils.h srp.h fqreader.h FqConstants.h FqBaseBackend.h FqAuxBackend.h FqMainBackend.h ReadProcessor.h rock_commons.h read_utils.h CountMinSketch.hpp Filter.hpp fqwriter.h ROCKparams.h unit_tests_tools.h all: all-am .SUFFIXES: @@ -436,9 +437,9 @@ unit_test_fqwriter$(EXEEXT): $(unit_test_fqwriter_OBJECTS) $(unit_test_fqwriter_ @rm -f unit_test_fqwriter$(EXEEXT) $(AM_V_CXXLD)$(CXXLINK) $(unit_test_fqwriter_OBJECTS) $(unit_test_fqwriter_LDADD) $(LIBS) -unit_test_main_utils$(EXEEXT): $(unit_test_main_utils_OBJECTS) $(unit_test_main_utils_DEPENDENCIES) $(EXTRA_unit_test_main_utils_DEPENDENCIES) - @rm -f unit_test_main_utils$(EXEEXT) - $(AM_V_CXXLD)$(CXXLINK) $(unit_test_main_utils_OBJECTS) $(unit_test_main_utils_LDADD) $(LIBS) +unit_test_math_utils$(EXEEXT): $(unit_test_math_utils_OBJECTS) $(unit_test_math_utils_DEPENDENCIES) $(EXTRA_unit_test_math_utils_DEPENDENCIES) + @rm -f unit_test_math_utils$(EXEEXT) + $(AM_V_CXXLD)$(CXXLINK) $(unit_test_math_utils_OBJECTS) $(unit_test_math_utils_LDADD) $(LIBS) unit_test_read_utils$(EXEEXT): $(unit_test_read_utils_OBJECTS) $(unit_test_read_utils_DEPENDENCIES) $(EXTRA_unit_test_read_utils_DEPENDENCIES) @rm -f unit_test_read_utils$(EXEEXT) @@ -457,13 +458,14 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ReadProcessor.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fqreader.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fqwriter.Po@am__quote@ # am--include-marker -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/main_utils.Po@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/low_level_utils.Po@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/math_utils.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/read_utils.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rock.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unit_test_cms.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unit_test_fqreader.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unit_test_fqwriter.Po@am__quote@ # am--include-marker -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unit_test_main_utils.Po@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unit_test_math_utils.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unit_test_read_utils.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unit_tests_tools.Po@am__quote@ # am--include-marker @@ -622,13 +624,14 @@ distclean: distclean-am -rm -f ./$(DEPDIR)/ReadProcessor.Po -rm -f ./$(DEPDIR)/fqreader.Po -rm -f ./$(DEPDIR)/fqwriter.Po - -rm -f ./$(DEPDIR)/main_utils.Po + -rm -f ./$(DEPDIR)/low_level_utils.Po + -rm -f ./$(DEPDIR)/math_utils.Po -rm -f ./$(DEPDIR)/read_utils.Po -rm -f ./$(DEPDIR)/rock.Po -rm -f ./$(DEPDIR)/unit_test_cms.Po -rm -f ./$(DEPDIR)/unit_test_fqreader.Po -rm -f ./$(DEPDIR)/unit_test_fqwriter.Po - -rm -f ./$(DEPDIR)/unit_test_main_utils.Po + -rm -f ./$(DEPDIR)/unit_test_math_utils.Po -rm -f ./$(DEPDIR)/unit_test_read_utils.Po -rm -f ./$(DEPDIR)/unit_tests_tools.Po -rm -f Makefile @@ -683,13 +686,14 @@ maintainer-clean: maintainer-clean-am -rm -f ./$(DEPDIR)/ReadProcessor.Po -rm -f ./$(DEPDIR)/fqreader.Po -rm -f ./$(DEPDIR)/fqwriter.Po - -rm -f ./$(DEPDIR)/main_utils.Po + -rm -f ./$(DEPDIR)/low_level_utils.Po + -rm -f ./$(DEPDIR)/math_utils.Po -rm -f ./$(DEPDIR)/read_utils.Po -rm -f ./$(DEPDIR)/rock.Po -rm -f ./$(DEPDIR)/unit_test_cms.Po -rm -f ./$(DEPDIR)/unit_test_fqreader.Po -rm -f ./$(DEPDIR)/unit_test_fqwriter.Po - -rm -f ./$(DEPDIR)/unit_test_main_utils.Po + -rm -f ./$(DEPDIR)/unit_test_math_utils.Po -rm -f ./$(DEPDIR)/unit_test_read_utils.Po -rm -f ./$(DEPDIR)/unit_tests_tools.Po -rm -f Makefile diff --git a/src/ROCKparams.cpp b/src/ROCKparams.cpp index 5a0cac8f801c302ac0412777712fd6917a2afa1b..571fd4ec49b1805a5908f8a7e0419c06b141b2b3 100755 --- a/src/ROCKparams.cpp +++ b/src/ROCKparams.cpp @@ -12,6 +12,7 @@ #include <vector> #include <cmath> #include "FqConstants.h" +#include "math_utils.h" #include "ROCKparams.h" using namespace std; @@ -31,6 +32,41 @@ int ROCKparams::getfilterPEMode() { return parms.filter_PE_separately; }*/ +void usage(int status) { +cout<<"ROCK v@@@ Copyright (C) 2016-2021 Institut Pasteur"<<endl; +cout<<endl; +cout<<"Reducing Over-Covering K-mers within FASTQ file(s)"<<endl; +cout<<endl; +cout<<"USAGE: rock [options] [files]"<<endl; +cout<<endl; +cout<<"OPTIONS:"<<endl; +cout<<" -i <file> file containing the name(s) of the input FASTQ file(s) to"<<endl; +cout<<" process; single-end: one file name per line; paired-end:"<<endl; +cout<<" two file names per line separated by a comma; up to 15"<<endl; +cout<<" FASTQ file names can be specified; of note, input file"<<endl; +cout<<" name(s) can also be specified as program argument(s)"<<endl; +cout<<" -o <file> file containing the name(s) of the output FASTQ file(s);"<<endl; +cout<<" FASTQ file name(s) should be structured in the same way as"<<endl; +cout<<" the file specified in option -i."<<endl; +cout<<" -k <int> k-mer length (default 25)"<<endl; +cout<<" -c <int> lower-bound k-mer coverage depth threshold (default: 0)"<<endl; +cout<<" -C <int> upper-bound k-mer coverage depth threshold (default: 70)"<<endl; +cout<<" -l <int> number of hashing function(s) (default: 4)"<<endl; +cout<<" -n <int> expected total number of distinct k-mers within the input"<<endl; +cout<<" read sequences; not compatible with option -l."<<endl; +cout<<" -f <float> maximum expected false positive probability when computing"<<endl; +cout<<" the optimal number of hashing functions from the number of"<<endl; +cout<<" distinct k-mers specified with option -n (default: 0.05)."<<endl; +cout<<" -q <int> sets as valid only k-mers made up of nucleotides with"<<endl; +cout<<" Phred score (+33 offset) above this cutoff (default: 0)"<<endl; +cout<<" -m <int> minimum number of valid k-mer(s) to consider a read; all"<<endl; +cout<<" non-considered reads are written into output file(s) with"<<endl; +cout<<" extension undefined.fastq (default: 1)"<<endl; +cout<<" -v verbose mode"<<endl; +cout<<" -h prints this message and exit"<<endl; +exit(status); +} + void ROCKparams::processMainArgs(int optind, const int argc, char ** argv,std::vector<string>& v_input_lines) { if (optind==argc) return; // no arguments @@ -42,6 +78,26 @@ void ROCKparams::processMainArgs(int optind, const int argc, char ** argv,std::v } + +/* + * Minimize lambda for a given number of k-mers. + * We want p<=0.01, + * so choose smallest lambda so that ccdf(kappa_prime,nb_k_mer,array_size) exp lambda<=0.01. + * Smallest is either low filter either high filter if there is no low filter. + */ +int ROCKparams::getBestLambdaForN(const unsigned long& nb_k_mers,const int& smallest_kappa,const unsigned int& m,const float& max_collision_proba) { + float lnp=log(max_collision_proba); + float ccdfr=ccdf(smallest_kappa,nb_k_mers,m); + float lnccdf=log(ccdfr); + float tmp=lnp/lnccdf; + int min_l=ceil(tmp); + if (min_l==0) min_l=1; + return min_l; +} + + + + void ROCKparams::optArgConsistency(const string& input_file,const string& output_file, const int& g,CMSparams& parms,const unsigned long& nb_k_mers, const std::vector<string>& v_input_lines) { @@ -69,16 +125,16 @@ void ROCKparams::optArgConsistency(const string& input_file,const string& output } if (parms.kappa_prime>=parms.kappa) { cout<<"ERROR lower filter is higher than high filter!"<<endl; - exit(EXIT_FAILURE); + usage(EXIT_FAILURE); } if (qual_thres.nucl_score_threshold<0) { cout<<"ERROR nucleotide score threshold must be positive."<<endl; - exit(EXIT_FAILURE); + usage(EXIT_FAILURE); } if (qual_thres.min_correct_k_mers_in_read<1) { cout<<"minimum number of correct k-mers in read must be a positive integer."<<endl; - exit(EXIT_FAILURE); + usage(EXIT_FAILURE); } if (parms.lambda==0) { // This happens when the user doesn't specify lambda nor nb_k_mers. @@ -94,7 +150,7 @@ void ROCKparams::optArgConsistency(const string& input_file,const string& output if (cms_size>getNodePhysMemory()-defaultGRPMAXSize) { cout<<"Not enough RAM on the machine to run rock with a CMS of size:"<<cms_size<<" GB."<<endl; cout<<" Maybe you should think of increasing the value for the low filter (-c option)"<<endl; - exit(EXIT_FAILURE); + usage(EXIT_FAILURE); } } @@ -333,6 +389,7 @@ void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) { int smallest_kappa; if (parms.kappa_prime==0) smallest_kappa=parms.kappa; else smallest_kappa=parms.kappa_prime; + //float ROCKparams::getCollisionProba(const int& smallest_kappa,const unsigned long& nb_k_mers,const unsigned int& arr_cms_size,const int& lambda) const expected_collision_proba=getCollisionProba(smallest_kappa,nb_k_mers,Pi_js[0],parms.lambda); } if ((v_input_lines.empty() || v_output_lines.empty()) && (loadInOutFileArgs(input_file,output_file,v_input_lines,v_output_lines)==EXIT_FAILURE)) throw EXIT_FAILURE; diff --git a/src/ROCKparams.h b/src/ROCKparams.h index 93cfaaf07dee1cc8865e2580d3324eb0c79955e3..56f63882ab7f27553ec52a9a1d590847ecc648c0 100755 --- a/src/ROCKparams.h +++ b/src/ROCKparams.h @@ -14,9 +14,10 @@ #include <vector> #include <getopt.h> #include <iostream> +#include "low_level_utils.h" #include "FqConstants.h" +#include "rock_commons.h" #include "CountMinSketch.hpp" -#include "main_utils.h" #define defaultGRPMAXSize 1 //(in gigabytes) Default maximum size of srp structure but of course this size depends of the number of reads we are processing. // 1 GB is enough to store approx 82 millions of reads (must take into account memory needed for stl containers). @@ -28,6 +29,8 @@ #define path_sep '/' #define k_proposed_lamda 4 +#define k_err_parms_inconsistency 1 + typedef struct { @@ -40,6 +43,7 @@ typedef struct { using namespace std; + class ROCKparams{ static const int output_ext=1; static const int undef_ext=2; @@ -77,7 +81,11 @@ class ROCKparams{ const int& g,CMSparams& parms,const unsigned long& nb_k_mers, const std::vector<string>& v_input_lines); + int getBestLambdaForN(const unsigned long& nb_k_mers,const int& smallest_kappa,const unsigned int& m,const float& max_collision_proba); + friend void test_processIOFileArgs(); + friend void test_getBestLambdaForN(); + public: @@ -99,7 +107,7 @@ public: qual_thres.k=k; verbose_mode=0; cms_size=0; - expected_collision_proba=0.0; + expected_collision_proba=0.0; //! collision probability that is computed at the beginning of ROCK from the expected number of distinct k-mers provided by the user. //min_correct_k_mers_in_read=0; parms.filter_size=getNodePhysMemory()/100.0*90-defaultGRPMAXSize; // Prefer not to use all the machine's memory if (parms.filter_size==0) throw EXIT_FAILURE; @@ -109,7 +117,7 @@ public: void initFromMainOptsArgs(int argc,char ** argv); - CMSparams getCMSparams() { + CMSparams getCMSparams() const { return parms; } @@ -122,48 +130,42 @@ public: return f_id; } - std::vector<IO_fq_files> get_single_files() { + std::vector<IO_fq_files> get_single_files() const { return single_files; } - vector<PE_files> get_PE_files() { + vector<PE_files> get_PE_files() const { return v_PE_files; } - int get_k() { + int get_k() const { return k; } - void printVerboseInfo() { - std::vector<IO_fq_files>::iterator it_s; - vector<PE_files>::iterator it_pe; - cout<<"Input files: "<<endl; - cout<<"single:"<<endl; - for (it_s=single_files.begin();it_s!=single_files.end();it_s++) { - cout<<" "<<it_s->in_fq_file<<endl; - } - cout<<"pair-end:"<<endl; - for (it_pe=v_PE_files.begin();it_pe!=v_PE_files.end();it_pe++) { - cout<<" "<<it_pe->PE1.in_fq_file<<" "<<it_pe->PE2.in_fq_file<<endl; - } - cout<<endl; - cout<<"k="<<k<<endl; - cout<<"kappa="<<parms.kappa<<endl; - cout<<"kappa prime="<<parms.kappa_prime; - cout<<endl; - cout<<"CMS size="<<cms_size<<" Bytes for "<<parms.lambda<<" arrays."<<endl; - cout<<endl; - if (nb_k_mers!=0) cout<<"expected probability of collision was: "<<expected_collision_proba<<endl; - } - void setFilterSize(unsigned long fsize) { cms_size=fsize; } + unsigned long getFilterSize() const { + unsigned long cms_size_in_GB=cms_size/1024; + cms_size_in_GB/=1024; + cms_size_in_GB/=1024; + return cms_size_in_GB; + } + + int is_verbose() { return verbose_mode; } + unsigned long get_expected_nbKmers() const { + return nb_k_mers; + } + + float get_expected_collision_proba() const { + return expected_collision_proba; + } + }; diff --git a/src/fqreader.cpp b/src/fqreader.cpp index d918e68fbfa4b385775db92a8f358168a3a557fe..6a79cac52e9abff170aac790cdb004301f3a5238 100644 --- a/src/fqreader.cpp +++ b/src/fqreader.cpp @@ -56,8 +56,12 @@ void processPEFiles(char * fq_1, unsigned char f_id1,char * fq_2, unsigned char * pointers to Fqbackend objects. * This function ALLOCATES MEMORY with new. */ -void processInputFiles(const std::vector<IO_fq_files>& single_files,const vector<PE_files>& v_PE_files,FqBaseBackend * array_be[],const FasqQualThreshold& a_qual_thres, srp* io_sr,const int& PE_process_mode) { - unsigned char f_id=1; +unsigned long processInputFiles(const std::vector<IO_fq_files>& single_files,\ + const vector<PE_files>& v_PE_files,FqBaseBackend * array_be[],\ + const FasqQualThreshold& a_qual_thres, srp* io_sr,\ + const int& PE_process_mode) { + unsigned long tot_nb_kmers=0; + unsigned char f_id=1; FqBaseBackend::setQualThreshold(a_qual_thres); FqMainBackend::setTreatPEMode(PE_process_mode); std::vector<IO_fq_files>::const_iterator it_single; @@ -68,6 +72,7 @@ void processInputFiles(const std::vector<IO_fq_files>& single_files,const vector be_fq->setOutputFile((char *) it_single->out_fq_file.c_str()); array_be[f_id-1]=be_fq; f_id+=1; + tot_nb_kmers+=be_fq->getCntKMers(); } vector<PE_files>::const_iterator it_pe; for (it_pe=v_PE_files.begin();it_pe!=v_PE_files.end();it_pe++) { @@ -87,7 +92,9 @@ void processInputFiles(const std::vector<IO_fq_files>& single_files,const vector array_be[f_id1-1]=be_fq1; array_be[f_id-1]=be_fq2; f_id++; + tot_nb_kmers+=be_fq1->getCntKMers(); } + return tot_nb_kmers; } diff --git a/src/fqreader.h b/src/fqreader.h index 17ce828ee3a317be0c6688c298d10834e5e39c44..79badaae6b25eed118d2716092c02e3a62486015 100644 --- a/src/fqreader.h +++ b/src/fqreader.h @@ -19,5 +19,5 @@ using namespace std; void processSingleFile(char *, unsigned char, srp*); void processPEFiles(char * fq_1, unsigned char f_id1,char * gq_2, unsigned char f_id2,srp *io_sr,char * fq_1_test_undef=NULL,char * fq_2_test_undef=NULL,size_t test_bufsize=0); -void processInputFiles(const std::vector<IO_fq_files>& ,const vector<PE_files>& ,FqBaseBackend * [], const FasqQualThreshold&,srp*,const int& PE_process_mode); +unsigned long processInputFiles(const std::vector<IO_fq_files>& ,const vector<PE_files>& ,FqBaseBackend * [], const FasqQualThreshold&,srp*,const int& PE_process_mode); #endif diff --git a/src/fqwriter.cpp b/src/fqwriter.cpp index 6ca2278364de6327eb68a5dc15e213a458802ced..eabb289fe52ac27db5231b94a4c9ca0266b8b89f 100644 --- a/src/fqwriter.cpp +++ b/src/fqwriter.cpp @@ -19,7 +19,7 @@ using namespace std; // TODO this looks a little bit like getDnaStr in read_utils.cpp. Some refactoring could be useful. Think about // classes to encapsulate that without decreasing perfs. -void writeFilteredFastq(FqBaseBackend* map_id_backend[],int nb_be, const srp& io_sr){ +void writeFilteredFastq(FqBaseBackend* map_id_backend[],int nb_be,T_read_counters& rc, const srp& io_sr){ srp::const_reverse_iterator it; i_dim::const_iterator it_offs; k_dim::const_iterator it_struct; @@ -30,6 +30,8 @@ void writeFilteredFastq(FqBaseBackend* map_id_backend[],int nb_be, const srp& io // 1rst step, open files before fetching/writing reads from/to them int i; + rc.nb_input_reads=0; + rc.nb_output_reads=0; for (i=0;i<nb_be;i++){ map_id_backend[i]->openInputFile(); map_id_backend[i]->openOutputFile(); @@ -39,7 +41,9 @@ void writeFilteredFastq(FqBaseBackend* map_id_backend[],int nb_be, const srp& io for (it_offs=it->second.begin();it_offs!=it->second.end();it_offs++) { unsigned long j=it_offs->first; for (it_struct=it_offs->second.begin();it_struct!=it_offs->second.end();it_struct++) { + rc.nb_input_reads++; if (it_struct->fileid) { + rc.nb_output_reads++; fid_stored=it_struct->fileid; f_id1=fid_stored >>4; f_id2=fid_stored &mask_fid; diff --git a/src/fqwriter.h b/src/fqwriter.h index 97d90b44bc98e75818d57d7052b588a492de7cc4..59514b63dc5854c8fbf07f81262825372cfb8700 100644 --- a/src/fqwriter.h +++ b/src/fqwriter.h @@ -8,7 +8,7 @@ #ifndef FQWRITER_H_ #define FQWRITER_H_ -void writeFilteredFastq(FqBaseBackend* map_id_backend[],int nb_be, const srp& io_sr); +void writeFilteredFastq(FqBaseBackend* map_id_backend[],int nb_be, T_read_counters& rc, const srp& io_sr); diff --git a/src/low_level_utils.cpp b/src/low_level_utils.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7acd6a56e37f27d92916238416afab31612cc67f --- /dev/null +++ b/src/low_level_utils.cpp @@ -0,0 +1,62 @@ +/* + * low_level_utils.cpp + * + * Created on: 7 mai 2021 + * Author: vlegrand + */ + +#include <string> +#include <iostream> +#include <sys/stat.h> +#include <sys/sysctl.h> +#include "low_level_utils.h" + +using namespace std; + +/* + * Given a filename (including full path), determine if parent directory exists. + * If is doesn't, display error message and exit. + */ +std::string checkDirExists(const string o_file) { + string parent_dir="."; + std::size_t o_found = o_file.find_last_of("/"); + if (o_found!=string::npos) { + parent_dir=o_file.substr(0,o_found); + // cout<<parent_dir<<endl; + struct stat info; + if (stat(parent_dir.c_str(),&info)!=0) { + cout<<"parent directory for output files: "<<parent_dir<<" doesn't exist."<<endl; + exit(EXIT_FAILURE); + } + } + return parent_dir; +} + +/* + * Use this function to get the amount of RAM in GB on the machine (used or not). + * Aim is to make sure that ROCK can run on the current machine with the requirements that the user gave. + */ +unsigned long getNodePhysMemory() { + uint64_t mem = 0; + +#if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) + mem = sysconf(_SC_PHYS_PAGES); + mem *= sysconf(_SC_PAGESIZE); + mem /= 1 * 1024 * 1024 * 1024; +#elif __APPLE__ + size_t len = sizeof(mem); + int ret=sysctlbyname("hw.memsize", (void*) &mem, &len, NULL,0); + if (ret==-1) { + cout<<"Couldn't determine how much RAM is usable on your system."<<errno<<endl; + mem=0; + } + mem /= 1 * 1024 * 1024 * 1024; +#else + cout<<"ROCK has not been tested on this operating system, please contact author."<<endl; +#endif + + return (unsigned long)mem; +} + + + diff --git a/src/low_level_utils.h b/src/low_level_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..081c045e964d8061b795ab588091097fccc10378 --- /dev/null +++ b/src/low_level_utils.h @@ -0,0 +1,22 @@ +/* + * low_level_utils.h + * + * Created on: 7 mai 2021 + * Author: vlegrand + * Low level utilities + */ + +#ifndef SRC_LOW_LEVEL_UTILS_H_ +#define SRC_LOW_LEVEL_UTILS_H_ + +#include <string> + +using namespace std; + +string checkDirExists(const string o_file); +unsigned long getNodePhysMemory(); + + + + +#endif /* SRC_LOW_LEVEL_UTILS_H_ */ diff --git a/src/main_utils.cpp b/src/main_utils.cpp deleted file mode 100644 index e75cfd08921d78d23c5dd3d9712e14d44289f2d2..0000000000000000000000000000000000000000 --- a/src/main_utils.cpp +++ /dev/null @@ -1,198 +0,0 @@ -/* - * main_utils.cpp - * - * Created on: May 9, 2016 - * Author: vlegrand - */ -#include <stdint.h> -#include <unistd.h> -#include <stdlib.h> -#include <sys/types.h> -#include <sys/sysctl.h> -#include <sys/stat.h> -#include <limits.h> -#include <string.h> -#include <iostream> -#include <fstream> -#include <cmath> -#include "main_utils.h" - -using namespace std; - -/* - * Use this function to get the amount of RAM in GB on the machine (used or not). - * Aim is to make sure that ROCK can run on the current machine with the requirements that the user gave. - */ -unsigned long getNodePhysMemory() { - uint64_t mem = 0; - -#if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) - mem = sysconf(_SC_PHYS_PAGES); - mem *= sysconf(_SC_PAGESIZE); - mem /= 1 * 1024 * 1024 * 1024; -#elif __APPLE__ - size_t len = sizeof(mem); - int ret=sysctlbyname("hw.memsize", (void*) &mem, &len, NULL,0); - if (ret==-1) { - cout<<"Couldn't determine how much RAM is usable on your system."<<errno<<endl; - mem=0; - } - mem /= 1 * 1024 * 1024 * 1024; -#else - cout<<"ROCK has not been tested on this operating system, please contact author."<<endl; -#endif - - return (unsigned long)mem; -} - - -/* - * Given a filename (including full path), determine if parent directory exists. - * If is doesn't, display error message and exit. - */ -std::string checkDirExists(const string o_file) { - string parent_dir="."; - std::size_t o_found = o_file.find_last_of("/"); - if (o_found!=string::npos) { - parent_dir=o_file.substr(0,o_found); - // cout<<parent_dir<<endl; - struct stat info; - if (stat(parent_dir.c_str(),&info)!=0) { - cout<<"parent directory for output files: "<<parent_dir<<" doesn't exist."<<endl; - exit(EXIT_FAILURE); - } - } - return parent_dir; -} - -/* - * logarithm of the gamma function; - * from here: https://www.cec.uchile.cl/cinetica/pcordero/MC_libros/NumericalRecipesinC.pdf - */ - -double gammln(const unsigned long& xx) { - double x,y,tmp,ser; - - static double cof[6]={76.18009172947146,-86.50532032941677,\ - 24.01409824083091,-1.231739572450155,\ - 0.001208650973866179,-0.000005395239384953}; - int j; - y=x=xx; - tmp=x+5.5; - tmp -=(x+0.5)*log(tmp); - ser=1.000000000190015; - for (j=0;j<=5;j++) ser+=cof[j]/++y; - return -tmp+log(2.5066282746310005*ser/x); -} - -/* - * here x is the smallest value of the counter for a given k-mer in the CMS. - * m is the size of a CMS array (AC: in fact, we use the biggest prime number as an approximation of the array's size). - */ -double pmf(const float&x,const unsigned long& nb_k_mers,const unsigned int& m) { - double res; - double lnn=log(nb_k_mers); - double lnm=log(m); - double diff=lnn-lnm; - double p=x*diff; - diff=double(nb_k_mers)/m; - //double truc=exp(diff); - double tmp=p-diff; - double g=gammln(x+1); - tmp -=g; - res=exp(tmp); - return res; -} - -double ccdf(const int& smallest_kappa,const unsigned long& nb_k_mers,const unsigned int& m) { - double res; - double s=0.0; - for (int i=0;i<=smallest_kappa;i++) { - s+=pmf(i,nb_k_mers,m); - } - res=1-s; - if (res<0) res=0.00000001; // It happens because pmf if an approximation of the probability distribution. - // So it happens that ccdf is negative but very close to 0... As we need a non null value for the next calculation, - // return a very small constant. - if (res>=0.999) res=0.999; // Need to do that because we compule ln(ccdf) to determine lamda and we need it to be non null otherwise we'd get a division by 0. - return res; -} - -/* - * Minimize lambda for a given number of k-mers. - * We want p<=0.01, - * so choose smallest lambda so that ccdf(kappa_prime,nb_k_mer,array_size) exp lambda<=0.01. - * Smallest is either low filter either high filter if there is no low filter. - */ -int getBestLambdaForN(const unsigned long& nb_k_mers,const int& smallest_kappa,const unsigned int& m,const float& max_collision_proba) { - float lnp=log(max_collision_proba); - float ccdfr=ccdf(smallest_kappa,nb_k_mers,m); - float lnccdf=log(ccdfr); - float tmp=lnp/lnccdf; - int min_l=ceil(tmp); - if (min_l==0) min_l=1; - return min_l; -} - - - - -float getCollisionProba(const int& smallest_kappa,const unsigned long& nb_k_mers,const unsigned int& arr_cms_size,const int& lambda) { - float ccfdr=ccdf(smallest_kappa,nb_k_mers,arr_cms_size); - float fpp=pow(ccfdr,lambda); - return fpp; -} - - -void usage(int status) { -cout<<"ROCK v@@@ Copyright (C) 2016-2021 Institut Pasteur"<<endl; -cout<<endl; -cout<<"Reducing Over-Covering K-mers within FASTQ file(s)"<<endl; -cout<<endl; -cout<<"USAGE: rock [options] [files]"<<endl; -cout<<endl; -cout<<"OPTIONS:"<<endl; -cout<<" -i <file> file containing the name(s) of the input FASTQ file(s) to"<<endl; -cout<<" process; single-end: one file name per line; paired-end:"<<endl; -cout<<" two file names per line separated by a comma; up to 15"<<endl; -cout<<" FASTQ file names can be specified; of note, input file"<<endl; -cout<<" name(s) can also be specified as program argument(s)"<<endl; -cout<<" -o <file> file containing the name(s) of the output FASTQ file(s);"<<endl; -cout<<" FASTQ file name(s) should be structured in the same way as"<<endl; -cout<<" the file specified in option -i."<<endl; -cout<<" -k <int> k-mer length (default 25)"<<endl; -cout<<" -c <int> lower-bound k-mer coverage depth threshold (default: 0)"<<endl; -cout<<" -C <int> upper-bound k-mer coverage depth threshold (default: 70)"<<endl; -cout<<" -l <int> number of hashing function(s) (default: 4)"<<endl; -cout<<" -n <int> expected total number of distinct k-mers within the input"<<endl; -cout<<" read sequences; not compatible with option -l."<<endl; -cout<<" -f <float> maximum expected false positive probability when computing"<<endl; -cout<<" the optimal number of hashing functions from the number of"<<endl; -cout<<" distinct k-mers specified with option -n (default: 0.05)."<<endl; -cout<<" -q <int> sets as valid only k-mers made up of nucleotides with"<<endl; -cout<<" Phred score (+33 offset) above this cutoff (default: 0)"<<endl; -cout<<" -m <int> minimum number of valid k-mer(s) to consider a read; all"<<endl; -cout<<" non-considered reads are written into output file(s) with"<<endl; -cout<<" extension undefined.fastq (default: 1)"<<endl; -cout<<" -v verbose mode"<<endl; -cout<<" -h prints this message and exit"<<endl; -exit(status); -} - -/* -void usage(int status) { - cout<<"usage: rock [options] [args]"<<endl<<endl; - cout<<" -i <file> .... file name containing the list of the FASTQ files to process."<<endl; - cout<<" -o <file> .... file name containing the list of the output FASTQ file names."<<endl; - cout<<" -h .... Print this message and exit."<<endl; - cout<<" -k <int> .... k-mer size. (default 25)."<<endl; - cout<<" -c <int> .... lower coverage threshold (default: 0)."<<endl; - cout<<" -C <int> .... upper coverage threshold (default: 70; max: 65535)."<<endl; - cout<<" -f <float> .... maximum probability of collision in the count min sketch (default: 0.05)"<<endl; - cout<<" -l <int> .... size of the count min sketch (default: at most 4, depending on the available RAM)"<<endl; - cout<<" -n <int> .... (expected) number of distinct k-mers within the processed reads."<<endl; - cout<<" -q <int> .... minimum quality threshold for nucleotides. Default is 0."<<endl; - cout<<" -m <int> .... minimum of valid k-mer threshold for reads. Default is 1."<<endl; - cout<<" -v .... verbose"<<endl; - exit(status); }*/ - diff --git a/src/main_utils.h b/src/main_utils.h deleted file mode 100644 index bd6fef306fa510b0b5122cf8f51894335846d749..0000000000000000000000000000000000000000 --- a/src/main_utils.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * main_utils.h - * - * Created on: May 9, 2016 - * Author: vlegrand - */ - -#ifndef MAIN_UTILS_H_ -#define MAIN_UTILS_H_ - - -#define k_max_collision_proba 0.05 - -#include <string> -#include <vector> -#include "rock_commons.h" - - - - -unsigned long getNodePhysMemory(); -std::string checkDirExists(const std::string o_file); -int getBestLambdaForN(const unsigned long& nb_k_mers,const int& smallest_kappa,const unsigned int& m,const float& max_collision_proba); -int getBestLambdaForN_altn(const unsigned long& nb_k_mers,const int& smallest_kappa,const unsigned int& m,const float& max_collision_proba); -double gammln(const unsigned long& xx); -double pmf(const float&x,const unsigned long& nb_k_mers,const unsigned int& m); -double ccdf(const int& smallest_kappa,const unsigned long& nb_k_mers,const unsigned int& m); -float getCollisionProba(const int& smallest_kappa,const unsigned long& nb_k_mers,const unsigned int& arr_cms_size,const int& lambda); -void usage(int status); - - -#endif /* MAIN_UTILS_H_ */ diff --git a/src/math_utils.cpp b/src/math_utils.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6a952f4381123e1ab47dda0b71a966092da7138e --- /dev/null +++ b/src/math_utils.cpp @@ -0,0 +1,74 @@ +/* + * math_utils.cpp + * + * Created on: 7 mai 2021 + * Author: vlegrand + */ +#include <cmath> + + +/* + * logarithm of the gamma function; + * from here: https://www.cec.uchile.cl/cinetica/pcordero/MC_libros/NumericalRecipesinC.pdf + */ +double gammln(const unsigned long& xx) { + double x,y,tmp,ser; + + static double cof[6]={76.18009172947146,-86.50532032941677,\ + 24.01409824083091,-1.231739572450155,\ + 0.001208650973866179,-0.000005395239384953}; + int j; + y=x=xx; + tmp=x+5.5; + tmp -=(x+0.5)*log(tmp); + ser=1.000000000190015; + for (j=0;j<=5;j++) ser+=cof[j]/++y; + return -tmp+log(2.5066282746310005*ser/x); +} + + +/* + * here x is the smallest value of the counter for a given k-mer in the CMS. + * m is the size of a CMS array (AC: in fact, we use the biggest prime number as an approximation of the array's size). + */ +double pmf(const float&x,const unsigned long& nb_k_mers,const unsigned int& m) { + double res; + double lnn=log(nb_k_mers); + double lnm=log(m); + double diff=lnn-lnm; + double p=x*diff; + diff=double(nb_k_mers)/m; + //double truc=exp(diff); + double tmp=p-diff; + double g=gammln(x+1); + tmp -=g; + res=exp(tmp); + return res; +} + +double ccdf(const int& smallest_kappa,const unsigned long& nb_k_mers,const unsigned int& m) { + double res; + double s=0.0; + for (int i=0;i<=smallest_kappa;i++) { + s+=pmf(i,nb_k_mers,m); + } + res=1-s; + if (res<0) res=0.00000001; // It happens because pmf if an approximation of the probability distribution. + // So it happens that ccdf is negative but very close to 0... As we need a non null value for the next calculation, + // return a very small constant. + if (res>=0.999) res=0.999; // Need to do that because we compule ln(ccdf) to determine lamda and we need it to be non null otherwise we'd get a division by 0. + return res; +} + + +float getCollisionProba(const int& smallest_kappa,const unsigned long& nb_k_mers,const unsigned int& arr_cms_size,const int& lambda) { + float ccfdr=ccdf(smallest_kappa,nb_k_mers,arr_cms_size); + float fpp=pow(ccfdr,lambda); + return fpp; +} + + + + + + diff --git a/src/math_utils.h b/src/math_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..9c9d045c8b67b5b072b57e5192abd38dc2b2754f --- /dev/null +++ b/src/math_utils.h @@ -0,0 +1,20 @@ +/* + * math_utils.h + * + * Created on: 7 mai 2021 + * Author: vlegrand + * Gather here utilities methods for computing the collision probability + */ + +#ifndef SRC_MATH_UTILS_H_ +#define SRC_MATH_UTILS_H_ + + + +double gammln(const unsigned long& xx); +double pmf(const float&x,const unsigned long& nb_k_mers,const unsigned int& m); +double ccdf(const int& smallest_kappa,const unsigned long& nb_k_mers,const unsigned int& m); +float getCollisionProba(const int& smallest_kappa,const unsigned long& nb_k_mers,const unsigned int& arr_cms_size,const int& lambda); + + +#endif /* SRC_MATH_UTILS_H_ */ diff --git a/src/rock.cpp b/src/rock.cpp index bfaf0c74b59fb9cdc9a63d6b7d91ff4d36be844b..5dae10f3266a11a0c27879ae0c00bcfbd09d98f8 100755 --- a/src/rock.cpp +++ b/src/rock.cpp @@ -17,7 +17,7 @@ #include <vector> #include <unistd.h> - +#include "math_utils.h" #include "CountMinSketch.hpp" #include "rock_commons.h" #include "srp.h" @@ -26,7 +26,6 @@ #include "fqreader.h" #include "fqwriter.h" #include "Filter.hpp" -#include "main_utils.h" #include "ROCKparams.h" @@ -63,6 +62,50 @@ void printCMSparams(const CMSparams& p) { cout<<"filter_PE_separately="<<p.filter_PE_separately<<endl; } +const void printVerboseInfo(const ROCKparams& Rparms,const Filter& filter,const T_read_counters& rc,\ + const unsigned long& tot_nb_kmers) { + std::vector<IO_fq_files>::iterator it_s; + vector<PE_files>::iterator it_pe; + vector<PE_files> v_PE_files=Rparms.get_PE_files(); + vector<IO_fq_files> single_files=Rparms.get_single_files(); + + for (it_pe=v_PE_files.begin();it_pe!=v_PE_files.end();it_pe++) { + cout<<"PE input files "<<it_pe->PE1.in_fq_file<<" "<<it_pe->PE2.in_fq_file<<endl; + } + for (it_s=single_files.begin();it_s!=single_files.end();it_s++) { + cout<<"SE input file(s) "<<it_s->in_fq_file<<endl; + } + CMSparams parms=Rparms.getCMSparams(); + cout<<"upper-bound coverage depth "<<parms.kappa<<endl; + cout<<"lower-bound coverage depth "<<parms.kappa_prime<<endl; + cout<<"k-mer length "<<Rparms.get_k()<<endl; + cout<<"expected no. distinct k-mers "<<Rparms.get_expected_nbKmers()<<endl; + cout<<"no. hash. function(s) "<<parms.lambda<<endl; + + cout<<"expected false positive proba. "<<Rparms.get_expected_collision_proba() <<"(cov. depth > 1)"<<endl; + for (int i=0;i<parms.lambda;i++) cout<<"no. buckets for hash. "<<i+1<<" "<<Pi_js[i]<<endl; + cout<<"no. bits per bucket "<<filter.getNbBitsForCounters()<<endl; + cout<<"count-min sketch size (Gb) "<<Rparms.getFilterSize()<<endl; + cout<<"no. input reads "<<rc.nb_input_reads<<endl; + cout<<"no. input k-mers"<<tot_nb_kmers<<endl; + unsigned long approx_nbDiffKm=filter.getApproxNbDistinctKMers(); + std::vector<unsigned long> zeroes=filter.getUnsetBuckets(); + for (int i=0;i<=parms.lambda-1;i++) cout<<"no. unset buckets for hash. "<<i+1<<zeroes[parms.lambda-1-i]<<endl; + cout<<"approx. no. distinct k-mers "<<approx_nbDiffKm<<endl; + int smallest_kappa=parms.kappa; + if (parms.kappa_prime>0) smallest_kappa=parms.kappa_prime; + float p =getCollisionProba(smallest_kappa,approx_nbDiffKm,Pi_js[0],parms.lambda); + cout<<"approx. false positive proba. "<<p<<endl; + cout<<"no. selected reads "<<rc.nb_output_reads<<endl; + for (it_pe=v_PE_files.begin();it_pe!=v_PE_files.end();it_pe++) { + cout<<"PE output files "<<it_pe->PE1.out_fq_file<<" "<<it_pe->PE2.out_fq_file<<" "<<it_pe->PE1.undef_fq_file<<" "<<it_pe->PE2.undef_fq_file<<endl; + } + for (it_s=single_files.begin();it_s!=single_files.end();it_s++) { + cout<<"SE output file(s) "<<it_s->out_fq_file<<" "<<it_s->undef_fq_file<<endl; + } +} + + int main(int argc,char * argv[]) { #ifdef BENCHMARK @@ -70,13 +113,14 @@ int main(int argc,char * argv[]) { printRUsage(); #endif srp sr; + T_read_counters rc; ROCKparams main_parms; main_parms.initFromMainOptsArgs(argc,argv); int f_id=main_parms.get_f_id(); CMSparams parms=main_parms.getCMSparams(); - printCMSparams(parms); + //printCMSparams(parms); FasqQualThreshold qual_thres=main_parms.getQualThresholds(); - printFastqQualThreshold(qual_thres); + //printFastqQualThreshold(qual_thres); std::vector<IO_fq_files> single_files=main_parms.get_single_files(); vector<PE_files> v_PE_files=main_parms.get_PE_files(); @@ -87,7 +131,7 @@ int main(int argc,char * argv[]) { cout<<"processed input args; going to start reading fastq files"<<endl; printRUsage(); #endif - processInputFiles(single_files,v_PE_files,map_id_backend,main_parms.getQualThreshold(),&sr,0); // Removing -p option, set default mode to 0 (PE processed as single). + unsigned long tot_nb_kmers=processInputFiles(single_files,v_PE_files,map_id_backend,main_parms.getQualThreshold(),&sr,0); // Removing -p option, set default mode to 0 (PE processed as single). #ifdef BENCHMARK cout<<"finished loading fastq file into sr structure"<<endl; cout<<"size of srp structure="<<sizeof(sr)<<endl; @@ -123,7 +167,7 @@ int main(int argc,char * argv[]) { printRUsage(); cout<<"Should now write output files"<<endl; #endif - writeFilteredFastq(map_id_backend,f_id,sr); + writeFilteredFastq(map_id_backend,f_id,rc,sr); #ifdef BENCHMARK cout<<"finished writing filtered reads"<<endl; printRUsage(); @@ -139,13 +183,14 @@ int main(int argc,char * argv[]) { cout<<"going to compute collision probability"<<endl; printRUsage(); #endif + /* int smallest_kappa=parms.kappa; if (parms.kappa_prime>0) smallest_kappa=parms.kappa_prime; float p =getCollisionProba(smallest_kappa,approx_nb_k_mers,Pi_js[0],parms.lambda); cout<<"estimated probability of collision:"<<p<<endl; cout<<"estimated number of distinct k-mers:"<<approx_nb_k_mers<<endl; - - if (main_parms.is_verbose()) main_parms.printVerboseInfo(); + */ + if (main_parms.is_verbose()) printVerboseInfo(main_parms,the_filter,rc,tot_nb_kmers); #ifdef BENCHMARK cout<<"finished,going to exit "<<endl; diff --git a/src/rock_commons.h b/src/rock_commons.h index 0ac2a4f150e3d10914a1221e3a4e6e6c595401ef..09533f6035b9aa29cd49f00297ca2a681a183959 100644 --- a/src/rock_commons.h +++ b/src/rock_commons.h @@ -1,5 +1,5 @@ /* - * rock_types2.h + * rock_commons.h * * Created on: Jan 20, 2016 * Author: vlegrand @@ -23,18 +23,18 @@ * (fqreader, read processing, CMS filling, result writing) and to the main program. * */ - +#define k_max_collision_proba 0.05 #define k_max_input_files 15 #define k_max_array_in_cms 50 // Theorical maximum of arrays in the count min sketch. #define k_arr_cms_size UINT_MAX // size of 1 CMS array // Store the prime numbers for modulo hashing in this array. -/* Use the 50 biggest prime numbers < 2^32. They have the propriety thatthat their binary representation 9i) +/* Use the 50 biggest prime numbers < 2^32. They have the propriety that their binary representation 9i) * contains at least 13 zeroes and (ii) at least 11 occurrences of '10'. * This disparity in the binary representation of a primary number guaranties that all bits of a numerator x * will be well affected during the x%p operation. */ -const long int Pi_js[k_max_array_in_cms]={4283781797,4283781461,4283738773,4283738453,4280985941,4280898901,4275393833,4275393173,\ +const unsigned int Pi_js[k_max_array_in_cms]={4283781797,4283781461,4283738773,4283738453,4280985941,4280898901,4275393833,4275393173,\ 4275382933,4274694821,4273296553,4273285717,4273253029,4273121621,4273034581,4272772181,\ 4272771749,4272769621,4272761509,4272728741,4272630421,4272629077,4272608581,4272608533,\ 4272599701,4272598181,4272597673,4272597353,4272597349,4272597157,4272596149,4272596117,\ @@ -62,6 +62,11 @@ typedef struct { // indicate at which index the single_or_PE_val array starts containing values for PE2 k-mers. }T_read_numericValues; +typedef struct { + unsigned int nb_input_reads; + unsigned int nb_output_reads; +}T_read_counters; + diff --git a/src/unit_test_cms.cpp b/src/unit_test_cms.cpp index cf3a0fa83e0cc26276753f1c4038c22ed34f763b..4f0bfbef0ca58be26183ed766b2efa52926b1103 100644 --- a/src/unit_test_cms.cpp +++ b/src/unit_test_cms.cpp @@ -11,8 +11,103 @@ #include <climits> using namespace std; +#include "ROCKparams.h" #include "CountMinSketch.hpp" -#include "main_utils.h" + + + +void test_getBestLambdaForN() { + ROCKparams Rparms; + unsigned long nb_k_mer=5000000000; + int best=Rparms.getBestLambdaForN(nb_k_mer,2,UINT_MAX,0.05); + assert(best==2); + nb_k_mer=200000000; + best=Rparms.getBestLambdaForN(nb_k_mer,5,UINT_MAX,0.1); + assert(best==1); + nb_k_mer=600000000; + best=Rparms.getBestLambdaForN(nb_k_mer,70,UINT_MAX,0.1); + assert(best==1); + nb_k_mer=2000000000; + best=Rparms.getBestLambdaForN(nb_k_mer,10,UINT_MAX,0.1); + assert(best==1); + nb_k_mer=10000000000; + best=Rparms.getBestLambdaForN(nb_k_mer,5,UINT_MAX,0.1); + //cout<<best<<endl; + assert(best==1); + best=Rparms.getBestLambdaForN(nb_k_mer,1,UINT_MAX,0.05); + //cout<<best<<endl; + assert(best==8); + + nb_k_mer=50000000000; + best=Rparms.getBestLambdaForN(nb_k_mer,5,UINT_MAX,0.1);// this returns 90 + assert(best==90); + best=Rparms.getBestLambdaForN(nb_k_mer,50,UINT_MAX,0.1); + assert(best==1); + + best=Rparms.getBestLambdaForN(nb_k_mer,50,UINT_MAX,0.05); + assert(best==1); + best=Rparms.getBestLambdaForN(nb_k_mer,10,UINT_MAX,0.05); + assert(best==7); +} + +void test_processIOFileArgs() { + std::vector<IO_fq_files> single_files; + vector<PE_files> v_PE_files; + std::vector<string> v_input_lines; + std::vector<string> v_output_lines; + ROCKparams main_parms; + int f_id=0; + int ret=main_parms.loadInOutFileArgs("../test/data/unit/list_input1.txt","../test/data/unit/list_output1.txt",v_input_lines,v_output_lines); + assert(ret==EXIT_SUCCESS); + ret=main_parms.processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); + assert(ret==EXIT_FAILURE); + f_id=0; + v_PE_files.clear(); + single_files.clear(); + v_input_lines.clear(); + v_output_lines.clear(); + ret=main_parms.loadInOutFileArgs("../test/data/unit/list_input2.txt","../test/data/unit/list_output2.txt",v_input_lines,v_output_lines); + ret=main_parms.processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); + assert(ret==EXIT_FAILURE); + f_id=0; + v_PE_files.clear(); + single_files.clear(); + v_input_lines.clear(); + v_output_lines.clear(); + ret=main_parms.loadInOutFileArgs("../test/data/unit/list_input3.txt","../test/data/unit/list_output3.txt",v_input_lines,v_output_lines); + ret=main_parms.processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); + assert(ret==EXIT_SUCCESS); + assert(f_id==14); + assert(single_files.size()==2); + assert(v_PE_files.size()==6); + IO_fq_files s=single_files[0]; + assert(s.in_fq_file.compare("fifi")==0); + assert(s.out_fq_file.compare("ofifi")==0); + PE_files s2=v_PE_files[5]; + assert(s2.PE2.in_fq_file.compare("nono")==0); + assert(s2.PE2.out_fq_file.compare("onono")==0); + + v_output_lines.clear(); + v_input_lines.clear(); + v_PE_files.clear(); + single_files.clear(); + ret=main_parms.loadInOutFileArgs("../test/data/unit/list_input3.txt","",v_input_lines,v_output_lines); + ret=main_parms.processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); + s=single_files[0]; + assert(s.out_fq_file.compare("fifi.rock.fq")==0); + s2=v_PE_files[5]; + assert(s2.PE2.out_fq_file.compare("nono.rock.fq")==0); + +} + + +void test_ROCKparams() { + cout<<"test computation of the best lambda value depending on nb distinct k-mers."<<endl; + test_getBestLambdaForN(); + cout<<"testing processing of IOFile arguments ( following 2 error messages are what is expected; don't worry)."<<endl; + test_processIOFileArgs(); + cout<<"done"<<endl; +} void test_CMS(int lambda,int kappa,int kappa_prime) { @@ -356,7 +451,8 @@ int main(int argc, char **argv) { return -1; } - + cout<<"testing CMS params"<<endl; + test_ROCKparams(); cout<<"testing CMS with lambda="<<lambda<<endl; test_CMS(lambda,kappa,kappa_prime); // Finally using C arrays (maps implied storing hash keys : 4 Bytes per k_mer overhead) but each array is of size INT_MAX... //cout<<"testing CMS with PE not as single strict kappa prime implementation"<<endl; diff --git a/src/unit_test_fqwriter.cpp b/src/unit_test_fqwriter.cpp index 21ad59c061b642b1a79a70d58ff975b6a2e96129..db0d8bf566b1ab383bcc3cd4b424575753db5c50 100644 --- a/src/unit_test_fqwriter.cpp +++ b/src/unit_test_fqwriter.cpp @@ -57,6 +57,7 @@ CCCCCCFFFCCCGGGGGGGGGGHHHHHHHHGHHHHHHHHHHGGGGGGGGHHGHHHHGHGHGHHHHHHHHHFHHHGGGGGG void test_write_PE() { srp sr; + T_read_counters rc; srp::reverse_iterator rit; i_dim::iterator it_offs; k_dim::iterator it_struct; @@ -98,7 +99,9 @@ void test_write_PE() { be_fq1->setOutputFile((char *) "../test/data/unit/klebsiella_PE1_filtered.fq"); be_fq2->setOutputFile((char *) "../test/data/unit/klebsiella_PE2_filtered.fq"); - writeFilteredFastq(map_id_backend,2, sr); + writeFilteredFastq(map_id_backend,2,rc,sr); + assert(rc.nb_input_reads==3); + assert(rc.nb_output_reads==3); // step 4 : re-read output files and check their content. mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; diff --git a/src/unit_test_main_utils.cpp b/src/unit_test_main_utils.cpp deleted file mode 100644 index 9e9a329d37281eb97b37c8a107789d8d18675ee8..0000000000000000000000000000000000000000 --- a/src/unit_test_main_utils.cpp +++ /dev/null @@ -1,185 +0,0 @@ -/* - * test_main_utils.cpp - * - * Created on: May 25, 2016 - * Author: vlegrand - */ - -#include <iostream> -#include <cassert> -#include <string> -#include <cmath> - - -using namespace std; - -#include "rock_commons.h" -#include "main_utils.h" -#include "ROCKparams.h" - -void test_processIOFileArgs() { - std::vector<IO_fq_files> single_files; - vector<PE_files> v_PE_files; - std::vector<string> v_input_lines; - std::vector<string> v_output_lines; - ROCKparams main_parms; - int f_id=0; - int ret=main_parms.loadInOutFileArgs("../test/data/unit/list_input1.txt","../test/data/unit/list_output1.txt",v_input_lines,v_output_lines); - assert(ret==EXIT_SUCCESS); - ret=main_parms.processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); - assert(ret==EXIT_FAILURE); - f_id=0; - v_PE_files.clear(); - single_files.clear(); - v_input_lines.clear(); - v_output_lines.clear(); - ret=main_parms.loadInOutFileArgs("../test/data/unit/list_input2.txt","../test/data/unit/list_output2.txt",v_input_lines,v_output_lines); - ret=main_parms.processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); - assert(ret==EXIT_FAILURE); - f_id=0; - v_PE_files.clear(); - single_files.clear(); - v_input_lines.clear(); - v_output_lines.clear(); - ret=main_parms.loadInOutFileArgs("../test/data/unit/list_input3.txt","../test/data/unit/list_output3.txt",v_input_lines,v_output_lines); - ret=main_parms.processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); - assert(ret==EXIT_SUCCESS); - assert(f_id==14); - assert(single_files.size()==2); - assert(v_PE_files.size()==6); - IO_fq_files s=single_files[0]; - assert(s.in_fq_file.compare("fifi")==0); - assert(s.out_fq_file.compare("ofifi")==0); - PE_files s2=v_PE_files[5]; - assert(s2.PE2.in_fq_file.compare("nono")==0); - assert(s2.PE2.out_fq_file.compare("onono")==0); - - v_output_lines.clear(); - v_input_lines.clear(); - v_PE_files.clear(); - single_files.clear(); - ret=main_parms.loadInOutFileArgs("../test/data/unit/list_input3.txt","",v_input_lines,v_output_lines); - ret=main_parms.processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id); - s=single_files[0]; - assert(s.out_fq_file.compare("fifi.rock.fq")==0); - s2=v_PE_files[5]; - assert(s2.PE2.out_fq_file.compare("nono.rock.fq")==0); - -} - - -void test_getBestLambdaForN() { - unsigned long nb_k_mer=5000000000; - int best=getBestLambdaForN(nb_k_mer,2,UINT_MAX,0.05); - assert(best==2); - nb_k_mer=200000000; - best=getBestLambdaForN(nb_k_mer,5,UINT_MAX,0.1); - assert(best==1); - nb_k_mer=600000000; - best=getBestLambdaForN(nb_k_mer,70,UINT_MAX,0.1); - assert(best==1); - nb_k_mer=2000000000; - best=getBestLambdaForN(nb_k_mer,10,UINT_MAX,0.1); - assert(best==1); - nb_k_mer=10000000000; - best=getBestLambdaForN(nb_k_mer,5,UINT_MAX,0.1); - //cout<<best<<endl; - assert(best==1); - best=getBestLambdaForN(nb_k_mer,1,UINT_MAX,0.05); - //cout<<best<<endl; - assert(best==8); - - nb_k_mer=50000000000; - best=getBestLambdaForN(nb_k_mer,5,UINT_MAX,0.1);// this returns 90 - assert(best==90); - best=getBestLambdaForN(nb_k_mer,50,UINT_MAX,0.1); - assert(best==1); - - best=getBestLambdaForN(nb_k_mer,50,UINT_MAX,0.05); - assert(best==1); - best=getBestLambdaForN(nb_k_mer,10,UINT_MAX,0.05); -} - -void test_getCollisionProba() { - float p=getCollisionProba(2,5000000000,UINT_MAX,2); - assert(round(p*10000)==127); - p=getCollisionProba(2,5000000000,UINT_MAX,1); - assert(p=0.1128); - p =getCollisionProba(1,2,UINT_MAX,3); - //cout<<p<<endl; - assert(p==0.0); - p =getCollisionProba(1,1000000000,UINT_MAX,1); - assert(floor(p*1000+0.5)==23); - p =getCollisionProba(5,1000000000,UINT_MAX,1); - assert(floor(p*1000+0.5)==0); - p =getCollisionProba(5,50000000000,UINT_MAX,1); - assert(floor(p*1000+0.5)==975); -} - -void test_gammln() { - float a; - a=gammln(1); - assert(exp(a)==1); - a=gammln(2); - assert(exp(a)==1); - a=gammln(3); - assert(exp(a)==2); - a=gammln(4); - assert(exp(a)==6); - a=gammln(5); - assert(exp(a)==24); - a=gammln(6); - float tmp=exp(a); - float tmp2=tmp*10000; - float tmp3=round(tmp2)/10000; - //printf("%s \n",tmp3); - assert(tmp3==120); - /* unsigned long n=5000; - unsigned long m=n+1; - a=gammln(m); - float b=gammln(n); - double truc=exp(a-b); - assert(truc==n); - */ - -} - -void test_pmf() { - unsigned long nb_k_mers=5000000000; - float p=pmf(0,nb_k_mers,UINT_MAX); - assert(round(p*10000)==3122); - p=pmf(1,nb_k_mers,UINT_MAX); - assert(round(p*10000)==3634); - p=pmf(2,nb_k_mers,UINT_MAX); - assert(round(p*10000)==2115); -} - -void test_ccdf() { - unsigned long nb_k_mers=5000000000; - double res=ccdf(2,nb_k_mers,UINT_MAX); - assert(round(res*10000)==1128); -} - - - - - - -int main() { - cout<<"testing main_utils"<<endl; - cout<<"testing processing of IOFile arguments ( following 2 error messages are what is expected; don't worry)."<<endl; - test_processIOFileArgs(); - cout<<"testing the gammln function"<<endl; - test_gammln(); - cout<<"testing the pmf function"<<endl; - test_pmf(); - cout<<"testing the ccdf function"<<endl; - test_ccdf(); - cout<<"testing computation of collision probability."<<endl; - test_getCollisionProba(); - cout<<"test computation of the bast lambda value depending on nb distinct k-mers."<<endl; - test_getBestLambdaForN(); - cout<<"done"<<endl; -} - - diff --git a/test/rock.sh b/test/rock.sh index 753788a4a1fded8aa505155d9463f14bf46971f5..861642e3a6b70c9dbff7961f476ca5e0a6e27894 100755 --- a/test/rock.sh +++ b/test/rock.sh @@ -100,8 +100,8 @@ echo "unit testing for fqwriter" echo " " echo "#################################################################################" -echo "unit testing for main utils" -../src/unit_test_main_utils || exit 25 +echo "unit testing for math utils" +../src/unit_test_math_utils || exit 25 # cleanup echo " "