From 600ae26db4d11bbb32e4c3f37046382c05711819 Mon Sep 17 00:00:00 2001 From: Veronique Legrand <vlegrand@pasteur.fr> Date: Thu, 26 May 2016 14:28:38 +0200 Subject: [PATCH] added unit tests for main utils and fixed bug in computation of lambda and of collision probability --- src/main_utils.cpp | 23 +++++++++++++++++++---- src/main_utils.h | 1 + src/unit_test_cms.cpp | 30 ++++++++++-------------------- test/Makefile.am | 2 +- test/rock.sh | 12 +++++++++--- 5 files changed, 40 insertions(+), 28 deletions(-) diff --git a/src/main_utils.cpp b/src/main_utils.cpp index ac6653d..2f3e279 100644 --- a/src/main_utils.cpp +++ b/src/main_utils.cpp @@ -142,17 +142,32 @@ int processInOutFileArgs(const std::string& input_file,const std::string output_ int getBestLambdaForN(const unsigned long& nb_k_mers,const int& lambda_max) { int lambda=2; int min_lambda=lambda; - float tmp=1/INT_MAX; - tmp=1-tmp; + long double tmp=1.0/INT_MAX; + tmp=1.0-tmp; tmp=pow(tmp,nb_k_mers); - tmp=1-tmp; + tmp=1.0-tmp; while (lambda<=lambda_max) { min_lambda=lambda; - float p=pow(tmp,lambda); + long double p=pow(tmp,lambda); if (p<=0.01) break; lambda+=1; } return min_lambda; } +/* + * Compute collision probability + */ +float getCollisionProba(const unsigned long& nb_k_mers,const int& lambda) { + long double tmp=1.0/INT_MAX; + tmp=1.0-tmp; + tmp=pow(tmp,nb_k_mers); + tmp=1.0-tmp; + long double p=pow(tmp,lambda); + if (p<0.0001) p=0; + // rounding + p=p*1000+0.5; + return floor(p)/1000; + //return p; +} diff --git a/src/main_utils.h b/src/main_utils.h index 80cd267..50ebafe 100644 --- a/src/main_utils.h +++ b/src/main_utils.h @@ -17,6 +17,7 @@ unsigned long getNodePhysMemory(); int processInOutFileArgs(const std::string& input_file,const std::string output_file,std::vector<IO_fq_files>& single_files,std::vector<PE_files>& v_PE_files,int& f_id); int getBestLambdaForN(const unsigned long& nb_k_mers,const int& lambda_max); +float getCollisionProba(const unsigned long& nb_k_mers,const int& lambda); #endif /* MAIN_UTILS_H_ */ diff --git a/src/unit_test_cms.cpp b/src/unit_test_cms.cpp index 397ce29..47349ce 100644 --- a/src/unit_test_cms.cpp +++ b/src/unit_test_cms.cpp @@ -13,21 +13,7 @@ using namespace std; #include "CountMinSketch.hpp" -//TODO Add tests for hash functions? Yes -/* -void test_hash(int lambda,int kappa,int kappa_prime) { - int j,i; - unsigned long intermediate=ULONG_MAX; - unsigned long expected=intermediate; - CountMinSketch cms=CountMinSketch(lambda,kappa,kappa_prime); - - for (j=1;j<=lambda;j++) { - //assert(cms.hash64to32(0,j)==pi_j[j]); - } -} - -*/ void test_CMS(int lambda,int kappa,int kappa_prime) { CountMinSketch<unsigned char> cms(lambda,kappa,kappa_prime); int i; @@ -43,6 +29,9 @@ void test_CMS(int lambda,int kappa,int kappa_prime) { std::cout<<"min="<<min<<endl; assert((min==num) || (min==num-get_mask<unsigned char>::value-1)); // addKmer doesn't check kappa. + unsigned long nb_distinct_k_mers=cms.getNbDistinctKMers(); + assert(nb_distinct_k_mers==1); + // mimic a read (as given by part 2 output). readNumericValues read_values; for (i=1;i<=400;i++) { @@ -51,6 +40,9 @@ void test_CMS(int lambda,int kappa,int kappa_prime) { ret=cms.addRead(read_values); // read should be accepted since none of the values that it contains is equal to those I inserted previously. assert(ret!=rej_expected); + nb_distinct_k_mers=cms.getNbDistinctKMers(); + assert(nb_distinct_k_mers==401); + // mimic a vector that contains a lot of k_mers already present in the CMS and check that it is rejected. readNumericValues read_values2; for (i=1;i<=199;i++) { @@ -61,6 +53,10 @@ void test_CMS(int lambda,int kappa,int kappa_prime) { } ret=cms.addRead(read_values2); assert(ret==rej_expected); + + nb_distinct_k_mers=cms.getNbDistinctKMers(); + assert(nb_distinct_k_mers==401); + // test that reads with median cov<kappa_prime are rejected. ret=cms.isBeneathMinKappa(read_values); assert(ret>0); @@ -85,11 +81,5 @@ int main(int argc, char **argv) { cout<<"testing CMS with lambda="<<lambda<<endl; test_CMS(lambda,kappa,kappa_prime); // Finally using C arrays (maps implied storing hash keys : 4 Bytes per k_mer overhead) but each array is of size INT_MAX... - /*lambda=6; - cout<<"testing CMS with lambda="<<lambda<<endl; - test_CMS(lambda,kappa,kappa_prime); - lambda=8; - cout<<"testing CMS with lambda="<<lambda<<endl; - test_CMS(lambda,kappa,kappa_prime);*/ cout<<"done"<<endl; } diff --git a/test/Makefile.am b/test/Makefile.am index 9cb5109..7e30e82 100755 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -4,6 +4,6 @@ LOG_COMPILER = $(SHELL) TESTS = rock.sh -EXTRA_DIST = $(TESTS) data/extract_klebsiella_long_reads_100.txt data/extract_klebsiella_long_reads_100_filtered.txt data/fastq.raw/klebsiella_100_1.fq data/fastq.raw/klebsiella_100_2.fq data/unit/klebsiella_PE1.fq data/unit/klebsiella_PE2.fq data/unit/test_PE1.fq data/unit/test_PE2.fq data/unit/test_PE1_2.fq data/unit/test_PE2_2.fq data/unit/test_single2.fq data/unit/test_single.fq +EXTRA_DIST = $(TESTS) data/extract_klebsiella_long_reads_100.txt data/extract_klebsiella_long_reads_100_filtered.txt data/fastq.raw/klebsiella_100_1.fq data/fastq.raw/klebsiella_100_2.fq data/unit/klebsiella_PE1.fq data/unit/klebsiella_PE2.fq data/unit/test_PE1.fq data/unit/test_PE2.fq data/unit/test_PE1_2.fq data/unit/test_PE2_2.fq data/unit/test_single2.fq data/unit/test_single.fq data/unit/list_input1.txt data/unit/list_output1.txt data/unit/list_input2.txt data/unit/list_output2.txt data/unit/list_input3.txt data/unit/list_output3.txt diff --git a/test/rock.sh b/test/rock.sh index 2c699ed..649fdbf 100755 --- a/test/rock.sh +++ b/test/rock.sh @@ -3,10 +3,12 @@ ## Debug mode test "x$VERBOSE" = "xx" && set -x -pwd -ls -l [ -z "$srcdir" ] && srcdir="." -echo ${srcdir} +echo "srcdir"=${srcdir} + +## doing some cleanup in case previous execution failed. +rm -fr data/fastq.filtered || echo "nothing to clean up" + ## check options echo " " echo "##################################################################################" @@ -116,6 +118,10 @@ echo "########################################################################## echo "unit testing for fqwriter" ../src/unit_test_fqwriter || exit 25 +echo " " +echo "#################################################################################" +echo "unit testing for main utils" +../src/unit_test_main_utils || exit 26 # cleanup echo " " -- GitLab