diff --git a/JolyTree.sh b/JolyTree.sh
index d0a7224612348667eba8edd073fecddfb936c6fa..0dcf55e6207840fd4a69b43fce72b3ecc7257b91 100755
--- a/JolyTree.sh
+++ b/JolyTree.sh
@@ -4,7 +4,7 @@
 #                                                                                                           #
 # JolyTree: fast distance-based phylogenetic inference from unaligned genome sequences                      #
 #                                                                                                           #
-# Copyright (C) 2017,2018,2019  Alexis Criscuolo                                                            #
+  COPYRIGHT="Copyright (C) 2017-2020 Institut Pasteur"                                                      #
 #                                                                                                           #
 # This program  is free software:  you can  redistribute it  and/or modify it  under the terms  of the GNU  #
 # General Public License as published by the Free Software Foundation, either version 3 of the License, or  #
@@ -17,13 +17,13 @@
 # You should have received a copy of the  GNU General Public License along with this program.  If not, see  #
 # <http://www.gnu.org/licenses/>.                                                                           #
 #                                                                                                           #
-#  Contact:                                                                                                 #
-#  Institut Pasteur                                                                                         #
-#  Bioinformatics and Biostatistics Hub                                                                     #
-#  C3BI, USR 3756 IP CNRS                                                                                   #
-#  Paris, FRANCE                                                                                            #
-#                                                                                                           #
-#  alexis.criscuolo@pasteur.fr                                                                              #
+# Contact:                                                                                                  #
+#  Alexis Criscuolo                                                            alexis.criscuolo@pasteur.fr  #
+#  Genome Informatics & Phylogenetics (GIPhy)                                             giphy.pasteur.fr  #
+#  Bioinformatics and Biostatistics Hub                                 research.pasteur.fr/team/hub-giphy  #
+#  USR 3756 IP CNRS                          research.pasteur.fr/team/bioinformatics-and-biostatistics-hub  #
+#  Dpt. Biologie Computationnelle                     research.pasteur.fr/department/computational-biology  #
+#  Institut Pasteur, Paris, FRANCE                                                     research.pasteur.fr  #
 #                                                                                                           #
 #############################################################################################################
 
@@ -33,8 +33,16 @@
 # = VERSIONS =                                                                                              #
 # ============                                                                                              #
 #                                                                                                           #
-  VERSION=1.1b.191021ac                                                                                     #
-# + exit 0 when displaying the doc                                                                          #
+  VERSION=2.0.190926ac                                                                                      #
+# + new F81/EI transformation formula using gamma shape parameter (option -a = 1.5 by default)              #
+# + option -f to to use the 4 nucleotide frequencies in F81/EI transformation; by default, to deal with     #
+#   multiple contig files, JolyTree sets f(A)=f(T)=0.5*(A+T)/(A+C+G+T) and f(C)=f(G)=0.5*(C+G)/(A+C+G+T)    #
+# + option -s modified                                                                                      #
+#                                                                                                           #
+# VERSION=1.2.190729ac                                                                                      #
+# + 3x faster minhash estimations on multiple threads                                                       #
+# + ratchet-based BME tree search based on multiple threads                                                 #
+# + temporary files deleted when JolyTree is interrupted                                                    #
 #                                                                                                           #
 # VERSION=1.1.181205ac                                                                                      #
 # + option -q to set desired probability of observing a random k-mer                                        #
@@ -81,23 +89,32 @@
 #                                                                                                           #
   if [ "$1" = "-?" ] || [ "$1" = "-h" ] || [ $# -le 1 ]                                                     #
   then                                                                                                      #
+    echo -e "\n\033[1m JolyTree v$VERSION                         $COPYRIGHT\033[0m";                       #
     cat << EOF                                                       
 
- JolyTree v.$VERSION
+ Criscuolo A  (2019)  A fast alignment-free  bioinformatics procedure to  infer accurate 
+ distance-based phylogenetic trees from genome assemblies. RIO. doi:10.3897/rio.5.e36178 
+
+ USAGE:  JolyTree.sh  -i <directory>  -b <basename>  [options]
+
+ OPTIONS:
 
- USAGE:
-    JolyTree.sh  [options]
- where:
     -i <directory>  directory name containing  FASTA-formatted contig files;  only files
                     ending with .fa, .fna, .fas or .fasta will be considered (mandatory)
     -b <basename>   basename of every written output file (mandatory)
-    -s <int>        sketch size (default: 25% of the largest genome size)
-    -q <double>     probability of observing a random k-mer (default: 0.0001)
-    -k <int>        k-mer size (default: estimated from the average genome size with the
+    -q <real>       probability of observing a random k-mer (default: 0.000000001)
+    -k <int>        k-mer size (default: estimated from the largest genome size with the
                     probability set by option -q)
+    -s <real|int>   sketch size estimated as the  proportion (up to 1.00) of the average 
+                    genome size;  the sketch size (integer > 2) can also be directly set 
+                    using this option (default: 0.25)
     -c <real>       if at least one of the estimated p-distances is above this specified
-                    cutoff, then a F81 correction is performed (default: 0.1)
-    -n              no BME tree inference (only pairwise distance estimation)
+                    cutoff, then a F81/EI correction is performed (default: 0.1)
+    -a <real>       F81/EI transformation gamma shape parameter (default: 1.5)
+    -f              using  the  four nucleotide  frequencies in  F81/EI  transformations  
+                    (default:  to  deal  with  multiple  contig  files,   only  the  two 
+                    frequencies A+T and C+G are used)
+    -n              no BME tree inference (only pairwise distance estimates)
     -r <int>        number of steps  when performing the  ratchet-based  BME tree search
                     (default: 100)
     -t <int>        number of threads (default: 2)
@@ -174,6 +191,7 @@ EOF
 #                                                                                                           #
 #                                                                                                           #
 #                                                                                                           #
+#                                                                                                           #
 # [2] EXECUTE PERMISSION =================================================================================  #
 #  In order to run JolyTree, give the execute permission on the script JolyTree.sh:                         #
 #    chmod +x JolyTree.sh                                                                                   #
@@ -195,7 +213,7 @@ EOF
 #  Moreover, it is also possible to launch JolyTree on  multiple cores on a cluster managed by Slurm. For   #
 #  this particular case, you should first uncomment the following line:                                     #
 #                                                                                                           #
-#  EXEC="srun -n 1 -N 1 $EXEC";                                                                             #
+#  EXEC="srun -n 1 -N 1 -Q $EXEC";                                                                          #
 #                                                                                                           #
 #  and launch JolyTree via the following command line models (with t = number of cores):                    #
 #    salloc <Slurm options> -n $t  ./JolyTree.sh  <JolyTree options>  -t $t                                 #
@@ -215,58 +233,102 @@ if [ ! $(command -v $GAWK) ];   then echo "$GAWK not found"   >&2 ; exit 1 ; fi
 if [ ! $(command -v $FASTME) ]; then echo "$FASTME not found" >&2 ; exit 1 ; fi
 if [ ! $(command -v $REQ) ];    then echo "$REQ not found"    >&2 ; exit 1 ; fi
 
-DATADIR="N.O.D.I.R";            # -i (mandatory)
-BASEFILE="N.O.B.A.S.E.F.I.L.E"; # -b (mandatory)
+export LC_ALL=C ;
+
+DATADIR="N.O_D.I.R";            # -i (mandatory)
+BASEFILE="N.O_B.A.S.E.F.I.L.E"; # -b (mandatory)
 
-SKETCH=0;                       # -s (auto from data)
-Q=0.00001;                      # -q (0.00001)
+SKETCH=0.25;                    # -s (0.25)
+Q=0.000000001;                  # -q (0.000000001)
 K=0;                            # -k (auto from -q)
 CUTOFF=0.1;                     # -c (0.1)
+ALPHA=1.5;                      # -a (1.5)
+NFQ=2;                          # -f (none)
 
 INFERTREE=true;                 # -n (none)
 RATCHET=100;                    # -r (100)
 RATCHET_LIMIT=200;              #    (static)
 
 NPROC=2;                        # -t (2)
+CHUNK=20;                       # -h (20)
 WAITIME=0.5;                    #    (auto from -t)
 
-while getopts :i:b:s:q:k:c:d:r:t:n option
+while getopts :i:b:s:q:k:c:a:d:r:t:h:nf option
 do
   case $option in
-    i) DATADIR="$OPTARG"                                  ;;
-    b) BASEFILE="$OPTARG"                                 ;;
-    s) SKETCH=$OPTARG                                     ;;
-    q) Q=$OPTARG                                          ;;
-    k) K=$OPTARG                                          ;;
-    c) CUTOFF=$OPTARG                                     ;;
-    n) INFERTREE=false                                    ;;
-    r) RATCHET=$OPTARG                                    ;;
-    t) NPROC=$OPTARG                                      ;;
-    :) echo "option $OPTARG : missing argument" ; exit 1  ;;
-   \?) echo "$OPTARG : option invalide" ;         exit 1  ;;
+    i) DATADIR="$OPTARG"                                                            ;;
+    b) BASEFILE="$OPTARG"                                                           ;;
+    s) SKETCH=$OPTARG                                                               ;;
+    q) Q="$($GAWK -v x=$OPTARG 'BEGIN{printf "%.20f", x+0}' | sed 's/0*$//g')"      ;;
+    k) K=$OPTARG                                                                    ;;
+    c) CUTOFF="$($GAWK -v x=$OPTARG 'BEGIN{printf "%.20f", x+0}' | sed 's/0*$//g')" ;;
+    a) ALPHA="$($GAWK -v x=$OPTARG 'BEGIN{printf "%.20f", x+0}' | sed 's/0*$//g')"  ;;
+    f) NFQ=4                                                                        ;;
+    n) INFERTREE=false                                                              ;;
+    r) RATCHET=$OPTARG                                                              ;;
+    h) CHUNK=$OPTARG                                                                ;;
+    t) NPROC=$OPTARG                                                                ;;
+    :) echo "missing argument ($OPTARG)" >&2 ; exit 1                               ;;
+   \?) echo "invalid option ($OPTARG)"   >&2 ; exit 1                               ;;
   esac
 done
-if [ "$DATADIR" == "N.O.D.I.R" ];             then echo "genome directory is not specified (option -i)" >&2 ; exit 1 ; fi
-if [ ! -e "$DATADIR" ];                       then echo "genome directory does not exist (option -i)"   >&2 ; exit 1 ; fi
-if [ ! -d "$DATADIR" ];                       then echo "$DATADIR is not a directory (option -i)"       >&2 ; exit 1 ; fi
-if [ "$BASEFILE" == "N.O.B.A.S.E.F.I.L.E" ];  then echo "basename is not specified (option -b)"         >&2 ; exit 1 ; fi
-if [ $SKETCH -ne 0 ] && [ $SKETCH -le 1000 ]; then echo "sketch size $SKETCH is too low (option -s)"    >&2 ; exit 1 ; fi
-
-### verifying the number of threads
+if [ "$DATADIR" == "N.O_D.I.R" ];             then echo "genome directory is not specified (option -i)"       >&2 ; exit 1 ; fi
+if [ ! -e "$DATADIR" ];                       then echo "genome directory does not exist (option -i)"         >&2 ; exit 1 ; fi
+if [ ! -d "$DATADIR" ];                       then echo "$DATADIR is not a directory (option -i)"             >&2 ; exit 1 ; fi
+if [ "$BASEFILE" == "N.O_B.A.S.E.F.I.L.E" ];  then echo "basename is not specified (option -b)"               >&2 ; exit 1 ; fi
+if [ $(echo "$SKETCH<=0" | bc) -ne 0 ];       then echo "incorrect value: $SKETCH (option -s)"                >&2 ; exit 1 ; fi
+if ! [[ $K =~ ^[0-9]+$ ]];                    then echo "incorrect k-mer size: $K (option -k)"                >&2 ; exit 1 ; fi
+if ! [[ $Q =~ ^0\.[0-9]+$ ]];                 then echo "incorrect probability q: $Q (option -q)"             >&2 ; exit 1 ; fi
+if ! [[ $CUTOFF =~ ^0\.[0-9]+$ ]];            then echo "incorrect cutoff: $CUTOFF (option -c)"               >&2 ; exit 1 ; fi
+if ! [[ $ALPHA =~ ^[0-9]+\.[0-9]+$ ]];        then echo "incorrect parameter: $ALPHA (option -a)"             >&2 ; exit 1 ; fi
+if ! [[ $RATCHET =~ ^[0-9]+$ ]];              then echo "incorrect ratchet step number: $RATCHET (option -r)" >&2 ; exit 1 ; fi
+if ! [[ $NPROC =~ ^[0-9]+$ ]];                then echo "incorrect value: $NPROC (option -t)"                 >&2 ; exit 1 ; fi
+
+### verifying the number of threads #########################################################################
 [ $NPROC -le 0 ] && NPROC=2;
 echo "$NPROC thread(s)" ;
-WAITIME=$($GAWK -v x=$NPROC 'BEGIN{print 1/sqrt(x)'});
-
-### gathering the genome list
-GLIST=$(ls $DATADIR/*.fna $DATADIR/*.fas $DATADIR/*.fa $DATADIR/*.fasta 2> /dev/null | sort);
+WAITIME=$($GAWK -v x=$NPROC 'BEGIN{print 1/sqrt(x)}');
+						     
+### gathering the genome list ###############################################################################
+GLIST=$(ls $DATADIR/*.fna $DATADIR/*.fas $DATADIR/*.fa $DATADIR/*.fasta 2>/dev/null);
+if   [ $(grep -c -F " " <<<"$GLIST") -ne 0 ]
+then
+  echo "found blank space(s) within the following file name(s):"                              >&2 ;
+  ls $DATADIR/*.fna $DATADIR/*.fas $DATADIR/*.fa $DATADIR/*.fasta 2>/dev/null | grep " "     1>&2 ;
+  echo "please avoid any blank space within every file name"                                  >&2 ; exit 1 ;
+elif [ $(grep -c -F "'" <<<"$GLIST") -ne 0 ]
+then
+  echo "found single quote(s) within the following file name(s):"                             >&2 ;
+  ls $DATADIR/*.fna $DATADIR/*.fas $DATADIR/*.fa $DATADIR/*.fasta 2>/dev/null | grep -F "'"  1>&2 ;
+  echo "please avoid any single quote within every file name"                                 >&2 ; exit 1 ;
+elif [ $(grep -c -F "\"" <<<"$GLIST") -ne 0 ]
+then
+  echo "found double quote(s) within the following file name(s):"                             >&2 ;
+  ls $DATADIR/*.fna $DATADIR/*.fas $DATADIR/*.fa $DATADIR/*.fasta 2>/dev/null | grep -F "\"" 1>&2 ;
+  echo "please avoid any double quote within every file name"                                 >&2 ; exit 1 ;
+elif [ $(grep -c -F ";" <<<"$GLIST") -ne 0 ]
+then
+  echo "found semicolon(s) within the following file name(s):"                                >&2 ;
+  ls $DATADIR/*.fna $DATADIR/*.fas $DATADIR/*.fa $DATADIR/*.fasta 2>/dev/null | grep -F ";"  1>&2 ;
+  echo "please avoid any semicolon within every file name"                                    >&2 ; exit 1 ;
+elif [ $(grep -c -F "," <<<"$GLIST") -ne 0 ]
+then
+  echo "found comma(s) within the following file name(s):"                                    >&2 ;
+  ls $DATADIR/*.fna $DATADIR/*.fas $DATADIR/*.fa $DATADIR/*.fasta 2>/dev/null | grep -F ","  1>&2 ;
+  echo "please avoid any comma within every file name"                                        >&2 ; exit 1 ;
+fi
+GLIST=$(ls $DATADIR/*.fna $DATADIR/*.fas $DATADIR/*.fa $DATADIR/*.fasta 2>/dev/null | sort);
 n=$(echo $GLIST | $GAWK '{print NF}');
-if [ $n -lt 4 ]; then echo "directory $DATADIR should contain at least 4 files *.fna, *.fas, *.fasta or *.fa" >&2 ; exit 1 ; fi
+if [ $n -lt 4 ]
+then
+  echo "directory $DATADIR should contain at least 4 files *.fna, *.fas, *.fasta or *.fa"     >&2 ; exit 1 ;
+fi
 echo "$n taxa" ;
 
-### creating output file names
+### creating output file names ##############################################################################
 ACGT=$BASEFILE.acgt;    # ACGT content of each input genome
 OEPL=$BASEFILE.oepl;    # p-distance estimates in OEPL (One Entry Per Line) format 
-DFILE=$BASEFILE.d;      # evolutioanry distances in PHYLIP square format
+DMAT=$BASEFILE.d;       # evolutionary distances in PHYLIP square format
 
 
 #############################################################################################################
@@ -275,13 +337,35 @@ DFILE=$BASEFILE.d;      # evolutioanry distances in PHYLIP square format
 #############################################################################################################
 #############################################################################################################
 
-### estimating ACGT content
+function ctrl_c() {
+  echo -n " process interrupted: deleting files ... " ;
+  sleep 5 ;
+  for f in $GLIST
+  do
+    rm -f ${f%.*}.msh ;
+  done
+  rm -f $ACGT $ACGT.tmp $OEPL $DMAT ;
+  echo "[ok]" ;
+  exit 1 ;
+}
+trap  ctrl_c  INT ; 
+
+### estimating ACGT content #################################################################################
 rm -f $ACGT ; 
 for f in $GLIST
 do
+  if [ ! -e $f ]
+  then
+    sleep 1 ; echo "ERROR!" >&2 ;
+    sleep 1 ; echo "blank space in  file name starting by:  $f" >&2 ;
+    sleep 1 ; echo "please avoid any blank space within every file name" >&2 ;
+    sleep 1 ; for f in $GLIST ; do if [ -e $f.nfh ]; then rm -f $f.nfh ; fi ; done
+    rm -f $ACGT ;
+    exit 1 ;
+  fi
   echo "parsing $(basename ${f%.*})" >&2 ;
-  $EXEC "x=\$($GAWK '! /^>/{i=split(\$0,c,\"\");++i;while(--i>0)w[c[i]]++}END{print w[\"A\"]+w[\"a\"]\" \"w[\"C\"]+w[\"c\"]\" \"w[\"G\"]+w[\"g\"]\" \"w[\"T\"]+w[\"t\"]}' $f); flock -x $ACGT echo \"$(basename $f) \$x\" >> $ACGT;" &
-  while [ $(jobs -r | wc -l) -gt $NPROC ]; do sleep $WAITIME ; done
+  $EXEC "grep -v '^>' $f > $f.nfh; a=\$(tr -cd Aa <$f.nfh | wc -c); c=\$(tr -cd Cc <$f.nfh | wc -c); g=\$(tr -cd Gg <$f.nfh | wc -c); t=\$(tr -cd Tt <$f.nfh | wc -c); flock -x $ACGT echo $(basename $f) \$a \$c \$g \$t >> $ACGT; rm -f $f.nfh;" &
+  while [ $(jobs -r | wc -l) -ge $NPROC ]; do sleep $WAITIME ; done
 done
 
 wait ;
@@ -289,55 +373,84 @@ wait ;
 sort $ACGT > $ACGT.tmp ;
 mv $ACGT.tmp $ACGT ;
 
-### estimating k-mer and sketch size
-[ $K -le 0 ] && K=$($GAWK -v q=$Q '{n=$2+$3+$4+$5;kc=int(log(n*(1-q)/q)/log(4))+1;k=(kc>k)?kc:k}END{print k}' $ACGT) && [ $K -le 0 ] && k=19;
+### estimating k-mer and sketch size ########################################################################
+[ $K -le 0 ] && K=$($GAWK -v q=$Q '{n=$2+$3+$4+$5;kc=int(log(n*(1-q)/q)/log(4)+0.5);k=(kc>k)?kc:k}END{print k}' $ACGT) && [ $K -le 0 ] && k=19;
 echo "k-mer size: $K (q=$Q)" ;
-[ $SKETCH -le 0 ] && SKETCH=$($GAWK '{s+=$2+$3+$4+$5;n+=4}END{printf("%d\n", 100000*int((s/n)/100000))}' $ACGT) && [ $SKETCH -eq 0 ] && SKETCH=10000;
+[ $(echo "$SKETCH<=1" | bc) -ne 0 ] && SKETCH=$($GAWK -v s=$SKETCH '{n+=$2+$3+$4+$5}END{printf("%d\n", 1000*int((s*n/NR)/1000))}' $ACGT) && [ $SKETCH -eq 0 ] && SKETCH=10000;
+[ $(echo "$SKETCH>1"  | bc) -ne 0 ] && SKETCH=$(echo "($SKETCH+0.5)/1" | bc);
 echo "sketch size: $SKETCH" ;
 
-### sketching genomes
+### sketching genomes #######################################################################################
 TLIST="" ;
 for f in $GLIST
 do
   TLIST="$TLIST $(basename ${f%.*})" ;
   echo "sketching $(basename ${f%.*})" >&2 ;
   $EXEC "$MASH sketch -o ${f%.*} -s $SKETCH -k $K $f" &> /dev/null &
-  while [ $(jobs -r | wc -l) -gt $NPROC ]; do sleep $WAITIME ; done
+  while [ $(jobs -r | wc -l) -ge $NPROC ]; do sleep $WAITIME ; done
 done
 
+
 wait ; 
 
+
 #############################################################################################################
 #############################################################################################################
-#### DISTANCE ESTIMATES                                                                                  ####
+#### P-DISTANCE ESTIMATES                                                                                ####
 #############################################################################################################
 #############################################################################################################
 
-### estimating and writing pairwise p-distances
+### estimating and writing pairwise p-distances #############################################################
 echo $TLIST > $OEPL ;
 a=($(ls $DATADIR/*.msh | sort));
-i=${#a[@]}; 
+i=${#a[@]};
 while [ $((j=--i)) -ge 0 ]
 do
   mi=${a[$i]};
   ti=$(basename ${mi%.*});
   while [ $((--j)) -ge 0 ]
   do
-    mj=${a[$j]};
-    tj=$(basename ${mj%.*});
-    echo "estimating p-distance between $ti ($(( $i + 1 ))) and $tj ($(( $j + 1 )))" >&2 ;
-    $EXEC "d=\$(timeout 5 $MASH dist -s $SKETCH $mi $mj | $GAWK '{printf(\"%.8f\\n\",\$3)}'); [ -n \"\$d\" ] && flock -x $OEPL echo \"$(( $i + 1 )) $(( $j + 1 )) \$d\" >> $OEPL ;" &
-    while [ $(jobs -r | wc -l) -gt $NPROC ]; do sleep $WAITIME ; done
+    if [ $CHUNK -eq 1 ] || [ $j -eq 0 ]
+    then
+      mj=${a[$j]};
+      tj=$(basename ${mj%.*});
+      echo "estimating p-distance between $ti ($(( $i + 1 ))) and $tj ($(( $j + 1 )))" >&2 ;
+      $EXEC "d=\$(timeout 5 $MASH dist -s $SKETCH $mi $mj | $GAWK '{printf(\"%.8f\\n\",1-exp(-\$3))}'); [ -n \"\$d\" ] && flock -x $OEPL echo \"$(( $i + 1 )) $(( $j + 1 )) \$d\" >> $OEPL ;" &
+    else
+      chk=$CHUNK;
+      let j++;
+      if [ $j -le $(( $CHUNK - 1 )) ]; then chk=$j; fi
+      mlist="";
+      c=$chk;
+      while [ $((--c)) -ge 0 ]
+      do
+	let j--;
+	mj=${a[$j]};
+	mlist="$mlist $mj";
+        tj=$(basename ${mj%.*});
+        echo "estimating p-distance between $ti ($(( $i + 1 ))) and $tj ($(( $j + 1 )))" >&2 ;
+      done
+      $EXEC "i=\$(( $i + 1 )); j=\$(( $j + $chk + 1 )); out=; for mj in $mlist; do d=\$(timeout 5 $MASH dist -s $SKETCH $mi \$mj | $GAWK '{printf(\"%.8f\\n\",(\$3==1)?1:1-exp(-\$3))}'); j=\$(( \$j - 1 )); [ -n \"\$d\" ] && out=\"\$out\"\"\$i \$j \$d\\n\"; done ; [ -n \"\$out\" ] && flock -x $OEPL echo -n -e \$out >> $OEPL ;" &
+    fi
+    while [ $(jobs -r | wc -l) -ge $NPROC ]; do sleep $WAITIME ; done
   done
 done
 
-wait ;
+### waiting up to one minute ################################################################################
+### if any job is still running (very rare), then killing everything and next re-estimating one by one 
+for t in {1..121}
+do
+  echo -n "." >&2 ;
+  [ $(jobs -r | wc -l) -eq 0 ] && break ;
+  [ $t -eq 121 ] && jobs -p | xargs kill &>/dev/null;
+  sleep 0.5 ;
+done
 
-### verifying every p-distance estimates
+### verifying every p-distance estimate #####################################################################
 $GAWK '(NR==1){n=NF;while((j=++i)<=n)while(--j>0)d[i][j]=d[j][i]=-1;next}
               {d[$1][$2]=(d[$2][$1]=$3)}
        END    {i=0;while((j=++i)<=n)while(--j>0)if(d[i][j]<0||d[j][i]<0)print i"\t"j}' $OEPL |
-  while read i j
+  while read -r i j
   do
     let i--;
     mi=${a[$i]};
@@ -346,30 +459,94 @@ $GAWK '(NR==1){n=NF;while((j=++i)<=n)while(--j>0)d[i][j]=d[j][i]=-1;next}
     mj=${a[$j]};
     tj=$(basename ${mj%.*});
     echo "re-estimating p-distance between $ti ($(( $i + 1 ))) and $tj ($(( $j + 1 )))" >&2 ;
-    d=$($MASH dist -s $SKETCH $mi $mj | $GAWK '{printf("%.8f\n",$3)}'); flock -x $OEPL echo "$(( $i + 1 )) $(( $j + 1 )) $d" >> $OEPL ;
-  done
+    d=$($MASH dist -s $SKETCH $mi $mj | $GAWK '{printf("%.8f\n",($3==1)?1:1-exp(-$3))}');
+    echo "$(( $i + 1 )) $(( $j + 1 )) $d" ;
+  done >> $OEPL 
+
+echo " [ok]" >&2 ;
 
-wait ;
 
-### transforming (if required) p-distances and writing in PHYLIP square format
+#############################################################################################################
+#############################################################################################################
+#### EVOLUTIONARY DISTANCE ESTIMATES                                                                     ####
+#############################################################################################################
+#############################################################################################################
+
+### transforming (if required) p-distances and writing in PHYLIP square format ##############################
+### F81/EI transformations can be forced with option -c 0
+### to have PC distances (option -a 0) or only p-distances (option -a > 0), F81/EI transformations should be prevented with option -c 1
+
 if [ -n "$($GAWK -v c=$CUTOFF '(NR==1){next}($3>c){print;exit}' $OEPL)" ]
 then
-  $GAWK -v p=8 'function s(x){return x*x}
-                (ARGIND==1)        {++x;sx=$2+$3+$4+$5;a[x]=$2/sx;c[x]=$3/sx;g[x]=$4/sx;t[x]=$5/sx}
-                (ARGIND==2&&FNR==1){while(++n<=NF){m=(m>(l=length(lbl[n]=$n)))?m:l;d[n][n]=0}--n;  print(b=" ")n;x=0.5;while((x*=2)<m)b=b""b;  next}
-                (ARGIND==2)        {d[$1][$2]=(d[$2][$1]=$3)}
-                END                {while(++i<=n){printf substr(lbl[i]b,1,m);ai=a[i];ci=c[i];gi=g[i];ti=t[i];j=0;
-                                      while(++j<=n)printf(" %."p"f",((dij=d[i][j])==0)?0:((x=1-dij/(1-ai*a[j]-ci*c[j]-gi*g[j]-ti*t[j]))>0)?((s(ai+a[j])+s(ci+c[j])+s(gi+g[j])+s(ti+t[j]))/4-1)*log(x):1.23456789);
-                                      print""}}' $ACGT $OEPL > $DFILE ;
-  echo "F81 distances written into $DFILE" ;
+  if [ $(echo "$ALPHA<=0" | bc) -ne 0 ]
+  then
+    ### F81/EI transformation without gamma shape parameter (option -a 0) ###################################
+    $GAWK -v p=8 -v f=$NFQ 'function s(x){return x*x}
+                            (ARGIND==1)        {++x;sx=$2+$3+$4+$5;if(f==2){a[x]=t[x]=0.5*($2+$5)/sx;c[x]=g[x]=0.5*($3+$4)/sx}else{a[x]=$2/sx;c[x]=$3/sx;g[x]=$4/sx;t[x]=$5/sx}}
+                            (ARGIND==2&&FNR==1){while(++n<=NF){m=(m>(l=length(lbl[n]=$n)))?m:l;d[n][n]=0}--n;  print(b=" ")n;x=0.5;while((x*=2)<m)b=b""b;  next}
+                            (ARGIND==2)        {d[$1][$2]=(d[$2][$1]=$3)}
+                            END                {while(++i<=n){
+                                                  printf substr(lbl[i]b,1,m);ai=a[i];ci=c[i];gi=g[i];ti=t[i];j=0;
+                                                  while(++j<=n)
+                                                    printf(" %."p"f",((dij=d[i][j])==0)?0:((x=1-dij/(1-ai*a[j]-ci*c[j]-gi*g[j]-ti*t[j]))>0)?((s(ai+a[j])+s(ci+c[j])+s(gi+g[j])+s(ti+t[j]))/4-1)*log(x):9.99999999);
+                                                  print""}  }' $ACGT $OEPL > $DMAT ;
+    echo "F81/EI distances written into $DMAT" ;
+  else
+    ### F81/EI transformation with gamma shape parameter (option -a > 0) ####################################
+    $GAWK -v p=8 -v f=$NFQ -v alp=$ALPHA 'function s(x){return x*x}
+                                          (ARGIND==1)        {++x;sx=$2+$3+$4+$5;if(f==2){a[x]=t[x]=0.5*($2+$5)/sx;c[x]=g[x]=0.5*($3+$4)/sx}else{a[x]=$2/sx;c[x]=$3/sx;g[x]=$4/sx;t[x]=$5/sx}}
+                                          (ARGIND==2&&FNR==1){while(++n<=NF){m=(m>(l=length(lbl[n]=$n)))?m:l;d[n][n]=0}--n;  print(b=" ")n;x=0.5;while((x*=2)<m)b=b""b;  next}
+                                          (ARGIND==2)        {d[$1][$2]=(d[$2][$1]=$3)}
+                                          END                {while(++i<=n){
+                                                                printf substr(lbl[i]b,1,m);ai=a[i];ci=c[i];gi=g[i];ti=t[i];j=0;
+                                                                while(++j<=n)
+                                                                  printf(" %."p"f",((dij=d[i][j])==0)?0:((x=1-dij/(1-ai*a[j]-ci*c[j]-gi*g[j]-ti*t[j]))>0)?alp*(1-(s(ai+a[j])+s(ci+c[j])+s(gi+g[j])+s(ti+t[j]))/4)*((x**(-1/alp))-1):9.99999999);
+                                                                print""}  }' $ACGT $OEPL > $DMAT ;
+    echo "F81/EI distances (a=$ALPHA) written into $DMAT" ;
+  fi    
 else
-  $GAWK -v p=8 '(NR==1){while(++n<=NF){m=(m>(l=length(lbl[n]=$n)))?m:l;d[n][n]=0}--n;  print(b=" ")n;x=0.5;while((x*=2)<m)b=b""b;  next}  
-                       {d[$1][$2]=(d[$2][$1]=$3)}
-                END    {while(++i<=n){printf substr(lbl[i]b,1,m);j=0;while(++j<=n)printf(" %."p"f",d[i][j]);print""}}' $OEPL > $DFILE ;
-  echo "p-distances written into $DFILE" ;
+  if [ $(echo "$ALPHA<=0" | bc) -ne 0 ]
+  then
+    ### PC transformation (option -a 0) #####################################################################
+    $GAWK -v p=8 '(NR==1){while(++n<=NF){m=(m>(l=length(lbl[n]=$n)))?m:l;d[n][n]=0}--n;  print(b=" ")n;x=0.5;while((x*=2)<m)b=b""b;  next}  
+                         {d[$1][$2]=(d[$2][$1]=$3)}
+                  END    {while(++i<=n){
+                            printf substr(lbl[i]b,1,m);j=0;
+                            while(++j<=n)
+                              printf(" %."p"f",((dij=d[i][j])==0)?0:(dij<1)?-log(1-dij):9.99999999);
+                            print""}  }' $OEPL > $DMAT ;
+    echo "PC distances written into $DMAT" ;
+  else
+    ### p-distance without transformation (option -a > 0) ###################################################
+    $GAWK -v p=8 '(NR==1){while(++n<=NF){m=(m>(l=length(lbl[n]=$n)))?m:l;d[n][n]=0}--n;  print(b=" ")n;x=0.5;while((x*=2)<m)b=b""b;  next}  
+                         {d[$1][$2]=(d[$2][$1]=$3)}
+                  END    {while(++i<=n){
+                            printf substr(lbl[i]b,1,m);j=0;
+                            while(++j<=n)
+                              printf(" %."p"f",((dij=d[i][j])==0)?0:(dij<1)?dij:9.99999999);
+                            print""}  }' $OEPL > $DMAT ;
+    echo "p-distances written into $DMAT" ;
+  fi
+fi
+
+### estimating (if required) missing distances, i.e. every missing dij is replaced by min[dix+djx] ##########
+miss=$(grep -o -F " 9.99999999" $DMAT | wc -l);
+if [ $miss -ne 0 ]
+then
+  echo "estimating $(( $miss / 2 )) missing distances inside $DMAT" ;
+  $GAWK '(NR>1){m=(m>(l=length(lbl[++n]=$(c=j=1))))?m:l;--j;while(++c<=NF){d[n][++j]=$c;(max<$c)&&max=$c}}
+         END{while(++i<=n){j=0;
+               while(++j<i){x=0;min=2*max;
+                 if(d[i][j]!=9.99999999)continue;
+                 while(++x<=n)x!=i&&x!=j&&d[i][x]!=9.99999999&&d[j][x]!=9.99999999&&d[i][x]>=0&&d[j][x]>=0&&min>(y=d[i][x]+d[j][x])&&min=y;
+                 print "estimated missing distance between "lbl[i]" ("i") and "lbl[j]" ("j"): "min > "/dev/stderr";
+                 d[i][j]=d[j][i]=-min}}
+             print(b=" ")n;x=0.5;while((x*=2)<m)b=b""b;i=0;
+             while(++i<=n){printf substr(lbl[i]b,1,m);j=0;while(++j<=n)printf(" %.8f",((d[i][j]>=0)?d[i][j]:-d[i][j]));print""}  }' $DMAT > $DMAT.tmp ;
+  mv $DMAT.tmp $DMAT ;
 fi
 
-### deleting all *.msh files
+### deleting all *.msh files ################################################################################
 for f in $DATADIR/*.msh ; do rm -f $f ; done
 
 
@@ -382,69 +559,108 @@ if ! $INFERTREE ; then exit 0 ; fi
 #############################################################################################################
 #############################################################################################################
 
+trap  INT ;
+function ctrl_c() {
+  echo -n " process interrupted: deleting files ... " ;
+  sleep 5 ;
+  for x in $(seq 1 $NPROC)
+  do
+    rm -f $BASEFILE.dd.$x.c $BASEFILE.dd.$x.c_fastme_stat.txt $BASEFILE.dd.$x.n $BASEFILE.dd.$x.n_fastme_stat.txt $BASEFILE.tt.$x.c $BASEFILE.tt.$x.n $BASEFILE.tt.$x.m ;
+  done
+  rm -f $ACGT $OEPL $BASEFILE.tax $BASEFILE.d $BASEFILE.dd $BASEFILE.dd_fastme_stat.txt $BASEFILE.nwk $BASEFILE.tt ;
+  echo "[ok]" ;
+  exit 1 ;
+}
+trap  ctrl_c  INT ; 
+
 echo "searching for the BME phylogenetic tree..." ;
 
 TAXFILE=$BASEFILE.tax;
-grep -v "^ " $DFILE | $GAWK '{print $1}' > $TAXFILE ;
+sed 1d $DMAT | $GAWK '{print "s/@"(++i)"@/"$1"/"}' > $TAXFILE ;
 
-$GAWK '(NR==1){print;next}{printf"@"(++i)"@";j=1;while(++j<=NF)printf" "$j;print""}' $DFILE > $BASEFILE.dd ;
-DFILE=$BASEFILE.dd;
+$GAWK '(NR==1){print;next}{printf"@"(++i)"@";j=1;while(++j<=NF)printf" "$j;print""}' $DMAT > $BASEFILE.dd ;
+DMAT=$BASEFILE.dd;
 
 BMETREE=$BASEFILE.nwk;  # BME phylogenetic tree in NEWICK format
 OUTTREE=$BASEFILE.tt;
-STATFILE=$DFILE""_fastme_stat.txt;
 
-### first BME tree inference
-$FASTME -i $DFILE -o $OUTTREE -s -f 12 -T 1 &> /dev/null ;
-tblo=$(grep -B1 "Performed" $STATFILE | sed -n 1p | sed 's/.* //g' | sed 's/\.$//g');
+### first BME tree inference ################################################################################
+$FASTME -i $DMAT -o $OUTTREE -s -f 12 -T 1 &> /dev/null ;
+tblo=$(grep -B1 "Performed" $BASEFILE.dd_fastme_stat.txt | sed -n 1p | sed 's/.* //g' | sed 's/\.$//g');
 [ -z "$tblo" ] && tblo=$(grep -o ":[0-9\.-]*" $OUTTREE | tr -d :- | paste -sd+ | bc -l | sed 's/^\./0./');
 echo "  step 0   $tblo" >&2 ;
 echo "step 0   tbl=$tblo" ;
 cp $OUTTREE $BMETREE;
-i=0; while read tax; do let i++; sed -i "s/@$i""@/$tax/" $BMETREE ; done < $TAXFILE ;
+sed -f $TAXFILE $BMETREE > $BMETREE.tmp ; mv $BMETREE.tmp $BMETREE ; # <=> sed -f $TAXFILE -i $BMETREE ;
+rm -f $BASEFILE.dd_fastme_stat.txt ;
 
-### ratchet-based search of the BME tree
-ct=0;
-for s in $(seq 1 $RATCHET)
-do
-  ### noising evolutionary distances
-  v=0.$s; [ $(echo "$v>=0.7" | bc) -eq 1 ] && v=0$(echo "scale=4;$v*$v" | bc -l);
-  $GAWK -v v=$v -v s=$s 'BEGIN  {srand(s)}
-                         (NR==1){n=$0;next}  {lbl[++i]=$1;d[i][i]=0;j=0;f=1;while(++f<=i){++j;d[i][j]=(d[j][i]=($f*(1-v)+2*v*$f*rand()))}}
-                         END    {print" "n;i=0;while(++i<=n){printf lbl[i];j=0;while(++j<=n){printf(" %.8f",d[i][j])}print""}}' $DFILE > $DFILE.noised ;
-
-  ### using current BME tree as starting tree for a new BME tree search
-  $FASTME -i $DFILE.noised -u $OUTTREE        -o $OUTTREE.noised    -nB -s -T 1 &> /dev/null ;
-  sed -i 's/:-/:/g' $OUTTREE.noised ;
-  $FASTME -i $DFILE        -u $OUTTREE.noised -o $OUTTREE.candidate     -s -T 1 &> /dev/null ;
-
-  tbl=$(grep -B1 "Performed" $STATFILE | sed -n 1p | sed 's/.* //g' | sed 's/\.$//g'); 
-  out=" ";
-  [ -z "$tbl" ] && tbl=$(grep -o ":[0-9\.-]*" $OUTTREE.candidate | tr -d :- | paste -sd+ | bc | sed 's/^\./0./') && out="+";
-  echo -n "$out step $s   $tbl" >&2 ; 
-  if [ $(echo "$tbl<$tblo" | bc) -eq 0 ]
-  then
-    echo >&2 ;
-  else
-    ct=0;
-    tblo=$tbl;
-    mv $OUTTREE.candidate $OUTTREE ;
-    cp $OUTTREE $BMETREE;
-    i=0; while read tax; do let i++; sed -i "s/@$i""@/$tax/" $BMETREE ; done < $TAXFILE ;
-    echo " *" >&2 ;
-    echo "step $s   tbl=$tbl"; 
-  fi
+### ratchet-based search of the BME tree ####################################################################
+if [ $RATCHET -gt 0 ]
+then
+  for x in $(seq 1 $NPROC); do cp $DMAT $DMAT.$x.c ; done
 
-  rm -f $DFILE.noised_fastme_stat.txt $STATFILE $OUTTREE.noised $OUTTREE.candidate $DFILE.noised ;
+  step=0; eps=0;
+  while [ $step -le $RATCHET ]
+  do
+    step_prev=$step;
+    eps_prev=$eps;
+    for x in $(seq 1 $NPROC)
+    do
+      if [ $((++step)) -gt $RATCHET ]; then break; fi
+
+      ### noising evolutionary distances ####################################################################
+      v=0.$((++eps))$step; [ $(echo "$v>=0.7" | bc) -eq 1 ] && v=0$(echo "scale=4;$v*$v/1.4" | bc -l);
+      $GAWK -v v=$v -v s=$step 'BEGIN  {srand(s)}
+                                (NR==1){n=$0;next}  {lbl[++i]=$1;d[i][i]=0;j=0;f=1;while(++f<=i){++j;d[i][j]=(d[j][i]=($f*(1-v)+2*v*$f*rand()))}}
+                                END    {print" "n;i=0;while(++i<=n){printf lbl[i];j=0;while(++j<=n){printf(" %.8f",d[i][j])}print""}}' $DMAT.$x.c > $DMAT.$x.n ;
+
+      ### ratchet-search tree search ########################################################################
+      $EXEC "$FASTME -i $DMAT.$x.n -u $OUTTREE -o $OUTTREE.$x.n -nB -s -T 1 ; sed 's/:-/:/g' $OUTTREE.$x.n > $OUTTREE.$x.m ; $FASTME -i $DMAT.$x.c -u $OUTTREE.$x.m -o $OUTTREE.$x.c -s -T 1 ; rm -f $DMAT.$x.n $DMAT.$x.m $DMAT.$x.n_fastme_stat.txt $OUTTREE.$x.n $OUTTREE.$x.m ;" &> /dev/null &
+    done
+
+    while [ $(jobs -r | wc -l) -gt 0 ]; do sleep $WAITIME ; done
+
+    for x in $(seq 1 $NPROC)
+    do
+      if [ $((++step_prev)) -gt $RATCHET ]; then break; fi
+      v=0.$((++eps_prev))$step_prev; [ $(echo "$v>=0.7" | bc) -eq 1 ] && v=0$(echo "scale=4;$v*$v/1.44" | bc -l);
+    
+      tbl=$(grep -B1 "Performed" $DMAT.$x.c_fastme_stat.txt | sed -n 1p | sed 's/.* //g' | sed 's/\.$//g');
+      rm -f $DMAT.$x.c_fastme_stat.txt ;
+      out=" ";
+      [ -z "$tbl" ] && tbl=$(grep -o ":[0-9\.-]*" $OUTTREE.$x.c | tr -d :- | paste -sd+ | bc | sed 's/^\./0./') && out="+";
+      echo -n "$out step $step_prev   $tbl" >&2 ; 
+      if [ $(echo "$tbl<$tblo" | bc) -eq 0 ]
+      then
+        rm -f $OUTTREE.$x.c ;
+        echo "  (epsilon=$v)" >&2 ;
+      else
+        tblo=$tbl;
+        mv $OUTTREE.$x.c $OUTTREE ;
+        cp $OUTTREE $BMETREE;
+        sed -f $TAXFILE $BMETREE > $BMETREE.tmp ; mv $BMETREE.tmp $BMETREE ; # <=> sed -f $TAXFILE -i $BMETREE ;
+        echo " *  (epsilon=$v)" >&2 ;
+        echo "step $step_prev   tbl=$tbl";
+        eps=0;
+      fi
+    done
+  done
 
-  if [ $((++ct)) -eq $RATCHET_LIMIT ]; then break; fi
-done
+  for x in $(seq 1 $NPROC); do rm -f $DMAT.$x.c ; done
+fi
+
+#############################################################################################################
+#############################################################################################################
+#### REQ CONFIDENCE VALUES AT BRANCHES                                                                   ####
+#############################################################################################################
+#############################################################################################################
 
-### confidence value at every branch
+echo -n "estimating branch supports ... " >&2 ;
 $REQ $BASEFILE.d $BMETREE $OUTTREE ;
+echo "[ok]" >&2 ;
 mv $OUTTREE $BMETREE ;
 echo "BME tree (tbl=$tblo) with branch supports written into $BMETREE" ;
-rm -f $DFILE $TAXFILE $OUTTREE ;
+rm -f $DMAT $TAXFILE $OUTTREE ;
 
 
 exit ;
diff --git a/README.md b/README.md
index 3ca3cd12df36ac795de362c22b22ed589e8cc876..ed56b7132e98e12b15eb871280b2fcf772a0deb5 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 _JolyTree_ (named in memory of [Nicolas Joly](https://research.pasteur.fr/en/member/nicolas-joly/)) is a command line script written in [Bash](https://www.gnu.org/software/bash/) that allows a distance-based phylogenetic tree with branch supports to be quickly inferred from non-aligned genome sequences.
 _JolyTree_ runs on UNIX, Linux and most OS X operating systems.
-For more details, see the associated publication (Criscuolo 2019).
+For more details, see the associated publication ([Criscuolo 2019](https://riojournal.com/article/36178/)).
 
 ## Installation and execution
 
@@ -22,7 +22,7 @@ For more details, see the associated publication (Criscuolo 2019).
 git clone https://gitlab.pasteur.fr/GIPhy/JolyTree.git
 ```
 
-**C.** If at least one of the four required binaries (step A) is not available on your `$PATH` variable, edit the file `JolyTree.sh` and indicate the local path to the `mash`, `gawk`, `FastME` and/or `REQ` binary(ies) (approximately between lines 100 and 200):
+**C.** If at least one of the four required binaries (step A) is not available on your `$PATH` variable, edit the file `JolyTree.sh` and indicate the local path to the `mash`, `gawk`, `FastME` and/or `REQ` binary(ies) (approximately between lines 130 and 230):
 
 ```bash
 #############################################################################################################
@@ -95,19 +95,26 @@ chmod +x JolyTree.sh
 Launch _JolyTree_ without option to read the following documentation:
 
 ```
- USAGE:
-    JolyTree.sh  [options]
- where:
+ USAGE:  JolyTree.sh  -i <directory>  -b <basename>  [options]
+
+ OPTIONS:
+
     -i <directory>  directory name containing  FASTA-formatted contig files;  only files
                     ending with .fa, .fna, .fas or .fasta will be considered (mandatory)
     -b <basename>   basename of every written output file (mandatory)
-    -s <int>        sketch size (default: 25% of the largest genome size)
-    -q <double>     probability of observing a random k-mer (default: 0.0001)
-    -k <int>        k-mer size (default: estimated from the average genome size with the
+    -q <real>       probability of observing a random k-mer (default: 0.000000001)
+    -k <int>        k-mer size (default: estimated from the largest genome size with the
                     probability set by option -q)
+    -s <real|int>   sketch size estimated as the  proportion (up to 1.00) of the average 
+                    genome size;  the sketch size (integer > 2) can also be directly set 
+                    using this option (default: 0.25)
     -c <real>       if at least one of the estimated p-distances is above this specified
-                    cutoff, then a F81 correction is performed (default: 0.1)
-    -n              no BME tree inference (only pairwise distance estimation)
+                    cutoff, then a F81/EI correction is performed (default: 0.1)
+    -a <real>       F81/EI transformation gamma shape parameter (default: 1.5)
+    -f              using  the  four nucleotide  frequencies in  F81/EI  transformations  
+                    (default:  to  deal  with  multiple  contig  files,   only  the  two 
+                    frequencies A+T and C+G are used)
+    -n              no BME tree inference (only pairwise distance estimates)
     -r <int>        number of steps  when performing the  ratchet-based  BME tree search
                     (default: 100)
     -t <int>        number of threads (default: 2)
@@ -115,17 +122,32 @@ Launch _JolyTree_ without option to read the following documentation:
 
 ## Notes
 
-* It is not recommended to modify the option -k. The optimal value of _k_ is automatically estimated by equation (2) in [Ondov et al. (2016)](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x) from the desired probability _q_ of observing a random _k_-mer (option -q). Increasing _q_ (e.g. > 0.001) is not recommended, especially when dealing distantly-related genomes. Lowering _q_ (e.g. < 0.00001) leads to larger _k_-mer size that increases the variance of the estimated evolutionary distances.
+* In short, _JolyTree_ uses [_mash_](http://mash.readthedocs.io/en/latest/) to decompose each genome into a sketch of _k_-mers (options `-k`, `-q`, `-s`) and quickly estimate the _p_-distance between each pair of genomes. If required, every _p_-distance is transformed into an evolutionary distance (options `-c`, `-a`, `-f`). A ratchet-based optimal phylogenetic tree search is performed from the pairwise evolutionary distances using [_FastME_](http://www.atgc-montpellier.fr/fastme/usersguide.php) (option `-r`). Branch supports are finally estimated using [_REQ_](https://research.pasteur.fr/en/tool/r%ce%b5q-assessing-branch-supports-o%c6%92-a-distance-based-phylogenetic-tree-with-the-rate-o%c6%92-elementary-quartets/).
+
+* It is not recommended to modify the option `-k`. The optimal value of _k_ is automatically estimated by equation (2) in [Ondov et al. (2016)](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x) from the desired probability _q_ of observing a random _k_-mer (option `-q`). Since _JolyTree_ v2.0, the default probability is _q_ = 0.000000001, as it enables pairwise distances to be conveniently estimated in many cases. Increasing _q_ is not recommended, but can be useful to minimize the number of unknown distances when dealing with distantly-related genome sequences. Lowering _q_ leads to larger _k_-mer size that can be useful when dealing with very closely-related genome sequences.
+
+* Default sketch size is 25% of the average genome size, which enables pairwise distances to be efficiently estimated with fast running times. Increasing the sketch size (option `-s`) yields more accurate estimates (especially with distantly-related genomes), but requires important disk space to store the sketch files. 
+
+* Lowering the cutoff value for correcting the evolutionary distances (option `-c`) does generally not modify the inferred phylogenetic tree; on the other side, it is generally not recommended to increase this cutoff value.
+
+* The option `-c` allows multiple substitutions per character to be accurately estimated when an observed _p_-distance is quite large (e.g. > 0.1; see [Figure 3.1](https://books.google.fr/books?id=3Xc8DwAAQBAJ&pg=PA41) in Nei and Kumar 2000). In such cases, all the _p_-distances _p_ estimated by _mash_ are transformed into proper evolutionary distances _d_. Since _JolyTree_ v2.0, four _p_-distance transformations can be obtained using options `-c` and/or `-a`:
+<div align="center">
 
-* Increasing the sketch size (option -s) does not generally modify the inferred phylogenetic tree; on the other side, it is not recommended to set a sketch size lower than 10,000 (except for small genomic sequences, e.g. plasmids, viruses)
+| transformation          | formula                            | options                            |
+|:------------------------|:-----------------------------------|:-----------------------------------|
+| none                    | _d_ = _p_                          | `-c 1 -a $a` with any `$a` > 0 |   |
+| Poisson correction      | _d_ = - log<sub>_e_</sub>(1 - _p_) | `-c 1 -a 0`                      |
+| F81/EI correction       | _d_ = -_b_<sub>1</sub> log<sub>_e_</sub>(1 - _p_/_b_<sub>2</sub>) | `-a 0` |
+| F81/EI gamma correction | _d_ = -_ab_<sub>1</sub>[(1 - _p_/_b_<sub>2</sub>)<sup>-1/_a_</sup> - 1] | `-a $a` with specified gamma shape parameter |
 
-* Lowering the cutoff value for correcting the evolutionary distances (option -c) does generally not modify the inferred phylogenetic tree; on the other side, it is strongly not recommended to increase this cutoff value.
+</div>
 
-* The option -c allows multiple substitutions per character to be accurately estimated when an observed _p_-distance is quite large (e.g.> 0.1; see [Figure 3.1](https://books.google.fr/books?id=3Xc8DwAAQBAJ&pg=PA41) in Nei and Kumar 2000). In such cases, the F81 correction is performed by using the equation (4) in [Tamura and Kumar (2002)](https://academic.oup.com/mbe/article/19/10/1727/1258975) that allows estimating the pairwise distance based on the Equal-Input model of evolution ([Felsenstein 1981](https://link.springer.com/article/10.1007/BF01734359); [Tajima and Nei 1982](https://link.springer.com/article/10.1007/BF01810830), [1984](https://academic.oup.com/mbe/article/1/3/269/1244029)). This transformation was chosen because it could be directly computed from a _p_-distance value, and it takes into account putative unequal base frequencies and heterogeneous base composition among lineages.
+* The F81 corrections estimate pairwise distances based on the Equal-Input (EI) model of nucleotide substitution ([Felsenstein 1981](https://link.springer.com/article/10.1007/BF01734359); [Tajima and Nei 1982](https://link.springer.com/article/10.1007/BF01810830), [1984](https://academic.oup.com/mbe/article/1/3/269/1244029), [Tamura and Kumar 2002](https://academic.oup.com/mbe/article/19/10/1727/1258975)). These transformations were chosen because they can be directly computed from _p_-distances, and take into account putative unequal base frequencies and heterogeneous base composition among lineages (option `-f`; for details about the values _b_<sub>1</sub> and _b_<sub>2</sub>, see e.g. [Tamura and Kumar 2002](https://academic.oup.com/mbe/article/19/10/1727/1258975), [Criscuolo 2019](https://riojournal.com/article/36178/)). Thanks to the use of the supplementary gamma shape parameter (option `-a`), F81/EI gamma distance allows approximating evolutionary distances derived from complex nucleotide substitution models (the default parameter _a_ = 1.5 enables pairwise distances to be conveniently estimated in many cases).
 
-* Fast running times will be observed when using multiple threads; of note, only pairwise distance estimation step benefits from a large number of threads (other steps are quite fast).
+* Fast running times will be observed when using multiple threads. Since _JolyTree_ v2.0, almost all steps benefit from a large number of threads, i.e. genome parsing and sketching, _p_-distance estimates, and ratchet-based BME tree search.
+
+* The verbosity of _JolyTree_ can be reduced by ending the command line by `2>/dev/null`
 
-* The verbosity of _JolyTree_ could be reduced by ending the command line by `2>/dev/null`
 * To launch _JolyTree_ on multiple cores on a cluster managed by [SLURM](https://slurm.schedmd.com), edit the file `JolyTree.sh` and read the subsection [3] of the _Installation_ section (approximately line 200).
 
 
@@ -176,7 +198,7 @@ echo -e "oxytoca.ATCC13182\tCAAHFW01\naerogenes.ATCC13048\tQVMZ01\ngrimontii.06D
   while read -r s a; do echo $t.$s;([[ $a =~ $A1 ]]&&$EUTILS$a||$NCBIFTP${a:0:2}/${a:2:2}/$([[ $a =~ $A2 ]]&&echo ${a:4:2}/)$a/$a$Z|zcat)>genomes/$t.$s.fa; done
 ```
 
-##### Launching _jolyTree_
+##### Running _JolyTree_
 
 The following command line allows the script `jolyTree.sh` to be launched with default options on 8 threads:
 ```bash
@@ -212,3 +234,31 @@ Tajima F, Nei M (1982) Biases of the estimates of DNA divergence obtained by the
 Tajima F, Nei M (1984) Estimation of evolutionary distance between nucleotide sequences. Molecular Biology and Evolution, 1(3):269-285. [doi:10.1093/oxfordjournals.molbev.a040317](https://academic.oup.com/mbe/article/1/3/269/1244029).
 
 Tamura K, Kumar S (2002) Evolutionary distance estimation under heterogeneous substitution pattern among lineages. Molecular Biology and Evolution, 19(10):1727-1736. [doi:10.1093/oxfordjournals.molbev.a003995](https://academic.oup.com/mbe/article/19/10/1727/1258975).
+
+
+
+## Gallery
+
+Below is a list of some phylogenetic trees inferred using _JolyTree_:
+
+* [_Aggregatibacter_](https://riojournal.com/article/36178/zoom/fig/4969692/), [_Borreliella_](https://riojournal.com/article/36178/zoom/fig/4969672/), [_Elizabethkingia_](https://riojournal.com/article/36178/zoom/fig/4969676/), [_Lactococcus_](https://riojournal.com/article/36178/zoom/fig/4969680/), [_Providencia_](https://riojournal.com/article/36178/zoom/fig/4969684/), and [_Ralstonia_](https://riojournal.com/article/36178/zoom/fig/4969688/) genera ([Criscuolo 2019](https://riojournal.com/article/36178/))
+
+* [_Corynebacteria_ genus](https://www.microbiologyresearch.org/docserver/fulltext/ijsem/68/12/ijsem003069-f1.gif) ([Dazas et al. 2018](https://doi.org/10.1099/ijsem.0.003069))
+
+* [_Mucor circinelloides_ f. _circinelloides_](https://mbio.asm.org/content/mbio/9/2/e00573-18/F3.large.jpg) ([Garcia-Hermoso et al. 2018](https://doi.org/10.1128/mBio.00573-18))
+
+* [_Escherichia coli_](https://wwwnc.cdc.gov/eid/article/25/1/18-0534-f2) ([Nadimpali et al. 2019](https://doi.org/10.3201/eid2501.180534))
+
+* [_Klebsiella pneumoniae_](https://aem.asm.org/content/aem/86/7/e02711-19/F1.large.jpg) ([Barbier et al. 2020](https://doi.org/10.1128/AEM.02711-19))
+
+* [_Rathayibacter_ genus](https://mra.asm.org/content/ga/9/22/e00316-20/F1.large.jpg) ([Tarlachkov et al. 2020](https://doi.org/10.1128/MRA.00316-20))
+
+* [_Corynebacterium diphteriae_ complex](https://ars.els-cdn.com/content/image/1-s2.0-S092325082030019X-gr1_lrg.jpg) ([Badell et al. 2020](https://doi.org/10.1016/j.resmic.2020.02.003))
+
+* [_Klebsiella pneumoniae_](https://www.tandfonline.com/na101/home/literatum/publisher/tandf/journals/content/kgmi20/2020/kgmi20.v011.i05/19490976.2020.1748257/20200625/images/medium/kgmi_a_1748257_f0002_c.jpg) ([Huynh et al. 2020](https://doi.org/10.1080/19490976.2020.1748257 ))
+
+* [_Stromatolite bacterial communities_](https://www.biorxiv.org/content/biorxiv/early/2020/03/14/818625/F5.large.jpg) ([Waterworth et al. 2020](https://doi.org/10.1101/818625))
+
+* [_Campylobacter_ genus](https://figshare.com/articles/Data_Sheet_1_Taking_Control_Campylobacter_jejuni_Binding_to_Fibronectin_Sets_the_Stage_for_Cellular_Adherence_and_Invasion_pdf/12103260) ([Konkel et al. 2020](https://doi.org/10.3389/fmicb.2020.00564.s001))
+
+