diff --git a/JolyTree.sh b/JolyTree.sh index d0a7224612348667eba8edd073fecddfb936c6fa..0dcf55e6207840fd4a69b43fce72b3ecc7257b91 100755 --- a/JolyTree.sh +++ b/JolyTree.sh @@ -4,7 +4,7 @@ # # # JolyTree: fast distance-based phylogenetic inference from unaligned genome sequences # # # -# Copyright (C) 2017,2018,2019 Alexis Criscuolo # + COPYRIGHT="Copyright (C) 2017-2020 Institut Pasteur" # # # # This program is free software: you can redistribute it and/or modify it under the terms of the GNU # # General Public License as published by the Free Software Foundation, either version 3 of the License, or # @@ -17,13 +17,13 @@ # You should have received a copy of the GNU General Public License along with this program. If not, see # # <http://www.gnu.org/licenses/>. # # # -# Contact: # -# Institut Pasteur # -# Bioinformatics and Biostatistics Hub # -# C3BI, USR 3756 IP CNRS # -# Paris, FRANCE # -# # -# alexis.criscuolo@pasteur.fr # +# Contact: # +# Alexis Criscuolo alexis.criscuolo@pasteur.fr # +# Genome Informatics & Phylogenetics (GIPhy) giphy.pasteur.fr # +# Bioinformatics and Biostatistics Hub research.pasteur.fr/team/hub-giphy # +# USR 3756 IP CNRS research.pasteur.fr/team/bioinformatics-and-biostatistics-hub # +# Dpt. Biologie Computationnelle research.pasteur.fr/department/computational-biology # +# Institut Pasteur, Paris, FRANCE research.pasteur.fr # # # ############################################################################################################# @@ -33,8 +33,16 @@ # = VERSIONS = # # ============ # # # - VERSION=1.1b.191021ac # -# + exit 0 when displaying the doc # + VERSION=2.0.190926ac # +# + new F81/EI transformation formula using gamma shape parameter (option -a = 1.5 by default) # +# + option -f to to use the 4 nucleotide frequencies in F81/EI transformation; by default, to deal with # +# multiple contig files, JolyTree sets f(A)=f(T)=0.5*(A+T)/(A+C+G+T) and f(C)=f(G)=0.5*(C+G)/(A+C+G+T) # +# + option -s modified # +# # +# VERSION=1.2.190729ac # +# + 3x faster minhash estimations on multiple threads # +# + ratchet-based BME tree search based on multiple threads # +# + temporary files deleted when JolyTree is interrupted # # # # VERSION=1.1.181205ac # # + option -q to set desired probability of observing a random k-mer # @@ -81,23 +89,32 @@ # # if [ "$1" = "-?" ] || [ "$1" = "-h" ] || [ $# -le 1 ] # then # + echo -e "\n\033[1m JolyTree v$VERSION $COPYRIGHT\033[0m"; # cat << EOF - JolyTree v.$VERSION + Criscuolo A (2019) A fast alignment-free bioinformatics procedure to infer accurate + distance-based phylogenetic trees from genome assemblies. RIO. doi:10.3897/rio.5.e36178 + + USAGE: JolyTree.sh -i <directory> -b <basename> [options] + + OPTIONS: - USAGE: - JolyTree.sh [options] - where: -i <directory> directory name containing FASTA-formatted contig files; only files ending with .fa, .fna, .fas or .fasta will be considered (mandatory) -b <basename> basename of every written output file (mandatory) - -s <int> sketch size (default: 25% of the largest genome size) - -q <double> probability of observing a random k-mer (default: 0.0001) - -k <int> k-mer size (default: estimated from the average genome size with the + -q <real> probability of observing a random k-mer (default: 0.000000001) + -k <int> k-mer size (default: estimated from the largest genome size with the probability set by option -q) + -s <real|int> sketch size estimated as the proportion (up to 1.00) of the average + genome size; the sketch size (integer > 2) can also be directly set + using this option (default: 0.25) -c <real> if at least one of the estimated p-distances is above this specified - cutoff, then a F81 correction is performed (default: 0.1) - -n no BME tree inference (only pairwise distance estimation) + cutoff, then a F81/EI correction is performed (default: 0.1) + -a <real> F81/EI transformation gamma shape parameter (default: 1.5) + -f using the four nucleotide frequencies in F81/EI transformations + (default: to deal with multiple contig files, only the two + frequencies A+T and C+G are used) + -n no BME tree inference (only pairwise distance estimates) -r <int> number of steps when performing the ratchet-based BME tree search (default: 100) -t <int> number of threads (default: 2) @@ -174,6 +191,7 @@ EOF # # # # # # +# # # [2] EXECUTE PERMISSION ================================================================================= # # In order to run JolyTree, give the execute permission on the script JolyTree.sh: # # chmod +x JolyTree.sh # @@ -195,7 +213,7 @@ EOF # Moreover, it is also possible to launch JolyTree on multiple cores on a cluster managed by Slurm. For # # this particular case, you should first uncomment the following line: # # # -# EXEC="srun -n 1 -N 1 $EXEC"; # +# EXEC="srun -n 1 -N 1 -Q $EXEC"; # # # # and launch JolyTree via the following command line models (with t = number of cores): # # salloc <Slurm options> -n $t ./JolyTree.sh <JolyTree options> -t $t # @@ -215,58 +233,102 @@ if [ ! $(command -v $GAWK) ]; then echo "$GAWK not found" >&2 ; exit 1 ; fi if [ ! $(command -v $FASTME) ]; then echo "$FASTME not found" >&2 ; exit 1 ; fi if [ ! $(command -v $REQ) ]; then echo "$REQ not found" >&2 ; exit 1 ; fi -DATADIR="N.O.D.I.R"; # -i (mandatory) -BASEFILE="N.O.B.A.S.E.F.I.L.E"; # -b (mandatory) +export LC_ALL=C ; + +DATADIR="N.O_D.I.R"; # -i (mandatory) +BASEFILE="N.O_B.A.S.E.F.I.L.E"; # -b (mandatory) -SKETCH=0; # -s (auto from data) -Q=0.00001; # -q (0.00001) +SKETCH=0.25; # -s (0.25) +Q=0.000000001; # -q (0.000000001) K=0; # -k (auto from -q) CUTOFF=0.1; # -c (0.1) +ALPHA=1.5; # -a (1.5) +NFQ=2; # -f (none) INFERTREE=true; # -n (none) RATCHET=100; # -r (100) RATCHET_LIMIT=200; # (static) NPROC=2; # -t (2) +CHUNK=20; # -h (20) WAITIME=0.5; # (auto from -t) -while getopts :i:b:s:q:k:c:d:r:t:n option +while getopts :i:b:s:q:k:c:a:d:r:t:h:nf option do case $option in - i) DATADIR="$OPTARG" ;; - b) BASEFILE="$OPTARG" ;; - s) SKETCH=$OPTARG ;; - q) Q=$OPTARG ;; - k) K=$OPTARG ;; - c) CUTOFF=$OPTARG ;; - n) INFERTREE=false ;; - r) RATCHET=$OPTARG ;; - t) NPROC=$OPTARG ;; - :) echo "option $OPTARG : missing argument" ; exit 1 ;; - \?) echo "$OPTARG : option invalide" ; exit 1 ;; + i) DATADIR="$OPTARG" ;; + b) BASEFILE="$OPTARG" ;; + s) SKETCH=$OPTARG ;; + q) Q="$($GAWK -v x=$OPTARG 'BEGIN{printf "%.20f", x+0}' | sed 's/0*$//g')" ;; + k) K=$OPTARG ;; + c) CUTOFF="$($GAWK -v x=$OPTARG 'BEGIN{printf "%.20f", x+0}' | sed 's/0*$//g')" ;; + a) ALPHA="$($GAWK -v x=$OPTARG 'BEGIN{printf "%.20f", x+0}' | sed 's/0*$//g')" ;; + f) NFQ=4 ;; + n) INFERTREE=false ;; + r) RATCHET=$OPTARG ;; + h) CHUNK=$OPTARG ;; + t) NPROC=$OPTARG ;; + :) echo "missing argument ($OPTARG)" >&2 ; exit 1 ;; + \?) echo "invalid option ($OPTARG)" >&2 ; exit 1 ;; esac done -if [ "$DATADIR" == "N.O.D.I.R" ]; then echo "genome directory is not specified (option -i)" >&2 ; exit 1 ; fi -if [ ! -e "$DATADIR" ]; then echo "genome directory does not exist (option -i)" >&2 ; exit 1 ; fi -if [ ! -d "$DATADIR" ]; then echo "$DATADIR is not a directory (option -i)" >&2 ; exit 1 ; fi -if [ "$BASEFILE" == "N.O.B.A.S.E.F.I.L.E" ]; then echo "basename is not specified (option -b)" >&2 ; exit 1 ; fi -if [ $SKETCH -ne 0 ] && [ $SKETCH -le 1000 ]; then echo "sketch size $SKETCH is too low (option -s)" >&2 ; exit 1 ; fi - -### verifying the number of threads +if [ "$DATADIR" == "N.O_D.I.R" ]; then echo "genome directory is not specified (option -i)" >&2 ; exit 1 ; fi +if [ ! -e "$DATADIR" ]; then echo "genome directory does not exist (option -i)" >&2 ; exit 1 ; fi +if [ ! -d "$DATADIR" ]; then echo "$DATADIR is not a directory (option -i)" >&2 ; exit 1 ; fi +if [ "$BASEFILE" == "N.O_B.A.S.E.F.I.L.E" ]; then echo "basename is not specified (option -b)" >&2 ; exit 1 ; fi +if [ $(echo "$SKETCH<=0" | bc) -ne 0 ]; then echo "incorrect value: $SKETCH (option -s)" >&2 ; exit 1 ; fi +if ! [[ $K =~ ^[0-9]+$ ]]; then echo "incorrect k-mer size: $K (option -k)" >&2 ; exit 1 ; fi +if ! [[ $Q =~ ^0\.[0-9]+$ ]]; then echo "incorrect probability q: $Q (option -q)" >&2 ; exit 1 ; fi +if ! [[ $CUTOFF =~ ^0\.[0-9]+$ ]]; then echo "incorrect cutoff: $CUTOFF (option -c)" >&2 ; exit 1 ; fi +if ! [[ $ALPHA =~ ^[0-9]+\.[0-9]+$ ]]; then echo "incorrect parameter: $ALPHA (option -a)" >&2 ; exit 1 ; fi +if ! [[ $RATCHET =~ ^[0-9]+$ ]]; then echo "incorrect ratchet step number: $RATCHET (option -r)" >&2 ; exit 1 ; fi +if ! [[ $NPROC =~ ^[0-9]+$ ]]; then echo "incorrect value: $NPROC (option -t)" >&2 ; exit 1 ; fi + +### verifying the number of threads ######################################################################### [ $NPROC -le 0 ] && NPROC=2; echo "$NPROC thread(s)" ; -WAITIME=$($GAWK -v x=$NPROC 'BEGIN{print 1/sqrt(x)'}); - -### gathering the genome list -GLIST=$(ls $DATADIR/*.fna $DATADIR/*.fas $DATADIR/*.fa $DATADIR/*.fasta 2> /dev/null | sort); +WAITIME=$($GAWK -v x=$NPROC 'BEGIN{print 1/sqrt(x)}'); + +### gathering the genome list ############################################################################### +GLIST=$(ls $DATADIR/*.fna $DATADIR/*.fas $DATADIR/*.fa $DATADIR/*.fasta 2>/dev/null); +if [ $(grep -c -F " " <<<"$GLIST") -ne 0 ] +then + echo "found blank space(s) within the following file name(s):" >&2 ; + ls $DATADIR/*.fna $DATADIR/*.fas $DATADIR/*.fa $DATADIR/*.fasta 2>/dev/null | grep " " 1>&2 ; + echo "please avoid any blank space within every file name" >&2 ; exit 1 ; +elif [ $(grep -c -F "'" <<<"$GLIST") -ne 0 ] +then + echo "found single quote(s) within the following file name(s):" >&2 ; + ls $DATADIR/*.fna $DATADIR/*.fas $DATADIR/*.fa $DATADIR/*.fasta 2>/dev/null | grep -F "'" 1>&2 ; + echo "please avoid any single quote within every file name" >&2 ; exit 1 ; +elif [ $(grep -c -F "\"" <<<"$GLIST") -ne 0 ] +then + echo "found double quote(s) within the following file name(s):" >&2 ; + ls $DATADIR/*.fna $DATADIR/*.fas $DATADIR/*.fa $DATADIR/*.fasta 2>/dev/null | grep -F "\"" 1>&2 ; + echo "please avoid any double quote within every file name" >&2 ; exit 1 ; +elif [ $(grep -c -F ";" <<<"$GLIST") -ne 0 ] +then + echo "found semicolon(s) within the following file name(s):" >&2 ; + ls $DATADIR/*.fna $DATADIR/*.fas $DATADIR/*.fa $DATADIR/*.fasta 2>/dev/null | grep -F ";" 1>&2 ; + echo "please avoid any semicolon within every file name" >&2 ; exit 1 ; +elif [ $(grep -c -F "," <<<"$GLIST") -ne 0 ] +then + echo "found comma(s) within the following file name(s):" >&2 ; + ls $DATADIR/*.fna $DATADIR/*.fas $DATADIR/*.fa $DATADIR/*.fasta 2>/dev/null | grep -F "," 1>&2 ; + echo "please avoid any comma within every file name" >&2 ; exit 1 ; +fi +GLIST=$(ls $DATADIR/*.fna $DATADIR/*.fas $DATADIR/*.fa $DATADIR/*.fasta 2>/dev/null | sort); n=$(echo $GLIST | $GAWK '{print NF}'); -if [ $n -lt 4 ]; then echo "directory $DATADIR should contain at least 4 files *.fna, *.fas, *.fasta or *.fa" >&2 ; exit 1 ; fi +if [ $n -lt 4 ] +then + echo "directory $DATADIR should contain at least 4 files *.fna, *.fas, *.fasta or *.fa" >&2 ; exit 1 ; +fi echo "$n taxa" ; -### creating output file names +### creating output file names ############################################################################## ACGT=$BASEFILE.acgt; # ACGT content of each input genome OEPL=$BASEFILE.oepl; # p-distance estimates in OEPL (One Entry Per Line) format -DFILE=$BASEFILE.d; # evolutioanry distances in PHYLIP square format +DMAT=$BASEFILE.d; # evolutionary distances in PHYLIP square format ############################################################################################################# @@ -275,13 +337,35 @@ DFILE=$BASEFILE.d; # evolutioanry distances in PHYLIP square format ############################################################################################################# ############################################################################################################# -### estimating ACGT content +function ctrl_c() { + echo -n " process interrupted: deleting files ... " ; + sleep 5 ; + for f in $GLIST + do + rm -f ${f%.*}.msh ; + done + rm -f $ACGT $ACGT.tmp $OEPL $DMAT ; + echo "[ok]" ; + exit 1 ; +} +trap ctrl_c INT ; + +### estimating ACGT content ################################################################################# rm -f $ACGT ; for f in $GLIST do + if [ ! -e $f ] + then + sleep 1 ; echo "ERROR!" >&2 ; + sleep 1 ; echo "blank space in file name starting by: $f" >&2 ; + sleep 1 ; echo "please avoid any blank space within every file name" >&2 ; + sleep 1 ; for f in $GLIST ; do if [ -e $f.nfh ]; then rm -f $f.nfh ; fi ; done + rm -f $ACGT ; + exit 1 ; + fi echo "parsing $(basename ${f%.*})" >&2 ; - $EXEC "x=\$($GAWK '! /^>/{i=split(\$0,c,\"\");++i;while(--i>0)w[c[i]]++}END{print w[\"A\"]+w[\"a\"]\" \"w[\"C\"]+w[\"c\"]\" \"w[\"G\"]+w[\"g\"]\" \"w[\"T\"]+w[\"t\"]}' $f); flock -x $ACGT echo \"$(basename $f) \$x\" >> $ACGT;" & - while [ $(jobs -r | wc -l) -gt $NPROC ]; do sleep $WAITIME ; done + $EXEC "grep -v '^>' $f > $f.nfh; a=\$(tr -cd Aa <$f.nfh | wc -c); c=\$(tr -cd Cc <$f.nfh | wc -c); g=\$(tr -cd Gg <$f.nfh | wc -c); t=\$(tr -cd Tt <$f.nfh | wc -c); flock -x $ACGT echo $(basename $f) \$a \$c \$g \$t >> $ACGT; rm -f $f.nfh;" & + while [ $(jobs -r | wc -l) -ge $NPROC ]; do sleep $WAITIME ; done done wait ; @@ -289,55 +373,84 @@ wait ; sort $ACGT > $ACGT.tmp ; mv $ACGT.tmp $ACGT ; -### estimating k-mer and sketch size -[ $K -le 0 ] && K=$($GAWK -v q=$Q '{n=$2+$3+$4+$5;kc=int(log(n*(1-q)/q)/log(4))+1;k=(kc>k)?kc:k}END{print k}' $ACGT) && [ $K -le 0 ] && k=19; +### estimating k-mer and sketch size ######################################################################## +[ $K -le 0 ] && K=$($GAWK -v q=$Q '{n=$2+$3+$4+$5;kc=int(log(n*(1-q)/q)/log(4)+0.5);k=(kc>k)?kc:k}END{print k}' $ACGT) && [ $K -le 0 ] && k=19; echo "k-mer size: $K (q=$Q)" ; -[ $SKETCH -le 0 ] && SKETCH=$($GAWK '{s+=$2+$3+$4+$5;n+=4}END{printf("%d\n", 100000*int((s/n)/100000))}' $ACGT) && [ $SKETCH -eq 0 ] && SKETCH=10000; +[ $(echo "$SKETCH<=1" | bc) -ne 0 ] && SKETCH=$($GAWK -v s=$SKETCH '{n+=$2+$3+$4+$5}END{printf("%d\n", 1000*int((s*n/NR)/1000))}' $ACGT) && [ $SKETCH -eq 0 ] && SKETCH=10000; +[ $(echo "$SKETCH>1" | bc) -ne 0 ] && SKETCH=$(echo "($SKETCH+0.5)/1" | bc); echo "sketch size: $SKETCH" ; -### sketching genomes +### sketching genomes ####################################################################################### TLIST="" ; for f in $GLIST do TLIST="$TLIST $(basename ${f%.*})" ; echo "sketching $(basename ${f%.*})" >&2 ; $EXEC "$MASH sketch -o ${f%.*} -s $SKETCH -k $K $f" &> /dev/null & - while [ $(jobs -r | wc -l) -gt $NPROC ]; do sleep $WAITIME ; done + while [ $(jobs -r | wc -l) -ge $NPROC ]; do sleep $WAITIME ; done done + wait ; + ############################################################################################################# ############################################################################################################# -#### DISTANCE ESTIMATES #### +#### P-DISTANCE ESTIMATES #### ############################################################################################################# ############################################################################################################# -### estimating and writing pairwise p-distances +### estimating and writing pairwise p-distances ############################################################# echo $TLIST > $OEPL ; a=($(ls $DATADIR/*.msh | sort)); -i=${#a[@]}; +i=${#a[@]}; while [ $((j=--i)) -ge 0 ] do mi=${a[$i]}; ti=$(basename ${mi%.*}); while [ $((--j)) -ge 0 ] do - mj=${a[$j]}; - tj=$(basename ${mj%.*}); - echo "estimating p-distance between $ti ($(( $i + 1 ))) and $tj ($(( $j + 1 )))" >&2 ; - $EXEC "d=\$(timeout 5 $MASH dist -s $SKETCH $mi $mj | $GAWK '{printf(\"%.8f\\n\",\$3)}'); [ -n \"\$d\" ] && flock -x $OEPL echo \"$(( $i + 1 )) $(( $j + 1 )) \$d\" >> $OEPL ;" & - while [ $(jobs -r | wc -l) -gt $NPROC ]; do sleep $WAITIME ; done + if [ $CHUNK -eq 1 ] || [ $j -eq 0 ] + then + mj=${a[$j]}; + tj=$(basename ${mj%.*}); + echo "estimating p-distance between $ti ($(( $i + 1 ))) and $tj ($(( $j + 1 )))" >&2 ; + $EXEC "d=\$(timeout 5 $MASH dist -s $SKETCH $mi $mj | $GAWK '{printf(\"%.8f\\n\",1-exp(-\$3))}'); [ -n \"\$d\" ] && flock -x $OEPL echo \"$(( $i + 1 )) $(( $j + 1 )) \$d\" >> $OEPL ;" & + else + chk=$CHUNK; + let j++; + if [ $j -le $(( $CHUNK - 1 )) ]; then chk=$j; fi + mlist=""; + c=$chk; + while [ $((--c)) -ge 0 ] + do + let j--; + mj=${a[$j]}; + mlist="$mlist $mj"; + tj=$(basename ${mj%.*}); + echo "estimating p-distance between $ti ($(( $i + 1 ))) and $tj ($(( $j + 1 )))" >&2 ; + done + $EXEC "i=\$(( $i + 1 )); j=\$(( $j + $chk + 1 )); out=; for mj in $mlist; do d=\$(timeout 5 $MASH dist -s $SKETCH $mi \$mj | $GAWK '{printf(\"%.8f\\n\",(\$3==1)?1:1-exp(-\$3))}'); j=\$(( \$j - 1 )); [ -n \"\$d\" ] && out=\"\$out\"\"\$i \$j \$d\\n\"; done ; [ -n \"\$out\" ] && flock -x $OEPL echo -n -e \$out >> $OEPL ;" & + fi + while [ $(jobs -r | wc -l) -ge $NPROC ]; do sleep $WAITIME ; done done done -wait ; +### waiting up to one minute ################################################################################ +### if any job is still running (very rare), then killing everything and next re-estimating one by one +for t in {1..121} +do + echo -n "." >&2 ; + [ $(jobs -r | wc -l) -eq 0 ] && break ; + [ $t -eq 121 ] && jobs -p | xargs kill &>/dev/null; + sleep 0.5 ; +done -### verifying every p-distance estimates +### verifying every p-distance estimate ##################################################################### $GAWK '(NR==1){n=NF;while((j=++i)<=n)while(--j>0)d[i][j]=d[j][i]=-1;next} {d[$1][$2]=(d[$2][$1]=$3)} END {i=0;while((j=++i)<=n)while(--j>0)if(d[i][j]<0||d[j][i]<0)print i"\t"j}' $OEPL | - while read i j + while read -r i j do let i--; mi=${a[$i]}; @@ -346,30 +459,94 @@ $GAWK '(NR==1){n=NF;while((j=++i)<=n)while(--j>0)d[i][j]=d[j][i]=-1;next} mj=${a[$j]}; tj=$(basename ${mj%.*}); echo "re-estimating p-distance between $ti ($(( $i + 1 ))) and $tj ($(( $j + 1 )))" >&2 ; - d=$($MASH dist -s $SKETCH $mi $mj | $GAWK '{printf("%.8f\n",$3)}'); flock -x $OEPL echo "$(( $i + 1 )) $(( $j + 1 )) $d" >> $OEPL ; - done + d=$($MASH dist -s $SKETCH $mi $mj | $GAWK '{printf("%.8f\n",($3==1)?1:1-exp(-$3))}'); + echo "$(( $i + 1 )) $(( $j + 1 )) $d" ; + done >> $OEPL + +echo " [ok]" >&2 ; -wait ; -### transforming (if required) p-distances and writing in PHYLIP square format +############################################################################################################# +############################################################################################################# +#### EVOLUTIONARY DISTANCE ESTIMATES #### +############################################################################################################# +############################################################################################################# + +### transforming (if required) p-distances and writing in PHYLIP square format ############################## +### F81/EI transformations can be forced with option -c 0 +### to have PC distances (option -a 0) or only p-distances (option -a > 0), F81/EI transformations should be prevented with option -c 1 + if [ -n "$($GAWK -v c=$CUTOFF '(NR==1){next}($3>c){print;exit}' $OEPL)" ] then - $GAWK -v p=8 'function s(x){return x*x} - (ARGIND==1) {++x;sx=$2+$3+$4+$5;a[x]=$2/sx;c[x]=$3/sx;g[x]=$4/sx;t[x]=$5/sx} - (ARGIND==2&&FNR==1){while(++n<=NF){m=(m>(l=length(lbl[n]=$n)))?m:l;d[n][n]=0}--n; print(b=" ")n;x=0.5;while((x*=2)<m)b=b""b; next} - (ARGIND==2) {d[$1][$2]=(d[$2][$1]=$3)} - END {while(++i<=n){printf substr(lbl[i]b,1,m);ai=a[i];ci=c[i];gi=g[i];ti=t[i];j=0; - while(++j<=n)printf(" %."p"f",((dij=d[i][j])==0)?0:((x=1-dij/(1-ai*a[j]-ci*c[j]-gi*g[j]-ti*t[j]))>0)?((s(ai+a[j])+s(ci+c[j])+s(gi+g[j])+s(ti+t[j]))/4-1)*log(x):1.23456789); - print""}}' $ACGT $OEPL > $DFILE ; - echo "F81 distances written into $DFILE" ; + if [ $(echo "$ALPHA<=0" | bc) -ne 0 ] + then + ### F81/EI transformation without gamma shape parameter (option -a 0) ################################### + $GAWK -v p=8 -v f=$NFQ 'function s(x){return x*x} + (ARGIND==1) {++x;sx=$2+$3+$4+$5;if(f==2){a[x]=t[x]=0.5*($2+$5)/sx;c[x]=g[x]=0.5*($3+$4)/sx}else{a[x]=$2/sx;c[x]=$3/sx;g[x]=$4/sx;t[x]=$5/sx}} + (ARGIND==2&&FNR==1){while(++n<=NF){m=(m>(l=length(lbl[n]=$n)))?m:l;d[n][n]=0}--n; print(b=" ")n;x=0.5;while((x*=2)<m)b=b""b; next} + (ARGIND==2) {d[$1][$2]=(d[$2][$1]=$3)} + END {while(++i<=n){ + printf substr(lbl[i]b,1,m);ai=a[i];ci=c[i];gi=g[i];ti=t[i];j=0; + while(++j<=n) + printf(" %."p"f",((dij=d[i][j])==0)?0:((x=1-dij/(1-ai*a[j]-ci*c[j]-gi*g[j]-ti*t[j]))>0)?((s(ai+a[j])+s(ci+c[j])+s(gi+g[j])+s(ti+t[j]))/4-1)*log(x):9.99999999); + print""} }' $ACGT $OEPL > $DMAT ; + echo "F81/EI distances written into $DMAT" ; + else + ### F81/EI transformation with gamma shape parameter (option -a > 0) #################################### + $GAWK -v p=8 -v f=$NFQ -v alp=$ALPHA 'function s(x){return x*x} + (ARGIND==1) {++x;sx=$2+$3+$4+$5;if(f==2){a[x]=t[x]=0.5*($2+$5)/sx;c[x]=g[x]=0.5*($3+$4)/sx}else{a[x]=$2/sx;c[x]=$3/sx;g[x]=$4/sx;t[x]=$5/sx}} + (ARGIND==2&&FNR==1){while(++n<=NF){m=(m>(l=length(lbl[n]=$n)))?m:l;d[n][n]=0}--n; print(b=" ")n;x=0.5;while((x*=2)<m)b=b""b; next} + (ARGIND==2) {d[$1][$2]=(d[$2][$1]=$3)} + END {while(++i<=n){ + printf substr(lbl[i]b,1,m);ai=a[i];ci=c[i];gi=g[i];ti=t[i];j=0; + while(++j<=n) + printf(" %."p"f",((dij=d[i][j])==0)?0:((x=1-dij/(1-ai*a[j]-ci*c[j]-gi*g[j]-ti*t[j]))>0)?alp*(1-(s(ai+a[j])+s(ci+c[j])+s(gi+g[j])+s(ti+t[j]))/4)*((x**(-1/alp))-1):9.99999999); + print""} }' $ACGT $OEPL > $DMAT ; + echo "F81/EI distances (a=$ALPHA) written into $DMAT" ; + fi else - $GAWK -v p=8 '(NR==1){while(++n<=NF){m=(m>(l=length(lbl[n]=$n)))?m:l;d[n][n]=0}--n; print(b=" ")n;x=0.5;while((x*=2)<m)b=b""b; next} - {d[$1][$2]=(d[$2][$1]=$3)} - END {while(++i<=n){printf substr(lbl[i]b,1,m);j=0;while(++j<=n)printf(" %."p"f",d[i][j]);print""}}' $OEPL > $DFILE ; - echo "p-distances written into $DFILE" ; + if [ $(echo "$ALPHA<=0" | bc) -ne 0 ] + then + ### PC transformation (option -a 0) ##################################################################### + $GAWK -v p=8 '(NR==1){while(++n<=NF){m=(m>(l=length(lbl[n]=$n)))?m:l;d[n][n]=0}--n; print(b=" ")n;x=0.5;while((x*=2)<m)b=b""b; next} + {d[$1][$2]=(d[$2][$1]=$3)} + END {while(++i<=n){ + printf substr(lbl[i]b,1,m);j=0; + while(++j<=n) + printf(" %."p"f",((dij=d[i][j])==0)?0:(dij<1)?-log(1-dij):9.99999999); + print""} }' $OEPL > $DMAT ; + echo "PC distances written into $DMAT" ; + else + ### p-distance without transformation (option -a > 0) ################################################### + $GAWK -v p=8 '(NR==1){while(++n<=NF){m=(m>(l=length(lbl[n]=$n)))?m:l;d[n][n]=0}--n; print(b=" ")n;x=0.5;while((x*=2)<m)b=b""b; next} + {d[$1][$2]=(d[$2][$1]=$3)} + END {while(++i<=n){ + printf substr(lbl[i]b,1,m);j=0; + while(++j<=n) + printf(" %."p"f",((dij=d[i][j])==0)?0:(dij<1)?dij:9.99999999); + print""} }' $OEPL > $DMAT ; + echo "p-distances written into $DMAT" ; + fi +fi + +### estimating (if required) missing distances, i.e. every missing dij is replaced by min[dix+djx] ########## +miss=$(grep -o -F " 9.99999999" $DMAT | wc -l); +if [ $miss -ne 0 ] +then + echo "estimating $(( $miss / 2 )) missing distances inside $DMAT" ; + $GAWK '(NR>1){m=(m>(l=length(lbl[++n]=$(c=j=1))))?m:l;--j;while(++c<=NF){d[n][++j]=$c;(max<$c)&&max=$c}} + END{while(++i<=n){j=0; + while(++j<i){x=0;min=2*max; + if(d[i][j]!=9.99999999)continue; + while(++x<=n)x!=i&&x!=j&&d[i][x]!=9.99999999&&d[j][x]!=9.99999999&&d[i][x]>=0&&d[j][x]>=0&&min>(y=d[i][x]+d[j][x])&&min=y; + print "estimated missing distance between "lbl[i]" ("i") and "lbl[j]" ("j"): "min > "/dev/stderr"; + d[i][j]=d[j][i]=-min}} + print(b=" ")n;x=0.5;while((x*=2)<m)b=b""b;i=0; + while(++i<=n){printf substr(lbl[i]b,1,m);j=0;while(++j<=n)printf(" %.8f",((d[i][j]>=0)?d[i][j]:-d[i][j]));print""} }' $DMAT > $DMAT.tmp ; + mv $DMAT.tmp $DMAT ; fi -### deleting all *.msh files +### deleting all *.msh files ################################################################################ for f in $DATADIR/*.msh ; do rm -f $f ; done @@ -382,69 +559,108 @@ if ! $INFERTREE ; then exit 0 ; fi ############################################################################################################# ############################################################################################################# +trap INT ; +function ctrl_c() { + echo -n " process interrupted: deleting files ... " ; + sleep 5 ; + for x in $(seq 1 $NPROC) + do + rm -f $BASEFILE.dd.$x.c $BASEFILE.dd.$x.c_fastme_stat.txt $BASEFILE.dd.$x.n $BASEFILE.dd.$x.n_fastme_stat.txt $BASEFILE.tt.$x.c $BASEFILE.tt.$x.n $BASEFILE.tt.$x.m ; + done + rm -f $ACGT $OEPL $BASEFILE.tax $BASEFILE.d $BASEFILE.dd $BASEFILE.dd_fastme_stat.txt $BASEFILE.nwk $BASEFILE.tt ; + echo "[ok]" ; + exit 1 ; +} +trap ctrl_c INT ; + echo "searching for the BME phylogenetic tree..." ; TAXFILE=$BASEFILE.tax; -grep -v "^ " $DFILE | $GAWK '{print $1}' > $TAXFILE ; +sed 1d $DMAT | $GAWK '{print "s/@"(++i)"@/"$1"/"}' > $TAXFILE ; -$GAWK '(NR==1){print;next}{printf"@"(++i)"@";j=1;while(++j<=NF)printf" "$j;print""}' $DFILE > $BASEFILE.dd ; -DFILE=$BASEFILE.dd; +$GAWK '(NR==1){print;next}{printf"@"(++i)"@";j=1;while(++j<=NF)printf" "$j;print""}' $DMAT > $BASEFILE.dd ; +DMAT=$BASEFILE.dd; BMETREE=$BASEFILE.nwk; # BME phylogenetic tree in NEWICK format OUTTREE=$BASEFILE.tt; -STATFILE=$DFILE""_fastme_stat.txt; -### first BME tree inference -$FASTME -i $DFILE -o $OUTTREE -s -f 12 -T 1 &> /dev/null ; -tblo=$(grep -B1 "Performed" $STATFILE | sed -n 1p | sed 's/.* //g' | sed 's/\.$//g'); +### first BME tree inference ################################################################################ +$FASTME -i $DMAT -o $OUTTREE -s -f 12 -T 1 &> /dev/null ; +tblo=$(grep -B1 "Performed" $BASEFILE.dd_fastme_stat.txt | sed -n 1p | sed 's/.* //g' | sed 's/\.$//g'); [ -z "$tblo" ] && tblo=$(grep -o ":[0-9\.-]*" $OUTTREE | tr -d :- | paste -sd+ | bc -l | sed 's/^\./0./'); echo " step 0 $tblo" >&2 ; echo "step 0 tbl=$tblo" ; cp $OUTTREE $BMETREE; -i=0; while read tax; do let i++; sed -i "s/@$i""@/$tax/" $BMETREE ; done < $TAXFILE ; +sed -f $TAXFILE $BMETREE > $BMETREE.tmp ; mv $BMETREE.tmp $BMETREE ; # <=> sed -f $TAXFILE -i $BMETREE ; +rm -f $BASEFILE.dd_fastme_stat.txt ; -### ratchet-based search of the BME tree -ct=0; -for s in $(seq 1 $RATCHET) -do - ### noising evolutionary distances - v=0.$s; [ $(echo "$v>=0.7" | bc) -eq 1 ] && v=0$(echo "scale=4;$v*$v" | bc -l); - $GAWK -v v=$v -v s=$s 'BEGIN {srand(s)} - (NR==1){n=$0;next} {lbl[++i]=$1;d[i][i]=0;j=0;f=1;while(++f<=i){++j;d[i][j]=(d[j][i]=($f*(1-v)+2*v*$f*rand()))}} - END {print" "n;i=0;while(++i<=n){printf lbl[i];j=0;while(++j<=n){printf(" %.8f",d[i][j])}print""}}' $DFILE > $DFILE.noised ; - - ### using current BME tree as starting tree for a new BME tree search - $FASTME -i $DFILE.noised -u $OUTTREE -o $OUTTREE.noised -nB -s -T 1 &> /dev/null ; - sed -i 's/:-/:/g' $OUTTREE.noised ; - $FASTME -i $DFILE -u $OUTTREE.noised -o $OUTTREE.candidate -s -T 1 &> /dev/null ; - - tbl=$(grep -B1 "Performed" $STATFILE | sed -n 1p | sed 's/.* //g' | sed 's/\.$//g'); - out=" "; - [ -z "$tbl" ] && tbl=$(grep -o ":[0-9\.-]*" $OUTTREE.candidate | tr -d :- | paste -sd+ | bc | sed 's/^\./0./') && out="+"; - echo -n "$out step $s $tbl" >&2 ; - if [ $(echo "$tbl<$tblo" | bc) -eq 0 ] - then - echo >&2 ; - else - ct=0; - tblo=$tbl; - mv $OUTTREE.candidate $OUTTREE ; - cp $OUTTREE $BMETREE; - i=0; while read tax; do let i++; sed -i "s/@$i""@/$tax/" $BMETREE ; done < $TAXFILE ; - echo " *" >&2 ; - echo "step $s tbl=$tbl"; - fi +### ratchet-based search of the BME tree #################################################################### +if [ $RATCHET -gt 0 ] +then + for x in $(seq 1 $NPROC); do cp $DMAT $DMAT.$x.c ; done - rm -f $DFILE.noised_fastme_stat.txt $STATFILE $OUTTREE.noised $OUTTREE.candidate $DFILE.noised ; + step=0; eps=0; + while [ $step -le $RATCHET ] + do + step_prev=$step; + eps_prev=$eps; + for x in $(seq 1 $NPROC) + do + if [ $((++step)) -gt $RATCHET ]; then break; fi + + ### noising evolutionary distances #################################################################### + v=0.$((++eps))$step; [ $(echo "$v>=0.7" | bc) -eq 1 ] && v=0$(echo "scale=4;$v*$v/1.4" | bc -l); + $GAWK -v v=$v -v s=$step 'BEGIN {srand(s)} + (NR==1){n=$0;next} {lbl[++i]=$1;d[i][i]=0;j=0;f=1;while(++f<=i){++j;d[i][j]=(d[j][i]=($f*(1-v)+2*v*$f*rand()))}} + END {print" "n;i=0;while(++i<=n){printf lbl[i];j=0;while(++j<=n){printf(" %.8f",d[i][j])}print""}}' $DMAT.$x.c > $DMAT.$x.n ; + + ### ratchet-search tree search ######################################################################## + $EXEC "$FASTME -i $DMAT.$x.n -u $OUTTREE -o $OUTTREE.$x.n -nB -s -T 1 ; sed 's/:-/:/g' $OUTTREE.$x.n > $OUTTREE.$x.m ; $FASTME -i $DMAT.$x.c -u $OUTTREE.$x.m -o $OUTTREE.$x.c -s -T 1 ; rm -f $DMAT.$x.n $DMAT.$x.m $DMAT.$x.n_fastme_stat.txt $OUTTREE.$x.n $OUTTREE.$x.m ;" &> /dev/null & + done + + while [ $(jobs -r | wc -l) -gt 0 ]; do sleep $WAITIME ; done + + for x in $(seq 1 $NPROC) + do + if [ $((++step_prev)) -gt $RATCHET ]; then break; fi + v=0.$((++eps_prev))$step_prev; [ $(echo "$v>=0.7" | bc) -eq 1 ] && v=0$(echo "scale=4;$v*$v/1.44" | bc -l); + + tbl=$(grep -B1 "Performed" $DMAT.$x.c_fastme_stat.txt | sed -n 1p | sed 's/.* //g' | sed 's/\.$//g'); + rm -f $DMAT.$x.c_fastme_stat.txt ; + out=" "; + [ -z "$tbl" ] && tbl=$(grep -o ":[0-9\.-]*" $OUTTREE.$x.c | tr -d :- | paste -sd+ | bc | sed 's/^\./0./') && out="+"; + echo -n "$out step $step_prev $tbl" >&2 ; + if [ $(echo "$tbl<$tblo" | bc) -eq 0 ] + then + rm -f $OUTTREE.$x.c ; + echo " (epsilon=$v)" >&2 ; + else + tblo=$tbl; + mv $OUTTREE.$x.c $OUTTREE ; + cp $OUTTREE $BMETREE; + sed -f $TAXFILE $BMETREE > $BMETREE.tmp ; mv $BMETREE.tmp $BMETREE ; # <=> sed -f $TAXFILE -i $BMETREE ; + echo " * (epsilon=$v)" >&2 ; + echo "step $step_prev tbl=$tbl"; + eps=0; + fi + done + done - if [ $((++ct)) -eq $RATCHET_LIMIT ]; then break; fi -done + for x in $(seq 1 $NPROC); do rm -f $DMAT.$x.c ; done +fi + +############################################################################################################# +############################################################################################################# +#### REQ CONFIDENCE VALUES AT BRANCHES #### +############################################################################################################# +############################################################################################################# -### confidence value at every branch +echo -n "estimating branch supports ... " >&2 ; $REQ $BASEFILE.d $BMETREE $OUTTREE ; +echo "[ok]" >&2 ; mv $OUTTREE $BMETREE ; echo "BME tree (tbl=$tblo) with branch supports written into $BMETREE" ; -rm -f $DFILE $TAXFILE $OUTTREE ; +rm -f $DMAT $TAXFILE $OUTTREE ; exit ; diff --git a/README.md b/README.md index 3ca3cd12df36ac795de362c22b22ed589e8cc876..ed56b7132e98e12b15eb871280b2fcf772a0deb5 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ _JolyTree_ (named in memory of [Nicolas Joly](https://research.pasteur.fr/en/member/nicolas-joly/)) is a command line script written in [Bash](https://www.gnu.org/software/bash/) that allows a distance-based phylogenetic tree with branch supports to be quickly inferred from non-aligned genome sequences. _JolyTree_ runs on UNIX, Linux and most OS X operating systems. -For more details, see the associated publication (Criscuolo 2019). +For more details, see the associated publication ([Criscuolo 2019](https://riojournal.com/article/36178/)). ## Installation and execution @@ -22,7 +22,7 @@ For more details, see the associated publication (Criscuolo 2019). git clone https://gitlab.pasteur.fr/GIPhy/JolyTree.git ``` -**C.** If at least one of the four required binaries (step A) is not available on your `$PATH` variable, edit the file `JolyTree.sh` and indicate the local path to the `mash`, `gawk`, `FastME` and/or `REQ` binary(ies) (approximately between lines 100 and 200): +**C.** If at least one of the four required binaries (step A) is not available on your `$PATH` variable, edit the file `JolyTree.sh` and indicate the local path to the `mash`, `gawk`, `FastME` and/or `REQ` binary(ies) (approximately between lines 130 and 230): ```bash ############################################################################################################# @@ -95,19 +95,26 @@ chmod +x JolyTree.sh Launch _JolyTree_ without option to read the following documentation: ``` - USAGE: - JolyTree.sh [options] - where: + USAGE: JolyTree.sh -i <directory> -b <basename> [options] + + OPTIONS: + -i <directory> directory name containing FASTA-formatted contig files; only files ending with .fa, .fna, .fas or .fasta will be considered (mandatory) -b <basename> basename of every written output file (mandatory) - -s <int> sketch size (default: 25% of the largest genome size) - -q <double> probability of observing a random k-mer (default: 0.0001) - -k <int> k-mer size (default: estimated from the average genome size with the + -q <real> probability of observing a random k-mer (default: 0.000000001) + -k <int> k-mer size (default: estimated from the largest genome size with the probability set by option -q) + -s <real|int> sketch size estimated as the proportion (up to 1.00) of the average + genome size; the sketch size (integer > 2) can also be directly set + using this option (default: 0.25) -c <real> if at least one of the estimated p-distances is above this specified - cutoff, then a F81 correction is performed (default: 0.1) - -n no BME tree inference (only pairwise distance estimation) + cutoff, then a F81/EI correction is performed (default: 0.1) + -a <real> F81/EI transformation gamma shape parameter (default: 1.5) + -f using the four nucleotide frequencies in F81/EI transformations + (default: to deal with multiple contig files, only the two + frequencies A+T and C+G are used) + -n no BME tree inference (only pairwise distance estimates) -r <int> number of steps when performing the ratchet-based BME tree search (default: 100) -t <int> number of threads (default: 2) @@ -115,17 +122,32 @@ Launch _JolyTree_ without option to read the following documentation: ## Notes -* It is not recommended to modify the option -k. The optimal value of _k_ is automatically estimated by equation (2) in [Ondov et al. (2016)](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x) from the desired probability _q_ of observing a random _k_-mer (option -q). Increasing _q_ (e.g. > 0.001) is not recommended, especially when dealing distantly-related genomes. Lowering _q_ (e.g. < 0.00001) leads to larger _k_-mer size that increases the variance of the estimated evolutionary distances. +* In short, _JolyTree_ uses [_mash_](http://mash.readthedocs.io/en/latest/) to decompose each genome into a sketch of _k_-mers (options `-k`, `-q`, `-s`) and quickly estimate the _p_-distance between each pair of genomes. If required, every _p_-distance is transformed into an evolutionary distance (options `-c`, `-a`, `-f`). A ratchet-based optimal phylogenetic tree search is performed from the pairwise evolutionary distances using [_FastME_](http://www.atgc-montpellier.fr/fastme/usersguide.php) (option `-r`). Branch supports are finally estimated using [_REQ_](https://research.pasteur.fr/en/tool/r%ce%b5q-assessing-branch-supports-o%c6%92-a-distance-based-phylogenetic-tree-with-the-rate-o%c6%92-elementary-quartets/). + +* It is not recommended to modify the option `-k`. The optimal value of _k_ is automatically estimated by equation (2) in [Ondov et al. (2016)](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x) from the desired probability _q_ of observing a random _k_-mer (option `-q`). Since _JolyTree_ v2.0, the default probability is _q_ = 0.000000001, as it enables pairwise distances to be conveniently estimated in many cases. Increasing _q_ is not recommended, but can be useful to minimize the number of unknown distances when dealing with distantly-related genome sequences. Lowering _q_ leads to larger _k_-mer size that can be useful when dealing with very closely-related genome sequences. + +* Default sketch size is 25% of the average genome size, which enables pairwise distances to be efficiently estimated with fast running times. Increasing the sketch size (option `-s`) yields more accurate estimates (especially with distantly-related genomes), but requires important disk space to store the sketch files. + +* Lowering the cutoff value for correcting the evolutionary distances (option `-c`) does generally not modify the inferred phylogenetic tree; on the other side, it is generally not recommended to increase this cutoff value. + +* The option `-c` allows multiple substitutions per character to be accurately estimated when an observed _p_-distance is quite large (e.g. > 0.1; see [Figure 3.1](https://books.google.fr/books?id=3Xc8DwAAQBAJ&pg=PA41) in Nei and Kumar 2000). In such cases, all the _p_-distances _p_ estimated by _mash_ are transformed into proper evolutionary distances _d_. Since _JolyTree_ v2.0, four _p_-distance transformations can be obtained using options `-c` and/or `-a`: +<div align="center"> -* Increasing the sketch size (option -s) does not generally modify the inferred phylogenetic tree; on the other side, it is not recommended to set a sketch size lower than 10,000 (except for small genomic sequences, e.g. plasmids, viruses) +| transformation | formula | options | +|:------------------------|:-----------------------------------|:-----------------------------------| +| none | _d_ = _p_ | `-c 1 -a $a` with any `$a` > 0 | | +| Poisson correction | _d_ = - log<sub>_e_</sub>(1 - _p_) | `-c 1 -a 0` | +| F81/EI correction | _d_ = -_b_<sub>1</sub> log<sub>_e_</sub>(1 - _p_/_b_<sub>2</sub>) | `-a 0` | +| F81/EI gamma correction | _d_ = -_ab_<sub>1</sub>[(1 - _p_/_b_<sub>2</sub>)<sup>-1/_a_</sup> - 1] | `-a $a` with specified gamma shape parameter | -* Lowering the cutoff value for correcting the evolutionary distances (option -c) does generally not modify the inferred phylogenetic tree; on the other side, it is strongly not recommended to increase this cutoff value. +</div> -* The option -c allows multiple substitutions per character to be accurately estimated when an observed _p_-distance is quite large (e.g.> 0.1; see [Figure 3.1](https://books.google.fr/books?id=3Xc8DwAAQBAJ&pg=PA41) in Nei and Kumar 2000). In such cases, the F81 correction is performed by using the equation (4) in [Tamura and Kumar (2002)](https://academic.oup.com/mbe/article/19/10/1727/1258975) that allows estimating the pairwise distance based on the Equal-Input model of evolution ([Felsenstein 1981](https://link.springer.com/article/10.1007/BF01734359); [Tajima and Nei 1982](https://link.springer.com/article/10.1007/BF01810830), [1984](https://academic.oup.com/mbe/article/1/3/269/1244029)). This transformation was chosen because it could be directly computed from a _p_-distance value, and it takes into account putative unequal base frequencies and heterogeneous base composition among lineages. +* The F81 corrections estimate pairwise distances based on the Equal-Input (EI) model of nucleotide substitution ([Felsenstein 1981](https://link.springer.com/article/10.1007/BF01734359); [Tajima and Nei 1982](https://link.springer.com/article/10.1007/BF01810830), [1984](https://academic.oup.com/mbe/article/1/3/269/1244029), [Tamura and Kumar 2002](https://academic.oup.com/mbe/article/19/10/1727/1258975)). These transformations were chosen because they can be directly computed from _p_-distances, and take into account putative unequal base frequencies and heterogeneous base composition among lineages (option `-f`; for details about the values _b_<sub>1</sub> and _b_<sub>2</sub>, see e.g. [Tamura and Kumar 2002](https://academic.oup.com/mbe/article/19/10/1727/1258975), [Criscuolo 2019](https://riojournal.com/article/36178/)). Thanks to the use of the supplementary gamma shape parameter (option `-a`), F81/EI gamma distance allows approximating evolutionary distances derived from complex nucleotide substitution models (the default parameter _a_ = 1.5 enables pairwise distances to be conveniently estimated in many cases). -* Fast running times will be observed when using multiple threads; of note, only pairwise distance estimation step benefits from a large number of threads (other steps are quite fast). +* Fast running times will be observed when using multiple threads. Since _JolyTree_ v2.0, almost all steps benefit from a large number of threads, i.e. genome parsing and sketching, _p_-distance estimates, and ratchet-based BME tree search. + +* The verbosity of _JolyTree_ can be reduced by ending the command line by `2>/dev/null` -* The verbosity of _JolyTree_ could be reduced by ending the command line by `2>/dev/null` * To launch _JolyTree_ on multiple cores on a cluster managed by [SLURM](https://slurm.schedmd.com), edit the file `JolyTree.sh` and read the subsection [3] of the _Installation_ section (approximately line 200). @@ -176,7 +198,7 @@ echo -e "oxytoca.ATCC13182\tCAAHFW01\naerogenes.ATCC13048\tQVMZ01\ngrimontii.06D while read -r s a; do echo $t.$s;([[ $a =~ $A1 ]]&&$EUTILS$a||$NCBIFTP${a:0:2}/${a:2:2}/$([[ $a =~ $A2 ]]&&echo ${a:4:2}/)$a/$a$Z|zcat)>genomes/$t.$s.fa; done ``` -##### Launching _jolyTree_ +##### Running _JolyTree_ The following command line allows the script `jolyTree.sh` to be launched with default options on 8 threads: ```bash @@ -212,3 +234,31 @@ Tajima F, Nei M (1982) Biases of the estimates of DNA divergence obtained by the Tajima F, Nei M (1984) Estimation of evolutionary distance between nucleotide sequences. Molecular Biology and Evolution, 1(3):269-285. [doi:10.1093/oxfordjournals.molbev.a040317](https://academic.oup.com/mbe/article/1/3/269/1244029). Tamura K, Kumar S (2002) Evolutionary distance estimation under heterogeneous substitution pattern among lineages. Molecular Biology and Evolution, 19(10):1727-1736. [doi:10.1093/oxfordjournals.molbev.a003995](https://academic.oup.com/mbe/article/19/10/1727/1258975). + + + +## Gallery + +Below is a list of some phylogenetic trees inferred using _JolyTree_: + +* [_Aggregatibacter_](https://riojournal.com/article/36178/zoom/fig/4969692/), [_Borreliella_](https://riojournal.com/article/36178/zoom/fig/4969672/), [_Elizabethkingia_](https://riojournal.com/article/36178/zoom/fig/4969676/), [_Lactococcus_](https://riojournal.com/article/36178/zoom/fig/4969680/), [_Providencia_](https://riojournal.com/article/36178/zoom/fig/4969684/), and [_Ralstonia_](https://riojournal.com/article/36178/zoom/fig/4969688/) genera ([Criscuolo 2019](https://riojournal.com/article/36178/)) + +* [_Corynebacteria_ genus](https://www.microbiologyresearch.org/docserver/fulltext/ijsem/68/12/ijsem003069-f1.gif) ([Dazas et al. 2018](https://doi.org/10.1099/ijsem.0.003069)) + +* [_Mucor circinelloides_ f. _circinelloides_](https://mbio.asm.org/content/mbio/9/2/e00573-18/F3.large.jpg) ([Garcia-Hermoso et al. 2018](https://doi.org/10.1128/mBio.00573-18)) + +* [_Escherichia coli_](https://wwwnc.cdc.gov/eid/article/25/1/18-0534-f2) ([Nadimpali et al. 2019](https://doi.org/10.3201/eid2501.180534)) + +* [_Klebsiella pneumoniae_](https://aem.asm.org/content/aem/86/7/e02711-19/F1.large.jpg) ([Barbier et al. 2020](https://doi.org/10.1128/AEM.02711-19)) + +* [_Rathayibacter_ genus](https://mra.asm.org/content/ga/9/22/e00316-20/F1.large.jpg) ([Tarlachkov et al. 2020](https://doi.org/10.1128/MRA.00316-20)) + +* [_Corynebacterium diphteriae_ complex](https://ars.els-cdn.com/content/image/1-s2.0-S092325082030019X-gr1_lrg.jpg) ([Badell et al. 2020](https://doi.org/10.1016/j.resmic.2020.02.003)) + +* [_Klebsiella pneumoniae_](https://www.tandfonline.com/na101/home/literatum/publisher/tandf/journals/content/kgmi20/2020/kgmi20.v011.i05/19490976.2020.1748257/20200625/images/medium/kgmi_a_1748257_f0002_c.jpg) ([Huynh et al. 2020](https://doi.org/10.1080/19490976.2020.1748257 )) + +* [_Stromatolite bacterial communities_](https://www.biorxiv.org/content/biorxiv/early/2020/03/14/818625/F5.large.jpg) ([Waterworth et al. 2020](https://doi.org/10.1101/818625)) + +* [_Campylobacter_ genus](https://figshare.com/articles/Data_Sheet_1_Taking_Control_Campylobacter_jejuni_Binding_to_Fibronectin_Sets_the_Stage_for_Cellular_Adherence_and_Invasion_pdf/12103260) ([Konkel et al. 2020](https://doi.org/10.3389/fmicb.2020.00564.s001)) + +