diff --git a/JolyTree.sh b/JolyTree.sh index 0dcf55e6207840fd4a69b43fce72b3ecc7257b91..362a8315a8d476f07dab09c6bf1bf3e440ac0beb 100755 --- a/JolyTree.sh +++ b/JolyTree.sh @@ -4,7 +4,7 @@ # # # JolyTree: fast distance-based phylogenetic inference from unaligned genome sequences # # # - COPYRIGHT="Copyright (C) 2017-2020 Institut Pasteur" # + COPYRIGHT="Copyright (C) 2017-2021 Institut Pasteur" # # # # This program is free software: you can redistribute it and/or modify it under the terms of the GNU # # General Public License as published by the Free Software Foundation, either version 3 of the License, or # @@ -33,7 +33,13 @@ # = VERSIONS = # # ============ # # # - VERSION=2.0.190926ac # + VERSION=2.1.211019ac # +# + commenting line 576, as some linux distribution incorrectly interpret "trap [arg] signal_spec" with # +# empty arg # +# + adding some conditions to deal with some FastME crashes observed when inferring large trees # +# + new option -x to prevent +# # +# VERSION=2.0.190926ac # # + new F81/EI transformation formula using gamma shape parameter (option -a = 1.5 by default) # # + option -f to to use the 4 nucleotide frequencies in F81/EI transformation; by default, to deal with # # multiple contig files, JolyTree sets f(A)=f(T)=0.5*(A+T)/(A+C+G+T) and f(C)=f(G)=0.5*(C+G)/(A+C+G+T) # @@ -95,6 +101,10 @@ Criscuolo A (2019) A fast alignment-free bioinformatics procedure to infer accurate distance-based phylogenetic trees from genome assemblies. RIO. doi:10.3897/rio.5.e36178 + Criscuolo A (2020) On the transformation of MinHash-based uncorrected distances into + proper evolutionary distances for phylogenetic inference. F1000Research. + doi:10.12688/f1000research.26930.1 + USAGE: JolyTree.sh -i <directory> -b <basename> [options] OPTIONS: @@ -117,6 +127,7 @@ -n no BME tree inference (only pairwise distance estimates) -r <int> number of steps when performing the ratchet-based BME tree search (default: 100) + -x no branch support -t <int> number of threads (default: 2) EOF @@ -249,11 +260,13 @@ INFERTREE=true; # -n (none) RATCHET=100; # -r (100) RATCHET_LIMIT=200; # (static) +BRANCH_SUPPORT=true; # -x (none) + NPROC=2; # -t (2) CHUNK=20; # -h (20) WAITIME=0.5; # (auto from -t) -while getopts :i:b:s:q:k:c:a:d:r:t:h:nf option +while getopts :i:b:s:q:k:c:a:d:r:t:h:nfx option do case $option in i) DATADIR="$OPTARG" ;; @@ -265,6 +278,7 @@ do a) ALPHA="$($GAWK -v x=$OPTARG 'BEGIN{printf "%.20f", x+0}' | sed 's/0*$//g')" ;; f) NFQ=4 ;; n) INFERTREE=false ;; + x) BRANCH_SUPPORT=false ;; r) RATCHET=$OPTARG ;; h) CHUNK=$OPTARG ;; t) NPROC=$OPTARG ;; @@ -559,7 +573,7 @@ if ! $INFERTREE ; then exit 0 ; fi ############################################################################################################# ############################################################################################################# -trap INT ; +# trap INT ; function ctrl_c() { echo -n " process interrupted: deleting files ... " ; sleep 5 ; @@ -588,8 +602,14 @@ OUTTREE=$BASEFILE.tt; $FASTME -i $DMAT -o $OUTTREE -s -f 12 -T 1 &> /dev/null ; tblo=$(grep -B1 "Performed" $BASEFILE.dd_fastme_stat.txt | sed -n 1p | sed 's/.* //g' | sed 's/\.$//g'); [ -z "$tblo" ] && tblo=$(grep -o ":[0-9\.-]*" $OUTTREE | tr -d :- | paste -sd+ | bc -l | sed 's/^\./0./'); -echo " step 0 $tblo" >&2 ; -echo "step 0 tbl=$tblo" ; +if [ -z "$tblo" ] +then + tblo=999999; + echo " step 0 NaN" >&2 ; +else + echo " step 0 $tblo" >&2 ; + echo "step 0 tbl=$tblo" ; +fi cp $OUTTREE $BMETREE; sed -f $TAXFILE $BMETREE > $BMETREE.tmp ; mv $BMETREE.tmp $BMETREE ; # <=> sed -f $TAXFILE -i $BMETREE ; rm -f $BASEFILE.dd_fastme_stat.txt ; @@ -615,7 +635,7 @@ then END {print" "n;i=0;while(++i<=n){printf lbl[i];j=0;while(++j<=n){printf(" %.8f",d[i][j])}print""}}' $DMAT.$x.c > $DMAT.$x.n ; ### ratchet-search tree search ######################################################################## - $EXEC "$FASTME -i $DMAT.$x.n -u $OUTTREE -o $OUTTREE.$x.n -nB -s -T 1 ; sed 's/:-/:/g' $OUTTREE.$x.n > $OUTTREE.$x.m ; $FASTME -i $DMAT.$x.c -u $OUTTREE.$x.m -o $OUTTREE.$x.c -s -T 1 ; rm -f $DMAT.$x.n $DMAT.$x.m $DMAT.$x.n_fastme_stat.txt $OUTTREE.$x.n $OUTTREE.$x.m ;" &> /dev/null & + $EXEC "$FASTME -i $DMAT.$x.n -u $OUTTREE -o $OUTTREE.$x.n -nB -s -T 1 ; sed 's/:-/:/g' $OUTTREE.$x.n > $OUTTREE.$x.m ; $FASTME -i $DMAT.$x.c -u $OUTTREE.$x.m -o $OUTTREE.$x.c -s -T 1 -f 12 ; rm -f $DMAT.$x.n $DMAT.$x.m $DMAT.$x.n_fastme_stat.txt $OUTTREE.$x.n $OUTTREE.$x.m ;" &> /dev/null & done while [ $(jobs -r | wc -l) -gt 0 ]; do sleep $WAITIME ; done @@ -629,8 +649,9 @@ then rm -f $DMAT.$x.c_fastme_stat.txt ; out=" "; [ -z "$tbl" ] && tbl=$(grep -o ":[0-9\.-]*" $OUTTREE.$x.c | tr -d :- | paste -sd+ | bc | sed 's/^\./0./') && out="+"; + [ -z "$tbl" ] && tbl="NaN"; echo -n "$out step $step_prev $tbl" >&2 ; - if [ $(echo "$tbl<$tblo" | bc) -eq 0 ] + if [ "$tbl" == "NaN" ] || [ $(echo "$tbl<$tblo" | bc) -eq 0 ] then rm -f $OUTTREE.$x.c ; echo " (epsilon=$v)" >&2 ; @@ -655,13 +676,18 @@ fi ############################################################################################################# ############################################################################################################# -echo -n "estimating branch supports ... " >&2 ; -$REQ $BASEFILE.d $BMETREE $OUTTREE ; -echo "[ok]" >&2 ; -mv $OUTTREE $BMETREE ; -echo "BME tree (tbl=$tblo) with branch supports written into $BMETREE" ; -rm -f $DMAT $TAXFILE $OUTTREE ; +if $BRANCH_SUPPORT +then + echo -n "estimating branch supports ... " >&2 ; + $REQ $BASEFILE.d $BMETREE $OUTTREE ; + echo "[ok]" >&2 ; + mv $OUTTREE $BMETREE ; + echo "BME tree (tbl=$tblo) with branch supports written into $BMETREE" ; +else + echo "BME tree (tbl=$tblo) written into $BMETREE" ; +fi +rm -f $DMAT $TAXFILE $OUTTREE ; exit ;