diff --git a/wGRR b/wGRR index 25f662e7e9668bd64993178da109cd0847a0b9a9..f84cfc6b6ff30571025422e077e17b580fdaf4fd 100755 --- a/wGRR +++ b/wGRR @@ -7,7 +7,7 @@ trap 'rm -rf "$tmp"' EXIT export LC_ALL=C SECONDS=0 -readonly VERSION=0.5 +readonly VERSION=0.6 bold=$(tput bold) normal=$(tput sgr0) @@ -43,7 +43,7 @@ display_usage() { echo " -m <integer> Max number of simulteaneous tasks." echo " Only applicable to Maestro, for the wGRR calculation step." echo " default: not set" - echo " -T Test run. Useful to get stats on the input file and correctly set the -a parameter." + echo " -T Test run. Useful to get stats on the input file and correctly set the -a parameter." echo "" echo "${bold}DESCRIPTION:${normal}" echo "This pipeline will do all proteins pairwise comparisons using the MMseqs2 software and then process " @@ -156,69 +156,67 @@ if [[ $SLURM_JOBID != "" ]] ; then BATCHFLAG=$(squeue -j $SLURM_JOBID --Format=batchflag -h | awk '{print $1}') fi -echo "[INFO] -- wGRR version ${VERSION}" +printf "%-10s -- %s\n" "[INFO]" "wGRR version ${VERSION}" | tee -a ${OUT}.wgrr.log if [[ $BATCHFLAG == 0 ]] ; then - echo "[INFO] -- The command is:" - echo "[INFO] -- ./wGRR ${ALL_ARGS}" - if [[ $THREADS -gt $(nproc) ]] ; then - echo "[ERROR] -- You required $THREADS threads but your machine has $(nproc) cores." + printf "%-10s -- %s\n" "[ERROR]" "You required $THREADS threads but your machine has $(nproc) cores." | tee -a ${OUT}.wgrr.log exit 1 fi + printf "%-10s -- %s\n" "[INFO]" "The command is:" >> ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[INFO]" "./wGRR ${ALL_ARGS}" >> ${OUT}.wgrr.log else - echo "[INFO] -- The command is:" - echo "[INFO] -- $(scontrol show job ${SLURM_JOBID} | awk '/ +Command/{n=split($0,a,"=");print a[n]}')" + printf "%-10s -- %s\n" "[INFO]" "The command is:" >> ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[INFO]" "$(scontrol show job ${SLURM_JOBID} | awk '/ +Command/{n=split($0,a,"=");print a[n]}')" >> ${OUT}.wgrr.log fi if [[ ! -f wGRR.awk ]] ; then - echo "[ERROR] -- awk script wGRR.awk not found." + printf "%-10s -- %s\n" "[ERROR]" "awk script wGRR.awk not found." | tee -a ${OUT}.wgrr.log exit 1 fi if [[ ! -f wGRR_worker.zsh ]] ; then - echo "[ERROR] -- awk script wGRR_worker.zsh not found." + printf "%-10s -- %s\n" "[ERROR]" "awk script wGRR_worker.zsh not found." | tee -a ${OUT}.wgrr.log exit 1 fi ## Check awk if ! AWKEXE=$(command -v gawk) ; then if ! AWKEXE=$(command -v awk) ; then - echo "[ERROR] -- Could not find awk or gawk executable." + printf "%-10s -- %s\n" "[ERROR]" "Could not find awk or gawk executable." | tee -a ${OUT}.wgrr.log exit 1 fi fi if ! echo "" | $AWKEXE '{a[1][2]=3}' &> /dev/null ; then - echo "[ERROR] -- Your awk version is too old. Please install a more recent version (gawk is recommended)" + printf "%-10s -- %s\n" "[ERROR]" "Your awk version is too old. Please install a more recent version (gawk is recommended)." | tee -a ${OUT}.wgrr.log exit 1 fi ## Check MMseqs if [[ $MMPATH == "N.O.P.A.T.H" ]] ; then if ! MMSEQS=$(command -v mmseqs) ; then - echo "[ERROR] -- MMseqs binary not found." + printf "%-10s -- %s\n" "[ERROR]" "MMseqs binary not found." | tee -a ${OUT}.wgrr.log exit 1 fi else if ! MMSEQS=$(command -v $MMPATH/mmseqs) ; then if ! MMSEQS=$(command -v mmseqs) ; then - echo "[ERROR] -- MMseqs binary not found." + printf "%-10s -- %s\n" "[ERROR]" "MMseqs binary not found in your PATH." | tee -a ${OUT}.wgrr.log exit 1 else - echo "[WARNING] -- MMseqs binary not found at given location" - echo "[WARNING] -- Using mmseqs binary found in your path" + printf "%-10s -- %s\n" "[ERROR]" "MMseqs binary not found in ${MMPATH}." | tee -a ${OUT}.wgrr.log fi fi fi STATS=($($AWKEXE '/^>/{p++;g=substr($1,2);gsub(/_[0-9]+$/,"",g);if(!a[g]++){c++};LNR=NR}{if(NR>LNR+1){n=1}}END{if(n!=1){n=0}print c"\t"p"\t"n"\t"p/c}' $PRT)) -echo "[INFO] -- Input file has "$STATS[1]" genomes and a total of "$STATS[2]" proteins." -echo "[INFO] -- Mean number of proteins per genome: $STATS[4]" +printf "%-10s -- %s %s %s %s %s\n" "[INFO]" "Input file has" $STATS[1] "genomes and a total of" $STATS[2] "proteins" | tee -a ${OUT}.wgrr.log +printf "%-10s -- %s %s\n" "[INFO]" "Mean number of proteins per genome:" $STATS[4] | tee -a ${OUT}.wgrr.log if [[ $STATS[3] -eq 1 ]]; then - echo "[INFO] -- Converting fasta file to sequential fasta" + printf "%-10s -- %s\n" "[INFO]" "Converting fasta file to sequential fasta" | tee -a ${OUT}.wgrr.log OPRT=$PRT $AWKEXE '!/^>/{s=s$0;next}(s!=""){print s;s=""}{print}END{print s}' $OPRT > $OPRT:t:r_seq.prt PRT=$OPRT:t:r_seq.prt @@ -226,111 +224,119 @@ fi duration=$SECONDS if [[ -f $OUT.m8 ]] ; then - echo "[INFO] -- Using existing MMseqs output file $OUT.m8" + printf "%-10s -- %s %s%s\n" "[INFO]" "Using existing MMseqs output file" $OUT ".m8" | tee -a ${OUT}.wgrr.log elif [[ $TESTRUN == 0 ]] ; then MIDENT=$($AWKEXE '!/^>/{a[$0]++}END{for(i in a){if(a[i]>m){m=a[i]}}print m}' $PRT) - printf "%-10s -- %s\n" "[TIME]" $(textifyDuration $duration) + printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log if [[ $((MIDENT*2)) -gt "$MMS_DEF_MAX_SEQS" ]] ; then MMS_MAX_SEQ_PARAM=($(echo $((MIDENT*2)) | awk '{c=substr($1,2);gsub(/[0-9]/,0,c);print "--max-seqs "substr($1,1,1)+1""c}')) - echo "[INFO] -- Setting MMseqs parameter $MMS_MAX_SEQ_PARAM" + printf "%-10s -- %s %s\n" "[INFO]" "Setting MMseqs parameter" "$MMS_MAX_SEQ_PARAM" | tee -a ${OUT}.wgrr.log fi if [[ $BATCHFLAG == 1 ]] ; then - printf "%-10s -- %s\n" "[INFO]" "Submitting MMseqs search to Maestro" + printf "%-10s -- %s\n" "[INFO]" "Submitting MMseqs search to Maestro" | tee -a ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[INFO]" "The command is:" >> ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[INFO]" "sbatch --wait --parsable -o ${OUT}.mmseqs.log -p ${PARTITION} -c ${THREADS} -J \"wGRR_MMSeqs\" --wrap=\"${MMSEQS} easy-search ${PRT} ${PRT} ${OUT}.m8 ${tmp} --threads ${THREADS} --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches ${MMS_MAX_SEQ_PARAM}" >> ${OUT}.wgrr.log JID=$(sbatch --wait --parsable -o "$OUT".mmseqs.log -p $PARTITION -c $THREADS -J "wGRR_MMSeqs" --wrap="$MMSEQS easy-search $PRT $PRT ${OUT}.m8 ${tmp} --threads $THREADS --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches $MMS_MAX_SEQ_PARAM") PQT=$(sacct -X -j $JID -o Reserved -n | awk '{n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];print t}') - echo "[INFO] -- The job $JID has been "$(textifyDuration $PQT) "in queue" + printf "%-10s -- %s %s %s %s %s\n" "[INFO]" "The job" $JID "has been" $(textifyDuration $PQT) "in queue" | tee -a ${OUT}.wgrr.log QT=$((QT+PQT)) else - printf "%-10s -- %s\n" "[INFO]" "Running MMseqs search" + printf "%-10s -- %s\n" "[INFO]" "Running MMseqs search" | tee -a ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[INFO]" "The command is:" >> ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[INFO]" "${MMSEQS} easy-search ${PRT} ${PRT} ${OUT}.m8 ${tmp} --threads ${THREADS} --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches ${MMS_MAX_SEQ_PARAM} > ${OUT}.mmseqs.search.log" >> ${OUT}.wgrr.log $MMSEQS easy-search $PRT $PRT $OUT.m8 $tmp --threads $THREADS --format-output "query,target,qcov,tcov,fident,evalue,bits" --add-self-matches $MMS_MAX_SEQ_PARAM > $OUT.mmseqs.search.log fi fi if [[ $TESTRUN == 0 ]] && [[ ! -f $OUT.m8 ]] ; then - echo "[ERROR] -- Something went wrong during the MMseqs step." + printf "%-10s -- %s\n" "[ERROR]" "Something went wrong during the MMseqs step." | tee -a ${OUT}.wgrr.log exit 1 fi duration=$SECONDS if [[ -f $OUT.allpairs.txt ]] ; then - echo "[INFO] -- Using existing file $OUT.allpairs.txt" + printf "%-10s -- %s %s%s\n" "[INFO]" "Using existing file" $OUT ".allpairs.txt" | tee -a ${OUT}.wgrr.log else - printf "%-10s -- %s\n" "[TIME]" $(textifyDuration $duration) - printf "%-10s -- %s\n" "[INFO]" "Writing genomes pairs" + printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[INFO]" "Writing genomes pairs" | tee -a ${OUT}.wgrr.log $AWKEXE 'BEGIN{x=1}/^>/{g=substr($1,2);gsub(/_[0-9]+$/,"",g);if(FNR==1){a[x]=g;++x;currg=g;next}if(g!=currg){a[x]=g;x++;currg=g}}END{i=0;while(++i in a){j=i;while(++j in a){print a[i]"\t"a[j]}}}' $PRT > $OUT.allpairs.txt - printf "%-10s -- %s\n" "[INFO]" "$(wc -l $OUT.allpairs.txt | awk '{print $1}') pairs written" + printf "%-10s -- %s\n" "[INFO]" "$(wc -l $OUT.allpairs.txt | awk '{print $1}') pairs written" | tee -a ${OUT}.wgrr.log fi NPAIRS=$(wc -l $OUT.allpairs.txt | $AWKEXE '{print $1}') if [[ ! -f $OUT.allpairs.txt ]] || [[ $NPAIRS < 1 ]] ; then - echo "[ERROR] -- An error occurred when writing the $OUT.allpairs.txt file." + printf "%-10s -- %s\n" "[ERROR]" "An error occurred when writing the $OUT.allpairs.txt file." | tee -a ${OUT}.wgrr.log exit 1 fi +NJOBS=$(( (NPAIRS+ARRAYSIZE-1)/ARRAYSIZE)) + if [[ $TESTRUN == 1 ]] ; then + printf "%-10s -- %s\n" "[INFO]" "With the current -a parameter (${ARRAYSIZE}) ${NJOBS} workers are required" | tee -a ${OUT}.wgrr.log rm -rf "$OUT".logs - printf "%-10s -- %s\n" "[INFO]" "Done" + printf "%-10s -- %s\n" "[INFO]" "Done." | tee -a ${OUT}.wgrr.log duration=$SECONDS - printf "%-10s -- %s\n" "[TIME]" $(textifyDuration $duration) + printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log exit 0 fi duration=$SECONDS -NJOBS=$(( (NPAIRS+ARRAYSIZE-1)/ARRAYSIZE)) if [[ $BATCHFLAG == 0 ]] ; then if [[ $NJOBS -le 1 ]] ; then NJOBS=1 if [[ $THREADS > 1 ]] ; then - echo "[INFO] -- $THREADS threads required, but only 1 job to execute" - echo "[INFO] -- Using 1 thread" + printf "%-10s -- %s\n" "[INFO]" "$THREADS threads required, but only 1 job to execute" | tee -a ${OUT}.wgrr.log fi - echo "[INFO] -- Starting 1 job on 1 thread for wGRR calculation" + printf "%-10s -- %s\n" "[INFO]" "Starting 1 job on 1 thread for wGRR calculation" | tee -a ${OUT}.wgrr.log ARRAYSIZE=$NPAIRS ./wGRR_worker.zsh $ARRAYSIZE $OUT $NJOBS 1 $PRT $tmp elif [[ $THREADS -eq 1 ]] ; then - echo "[INFO] -- Starting 1 job on 1 thread for wGRR calculation" - echo "[WARNING] -- Running wGRR with a large dataset on 1 thread can lead to memory issues" + printf "%-10s -- %s\n" "[INFO]" "Starting 1 job on 1 thread for wGRR calculation" | tee -a ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[WARNING]" "Running wGRR with a large dataset on 1 thread can lead to memory issues" | tee -a ${OUT}.wgrr.log ARRAYSIZE=$NPAIRS ./wGRR_worker.zsh $ARRAYSIZE $OUT $NJOBS 1 $PRT $tmp else - echo "[INFO] -- Starting $NJOBS jobs on $THREADS threads for wGRR calculation" - printf "%-10s -- [%-40s]" "[PROGRESS]" "" + if [[ $THREADS -gt $NJOBS ]] ; then + printf "%-10s -- %s\n" "[INFO]" "$THREADS threads required, but only $NJOBS jobs to execute" | tee -a ${OUT}.wgrr.log + fi + printf "%-10s -- %s\n" "[INFO]" "Starting $NJOBS jobs on $THREADS threads for wGRR calculation" | tee -a ${OUT}.wgrr.log + printf "%-10s -- [%-40s] 0/%s" "[PROGRESS]" "" $NJOBS for i in {1..$NJOBS} ; do echo $i - done | xargs -n 1 -P $THREADS sh -c 'arg=$5; NJOBS=$2 ; printf "\r[PROGRESS] -- [%-40s] %s/%s" $(head -c $((arg*40/NJOBS+1)) < /dev/zero | tr "\0" "=") $arg $NJOBS ; ./wGRR_worker.zsh $0 $1 $NJOBS $arg $3 $4' $ARRAYSIZE $OUT $NJOBS $PRT $tmp + done | xargs -n 1 -P $THREADS sh -c 'arg=$5; NJOBS=$2 ; printf "\r[PROGRESS] -- [%-40s] %s/%s" $(head -c $((arg*40/NJOBS+1)) < /dev/zero | tr "\0" "=") $arg $NJOBS ; ./wGRR_worker.zsh $0 $1 $NJOBS $arg $3 $4 ' $ARRAYSIZE $OUT $NJOBS $PRT $tmp printf "\n" fi else - printf "%-10s -- %s\n" "[TIME]" $(textifyDuration $duration) + printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log REQMEM="" - echo "[INFO] -- Estimating required memory" + printf "%-10s -- %s\n" "[INFO]" "Estimating required memory" | tee -a ${OUT}.wgrr.log M1=$(awk -f wGRR.awk -v MINP=1 -v MAXP=10 -v OUT=$OUT -v MEM=1 $OUT.allpairs.txt $PRT $OUT.m8) REQMEM=$(bc -l <<< $(numfmt --from=iec $M1)*($ARRAYSIZE/10) | numfmt --to=iec | awk '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}') - echo "[INFO] -- $REQMEM per job required" + printf "%-10s -- %s\n" "[INFO]" "$REQMEM per job required" | tee -a ${OUT}.wgrr.log duration=$SECONDS - echo "[INFO] -- Submitting $NJOBS jobs for wGRR calculation" + printf "%-10s -- %s\n" "[INFO]" "Submitting $NJOBS jobs for wGRR calculation" | tee -a ${OUT}.wgrr.log if [[ $MAXJOBS -gt 0 ]] && [[ $MAXJOBS -lt $NJOBS ]] ; then - echo "[INFO] -- Limiting the number of simultaneous jobs to $MAXJOBS" + printf "%-10s -- %s\n" "[INFO]" "Limiting the number of simultaneous jobs to $MAXJOBS" | tee -a ${OUT}.wgrr.log JOBARRAY="1-${NJOBS}%${MAXJOBS}" else JOBARRAY="1-${NJOBS}" fi JID=$(sbatch --parsable --wait -p "$PARTITION" --array="$JOBARRAY" -c 1 -J "wGRR_worker" --mem=$REQMEM --wrap="./wGRR_worker.zsh $ARRAYSIZE $OUT $NJOBS 1 $PRT $tmp") PQT=$(sacct -X -j $JID -o Reserved -n | awk 'NR==1{prevt=0}{t=0;n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];if(t<prevt){tt=tt+prevt}prevt=t}END{print tt+t}') - echo "[INFO] -- The job $JID has been "$(textifyDuration $PQT) "in queue" + printf "%-10s -- %s %s %s %s %s\n" "[INFO]" "The job" $JID "has been" $(textifyDuration $PQT) "in queue" | tee -a ${OUT}.wgrr.log QT=$((QT+PQT)) mkdir -p "$OUT".logs mv slurm-"$JID"_*.out "$OUT".logs/ fi duration=$SECONDS -printf "%-10s -- %s\n" "[TIME]" $(textifyDuration $duration) -printf "%-10s -- %s\n" "[INFO]" "Sorting results" +printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log +printf "%-10s -- %s\n" "[INFO]" "Sorting results" | tee -a ${OUT}.wgrr.log sort -m "$tmp"/$OUT.wgrr_part.* | sort -u -k1,1V -k2,2V | $AWKEXE 'BEGIN{print "GenomeA\tGenomeB\twGRR1\tCommon1\tRealLengthA\tRealLengthB\twGRR2\tCommon2\tEffectiveLength1A\tEffectiveLength1B\twGRR3\tCommon3\tEffectiveLength2A\tEffectiveLength2B"}1' > $OUT.wgrr.txt if [[ ! -f "$OUT".wgrr.txt ]] ; then - echo "[ERROR] -- Failed to sort the wGRR table." - echo "[ERROR] -- Saving the partial files in ${OUT}.wgrr_part directory." + printf "%-10s -- %s\n" "[ERROR]" "Failed to sort the wGRR table." | tee -a ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[ERROR]" "Saving the partial files in ${OUT}.wgrr_part directory." | tee -a ${OUT}.wgrr.log rm -rf "$OUT".wgrr_part mkdir "$OUT".wgrr_part mv $tmp/"$OUT".wgrr_part.* "$OUT".wgrr_part/ @@ -340,24 +346,24 @@ fi NLINES=$(wc -l "$OUT".wgrr.txt | awk '{print $1}') NWGRR=$((STATS[1]*STATS[1]+1)) if [[ $NLINES -lt $NWGRR ]] ; then - echo "[ERROR] -- An error occurred during wGRR calculation:" - echo "[ERROR] -- less lines than expected in the final file." - echo "[ERROR] -- Saving the partial files in ${OUT}.wgrr_part directory." + printf "%-10s -- %s\n" "[ERROR]" "An error occurred during wGRR calculation:" | tee -a ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[ERROR]" "less lines than expected in the final file." | tee -a ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[ERROR]" "Saving the partial files in ${OUT}.wgrr_part directory." | tee -a ${OUT}.wgrr.log rm -rf "$OUT".wgrr_part mkdir "$OUT".wgrr_part mv $tmp/"$OUT".wgrr_part.* "$OUT".wgrr_part/ exit 1 elif [[ $NLINES -gt $NWGRR ]] ; then - echo "[WARNING] -- Too many lines in the output - some genome pairs are probably duplicated" + printf "%-10s -- %s\n" "[WARNING]" "Too many lines in the output - some genome pairs are probably duplicated" | tee -a ${OUT}.wgrr.log fi rm -rf "$OUT".logs -printf "%-10s -- %s\n" "[INFO]" "Done" +printf "%-10s -- %s\n" "[INFO]" "Done." | tee -a ${OUT}.wgrr.log if [[ $QT != 0 ]] ; then - printf "%-10s -- %s\n" "[INFO]" "Total time in queue: $(textifyDuration $QT)" + printf "%-10s -- %s\n" "[INFO]" "Total time in queue: $(textifyDuration $QT)" | tee -a ${OUT}.wgrr.log fi duration=$SECONDS -printf "%-10s -- %s\n" "[TIME]" $(textifyDuration $duration) +printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log exit 0