diff --git a/wGRR b/wGRR index e0800090d4c36924c16f32beb157d538494e0c1d..e78e9ae522cb9d489ca38ed983e4563ecbb345ec 100755 --- a/wGRR +++ b/wGRR @@ -226,21 +226,19 @@ if [[ -f $OUT.m8 ]] ; then echo "[INFO] -- Using existing MMseqs output file $OUT.m8" else MIDENT=$($AWKEXE '!/^>/{a[$0]++}END{for(i in a){if(a[i]>m){m=a[i]}}print m}' $PRT) + printf "%-10s -- %s\n" "[TIME]" $(textifyDuration $duration) if [[ $((MIDENT*2)) -gt "$MMS_DEF_MAX_SEQS" ]] ; then MMS_MAX_SEQ_PARAM=($(echo $((MIDENT*2)) | awk '{c=substr($1,2);gsub(/[0-9]/,0,c);print "--max-seqs "substr($1,1,1)+1""c}')) echo "[INFO] -- Setting MMseqs parameter $MMS_MAX_SEQ_PARAM" fi - if [[ $BATCHFLAG == 1 ]] ; then printf "%-10s -- %s\n" "[INFO]" "Submitting MMseqs search to Maestro" - printf "%-10s -- %s\n" "[TIME]" $(textifyDuration $duration) JID=$(sbatch --wait --parsable -o "$OUT".mmseqs.log -p $PARTITION -c $THREADS -J "wGRR_MMSeqs" --wrap="$MMSEQS easy-search $PRT $PRT ${OUT}.m8 ${tmp} --threads $THREADS --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches $MMS_MAX_SEQ_PARAM") PQT=$(sacct -X -j $JID -o Reserved -n | awk '{n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];print t}') echo "[INFO] -- The job $JID has been "$(textifyDuration $PQT) "in queue" QT=$((QT+PQT)) else printf "%-10s -- %s\n" "[INFO]" "Running MMseqs search" - printf "%-10s -- %s\n" "[TIME]" $(textifyDuration $duration) $MMSEQS easy-search $PRT $PRT $OUT.m8 $tmp --threads $THREADS --format-output "query,target,qcov,tcov,fident,evalue,bits" --add-self-matches $MMS_MAX_SEQ_PARAM > $OUT.mmseqs.search.log fi fi @@ -254,8 +252,8 @@ duration=$SECONDS if [[ -f $OUT.allpairs.txt ]] ; then echo "[INFO] -- Using existing file $OUT.allpairs.txt" else - printf "%-10s -- %s\n" "[INFO]" "Writing genomes pairs" printf "%-10s -- %s\n" "[TIME]" $(textifyDuration $duration) + printf "%-10s -- %s\n" "[INFO]" "Writing genomes pairs" $AWKEXE 'BEGIN{x=1}/^>/{g=substr($1,2);gsub(/_[0-9]+$/,"",g);if(FNR==1){a[x]=g;++x;currg=g;next}if(g!=currg){a[x]=g;x++;currg=g}}END{i=0;while(++i in a){j=i;while(++j in a){print a[i]"\t"a[j]}}}' $PRT > $OUT.allpairs.txt printf "%-10s -- %s\n" "[INFO]" "$(wc -l $OUT.allpairs.txt | awk '{print $1}') pairs written" fi @@ -292,13 +290,13 @@ if [[ $BATCHFLAG == 0 ]] ; then printf "\n" fi else + printf "%-10s -- %s\n" "[TIME]" $(textifyDuration $duration) REQMEM="" echo "[INFO] -- Estimating required memory" M1=$(awk -f wGRR.awk -v MINP=1 -v MAXP=10 -v OUT=$OUT -v MEM=1 $OUT.allpairs.txt $PRT $OUT.m8) REQMEM=$(bc -l <<< $(numfmt --from=iec $M1)*($ARRAYSIZE/10) | numfmt --to=iec | awk '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}') echo "[INFO] -- $REQMEM per job required" duration=$SECONDS - printf "%-10s -- %s\n" "[TIME]" $(textifyDuration $duration) echo "[INFO] -- Submitting $NJOBS jobs for wGRR calculation" if [[ $MAXJOBS -gt 0 ]] && [[ $MAXJOBS -lt $NJOBS ]] ; then echo "[INFO] -- Limiting the number of simultaneous jobs to $MAXJOBS" @@ -315,8 +313,8 @@ else fi duration=$SECONDS -printf "%-10s -- %s\n" "[INFO]" "Sorting results" printf "%-10s -- %s\n" "[TIME]" $(textifyDuration $duration) +printf "%-10s -- %s\n" "[INFO]" "Sorting results" sort -m "$tmp"/$OUT.wgrr_part.* | sort -u -k1,1V -k2,2V | $AWKEXE 'BEGIN{print "GenomeA\tGenomeB\twGRR1\tCommon1\tRealLengthA\tRealLengthB\twGRR2\tCommon2\tEffectiveLength1A\tEffectiveLength1B\twGRR3\tCommon3\tEffectiveLength2A\tEffectiveLength2B"}1' > $OUT.wgrr.txt if [[ ! -f "$OUT".wgrr.txt ]] ; then