Skip to content
Snippets Groups Projects
Commit bc3a5064 authored by Julien  GUGLIELMINI's avatar Julien GUGLIELMINI
Browse files

Improved the estimation of required memory

parent f3757e83
No related branches found
No related tags found
No related merge requests found
...@@ -98,5 +98,6 @@ wGRR3 also uses the protein families. But this time, if two BBH pairs are found ...@@ -98,5 +98,6 @@ wGRR3 also uses the protein families. But this time, if two BBH pairs are found
Common3 is the number of protein families containing at least one BBH divided by the denominator of wGRR3. Common3 is the number of protein families containing at least one BBH divided by the denominator of wGRR3.
![wgrr.png](https://gitlab.pasteur.fr/jgugliel/wgrr/wgrr.png)
[1]: https://mmseqs.com/ [1]: https://mmseqs.com/
...@@ -7,7 +7,7 @@ trap 'rm -rf "$tmp"' EXIT ...@@ -7,7 +7,7 @@ trap 'rm -rf "$tmp"' EXIT
export LC_ALL=C export LC_ALL=C
SECONDS=0 SECONDS=0
readonly VERSION=0.8 readonly VERSION=1.0
bold=$(tput bold) bold=$(tput bold)
normal=$(tput sgr0) normal=$(tput sgr0)
...@@ -233,13 +233,71 @@ if [[ $STATS[3] -eq 1 ]]; then ...@@ -233,13 +233,71 @@ if [[ $STATS[3] -eq 1 ]]; then
fi fi
duration=$SECONDS duration=$SECONDS
if [[ -f $OUT.allpairs.txt ]] ; then
printf "%-10s -- %s %s%s\n" "[INFO]" "Using existing file" $OUT ".allpairs.txt" | tee -a ${OUT}.wgrr.log
else
printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log
printf "%-10s -- %s\n" "[INFO]" "Writing genomes pairs" | tee -a ${OUT}.wgrr.log
$AWKEXE 'BEGIN{x=1}/^>/{g=substr($1,2);gsub(/_[^_]+$/,"",g);if(FNR==1){a[x]=g;++x;currg=g;next}if(g!=currg){a[x]=g;x++;currg=g}}END{i=0;while(++i in a){j=i;while(++j in a){print a[i]"\t"a[j]}}}' $PRT > $OUT.allpairs.txt
printf "%-10s -- %s\n" "[INFO]" "$(wc -l $OUT.allpairs.txt | $AWKEXE '{print $1}') pairs written" | tee -a ${OUT}.wgrr.log
fi
NPAIRS=$(wc -l $OUT.allpairs.txt | $AWKEXE '{print $1}')
if [[ ! -f $OUT.allpairs.txt ]] || [[ $NPAIRS < 1 ]] ; then
printf "%-10s -- %s\n" "[ERROR]" "An error occurred when writing the $OUT.allpairs.txt file." | tee -a ${OUT}.wgrr.log
exit 1
fi
NJOBS=$(( (NPAIRS+ARRAYSIZE-1)/ARRAYSIZE))
duration=$SECONDS
if [[ $BATCHFLAG == 1 || $TESTRUN == 1 ]] ; then
if [[ $BATCHFLAG == 1 && $TESTRUN == 1 ]] ; then
printf "%-10s -- %s\n" "[WARNING]" "Test run (option -T) is not available with sbatch runs" | tee -a ${OUT}.wgrr.log
printf "%-10s -- %s\n" "[WARNING]" "The run will continue ignoring the -T option" | tee -a ${OUT}.wgrr.log
printf "%-10s -- %s\n" "[INFO]" "Estimating required memory" | tee -a ${OUT}.wgrr.log
elif [[ $TESTRUN == 1 ]] ; then
printf "%-10s -- %s\n" "[INFO]" "This is a test run (option -T)" | tee -a ${OUT}.wgrr.log
elif [[ $BATCHFLAG == 1 ]] ; then
printf "%-10s -- %s\n" "[INFO]" "Estimating required memory" | tee -a ${OUT}.wgrr.log
fi
if [[ $STATS[1] -gt 5 ]] ; then
$AWKEXE '/^>/{g=$1;gsub(/_[0-9]+$/,"",g);a[g]++;if(length(a)>5){exit};print;getline;print}' $PRT > $OUT.testrun.prt
else
cp $PRT $OUT.testrun.prt
fi
$AWKEXE 'BEGIN{x=1}/^>/{g=substr($1,2);gsub(/_[^_]+$/,"",g);if(FNR==1){a[x]=g;++x;currg=g;next}if(g!=currg){a[x]=g;x++;currg=g}}END{i=0;while(++i in a){j=i;while(++j in a){print a[i]"\t"a[j]}}}' $OUT.testrun.prt > $OUT.testrun.allpairs.txt
if [[ $TESTRUN == 1 && $BATCHFLAG == 0 ]] ; then
printf "%-10s -- %s\n" "[INFO]" "Running MMseqs on a sample file" | tee -a ${OUT}.wgrr.log
fi
$MMSEQS easy-search $OUT.testrun.prt $OUT.testrun.prt $OUT.testrun.m8 $tmp --threads $THREADS --format-output "query,target,qcov,tcov,fident,evalue,bits" --add-self-matches > $OUT.testrun.mmseqs.search.log
M2=$($AWKEXE -f wGRR.awk -v MINP=1 -v MAXP=10 -v OUT=$OUT -v MEM=1 $OUT.testrun.allpairs.txt $OUT.testrun.prt $OUT.testrun.m8)
REQMEM=$(bc -l <<< $(numfmt --from=iec $M2)*($ARRAYSIZE*0.15) | numfmt --to=iec | $AWKEXE '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}')
REQMEMT=$(bc -l <<< $(numfmt --from=iec $REQMEM)*$THREADS | numfmt --to=iec | $AWKEXE '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}')
if [[ $TESTRUN == 1 && $BATCHFLAG == 0 ]] ; then
printf "%-10s -- %s\n" "[INFO]" "With the current -a parameter (${ARRAYSIZE}) ${NJOBS} workers are required" | tee -a ${OUT}.wgrr.log
printf "%-10s -- %s\n" "[INFO]" "Each worker will require approximately $REQMEM" | tee -a ${OUT}.wgrr.log
printf "%-10s -- %s\n" "[INFO]" "With the number of threads you specified (-t $THREADS) you will need a total of $REQMEMT" | tee -a ${OUT}.wgrr.log
printf "%-10s -- %s\n" "[INFO]" "Testrun is done. Now cleaning" | tee -a ${OUT}.wgrr.log
elif [[ $BATCHFLAG == 1 ]] ; then
printf "%-10s -- %s\n" "[INFO]" "$REQMEM required per worker" | tee -a ${OUT}.wgrr.log
fi
rm $OUT.testrun.*
if [[ $TESTRUN == 1 && $BATCHFLAG == 0 ]] ; then
printf "%-10s -- %s\n" "[INFO]" "Done." | tee -a ${OUT}.wgrr.log
duration=$SECONDS
printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log
exit 0
fi
fi
if [[ -f $OUT.m8 ]] ; then if [[ -f $OUT.m8 ]] ; then
printf "%-10s -- %s %s%s\n" "[INFO]" "Using existing MMseqs output file" $OUT ".m8" | tee -a ${OUT}.wgrr.log printf "%-10s -- %s %s%s\n" "[INFO]" "Using existing MMseqs output file" $OUT ".m8" | tee -a ${OUT}.wgrr.log
elif [[ $TESTRUN == 0 ]] ; then else
MIDENT=$($AWKEXE '!/^>/{a[$0]++}END{for(i in a){if(a[i]>m){m=a[i]}}print m}' $PRT) MIDENT=$($AWKEXE '!/^>/{a[$0]++}END{for(i in a){if(a[i]>m){m=a[i]}}print m}' $PRT)
printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log
if [[ $((MIDENT*2)) -gt "$MMS_DEF_MAX_SEQS" ]] ; then if [[ $((MIDENT*2)) -gt "$MMS_DEF_MAX_SEQS" ]] ; then
MMS_MAX_SEQ_PARAM=($(echo $((MIDENT*2)) | awk '{c=substr($1,2);gsub(/[0-9]/,0,c);print "--max-seqs "substr($1,1,1)+1""c}')) MMS_MAX_SEQ_PARAM=($(echo $((MIDENT*2)) | $AWKEXE '{c=substr($1,2);gsub(/[0-9]/,0,c);print "--max-seqs "substr($1,1,1)+1""c}'))
printf "%-10s -- %s %s\n" "[INFO]" "Setting MMseqs parameter" "$MMS_MAX_SEQ_PARAM" | tee -a ${OUT}.wgrr.log printf "%-10s -- %s %s\n" "[INFO]" "Setting MMseqs parameter" "$MMS_MAX_SEQ_PARAM" | tee -a ${OUT}.wgrr.log
fi fi
if [[ $BATCHFLAG == 1 ]] ; then if [[ $BATCHFLAG == 1 ]] ; then
...@@ -247,7 +305,7 @@ elif [[ $TESTRUN == 0 ]] ; then ...@@ -247,7 +305,7 @@ elif [[ $TESTRUN == 0 ]] ; then
printf "%-10s -- %s\n" "[INFO]" "The command is:" >> ${OUT}.wgrr.log printf "%-10s -- %s\n" "[INFO]" "The command is:" >> ${OUT}.wgrr.log
printf "%-10s -- %s\n" "[INFO]" "sbatch --wait --parsable -o ${OUT}.mmseqs.log -p ${PARTITION} -c ${THREADS} -J \"wGRR_MMSeqs\" --wrap=\"${MMSEQS} easy-search ${PRT} ${PRT} ${OUT}.m8 ${tmp} --threads ${THREADS} --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches ${MMS_MAX_SEQ_PARAM}" >> ${OUT}.wgrr.log printf "%-10s -- %s\n" "[INFO]" "sbatch --wait --parsable -o ${OUT}.mmseqs.log -p ${PARTITION} -c ${THREADS} -J \"wGRR_MMSeqs\" --wrap=\"${MMSEQS} easy-search ${PRT} ${PRT} ${OUT}.m8 ${tmp} --threads ${THREADS} --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches ${MMS_MAX_SEQ_PARAM}" >> ${OUT}.wgrr.log
JID=$(sbatch --wait --parsable -o "$OUT".mmseqs.log -p $PARTITION -c $THREADS -J "wGRR_MMSeqs" --wrap="$MMSEQS easy-search $PRT $PRT ${OUT}.m8 ${tmp} --threads $THREADS --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches $MMS_MAX_SEQ_PARAM") JID=$(sbatch --wait --parsable -o "$OUT".mmseqs.log -p $PARTITION -c $THREADS -J "wGRR_MMSeqs" --wrap="$MMSEQS easy-search $PRT $PRT ${OUT}.m8 ${tmp} --threads $THREADS --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches $MMS_MAX_SEQ_PARAM")
PQT=$(sacct -X -j $JID -o Reserved -n | awk '{n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];print t}') PQT=$(sacct -X -j $JID -o Reserved -n | $AWKEXE '{n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];print t}')
printf "%-10s -- %s %s %s %s %s\n" "[INFO]" "The job" $JID "has been" $(textifyDuration $PQT) "in queue" | tee -a ${OUT}.wgrr.log printf "%-10s -- %s %s %s %s %s\n" "[INFO]" "The job" $JID "has been" $(textifyDuration $PQT) "in queue" | tee -a ${OUT}.wgrr.log
QT=$((QT+PQT)) QT=$((QT+PQT))
else else
...@@ -258,38 +316,11 @@ elif [[ $TESTRUN == 0 ]] ; then ...@@ -258,38 +316,11 @@ elif [[ $TESTRUN == 0 ]] ; then
fi fi
fi fi
if [[ $TESTRUN == 0 ]] && [[ ! -f $OUT.m8 ]] ; then if [[ ! -f $OUT.m8 ]] ; then
printf "%-10s -- %s\n" "[ERROR]" "Something went wrong during the MMseqs step." | tee -a ${OUT}.wgrr.log printf "%-10s -- %s\n" "[ERROR]" "Something went wrong during the MMseqs step." | tee -a ${OUT}.wgrr.log
exit 1 exit 1
fi fi
duration=$SECONDS
if [[ -f $OUT.allpairs.txt ]] ; then
printf "%-10s -- %s %s%s\n" "[INFO]" "Using existing file" $OUT ".allpairs.txt" | tee -a ${OUT}.wgrr.log
else
printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log
printf "%-10s -- %s\n" "[INFO]" "Writing genomes pairs" | tee -a ${OUT}.wgrr.log
$AWKEXE 'BEGIN{x=1}/^>/{g=substr($1,2);gsub(/_[^_]+$/,"",g);if(FNR==1){a[x]=g;++x;currg=g;next}if(g!=currg){a[x]=g;x++;currg=g}}END{i=0;while(++i in a){j=i;while(++j in a){print a[i]"\t"a[j]}}}' $PRT > $OUT.allpairs.txt
printf "%-10s -- %s\n" "[INFO]" "$(wc -l $OUT.allpairs.txt | awk '{print $1}') pairs written" | tee -a ${OUT}.wgrr.log
fi
NPAIRS=$(wc -l $OUT.allpairs.txt | $AWKEXE '{print $1}')
if [[ ! -f $OUT.allpairs.txt ]] || [[ $NPAIRS < 1 ]] ; then
printf "%-10s -- %s\n" "[ERROR]" "An error occurred when writing the $OUT.allpairs.txt file." | tee -a ${OUT}.wgrr.log
exit 1
fi
NJOBS=$(( (NPAIRS+ARRAYSIZE-1)/ARRAYSIZE))
if [[ $TESTRUN == 1 ]] ; then
printf "%-10s -- %s\n" "[INFO]" "With the current -a parameter (${ARRAYSIZE}) ${NJOBS} workers are required" | tee -a ${OUT}.wgrr.log
rm -rf "$OUT".logs
printf "%-10s -- %s\n" "[INFO]" "Done." | tee -a ${OUT}.wgrr.log
duration=$SECONDS
printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log
exit 0
fi
duration=$SECONDS duration=$SECONDS
if [[ $BATCHFLAG == 0 ]] ; then if [[ $BATCHFLAG == 0 ]] ; then
if [[ $NJOBS -le 1 ]] ; then if [[ $NJOBS -le 1 ]] ; then
...@@ -318,11 +349,6 @@ if [[ $BATCHFLAG == 0 ]] ; then ...@@ -318,11 +349,6 @@ if [[ $BATCHFLAG == 0 ]] ; then
fi fi
else else
printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log
REQMEM=""
printf "%-10s -- %s\n" "[INFO]" "Estimating required memory" | tee -a ${OUT}.wgrr.log
M1=$(awk -f wGRR.awk -v MINP=1 -v MAXP=10 -v OUT=$OUT -v MEM=1 $OUT.allpairs.txt $PRT $OUT.m8)
REQMEM=$(bc -l <<< $(numfmt --from=iec $M1)*($ARRAYSIZE/10) | numfmt --to=iec | awk '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}')
printf "%-10s -- %s\n" "[INFO]" "$REQMEM per job required" | tee -a ${OUT}.wgrr.log
duration=$SECONDS duration=$SECONDS
printf "%-10s -- %s\n" "[INFO]" "Submitting $NJOBS jobs for wGRR calculation" | tee -a ${OUT}.wgrr.log printf "%-10s -- %s\n" "[INFO]" "Submitting $NJOBS jobs for wGRR calculation" | tee -a ${OUT}.wgrr.log
if [[ $MAXJOBS -gt 0 ]] && [[ $MAXJOBS -lt $NJOBS ]] ; then if [[ $MAXJOBS -gt 0 ]] && [[ $MAXJOBS -lt $NJOBS ]] ; then
...@@ -334,6 +360,10 @@ else ...@@ -334,6 +360,10 @@ else
if [[ $FAST == 1 ]] ; then if [[ $FAST == 1 ]] ; then
PARTITION=("common,dedicated" "-q" "fast") PARTITION=("common,dedicated" "-q" "fast")
fi fi
if [[ $REQMEM == "" ]] ; then
printf "%-10s -- %s\n" "[ERROR]" "Something went wrong when estimating the required memory." | tee -a ${OUT}.wgrr.log
exit 1
fi
JID=$(sbatch --parsable --wait -p ${PARTITION} --array="$JOBARRAY" -c 1 -J "wGRR_worker" --mem=$REQMEM --wrap="./wGRR_worker.zsh $ARRAYSIZE $OUT $NJOBS 1 $PRT $tmp") JID=$(sbatch --parsable --wait -p ${PARTITION} --array="$JOBARRAY" -c 1 -J "wGRR_worker" --mem=$REQMEM --wrap="./wGRR_worker.zsh $ARRAYSIZE $OUT $NJOBS 1 $PRT $tmp")
if [[ `sacct -j $JID | grep "TIMEOUT"` ]] ; then if [[ `sacct -j $JID | grep "TIMEOUT"` ]] ; then
printf "%-10s -- %s\n" "[ERROR]" "Some workers encountered a TIMEOUT." | tee -a ${OUT}.wgrr.log printf "%-10s -- %s\n" "[ERROR]" "Some workers encountered a TIMEOUT." | tee -a ${OUT}.wgrr.log
...@@ -343,7 +373,7 @@ else ...@@ -343,7 +373,7 @@ else
mv $tmp/"$OUT".wgrr_part.* "$OUT".wgrr_part/ mv $tmp/"$OUT".wgrr_part.* "$OUT".wgrr_part/
exit 1 exit 1
fi fi
PQT=$(sacct -X -j $JID -o Reserved -n | awk 'NR==1{prevt=0}{t=0;n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];if(t<prevt){tt=tt+prevt}prevt=t}END{print tt+t}') PQT=$(sacct -X -j $JID -o Reserved -n | $AWKEXE 'NR==1{prevt=0}{t=0;n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];if(t<prevt){tt=tt+prevt}prevt=t}END{print tt+t}')
printf "%-10s -- %s %s %s %s %s\n" "[INFO]" "The job" $JID "has been" $(textifyDuration $PQT) "in queue" | tee -a ${OUT}.wgrr.log printf "%-10s -- %s %s %s %s %s\n" "[INFO]" "The job" $JID "has been" $(textifyDuration $PQT) "in queue" | tee -a ${OUT}.wgrr.log
QT=$((QT+PQT)) QT=$((QT+PQT))
mkdir -p "$OUT".logs mkdir -p "$OUT".logs
...@@ -364,7 +394,7 @@ if [[ ! -f "$OUT".wgrr.txt ]] ; then ...@@ -364,7 +394,7 @@ if [[ ! -f "$OUT".wgrr.txt ]] ; then
exit 1 exit 1
fi fi
NLINES=$(wc -l "$OUT".wgrr.txt | awk '{print $1}') NLINES=$(wc -l "$OUT".wgrr.txt | $AWKEXE '{print $1}')
NWGRR=$((STATS[1]*STATS[1]+1)) NWGRR=$((STATS[1]*STATS[1]+1))
if [[ $NLINES -lt $NWGRR ]] ; then if [[ $NLINES -lt $NWGRR ]] ; then
printf "%-10s -- %s\n" "[ERROR]" "An error occurred during wGRR calculation:" | tee -a ${OUT}.wgrr.log printf "%-10s -- %s\n" "[ERROR]" "An error occurred during wGRR calculation:" | tee -a ${OUT}.wgrr.log
......
wgrr.png 0 → 100755
wgrr.png

550 KiB

0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment