diff --git a/README.md b/README.md index 435c6440a60c906f82efa6ed31e5ed805475e117..bc02db7dc945d82a19ac885a2e0b6ec9f143fcd8 100644 --- a/README.md +++ b/README.md @@ -98,5 +98,6 @@ wGRR3 also uses the protein families. But this time, if two BBH pairs are found Common3 is the number of protein families containing at least one BBH divided by the denominator of wGRR3. + [1]: https://mmseqs.com/ diff --git a/wGRR b/wGRR index cb61446642b99b78ece55eaaed49ece831113f82..622d08e674a57bca590704cba83c4934e48f75fb 100755 --- a/wGRR +++ b/wGRR @@ -7,7 +7,7 @@ trap 'rm -rf "$tmp"' EXIT export LC_ALL=C SECONDS=0 -readonly VERSION=0.8 +readonly VERSION=1.0 bold=$(tput bold) normal=$(tput sgr0) @@ -233,13 +233,71 @@ if [[ $STATS[3] -eq 1 ]]; then fi duration=$SECONDS +if [[ -f $OUT.allpairs.txt ]] ; then + printf "%-10s -- %s %s%s\n" "[INFO]" "Using existing file" $OUT ".allpairs.txt" | tee -a ${OUT}.wgrr.log +else + printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[INFO]" "Writing genomes pairs" | tee -a ${OUT}.wgrr.log + $AWKEXE 'BEGIN{x=1}/^>/{g=substr($1,2);gsub(/_[^_]+$/,"",g);if(FNR==1){a[x]=g;++x;currg=g;next}if(g!=currg){a[x]=g;x++;currg=g}}END{i=0;while(++i in a){j=i;while(++j in a){print a[i]"\t"a[j]}}}' $PRT > $OUT.allpairs.txt + printf "%-10s -- %s\n" "[INFO]" "$(wc -l $OUT.allpairs.txt | $AWKEXE '{print $1}') pairs written" | tee -a ${OUT}.wgrr.log +fi + +NPAIRS=$(wc -l $OUT.allpairs.txt | $AWKEXE '{print $1}') +if [[ ! -f $OUT.allpairs.txt ]] || [[ $NPAIRS < 1 ]] ; then + printf "%-10s -- %s\n" "[ERROR]" "An error occurred when writing the $OUT.allpairs.txt file." | tee -a ${OUT}.wgrr.log + exit 1 +fi + +NJOBS=$(( (NPAIRS+ARRAYSIZE-1)/ARRAYSIZE)) + +duration=$SECONDS +if [[ $BATCHFLAG == 1 || $TESTRUN == 1 ]] ; then + if [[ $BATCHFLAG == 1 && $TESTRUN == 1 ]] ; then + printf "%-10s -- %s\n" "[WARNING]" "Test run (option -T) is not available with sbatch runs" | tee -a ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[WARNING]" "The run will continue ignoring the -T option" | tee -a ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[INFO]" "Estimating required memory" | tee -a ${OUT}.wgrr.log + elif [[ $TESTRUN == 1 ]] ; then + printf "%-10s -- %s\n" "[INFO]" "This is a test run (option -T)" | tee -a ${OUT}.wgrr.log + elif [[ $BATCHFLAG == 1 ]] ; then + printf "%-10s -- %s\n" "[INFO]" "Estimating required memory" | tee -a ${OUT}.wgrr.log + fi + if [[ $STATS[1] -gt 5 ]] ; then + $AWKEXE '/^>/{g=$1;gsub(/_[0-9]+$/,"",g);a[g]++;if(length(a)>5){exit};print;getline;print}' $PRT > $OUT.testrun.prt + else + cp $PRT $OUT.testrun.prt + fi + $AWKEXE 'BEGIN{x=1}/^>/{g=substr($1,2);gsub(/_[^_]+$/,"",g);if(FNR==1){a[x]=g;++x;currg=g;next}if(g!=currg){a[x]=g;x++;currg=g}}END{i=0;while(++i in a){j=i;while(++j in a){print a[i]"\t"a[j]}}}' $OUT.testrun.prt > $OUT.testrun.allpairs.txt + if [[ $TESTRUN == 1 && $BATCHFLAG == 0 ]] ; then + printf "%-10s -- %s\n" "[INFO]" "Running MMseqs on a sample file" | tee -a ${OUT}.wgrr.log + fi + $MMSEQS easy-search $OUT.testrun.prt $OUT.testrun.prt $OUT.testrun.m8 $tmp --threads $THREADS --format-output "query,target,qcov,tcov,fident,evalue,bits" --add-self-matches > $OUT.testrun.mmseqs.search.log + M2=$($AWKEXE -f wGRR.awk -v MINP=1 -v MAXP=10 -v OUT=$OUT -v MEM=1 $OUT.testrun.allpairs.txt $OUT.testrun.prt $OUT.testrun.m8) + REQMEM=$(bc -l <<< $(numfmt --from=iec $M2)*($ARRAYSIZE*0.15) | numfmt --to=iec | $AWKEXE '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}') + REQMEMT=$(bc -l <<< $(numfmt --from=iec $REQMEM)*$THREADS | numfmt --to=iec | $AWKEXE '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}') + if [[ $TESTRUN == 1 && $BATCHFLAG == 0 ]] ; then + printf "%-10s -- %s\n" "[INFO]" "With the current -a parameter (${ARRAYSIZE}) ${NJOBS} workers are required" | tee -a ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[INFO]" "Each worker will require approximately $REQMEM" | tee -a ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[INFO]" "With the number of threads you specified (-t $THREADS) you will need a total of $REQMEMT" | tee -a ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[INFO]" "Testrun is done. Now cleaning" | tee -a ${OUT}.wgrr.log + elif [[ $BATCHFLAG == 1 ]] ; then + printf "%-10s -- %s\n" "[INFO]" "$REQMEM required per worker" | tee -a ${OUT}.wgrr.log + fi + rm $OUT.testrun.* + if [[ $TESTRUN == 1 && $BATCHFLAG == 0 ]] ; then + printf "%-10s -- %s\n" "[INFO]" "Done." | tee -a ${OUT}.wgrr.log + duration=$SECONDS + printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log + exit 0 + fi +fi + if [[ -f $OUT.m8 ]] ; then printf "%-10s -- %s %s%s\n" "[INFO]" "Using existing MMseqs output file" $OUT ".m8" | tee -a ${OUT}.wgrr.log -elif [[ $TESTRUN == 0 ]] ; then +else MIDENT=$($AWKEXE '!/^>/{a[$0]++}END{for(i in a){if(a[i]>m){m=a[i]}}print m}' $PRT) printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log if [[ $((MIDENT*2)) -gt "$MMS_DEF_MAX_SEQS" ]] ; then - MMS_MAX_SEQ_PARAM=($(echo $((MIDENT*2)) | awk '{c=substr($1,2);gsub(/[0-9]/,0,c);print "--max-seqs "substr($1,1,1)+1""c}')) + MMS_MAX_SEQ_PARAM=($(echo $((MIDENT*2)) | $AWKEXE '{c=substr($1,2);gsub(/[0-9]/,0,c);print "--max-seqs "substr($1,1,1)+1""c}')) printf "%-10s -- %s %s\n" "[INFO]" "Setting MMseqs parameter" "$MMS_MAX_SEQ_PARAM" | tee -a ${OUT}.wgrr.log fi if [[ $BATCHFLAG == 1 ]] ; then @@ -247,7 +305,7 @@ elif [[ $TESTRUN == 0 ]] ; then printf "%-10s -- %s\n" "[INFO]" "The command is:" >> ${OUT}.wgrr.log printf "%-10s -- %s\n" "[INFO]" "sbatch --wait --parsable -o ${OUT}.mmseqs.log -p ${PARTITION} -c ${THREADS} -J \"wGRR_MMSeqs\" --wrap=\"${MMSEQS} easy-search ${PRT} ${PRT} ${OUT}.m8 ${tmp} --threads ${THREADS} --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches ${MMS_MAX_SEQ_PARAM}" >> ${OUT}.wgrr.log JID=$(sbatch --wait --parsable -o "$OUT".mmseqs.log -p $PARTITION -c $THREADS -J "wGRR_MMSeqs" --wrap="$MMSEQS easy-search $PRT $PRT ${OUT}.m8 ${tmp} --threads $THREADS --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches $MMS_MAX_SEQ_PARAM") - PQT=$(sacct -X -j $JID -o Reserved -n | awk '{n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];print t}') + PQT=$(sacct -X -j $JID -o Reserved -n | $AWKEXE '{n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];print t}') printf "%-10s -- %s %s %s %s %s\n" "[INFO]" "The job" $JID "has been" $(textifyDuration $PQT) "in queue" | tee -a ${OUT}.wgrr.log QT=$((QT+PQT)) else @@ -258,38 +316,11 @@ elif [[ $TESTRUN == 0 ]] ; then fi fi -if [[ $TESTRUN == 0 ]] && [[ ! -f $OUT.m8 ]] ; then +if [[ ! -f $OUT.m8 ]] ; then printf "%-10s -- %s\n" "[ERROR]" "Something went wrong during the MMseqs step." | tee -a ${OUT}.wgrr.log exit 1 fi -duration=$SECONDS -if [[ -f $OUT.allpairs.txt ]] ; then - printf "%-10s -- %s %s%s\n" "[INFO]" "Using existing file" $OUT ".allpairs.txt" | tee -a ${OUT}.wgrr.log -else - printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log - printf "%-10s -- %s\n" "[INFO]" "Writing genomes pairs" | tee -a ${OUT}.wgrr.log - $AWKEXE 'BEGIN{x=1}/^>/{g=substr($1,2);gsub(/_[^_]+$/,"",g);if(FNR==1){a[x]=g;++x;currg=g;next}if(g!=currg){a[x]=g;x++;currg=g}}END{i=0;while(++i in a){j=i;while(++j in a){print a[i]"\t"a[j]}}}' $PRT > $OUT.allpairs.txt - printf "%-10s -- %s\n" "[INFO]" "$(wc -l $OUT.allpairs.txt | awk '{print $1}') pairs written" | tee -a ${OUT}.wgrr.log -fi - -NPAIRS=$(wc -l $OUT.allpairs.txt | $AWKEXE '{print $1}') -if [[ ! -f $OUT.allpairs.txt ]] || [[ $NPAIRS < 1 ]] ; then - printf "%-10s -- %s\n" "[ERROR]" "An error occurred when writing the $OUT.allpairs.txt file." | tee -a ${OUT}.wgrr.log - exit 1 -fi - -NJOBS=$(( (NPAIRS+ARRAYSIZE-1)/ARRAYSIZE)) - -if [[ $TESTRUN == 1 ]] ; then - printf "%-10s -- %s\n" "[INFO]" "With the current -a parameter (${ARRAYSIZE}) ${NJOBS} workers are required" | tee -a ${OUT}.wgrr.log - rm -rf "$OUT".logs - printf "%-10s -- %s\n" "[INFO]" "Done." | tee -a ${OUT}.wgrr.log - duration=$SECONDS - printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log - exit 0 -fi - duration=$SECONDS if [[ $BATCHFLAG == 0 ]] ; then if [[ $NJOBS -le 1 ]] ; then @@ -318,11 +349,6 @@ if [[ $BATCHFLAG == 0 ]] ; then fi else printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log - REQMEM="" - printf "%-10s -- %s\n" "[INFO]" "Estimating required memory" | tee -a ${OUT}.wgrr.log - M1=$(awk -f wGRR.awk -v MINP=1 -v MAXP=10 -v OUT=$OUT -v MEM=1 $OUT.allpairs.txt $PRT $OUT.m8) - REQMEM=$(bc -l <<< $(numfmt --from=iec $M1)*($ARRAYSIZE/10) | numfmt --to=iec | awk '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}') - printf "%-10s -- %s\n" "[INFO]" "$REQMEM per job required" | tee -a ${OUT}.wgrr.log duration=$SECONDS printf "%-10s -- %s\n" "[INFO]" "Submitting $NJOBS jobs for wGRR calculation" | tee -a ${OUT}.wgrr.log if [[ $MAXJOBS -gt 0 ]] && [[ $MAXJOBS -lt $NJOBS ]] ; then @@ -334,6 +360,10 @@ else if [[ $FAST == 1 ]] ; then PARTITION=("common,dedicated" "-q" "fast") fi + if [[ $REQMEM == "" ]] ; then + printf "%-10s -- %s\n" "[ERROR]" "Something went wrong when estimating the required memory." | tee -a ${OUT}.wgrr.log + exit 1 + fi JID=$(sbatch --parsable --wait -p ${PARTITION} --array="$JOBARRAY" -c 1 -J "wGRR_worker" --mem=$REQMEM --wrap="./wGRR_worker.zsh $ARRAYSIZE $OUT $NJOBS 1 $PRT $tmp") if [[ `sacct -j $JID | grep "TIMEOUT"` ]] ; then printf "%-10s -- %s\n" "[ERROR]" "Some workers encountered a TIMEOUT." | tee -a ${OUT}.wgrr.log @@ -343,7 +373,7 @@ else mv $tmp/"$OUT".wgrr_part.* "$OUT".wgrr_part/ exit 1 fi - PQT=$(sacct -X -j $JID -o Reserved -n | awk 'NR==1{prevt=0}{t=0;n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];if(t<prevt){tt=tt+prevt}prevt=t}END{print tt+t}') + PQT=$(sacct -X -j $JID -o Reserved -n | $AWKEXE 'NR==1{prevt=0}{t=0;n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];if(t<prevt){tt=tt+prevt}prevt=t}END{print tt+t}') printf "%-10s -- %s %s %s %s %s\n" "[INFO]" "The job" $JID "has been" $(textifyDuration $PQT) "in queue" | tee -a ${OUT}.wgrr.log QT=$((QT+PQT)) mkdir -p "$OUT".logs @@ -364,7 +394,7 @@ if [[ ! -f "$OUT".wgrr.txt ]] ; then exit 1 fi -NLINES=$(wc -l "$OUT".wgrr.txt | awk '{print $1}') +NLINES=$(wc -l "$OUT".wgrr.txt | $AWKEXE '{print $1}') NWGRR=$((STATS[1]*STATS[1]+1)) if [[ $NLINES -lt $NWGRR ]] ; then printf "%-10s -- %s\n" "[ERROR]" "An error occurred during wGRR calculation:" | tee -a ${OUT}.wgrr.log diff --git a/wgrr.png b/wgrr.png new file mode 100755 index 0000000000000000000000000000000000000000..233182431cb97f1625d3c199965e9e45286dc9fa Binary files /dev/null and b/wgrr.png differ