From 3c67806c54dcaba914a978addde8d20a6e4cc3be Mon Sep 17 00:00:00 2001
From: jgugliel <julien.guglielmini@pasteur.fr>
Date: Mon, 14 Mar 2022 14:19:13 +0100
Subject: [PATCH] Check for timeout

---
 README.md |  4 +++-
 wGRR      | 10 +++++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 95ebe3c..84c22c8 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ chmod +x wGRR*
 ## Usage
 ### On a local machine
 ```bash
-./wGRR -f $fasta [-p $mmseqs2_path -o $output_prefix -t $threads -a $comparisons -T]
+./wGRR -i $fasta [-p $mmseqs2_path -o $output_prefix -t $threads -a $comparisons -T -f]
 ```
 
 ### On an interactive session on Maestro
@@ -47,6 +47,8 @@ sbatch -p hubbioit ./wGRR -f test_2.prt -t 30
 This will run wGRR on the file test_2.prt on the hubbioit partition. The MMseqs job will be submitted to the cluster's scheduler with 30 CPUs. Then for the actual wGRR calculation, the required amount of jobs (depending on the value passed with the `-a` option) will be submitted to the queue. If 100 jobs (1 CPU each) are necessary, a job array of 100 jobs will be submitted to the scheduler.
 You can adjust the number of maximum jobs running simultaneously (to avoid using 100% of your partition's CPUs) by using the `-m` option.
 
+If you do not have access to a dedicated partition, or if there is not enough free CPUs on your partition, you can try to turn on the `-f` flag. By doing so, the wGRR workers will be submitted to the common and dedicated machines of Maestro, on the "fast" Quality of Service (QoS). Jobs running on the fast QoS have a higher priority (so the workers will start faster) but are limited to 2 hours. Also, using the `-m` parameter is less necessary because you will use a lot of different common resources. But you need to be sure that each worker will end in less than 2 hours - otherwise the run will fail.
+
 ### Mandatory parameter
 `$fasta` is a fasta file containing all the proteins of all the elements you want to compare. The protein names **must** follow the "gembase" convention:
 ```
diff --git a/wGRR b/wGRR
index ae3a77d..97ef578 100755
--- a/wGRR
+++ b/wGRR
@@ -7,7 +7,7 @@ trap 'rm -rf "$tmp"' EXIT
 export LC_ALL=C
 SECONDS=0
 
-readonly VERSION=0.6
+readonly VERSION=0.7
 
 bold=$(tput bold)
 normal=$(tput sgr0)
@@ -334,6 +334,14 @@ else
 		PARTITION=("common,dedicated" "-q" "fast")
 	fi
 	JID=$(sbatch --parsable --wait -p ${PARTITION} --array="$JOBARRAY" -c 1 -J "wGRR_worker" --mem=$REQMEM --wrap="./wGRR_worker.zsh $ARRAYSIZE $OUT $NJOBS 1 $PRT $tmp")
+	if [[ `sacct -j $JID | grep "TIMEOUT"` ]] ; then
+		printf "%-10s  --  %s\n" "[ERROR]" "Some workers encountered a TIMEOUT." | tee -a ${OUT}.wgrr.log
+		printf "%-10s  --  %s\n" "[ERROR]" "Saving the partial files in ${OUT}.wgrr_part directory." | tee -a ${OUT}.wgrr.log
+		rm -rf "$OUT".wgrr_part
+		mkdir "$OUT".wgrr_part
+		mv $tmp/"$OUT".wgrr_part.* "$OUT".wgrr_part/
+		exit 1
+	fi
 	PQT=$(sacct -X -j $JID -o Reserved -n | awk 'NR==1{prevt=0}{t=0;n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];if(t<prevt){tt=tt+prevt}prevt=t}END{print tt+t}')
 	printf "%-10s  --  %s %s %s %s %s\n" "[INFO]" "The job" $JID "has been" $(textifyDuration $PQT) "in queue" | tee -a ${OUT}.wgrr.log
 	QT=$((QT+PQT))
-- 
GitLab