From efcd56eb2a78c3c7a423584586f058dea0515707 Mon Sep 17 00:00:00 2001
From: jgugliel <julien.guglielmini@pasteur.fr>
Date: Tue, 21 Feb 2023 17:21:06 +0100
Subject: [PATCH] Major update.

wGRR calculations based on clustering and protein families have been removed. Instead, an option to calculate the Jaccard index has been added.
Now calculates the Sorensen-Dice index.
Improved the screen output.
Now the valid awk path is passed correctly to the worker.
Added an option to restrict the analysis to a subset of elements via a file containing a list of elements id.
Set the MMSeqs2 sensitivity parameter to 7.5
---
 README.md       |  58 ++++++-----
 wGRR            | 254 +++++++++++++++++++++++++++---------------------
 wGRR.awk        | 144 +++------------------------
 wGRR_worker.zsh |  21 ++--
 4 files changed, 204 insertions(+), 273 deletions(-)
 mode change 100755 => 100644 wGRR.awk

diff --git a/README.md b/README.md
index bb9eb29..9ca942a 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 ## Description
 ### Introduction
-wGRR is a set of scripts to calculate the weighted Gene Repertoire Relatedness (wGRR) of protein sequences of a set of genetic elements (genomes, plasmids, prophages, etc.). wGRR relies on Best Bi-directional Hits (BBH) which are couples of proteins that are reciprocal best matches in a BLAST-like analysis. Once the BBH have been defined for a pair of genetic elements, we can write the wGRR as:
+`wGRR` is a set of scripts to calculate the weighted Gene Repertoire Relatedness (wGRR) index among pairs of proteins sets. wGRR relies on Best Bi-directional Hits (BBH) which are couples of proteins that are reciprocal best matches in a BLAST-like analysis. Once the BBH have been defined for two proteins sets, we can write the wGRR as
 ```math
 wGRR(A,B) = \frac{\sum_{i}{id(A_i,B_i)}}{min(P_A,P_B)}
 ```
@@ -11,15 +11,25 @@ where
 * $`id(A_i,B_i)`$ is the identity score of BBH $`i`$
 * $`P_A`$ and $`P_B`$ are the numbers of proteins of $`A`$ and $`B`$ respectively
 
-**NOTE**  
-`wGRR` calculates 3 versions of the wGRR score depending on which BBH are considered (numerator) and what proteins should be counted (denominator). See the `Output` section of this manual for more explanations.
+`wGRR` also calculates another similar index, the [Sørensen-Dice (SD) index][1]. Given the same BBH pairs, it is defined as
+```math
+SD(A,B) = \frac{2\times\sum_{i}{id(A_i,B_i)}}{P_A+P_B}
+```
+
+Finally `wGRR` optionnally calculates the [Jaccard index][2]. For this the BBH pairs are ignored, but a clustering step is added to build families for all proteins of all the input sets. Then it is defined as
+```math
+J(A,B) =  \frac{\left | A \cup B \right |}{\left | A \cap B \right |}
+```
+where $`A`$ and $`B`$ represent two sets of protein families.
 
 ### Dependencies
-BBH are defined by all versus all protein comparisons using [MMseqs2][1].
+BBH are defined by all versus all protein comparisons using [MMseqs2][3].
+
+Clustering (for the Jaccard index calculation) is performed by the linclust algorithm of MMseqs2.
 
-The scripts are written in `zsh` and `awk` and as such wGRR should run on any Unix system. A recent version of `awk` is necessary (`gawk`) is recommended.
+The scripts are written in `zsh` and `awk` and as such wGRR should run on any Unix system. A recent version of `awk` is necessary, `gawk` is recommended.
 
-wGRR can also be run on Institut Pasteur's cluster Maestro. In this case, all dependencies will be automatically loaded upon execution.
+wGRR can also be run on Institut Pasteur's cluster "Maestro". In this case, all dependencies will be automatically loaded upon execution.
 
 ## Installation
 Download the files and make them executable (if necessary):
@@ -37,10 +47,11 @@ chmod +x wGRR*.zsh
 
 ### On a local machine
 ```bash
-./wGRR -i $fasta [-p $mmseqs2_path -o $output_prefix -t $threads -a $comparisons -T -f]
+./wGRR -i $fasta [-p $mmseqs2_path -o $output_prefix -t $threads -a $comparisons -l $id_file -T -f -j -s]
 ```
 **WARNING**  
-Memory consumption can be really high and running `wGRR` might exhaust your system. It is advised to perform a test run (with the `-T` flag) first.
+Memory consumption can be really high and running `wGRR` might exhaust your system. By default, `wGRR` will estimate the required memory and stop if your system does not fulfill this requirement.  
+It is also possible to perform a test run with the `-T` flag.
 
 ### On an interactive session on Maestro
 wGRR can be used interactively on Maestro. First you need to allocate resources. For example
@@ -63,6 +74,11 @@ To avoid using 100% of your partition's CPUs you can adjust the number of maximu
 
 If you do not have access to a dedicated partition, or if there is not enough free CPUs on your partition, you can try to turn on the `-f` flag. By doing so the wGRR workers will be submitted to the common and dedicated machines of Maestro, on the "fast" QoS. Jobs running on the fast QoS have a higher priority (so the workers will start faster) but are limited to 2 hours. Also, using the `-m` parameter is not necessary because in this case you will use a lot of different common resources. But you need to be sure that each worker will end in less than 2 hours otherwise the run will fail.
 
+### Using previously generated MMSeqs2 output
+It is possible to provide a MMSeqs2 file, with conditions.
+1. The name of the file must be $prefix.m8, $prefix being the string passed via the `-o` option.
+2. The format of the file must be: query,target,qcov,tcov,fident,evalue,bits
+
 ### Mandatory parameter
 `$fasta` is a fasta file containing all the proteins of all the elements you want to compare. The protein names **must** be formatted as:
 ```
@@ -88,30 +104,28 @@ This corresponds to the protein `00141` of the element `ESCO001.0321.00001.P001`
 
 * `$comparisons` is the number of genetic elements to be compared simultaneously on each thread. A value of 1,000 means that, on each thread, groups of 1,000 genetic elements will be compared simultaneously. Increasing this number will decrease the total number of tasks to be performed but will increase RAM usage. By default, a reasonable value is automatically determined. This value should result in workers running in less than 2 hours and which is perfectly adapted to the use of the `-f` flag (see the "sbatch" section above).
 
+* `$id_file` is a file containing one `Element_id` per line. If specified, `wGRR` will run only on these elements.
+
+* Jaccard index caculation. By using the `-j` flag, `wGRR` will perform a protein clustering and calculate the Jaccard index.
+
 * test run. By using the `-T` flag, a test run is executed, which omits MMseqs and wGRR calculations, and outputs some statistics of your input file to help you set up the run properly.
 
 * If `wGRR` runs interactively (not run with `sbatch` on Maestro), the total amount of required memory will be estimated and compared to the total memory of your system. To skip this and speed up the run, you can set the `-s` flag.  
 **WARNING**: if your run is not properly configured, you can completely exhaust your system.
 
 ## Output
-wGRR produces a table with several metrics corresponding to 3 different ways of calculating the wGRR.
+wGRR produces a table with the following columns:
 
 The first two columns indicate the pair of elements for this row.
 
-wGRR1 is the easiest way of calculating the wGRR. It corresponds to the sum of the identity scores for all BBH between the two elements, divided by the smallest total number of protein of the two elements (columns RealLengthA and RealLengthB).
-
-Common1 is the proportion of common proteins, _i.e._ the number of BBH pairs divided by the mean number of proteins of the two elements.
-
-wGRR2 uses the same numerator than wGRR1. For the denominator calculation, an extra single linkage clustering step is perfomed to build protein families. If a protein belongs to a family containing a BBH but this protein is not part of a BBH, then it is not counted in the denominator, resulting sometimes in a greater wGRR. This results in a new number of proteins (nProt2) per element, and the denominator is the smallest nProt2.  
-wGRR2 is especially interesting when lots of paralogs are expected - the presence of paralogs will artificially lower the wGRR1.
-
-Common2 is the number of BBH pairs divided by the mean nProt2 of two elements.
-
-wGRR3 also uses the protein families. But this time, if two BBH pairs are found in the same family, only one will be counted (the one with the highest identity score) for the numerator. nProt3 is the number of protein families + the number of proteins that are not part of a family. The denominator is the smallest nProt3.
+The third column contain the wGRR.
 
-Common3 is the number of protein families containing at least one BBH divided by the mean nProt3 of two elements.
+The fourth column contain the SD.
 
-![wgrr illustration](https://uc52613d33249883095daf20ba85.previews.dropboxusercontent.com/p/thumb/ABxsbX56g7gu9O6aHG_bcmjSvGPaCPrGehK9lNcYPlCYgyVns25GgzDvh5OaI88YVEu8C1HchxeBL6Ki6HnHofyEgTRkH9FLN0N1GtTsPxh_nQsbW-10DmDFhEk0tVGaguC52NNSjZJrRCTGHXILp7hn4DE4kvair4S437l4jFOpPgJom_8fsgYiM_2IGQUw1jr-QqvD1WY4xoHgUKprTpOPbb8Xe7lQ8PdXDfF-5zehEmKf2z4EKi7Al_6xKFg4ZsgfdgSSDpHcoQ-UzNWWflhSnPj5kiocSR96BIRjnL1H-oxovYXVHZT8RloLvWikxROuA3LffOyQKCBt5vf68pp-81tiRo1Z0qfUTJHNt4OcmWUIzWWLFG7-pmjNOtBYSmsxNMNe0fAKmrDNa8TbdqkRPasn2k7bT6IyCbn-IPojwA/p.png)
+The fifth column contain the proportion of common proteins, _i.e._ the number of BBH pairs divided by the mean number of proteins of the two elements.
 
+The last two columns contain the number of proteins of the two elements.
 
-[1]: https://mmseqs.com/
+[1]: https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
+[2]: https://en.wikipedia.org/wiki/Jaccard_index
+[3]: https://mmseqs.com/
diff --git a/wGRR b/wGRR
index efa7339..22c7474 100755
--- a/wGRR
+++ b/wGRR
@@ -7,9 +7,13 @@ trap 'rm -rf "$tmp"' EXIT
 export LC_ALL=C
 SECONDS=0
 
-readonly VERSION=1.1
+readonly VERSION=1.2a
 
 bold=$(tput bold)
+green=$(tput setaf 2)
+yellow=$(tput setaf 3)
+red=$(tput setaf 1)
+
 normal=$(tput sgr0)
 
 ALL_ARGS=($@)
@@ -44,6 +48,11 @@ display_usage() {
 	echo "   -m <integer>      Max number of simulteaneous tasks."
 	echo "                        Only applicable to Maestro, for the wGRR calculation step."
 	echo "                        default: not set"
+	echo "   -l <file>         Restrict the analysis to the elements listed in the provided file."
+	echo "                        The file must contain one element ID per line."
+	echo "                        default: not set"
+	echo "   -j                Compute Jaccard index."
+	echo "                        default: off"
 	echo "   -T                Test run. Useful to get stats on the input file and correctly set the -a parameter."
 	echo "                        default: off"
 	echo "   -s                Skip memory check."
@@ -78,8 +87,7 @@ fi
 
 ## Functions
 textifyDuration() {
-   local duration=$1
-   local shiff=$duration
+   local shiff=$1
    local secs
    secs=$((shiff % 60));  shiff=$((shiff / 60));
    local mins
@@ -105,6 +113,7 @@ MMPATH="N.O.P.A.T.H"      ## -p
 OUT="N.O.O.U.T"           ## -o
 THREADS=1                 ## -t
 ARRAYSIZE="AUTO"          ## -a
+IDLIST="N.O.L.I.S.T"		  ## -l
 BATCHFLAG=0               ## Are we in a sbatch job?
 QT=0                      ## Queing time (for Maestro)
 MAXJOBS=0                 ## -m
@@ -114,21 +123,24 @@ MMS_MAX_SEQ_PARAM=""
 TESTRUN=0                 ## -T
 FAST=0                    ## -f
 SKIP=0                    ## -s
+JACCARD=0                 ## -j
 
 ## catch option values
-while getopts :fTsi:p:o:t:a:m: option ; do
+while getopts :fTsji:p:o:t:a:m:l: option ; do
    case $option in
-   i) PRT="$OPTARG";     if [ ! -f $PRT ]; then echo "[ERROR]     --  fasta file '$PRT' not found (option -f)." ; exit 1 ; fi ;;
+   i) PRT="$OPTARG";       if [[ ! -s $PRT ]]; then printf "${red}%-17s  --  %s\n${normal}"  "[ERROR]" "Fasta file '$PRT' not found or empty (option -i)." ; exit 1 ; fi ;;
    p) MMPATH="$OPTARG"          ;;
-   o) OUT="$OPTARG";     if [ -f "$OUT".wgrr.final.txt ]; then echo "[ERROR]     --  file $OUT.wgrr.final.txt already exists. Change option -o or remove this file and relaunch." ; exit 1 ; fi ;;
-	t) THREADS="$OPTARG"; if [[ ! $THREADS =~ ^[0-9]+$ ]]; then echo "[ERROR]     --  number of threads $THREADS must be an integer (option -t)." ; exit 1 ; fi ;;
-	a) ARRAYSIZE="$OPTARG"; if [[ ! $ARRAYSIZE =~ ^[0-9]+$ ]]; then if [[ $ARRAYSIZE != "AUTO" ]] ; then echo "[ERROR]     --  number of genomes comparisons $ARRAYSIZE must be an integer (option -a)." ; exit 1 ; fi ; fi ;;
-	m) MAXJOBS="$OPTARG"; if [[ ! $MAXJOBS =~ ^[0-9]+$ ]]; then echo "[ERROR]     --  max number of simulteaneous jobs $MAXJOBS must be an integer (option -m)." ; exit 1 ; fi ;;
+   o) OUT="$OPTARG";       if [[ -f "$OUT".wgrr.final.txt ]]; then printf "${red}%-17s  --  %s\n${normal}"  "[ERROR]" "File $OUT.wgrr.final.txt already exists. Change option -o or remove this file and relaunch." ; exit 1 ; fi ;;
+	t) THREADS="$OPTARG";   if [[ ! $THREADS =~ ^[0-9]+$ ]]; then printf "${red}%-17s  --  %s\n${normal}"  "[ERROR]" "Number of threads $THREADS must be an integer (option -t)." ; exit 1 ; fi ;;
+	a) ARRAYSIZE="$OPTARG"; if [[ ! $ARRAYSIZE =~ ^[0-9]+$ ]]; then if [[ $ARRAYSIZE != "AUTO" ]] ; then printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "Number of genomes comparisons $ARRAYSIZE must be an integer (option -a)." ; exit 1 ; fi ; fi ;;
+	l) IDLIST="$OPTARG";    if [[ ! -s ${IDLIST} ]] ; then printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "File ${IDLIST} not found or empty (option -l)." ; exit 1 ; fi ;;
+	m) MAXJOBS="$OPTARG";   if [[ ! $MAXJOBS =~ ^[0-9]+$ ]]; then printf "${red}%-17s  --  %s\n${normal}"  "[ERROR]" "Max number of simulteaneous jobs $MAXJOBS must be an integer (option -m)." ; exit 1 ; fi ;;
 	T) TESTRUN=1 ;;
 	f) FAST=1 ;;
 	s) SKIP=1 ;;
-   :) echo "option $OPTARG : missing argument" ; exit 1  ;;
-   \?) echo "$OPTARG : invalid option" ; exit 1  ;;
+	j) JACCARD=1 ;;
+   :) printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "option $OPTARG : missing argument" ; exit 1  ;;
+   \?) printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "$OPTARG : invalid option" ; exit 1  ;;
    esac
 done
 shift "$((OPTIND - 1))"
@@ -140,7 +152,7 @@ unalias -a
 STIME=$(date +%s)
 
 if [[ -e ${OUT}.wgrr.log ]] ; then
-	printf "%-10s  --  %s\n" "[WARNING]" "Appending to existing log"
+	printf "${yellow}%-17s  --  %s\n${normal}" "[WARNING]" "Appending to existing log"
 	printf "\n####################################\n" >> ${OUT}.wgrr.log
 fi
 
@@ -179,8 +191,8 @@ fi
 # Get system total memory if not in sbatch
 TOTALMEM=0
 if [[ $BATCHFLAG==0 ]] ; then
-	if [[ $SLURM_MEM_PER_CPU != "" ]] ; then
-		TOTALMEM=$SLURM_MEM_PER_CPU
+	if [[ $SLURM_JOBID != "" ]] ; then
+		TOTALMEM=$(sacct -j $SLURM_JOBID -o ReqMem | awk 'NR==3{print $1;exit}' | numfmt --from=iec)
 	else
 		free -b > /dev/null 2>&1
 		if [[ $? -eq 0 ]] ; then
@@ -194,67 +206,72 @@ if [[ $BATCHFLAG==0 ]] ; then
 	fi
 fi
 
-printf "%-10s  --  %s\n" "[INFO]" "wGRR version ${VERSION}" | tee -a ${OUT}.wgrr.log
+printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "wGRR version ${VERSION}" | tee -a ${OUT}.wgrr.log
 
 if [[ $BATCHFLAG == 0 ]] ; then
 	if [[ $FAST == 1 ]] ; then
-		printf "%-10s  --  %s\n" "[WARNING]" "The fast flag (-f) is useless when not using sbatch"  | tee -a ${OUT}.wgrr.log
+		printf "${yellow}%-17s  --  %s\n${normal}" "[WARNING]" "The fast flag (-f) is useless when not using sbatch"  | tee -a ${OUT}.wgrr.log
 	fi
 	if [[ $THREADS -gt $(nproc) ]] ; then
-		printf "%-10s  --  %s\n" "[ERROR]" "You requested $THREADS threads but your machine has $(nproc) cores."  | tee -a ${OUT}.wgrr.log
+		printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "You requested $THREADS threads but your machine has $(nproc) cores."  | tee -a ${OUT}.wgrr.log
 		exit 1
 	fi
-	printf "%-10s  --  %s\n" "[INFO]" "The command is:"  >> ${OUT}.wgrr.log
-	printf "%-10s  --  %s\n" "[INFO]" "./wGRR ${ALL_ARGS}" >> ${OUT}.wgrr.log
+	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "The command is:"  >> ${OUT}.wgrr.log
+	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "./wGRR ${ALL_ARGS}" >> ${OUT}.wgrr.log
 else
-	printf "%-10s  --  %s\n" "[INFO]" "The command is:"  >> ${OUT}.wgrr.log
-	printf "%-10s  --  %s\n" "[INFO]" "$(scontrol show job ${SLURM_JOBID} | awk '/ +Command/{n=split($0,a,"=");print a[n]}')"  >> ${OUT}.wgrr.log
+	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "The command is:"  >> ${OUT}.wgrr.log
+	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "$(scontrol show job ${SLURM_JOBID} | awk '/ +Command/{n=split($0,a,"=");print a[n]}')"  >> ${OUT}.wgrr.log
 fi
 
 if [[ ! -f wGRR.awk ]] ; then
-	printf "%-10s  --  %s\n" "[ERROR]" "awk script wGRR.awk not found."  | tee -a ${OUT}.wgrr.log
+	printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "awk script wGRR.awk not found."  | tee -a ${OUT}.wgrr.log
 	exit 1
 fi
 
 if [[ ! -f wGRR_worker.zsh ]] ; then
-	printf "%-10s  --  %s\n" "[ERROR]" "awk script wGRR_worker.zsh not found."  | tee -a ${OUT}.wgrr.log
+	printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "awk script wGRR_worker.zsh not found."  | tee -a ${OUT}.wgrr.log
 	exit 1
 fi
 
 ## Check awk
 if ! AWKEXE=$(command -v gawk) ; then
 	if ! AWKEXE=$(command -v awk) ; then
-		printf "%-10s  --  %s\n" "[ERROR]" "Could not find awk or gawk executable."  | tee -a ${OUT}.wgrr.log
+		printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "Could not find awk or gawk executable."  | tee -a ${OUT}.wgrr.log
 		exit 1
 	fi
 fi
 
 if ! echo "" | $AWKEXE '{a[1][2]=3}' &> /dev/null ; then
-	printf "%-10s  --  %s\n" "[ERROR]" "Your awk version is too old. Please install a more recent version (gawk is recommended)."  | tee -a ${OUT}.wgrr.log
+	printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "Your awk version is too old. Please install a more recent version (gawk is recommended)."  | tee -a ${OUT}.wgrr.log
 	exit 1
 fi
 
 ## Check MMseqs
 if [[ $MMPATH == "N.O.P.A.T.H" ]] ; then
 	if ! MMSEQS=$(command -v mmseqs)  ; then
-		printf "%-10s  --  %s\n" "[ERROR]" "MMseqs binary not found."  | tee -a ${OUT}.wgrr.log
+		printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "MMseqs binary not found."  | tee -a ${OUT}.wgrr.log
 		exit 1
 	fi
 else
 	if ! MMSEQS=$(command -v $MMPATH/mmseqs) ; then
 		if ! MMSEQS=$(command -v mmseqs) ; then
-			printf "%-10s  --  %s\n" "[ERROR]" "MMseqs binary not found in your PATH."  | tee -a ${OUT}.wgrr.log
+			printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "MMseqs binary not found in your PATH."  | tee -a ${OUT}.wgrr.log
 			exit 1
 		else
-			printf "%-10s  --  %s\n" "[ERROR]" "MMseqs binary not found in ${MMPATH}."  | tee -a ${OUT}.wgrr.log
+			printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "MMseqs binary not found in ${MMPATH}."  | tee -a ${OUT}.wgrr.log
 		fi
 	fi
 fi
 
+if [[ ${IDLIST} != "N.O.L.I.S.T" ]] ; then
+	printf "%-17s  --  %s %s\n" "["$(textifyDuration $SECONDS)"]" "Sampling the input file ${PRT} according to the ${IDLIST} file"  | tee -a ${OUT}.wgrr.log
+	$AWKEXE 'NR==FNR{a[$1]++;next}s!=""{print s;s=""}/^>/{k=0;g=substr($1,2);gsub(/_[^_]+$/,"",g);if(g in a){print;k=1;s=""}next}k{s=s""$0}END{if(s!=""){print s}}' ${IDLIST} ${PRT} > ${PRT:t:r}.sample.prt
+	PRT=${PRT:t:r}.sample.prt
+fi
 
 STATS=($($AWKEXE '/^>/{p++;g=substr($1,2);gsub(/_[^_]+$/,"",g);if(!a[g]++){c++};LNR=NR}{if(NR>LNR+1){n=1}}END{if(n!=1){n=0}print c"\t"p"\t"n"\t"p/c}' $PRT))
-printf "%-10s  --  %s %s %s %s %s\n" "[INFO]" "Input file has" $STATS[1] "genomes and a total of" $STATS[2] "proteins"  | tee -a ${OUT}.wgrr.log
-printf "%-10s  --  %s %s\n" "[INFO]" "Mean number of proteins per genome:" $STATS[4]  | tee -a ${OUT}.wgrr.log
+printf "%-17s  --  %s %s %s %s %s\n" "["$(textifyDuration $SECONDS)"]" "Input file has" $STATS[1] "genomes and a total of" $STATS[2] "proteins"  | tee -a ${OUT}.wgrr.log
+printf "%-17s  --  %s %s\n" "["$(textifyDuration $SECONDS)"]" "Mean number of proteins per genome:" $STATS[4]  | tee -a ${OUT}.wgrr.log
 
 if [[ $ARRAYSIZE == "AUTO" ]] ; then
 	if [[ ${STATS[4]} -le 100 ]] ; then
@@ -272,47 +289,47 @@ if [[ $ARRAYSIZE == "AUTO" ]] ; then
 	else
 		ARRAYSIZE=50
 	fi
-	printf "%-10s  --  %s %s\n" "[INFO]" "Setting automatically the -a parameter to" $ARRAYSIZE  | tee -a ${OUT}.wgrr.log
+	printf "%-17s  --  %s %s\n" "["$(textifyDuration $SECONDS)"]" "Setting automatically the -a parameter to" $ARRAYSIZE  | tee -a ${OUT}.wgrr.log
 fi
 
 if [[ $STATS[3] -eq 1 ]]; then
-	printf "%-10s  --  %s\n" "[INFO]" "Converting fasta file to sequential fasta"  | tee -a ${OUT}.wgrr.log
+	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Converting fasta file to sequential fasta"  | tee -a ${OUT}.wgrr.log
 	OPRT=$PRT
 	$AWKEXE '!/^>/{s=s$0;next}(s!=""){print s;s=""}{print}END{print s}' $OPRT > $OPRT:t:r_seq.prt
 	PRT=$OPRT:t:r_seq.prt
 fi
 
-duration=$SECONDS
 if [[ -f $OUT.allpairs.txt ]] ; then
-	printf "%-10s  --  %s %s%s\n" "[INFO]" "Using existing file" $OUT ".allpairs.txt" | tee -a ${OUT}.wgrr.log
+	printf "%-17s  --  %s %s%s\n" "["$(textifyDuration $SECONDS)"]" "Using existing file" $OUT ".allpairs.txt" | tee -a ${OUT}.wgrr.log
 else
-	printf "%-10s  --  %s\n" "[INFO]" "Writing genomes pairs" | tee -a ${OUT}.wgrr.log
+	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Writing genomes pairs" | tee -a ${OUT}.wgrr.log
 	$AWKEXE 'BEGIN{x=1}/^>/{g=substr($1,2);gsub(/_[^_]+$/,"",g);if(FNR==1){a[x]=g;++x;currg=g;next}if(g!=currg){a[x]=g;x++;currg=g}}END{i=0;while(++i in a){j=i;while(++j in a){print a[i]"\t"a[j]}}}' $PRT > $OUT.allpairs.txt
-	printf "%-10s  --  %s\n" "[INFO]" "$(wc -l $OUT.allpairs.txt | $AWKEXE '{print $1}') pairs written" | tee -a ${OUT}.wgrr.log
+	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "$(wc -l $OUT.allpairs.txt | $AWKEXE '{print $1}') pairs written" | tee -a ${OUT}.wgrr.log
 fi
 
 NPAIRS=$(wc -l $OUT.allpairs.txt | $AWKEXE '{print $1}')
 if [[ ! -f $OUT.allpairs.txt ]] || [[ $NPAIRS < 1 ]] ; then
-	printf "%-10s  --  %s\n" "[ERROR]" "An error occurred when writing the $OUT.allpairs.txt file." | tee -a ${OUT}.wgrr.log
+	printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "An error occurred when writing the $OUT.allpairs.txt file." | tee -a ${OUT}.wgrr.log
 	exit 1
 fi
 
 NJOBS=$(( (NPAIRS+ARRAYSIZE-1)/ARRAYSIZE))
 
-duration=$SECONDS
 if [[ $BATCHFLAG == 1 && $SKIP == 1 ]] ; then
-	printf "%-10s  --  %s\n" "[WARNING]" "Skipping estimation of required memory (option -s) is not available with sbatch runs" | tee -a ${OUT}.wgrr.log
-	printf "%-10s  --  %s\n" "[WARNING]" "The run will continue ignoring the -s option" | tee -a ${OUT}.wgrr.log
+	printf "${yellow}%-17s  --  %s\n${normal}" "[WARNING]" "Skipping estimation of required memory (option -s) is not available with sbatch runs" | tee -a ${OUT}.wgrr.log
+	printf "${yellow}%-17s  --  %s\n${normal}" "[WARNING]" "The run will continue ignoring the -s option" | tee -a ${OUT}.wgrr.log
+elif [[ $SKIP == 1 ]] ; then
+	printf "${yellow}%-17s  --  %s\n${normal}" "[WARNING]" "Skipping estimation of required memory (option -s) can lead to exhaustion of system resources!"${normal} | tee -a ${OUT}.wgrr.log
 fi
 
 if [[ $BATCHFLAG == 1 && $TESTRUN == 1 ]] ; then
-	printf "%-10s  --  %s\n" "[WARNING]" "Test run (option -T) is not available with sbatch runs" | tee -a ${OUT}.wgrr.log
-	printf "%-10s  --  %s\n" "[WARNING]" "The run will continue ignoring the -T option" | tee -a ${OUT}.wgrr.log
-	printf "%-10s  --  %s\n" "[INFO]" "Estimating required memory" | tee -a ${OUT}.wgrr.log
+	printf "${yellow}%-17s  --  %s\n${normal}" "[WARNING]" "Test run (option -T) is not available with sbatch runs" | tee -a ${OUT}.wgrr.log
+	printf "${yellow}%-17s  --  %s\n${normal}" "[WARNING]" "The run will continue ignoring the -T option" | tee -a ${OUT}.wgrr.log
+	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Estimating required memory" | tee -a ${OUT}.wgrr.log
 elif [[ $TESTRUN == 1 ]] ; then
-	printf "%-10s  --  %s\n" "[INFO]" "This is a test run (option -T)" | tee -a ${OUT}.wgrr.log
+	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "This is a test run (option -T)" | tee -a ${OUT}.wgrr.log
 elif [[ $SKIP == 0 ]] ; then
-	printf "%-10s  --  %s\n" "[INFO]" "Estimating required memory" | tee -a ${OUT}.wgrr.log
+	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Estimating required memory" | tee -a ${OUT}.wgrr.log
 fi
 
 if [[ $SKIP == 0 || $BATCHFLAG == 1 ]] ; then
@@ -325,110 +342,120 @@ if [[ $SKIP == 0 || $BATCHFLAG == 1 ]] ; then
 
 	$AWKEXE 'BEGIN{x=1}/^>/{g=substr($1,2);gsub(/_[^_]+$/,"",g);if(FNR==1){a[x]=g;++x;currg=g;next}if(g!=currg){a[x]=g;x++;currg=g}}END{i=0;while(++i in a){j=i;while(++j in a){print a[i]"\t"a[j]}}}' $OUT.testrun.prt > $OUT.testrun.allpairs.txt
 
-	printf "%-10s  --  %s\n" "[INFO]" "Running MMseqs on a sample file" | tee -a ${OUT}.wgrr.log
-	$MMSEQS easy-search $OUT.testrun.prt $OUT.testrun.prt $OUT.testrun.m8 $tmp --threads $THREADS --format-output "query,target,qcov,tcov,fident,evalue,bits" --add-self-matches > $OUT.testrun.mmseqs.search.log
+	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Running MMseqs on a sample file" | tee -a ${OUT}.wgrr.log
+	$MMSEQS easy-search $OUT.testrun.prt $OUT.testrun.prt $OUT.testrun.m8 $tmp -s 7.5 --threads $THREADS --format-output "query,target,qcov,tcov,fident,evalue,bits" --add-self-matches > $OUT.testrun.mmseqs.search.log
 
 	M2=$($AWKEXE -f wGRR.awk -v MINP=1 -v MAXP=10 -v OUT=$OUT -v MEM=1 $OUT.testrun.allpairs.txt $OUT.testrun.prt $OUT.testrun.m8)
 	REQMEM=$(bc -l <<< $(numfmt --from=iec $M2)*($ARRAYSIZE*0.15) | numfmt --to=iec | $AWKEXE '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}')
 	REQMEMT=$(bc -l <<< $(numfmt --from=iec $REQMEM)*$THREADS | numfmt --to=iec | $AWKEXE '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}')
 
 	if [[ $TESTRUN == 1 && $BATCHFLAG == 0 ]] ; then
-		printf "%-10s  --  %s\n" "[INFO]" "With the current -a parameter (${ARRAYSIZE}) ${NJOBS} workers are required" | tee -a ${OUT}.wgrr.log
-		printf "%-10s  --  %s\n" "[INFO]" "Each worker will require approximately $REQMEM" | tee -a ${OUT}.wgrr.log
-		printf "%-10s  --  %s\n" "[INFO]" "With the number of threads you specified (-t $THREADS) you will need a total of $REQMEMT" | tee -a ${OUT}.wgrr.log
+		printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "With the current -a parameter (${ARRAYSIZE}) ${NJOBS} workers are required" | tee -a ${OUT}.wgrr.log
+		printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Each worker will require approximately $REQMEM" | tee -a ${OUT}.wgrr.log
+		printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "With the number of threads you specified (-t $THREADS) you will need a total of $REQMEMT" | tee -a ${OUT}.wgrr.log
 		if [[ $TOTALMEM == 0 ]] ; then
-			printf "%-10s  --  %s\n" "[WARNING]" "Total memory of your system could not be determined" | tee -a ${OUT}.wgrr.log
+			printf "${yellow}%-17s  --  %s\n${normal}" "[WARNING]" "Total memory of your system could not be determined" | tee -a ${OUT}.wgrr.log
 		else
-			printf "%-10s  --  %s\n" "[INFO]" "Total memory of your system: $(numfmt --to=iec $TOTALMEM)" | tee -a ${OUT}.wgrr.log
+			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Total memory of your system: $(numfmt --to=iec $TOTALMEM)" | tee -a ${OUT}.wgrr.log
 		fi
-		printf "%-10s  --  %s\n" "[INFO]" "Testrun is done. Now cleaning" | tee -a ${OUT}.wgrr.log
+		printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Testrun is done. Now cleaning" | tee -a ${OUT}.wgrr.log
 	elif [[ $BATCHFLAG == 1 ]] ; then
-		printf "%-10s  --  %s\n" "[INFO]" "$REQMEM required per worker" | tee -a ${OUT}.wgrr.log
+		printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "$REQMEM required per worker" | tee -a ${OUT}.wgrr.log
 	fi
 
 	rm $OUT.testrun.*
 
 	if [[ $TESTRUN == 0 && $BATCHFLAG == 0 ]] ; then
 		if [[ $TOTALMEM -lt $(bc -l <<< $(numfmt --from=iec $REQMEM)*$THREADS) ]] ; then
-			printf "%-10s  --  %s\n" "[ERROR]" "Your system does not have enough memory (${REQMEMT} required). Try decreasing the -a parameter and/or the number of threads." | tee -a ${OUT}.wgrr.log
+			printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "Your system does not have enough memory (${REQMEMT} required). Try decreasing the -a parameter and/or the number of threads." | tee -a ${OUT}.wgrr.log
 			exit 1
 		fi
 	fi
 fi
 
 if [[ $TESTRUN == 1 && $BATCHFLAG == 0 ]] ; then
-	printf "%-10s  --  %s\n" "[INFO]" "Done." | tee -a ${OUT}.wgrr.log
-	duration=$SECONDS
-	printf "%-10s  --  %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log
+	printf "${green}%-17s  --  %s\n${normal}" "["$(textifyDuration $SECONDS)"]" "Done." | tee -a ${OUT}.wgrr.log
 	exit 0
 fi
 
-duration=$SECONDS
-printf "%-10s  --  %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log
+rm -rf ${OUT}.bbh_part.*(N)
 
 if [[ -f $OUT.m8 ]] ; then
-	printf "%-10s  --  %s %s%s\n" "[INFO]" "Using existing MMseqs output file" $OUT ".m8"  | tee -a ${OUT}.wgrr.log
+	printf "%-17s  --  %s %s%s\n" "["$(textifyDuration $SECONDS)"]" "Using existing MMseqs output file" $OUT ".m8"  | tee -a ${OUT}.wgrr.log
 else
 	MIDENT=$($AWKEXE '!/^>/{a[$0]++}END{for(i in a){if(a[i]>m){m=a[i]}}print m}' $PRT)
 	if [[ $((MIDENT*2)) -gt "$MMS_DEF_MAX_SEQS" ]] ; then
 		MMS_MAX_SEQ_PARAM=($(echo $((MIDENT*2)) | $AWKEXE '{c=substr($1,2);gsub(/[0-9]/,0,c);print "--max-seqs "substr($1,1,1)+1""c}'))
-		printf "%-10s  --  %s %s\n" "[INFO]" "Setting MMseqs parameter" "$MMS_MAX_SEQ_PARAM" | tee -a ${OUT}.wgrr.log
+		printf "%-17s  --  %s %s\n" "["$(textifyDuration $SECONDS)"]" "Setting MMseqs parameter" "$MMS_MAX_SEQ_PARAM" | tee -a ${OUT}.wgrr.log
 	fi
 	if [[ $BATCHFLAG == 1 ]] ; then
-		printf "%-10s  --  %s\n" "[INFO]" "Submitting MMseqs search to Maestro" | tee -a ${OUT}.wgrr.log
-		printf "%-10s  --  %s\n" "[INFO]" "The command is:" >> ${OUT}.wgrr.log
-		printf "%-10s  --  %s\n" "[INFO]" "sbatch --wait --parsable -o ${OUT}.mmseqs.log -p ${PARTITION} -c ${THREADS} -J \"wGRR_MMSeqs\" --wrap=\"${MMSEQS} easy-search ${PRT} ${PRT} ${OUT}.m8 ${tmp} --threads ${THREADS} --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches ${MMS_MAX_SEQ_PARAM}" >> ${OUT}.wgrr.log
-		JID=$(sbatch --wait --parsable -o "$OUT".mmseqs.log -p $PARTITION -c $THREADS -J "wGRR_MMSeqs" --wrap="$MMSEQS easy-search $PRT $PRT ${OUT}.m8 ${tmp} --threads $THREADS --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches $MMS_MAX_SEQ_PARAM")
-		PQT=$(sacct -X -j $JID -o Reserved -n | $AWKEXE '{n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];print t}')
-		printf "%-10s  --  %s %s %s %s %s\n" "[INFO]" "The job" $JID "has been" $(textifyDuration $PQT) "in queue" | tee -a ${OUT}.wgrr.log
-		QT=$((QT+PQT))
+		if [[ $JACCARD == 0 ]] ; then
+			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Submitting MMseqs search to Maestro" | tee -a ${OUT}.wgrr.log
+			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "The command is:" >> ${OUT}.wgrr.log
+			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "sbatch --wait --parsable -o ${OUT}.mmseqs.log -p ${PARTITION} -c ${THREADS} -J \"wGRR_MMSeqs\" --wrap=\"${MMSEQS} easy-search ${PRT} ${PRT} ${OUT}.m8 ${tmp} -s 7.5 --threads ${THREADS} --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches ${MMS_MAX_SEQ_PARAM}" >> ${OUT}.wgrr.log
+			JID=$(sbatch --wait --parsable -o "$OUT".mmseqs.log -p $PARTITION -c $THREADS -J "wGRR_MMSeqs" --wrap="$MMSEQS easy-search $PRT $PRT ${OUT}.m8 ${tmp} -s 7.5 --threads $THREADS --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches $MMS_MAX_SEQ_PARAM")
+			PQT=$(sacct -X -j $JID -o Reserved -n | $AWKEXE '{n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];print t}')
+			printf "%-17s  --  %s %s %s %s %s\n" "["$(textifyDuration $SECONDS)"]" "The job" $JID "has been" $(textifyDuration $PQT) "in queue" | tee -a ${OUT}.wgrr.log
+			QT=$((QT+PQT))
+		else
+			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Submitting MMseqs search and linclust to Maestro" | tee -a ${OUT}.wgrr.log
+			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "The command is:" >> ${OUT}.wgrr.log
+			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "sbatch --wait --parsable -o ${OUT}.mmseqs.log -p ${PARTITION} -c ${THREADS} -J \"wGRR_MMSeqs\" --wrap=\"${MMSEQS} easy-search ${PRT} ${PRT} ${OUT}.m8 ${tmp} -s 7.5 --threads ${THREADS} --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches ${MMS_MAX_SEQ_PARAM} ; ${MMSEQS} easy-linclust ${PRT} ${OUT} ${tmp} --threads ${THREADS}" >> ${OUT}.wgrr.log
+			JID=$(sbatch --wait --parsable -o "$OUT".mmseqs.log -p $PARTITION -c $THREADS -J "wGRR_MMSeqs" --wrap="$MMSEQS easy-search $PRT $PRT ${OUT}.m8 ${tmp} -s 7.5 --threads $THREADS --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches $MMS_MAX_SEQ_PARAM ; ${MMSEQS} easy-linclust ${PRT} ${OUT} ${tmp} --threads ${THREADS}")
+			PQT=$(sacct -X -j $JID -o Reserved -n | $AWKEXE '{n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];print t}')
+			printf "%-17s  --  %s %s %s %s %s\n" "["$(textifyDuration $SECONDS)"]" "The job" $JID "has been" $(textifyDuration $PQT) "in queue" | tee -a ${OUT}.wgrr.log
+			QT=$((QT+PQT))
+		fi
 	else
-		printf "%-10s  --  %s\n" "[INFO]" "Running MMseqs search" | tee -a ${OUT}.wgrr.log
-		printf "%-10s  --  %s\n" "[INFO]" "The command is:" >> ${OUT}.wgrr.log
-		printf "%-10s  --  %s\n" "[INFO]" "${MMSEQS} easy-search ${PRT} ${PRT} ${OUT}.m8 ${tmp} --threads ${THREADS} --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches ${MMS_MAX_SEQ_PARAM} > ${OUT}.mmseqs.search.log" >> ${OUT}.wgrr.log
-		$MMSEQS easy-search $PRT $PRT $OUT.m8 $tmp --threads $THREADS --format-output "query,target,qcov,tcov,fident,evalue,bits" --add-self-matches $MMS_MAX_SEQ_PARAM > $OUT.mmseqs.search.log
+		printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Running MMseqs search" | tee -a ${OUT}.wgrr.log
+		printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "The command is:" >> ${OUT}.wgrr.log
+		printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "${MMSEQS} easy-search ${PRT} ${PRT} ${OUT}.m8 ${tmp} -s 7.5 --threads ${THREADS} --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches ${MMS_MAX_SEQ_PARAM} > ${OUT}.mmseqs.search.log" >> ${OUT}.wgrr.log
+		$MMSEQS easy-search $PRT $PRT $OUT.m8 $tmp -s 7.5 --threads $THREADS --format-output "query,target,qcov,tcov,fident,evalue,bits" --add-self-matches $MMS_MAX_SEQ_PARAM > $OUT.mmseqs.search.log
+
+		if [[ $JACCARD == 1 ]] ; then
+			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Running MMseqs linclust" | tee -a ${OUT}.wgrr.log
+			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "The command is:" >> ${OUT}.wgrr.log
+			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "${MMSEQS} easy-linclust ${PRT} ${OUT} ${tmp} --threads ${THREADS} > ${OUT}.mmseqs.linclust.log" >> ${OUT}.wgrr.log
+			$MMSEQS easy-linclust $PRT $OUT $tmp --threads $THREADS > $OUT.mmseqs.linclust.log
+		fi
 	fi
 fi
 
 if [[ ! -f $OUT.m8 ]] ; then
-	printf "%-10s  --  %s\n" "[ERROR]" "Something went wrong during the MMseqs step." | tee -a ${OUT}.wgrr.log
+	printf "${red}%-17s  --  %s\n${normal}" "Something went wrong during the MMseqs step." | tee -a ${OUT}.wgrr.log
 	exit 1
 fi
 
-duration=$SECONDS
-printf "%-10s  --  %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log
-
 if [[ $BATCHFLAG == 0 ]] ; then
 	if [[ $NJOBS -le 1 ]] ; then
 		NJOBS=1
 		if [[ $THREADS > 1 ]] ; then
-			printf "%-10s  --  %s\n" "[INFO]" "$THREADS threads requested, but only 1 job to execute" | tee -a ${OUT}.wgrr.log
+			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "$THREADS threads requested, but only 1 job to execute" | tee -a ${OUT}.wgrr.log
 		fi
-		printf "%-10s  --  %s\n" "[INFO]" "Starting 1 job on 1 thread for wGRR calculation" | tee -a ${OUT}.wgrr.log
+		printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Starting 1 job on 1 thread for wGRR calculation" | tee -a ${OUT}.wgrr.log
 		ARRAYSIZE=$NPAIRS
-		./wGRR_worker.zsh $ARRAYSIZE $OUT $NJOBS 1 $PRT $tmp
+		./wGRR_worker.zsh $AWKEXE $ARRAYSIZE $OUT $NJOBS 1 $PRT $tmp
 	elif [[ $THREADS -eq 1 ]] ; then
-		printf "%-10s  --  %s\n" "[INFO]" "Starting 1 job on 1 thread for wGRR calculation" | tee -a ${OUT}.wgrr.log
-		printf "%-10s  --  %s\n" "[WARNING]" "Running wGRR with a large dataset on 1 thread can lead to memory issues" | tee -a ${OUT}.wgrr.log
+		printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Starting 1 job on 1 thread for wGRR calculation" | tee -a ${OUT}.wgrr.log
+		printf "%-17s  --  %s\n" "[WARNING]" "Running wGRR with a large dataset on 1 thread can lead to memory issues" | tee -a ${OUT}.wgrr.log
 		ARRAYSIZE=$NPAIRS
-		./wGRR_worker.zsh $ARRAYSIZE $OUT $NJOBS 1 $PRT $tmp
+		./wGRR_worker.zsh $AWKEXE $ARRAYSIZE $OUT $NJOBS 1 $PRT $tmp
 	else
 		if [[ $THREADS -gt $NJOBS ]] ; then
-			printf "%-10s  --  %s\n" "[INFO]" "$THREADS threads requested, but only $NJOBS jobs to execute" | tee -a ${OUT}.wgrr.log
+			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "$THREADS threads requested, but only $NJOBS jobs to execute" | tee -a ${OUT}.wgrr.log
 		fi
-		printf "%-10s  --  %s\n" "[INFO]" "Starting $NJOBS jobs on $THREADS threads for wGRR calculation" | tee -a ${OUT}.wgrr.log
-		printf "%-10s  --  [%-40s] 0/%s" "[PROGRESS]" "" $NJOBS
+		printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Starting $NJOBS jobs on $THREADS threads for wGRR calculation" | tee -a ${OUT}.wgrr.log
+		printf "%-17s  --  [%-50s]  0/%s  0s" "[PROGRESS]" "" $NJOBS
 		for i in {1..$NJOBS} ; do
 			echo $i
-		done | xargs -n 1 -P $THREADS sh -c 'arg=$6; NJOBS=$2 ; ./wGRR_worker.zsh $0 $1 $NJOBS $arg $3 $4 $5 ' $ARRAYSIZE $OUT $NJOBS $PRT $tmp $STIME
+		done | xargs -n 1 -P $THREADS sh -c 'arg=$7 ; ./wGRR_worker.zsh $0 $1 $2 $3 $arg $4 $5 $6 ' $AWKEXE $ARRAYSIZE $OUT $NJOBS $PRT $tmp $STIME
+		printf "\r\033[K%-17s  --  [%-50s]  %s/%s"  "[PROGRESS]" $( head -c 50 < /dev/zero | tr "\0" "=" ) $NJOBS $NJOBS 
 		printf "\n"
 	fi
 else
-	printf "%-10s  --  %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log
-	duration=$SECONDS
-	printf "%-10s  --  %s\n" "[INFO]" "Submitting $NJOBS jobs for wGRR calculation" | tee -a ${OUT}.wgrr.log
+	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Submitting $NJOBS jobs for wGRR calculation" | tee -a ${OUT}.wgrr.log
 	if [[ $MAXJOBS -gt 0 ]] && [[ $MAXJOBS -lt $NJOBS ]] ; then
-		printf "%-10s  --  %s\n" "[INFO]" "Limiting the number of simultaneous jobs to $MAXJOBS" | tee -a ${OUT}.wgrr.log
+		printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Limiting the number of simultaneous jobs to $MAXJOBS" | tee -a ${OUT}.wgrr.log
 		JOBARRAY="1-${NJOBS}%${MAXJOBS}"
 	else
 		JOBARRAY="1-${NJOBS}"
@@ -437,60 +464,67 @@ else
 		PARTITION=("common,dedicated" "-q" "fast")
 	fi
 	if [[ $REQMEM == "" ]] ; then
-		printf "%-10s  --  %s\n" "[ERROR]" "Something went wrong when estimating the required memory." | tee -a ${OUT}.wgrr.log
+		printf "%-17s  --  %s\n" "[ERROR]" "Something went wrong when estimating the required memory." | tee -a ${OUT}.wgrr.log
 		exit 1
 	fi
-	JID=$(sbatch --parsable --wait -p ${PARTITION} --array="$JOBARRAY" -c 1 -J "wGRR_worker" --mem=$REQMEM --wrap="./wGRR_worker.zsh $ARRAYSIZE $OUT $NJOBS 1 $PRT $tmp")
+	JID=$(sbatch --parsable --wait -p ${PARTITION} --array="$JOBARRAY" -c 1 -J "wGRR_worker" --mem=$REQMEM --wrap="./wGRR_worker.zsh $AWKEXE $ARRAYSIZE $OUT $NJOBS 1 $PRT $tmp")
 	if [[ `sacct -j $JID | grep "TIMEOUT"` ]] ; then
-		printf "%-10s  --  %s\n" "[ERROR]" "Some workers encountered a TIMEOUT." | tee -a ${OUT}.wgrr.log
-		printf "%-10s  --  %s\n" "[ERROR]" "Saving the partial files in ${OUT}.wgrr_part directory." | tee -a ${OUT}.wgrr.log
+		printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "Some workers encountered a TIMEOUT." | tee -a ${OUT}.wgrr.log
+		printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "Saving the partial files in ${OUT}.wgrr_part directory." | tee -a ${OUT}.wgrr.log
 		rm -rf "$OUT".wgrr_part
 		mkdir "$OUT".wgrr_part
 		mv $tmp/"$OUT".wgrr_part.* "$OUT".wgrr_part/
 		exit 1
 	fi
 	PQT=$(sacct -X -j $JID -o Reserved -n | $AWKEXE 'NR==1{prevt=0}{t=0;n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];if(t<prevt){tt=tt+prevt}prevt=t}END{print tt+t}')
-	printf "%-10s  --  %s %s %s %s %s\n" "[INFO]" "The job" $JID "has been" $(textifyDuration $PQT) "in queue" | tee -a ${OUT}.wgrr.log
+	printf "%-17s  --  %s %s %s %s %s\n" "["$(textifyDuration $SECONDS)"]" "The job" $JID "has been" $(textifyDuration $PQT) "in queue" | tee -a ${OUT}.wgrr.log
 	QT=$((QT+PQT))
 	mkdir -p "$OUT".logs
 	mv slurm-"$JID"_*.out "$OUT".logs/
 fi
 
-duration=$SECONDS
-printf "%-10s  --  %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log
-printf "%-10s  --  %s\n" "[INFO]" "Sorting results" | tee -a ${OUT}.wgrr.log
-sort -m "$tmp"/$OUT.wgrr_part.* | sort -u -k1,1V -k2,2V | $AWKEXE 'BEGIN{print "GenomeA\tGenomeB\twGRR1\tCommon1\tRealLengthA\tRealLengthB\twGRR2\tCommon2\tEffectiveLength1A\tEffectiveLength1B\twGRR3\tCommon3\tEffectiveLength2A\tEffectiveLength2B"}1' > $OUT.wgrr.txt
+printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Sorting results" | tee -a ${OUT}.wgrr.log
+sort -m "$tmp"/$OUT.wgrr_part.* | sort -u -k1,1V -k2,2V | $AWKEXE 'BEGIN{print "GenomeA\tGenomeB\twGRR\tSørensen-Dice\tCommon\tNprotA\tNprotB"}1' > $OUT.wgrr.txt
 
 if [[ ! -f "$OUT".wgrr.txt ]] ; then
-	printf "%-10s  --  %s\n" "[ERROR]" "Failed to sort the wGRR table." | tee -a ${OUT}.wgrr.log
-	printf "%-10s  --  %s\n" "[ERROR]" "Saving the partial files in ${OUT}.wgrr_part directory." | tee -a ${OUT}.wgrr.log
+	printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "Failed to sort the wGRR table." | tee -a ${OUT}.wgrr.log
+	printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "Saving the partial files in ${OUT}.wgrr_part directory." | tee -a ${OUT}.wgrr.log
 	rm -rf "$OUT".wgrr_part
 	mkdir "$OUT".wgrr_part
 	mv $tmp/"$OUT".wgrr_part.* "$OUT".wgrr_part/
 	exit 1
 fi
 
+${AWKEXE} '!a[$0]++' ${OUT}.bbh_part.* | sort -k1,1V -k2,2V > ${OUT}.bbh.txt
+rm -rf ${OUT}.bbh_part.*(N)
+if [[ ! -s ${OUT}.bbh.txt ]] ; then
+	printf "%-17s  --  %s\n" "[WARNING]" "Failed to produce the BBH output file." | tee -a ${OUT}.wgrr.log
+fi
+
 NLINES=$(wc -l "$OUT".wgrr.txt | $AWKEXE '{print $1}')
 NWGRR=$((STATS[1]*STATS[1]+1))
 if [[ $NLINES -lt $NWGRR ]] ; then
-	printf "%-10s  --  %s\n" "[ERROR]" "An error occurred during wGRR calculation:" | tee -a ${OUT}.wgrr.log
-	printf "%-10s  --  %s\n" "[ERROR]" "less lines than expected in the final file." | tee -a ${OUT}.wgrr.log
-	printf "%-10s  --  %s\n" "[ERROR]" "Saving the partial files in ${OUT}.wgrr_part directory." | tee -a ${OUT}.wgrr.log
+	printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "An error occurred during wGRR calculation:" | tee -a ${OUT}.wgrr.log
+	printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "less lines than expected in the final file." | tee -a ${OUT}.wgrr.log
+	printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "Saving the partial files in ${OUT}.wgrr_part directory." | tee -a ${OUT}.wgrr.log
 	rm -rf "$OUT".wgrr_part
 	mkdir "$OUT".wgrr_part
 	mv $tmp/"$OUT".wgrr_part.* "$OUT".wgrr_part/
 	exit 1
 elif [[ $NLINES -gt $NWGRR ]] ; then
-	printf "%-10s  --  %s\n" "[WARNING]" "Too many lines in the output - some genome pairs are probably duplicated" | tee -a ${OUT}.wgrr.log
+	printf "%-17s  --  %s\n" "[WARNING]" "Too many lines in the output - some genome pairs are probably duplicated" | tee -a ${OUT}.wgrr.log
+fi
+
+if [[ $JACCARD == 1 ]] ; then
+	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Jaccard index calculation" | tee -a ${OUT}.wgrr.log
+	$AWKEXE 'BEGIN{c=0;OFS="\t"}NR==FNR{p=substr($1,2);g=p;gsub(/_[^_]++$/,"",g);G[g][p]++;next}FILENAME==ARGV[2]{if(!($1 in A)){A[$1]++;c++}C[$2]=c}FILENAME==ARGV[3]{split("",C1,"");split("",C2,"");common=0;orphan1=0;orphan2=0;for(p in G[$1]){if(!(p in C)){orphan1++}else{C1[C[p]]++};for(q in G[$2]){if(!(q in C)){orphan2++}else{C2[C[q]]++}}}for(x in C1){if(x in C2){common++}}if(!($1 in O)){O[$1]++;print $1,$1,length(C1),length(C1),length(C1),"1"};if(!($2 in O)){O[$2]++;print $2,$2,length(C2),length(C2),length(C2),"1"}val=common/(length(C1)+length(C2)+orphan1+orphan2-common);print $1,$2,length(C1),length(C2),common,val;print $2,$1,length(C2),length(C1),common,val}' <(grep ">" $PRT) ${OUT}_cluster.tsv ${OUT}.allpairs.txt | sort -k1,1V -k2,2V > ${OUT}.jaccard.txt
 fi
 
 rm -rf "$OUT".logs
 
-printf "%-10s  --  %s\n" "[INFO]" "Done." | tee -a ${OUT}.wgrr.log
+printf "${green}%-17s  --  %s\n${normal}" "["$(textifyDuration $SECONDS)"]" "Done." | tee -a ${OUT}.wgrr.log
 if [[ $QT != 0 ]] ; then
-	printf "%-10s  --  %s\n" "[INFO]" "Total time in queue: $(textifyDuration $QT)" | tee -a ${OUT}.wgrr.log
+	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Total time in queue: $(textifyDuration $QT)" | tee -a ${OUT}.wgrr.log
 fi
-duration=$SECONDS
-printf "%-10s  --  %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log
 
 exit 0
diff --git a/wGRR.awk b/wGRR.awk
old mode 100755
new mode 100644
index 94c0c71..390238b
--- a/wGRR.awk
+++ b/wGRR.awk
@@ -24,10 +24,6 @@ FILENAME==ARGV[2] && /^>/{
 	gsub(/_[^_]+$/,"",g)
 	getline
 	if(g in glist){
-		# For default MMseqs format, manual calculation of coverage
-		#getline
-		#PROTLIST[g][p]=length($0)
-		# For custom MMseqs output, no need to store prot length
 		PROTLIST[g][p]++
 	}
 }
@@ -50,23 +46,10 @@ FILENAME==ARGV[3] {
 		next
 	}
 
-#	Default MMseqs format
-#	Turn these on to use with Jorge's output
-#	Turn off my values below
-#	And more below, turn on the cov calculations
-#	fid=$3
-#	alilen=$4
-#	evalue=$11
-#	bits=$12
-#	MMseqs default format does not report coverage - needs to be calculated
-#	cov1=alilen/PROTLIST[g1][p1]
-#	cov2=alilen/PROTLIST[g2][p2]
-
 #	My values
 	cov1=$3
 	cov2=$4
 	fid=$5
-#	evalue=$6
 	bits=$7
 
 	# Fix for some evalues below awk precision
@@ -77,13 +60,13 @@ FILENAME==ARGV[3] {
 		}
 	}
 
-#	if(cov1<=COV || cov2<=COV || fid<=ID || evalue>=EVAL){
 	if(cov1<=COV || cov2<=COV || fid<=ID){
 		next
 	}
 
 	# Store id of all pairs
 	idpair[p1][p2]=fid
+	covpair[p1,p2]=cov1","cov2
 
 	if(g1==g2 && p1==p2){
 		BBS[p1][g2]=bits
@@ -151,55 +134,10 @@ END {
 		for(gj in PROTLIST){
 			if(gi" "gj in pairlist){
 				li=length(PROTLIST[gi])
-				lj=length(PROTLIST[gj])
-
-				# Create clusters based on the hits
-				# cluster ids are integers
-				# Find clusters inside genomes gi and gj
-				# The rationale is that if two proteins from the same genome belong to the same cluster, they shoud be counted only once in the wgrr calculation
-				# length(cluster) is the number of clusters - each cluster counts for 1 irrespective of its size
-				idx=0
-				split("",ref,"")
-				split("",cluster,"")
-				li_singleton=li
-				lj_singleton=lj
-				for(pi in PROTLIST[gi]){
-					for(pj in PROTLIST[gj]){
-						if((pi in idpair && pj in idpair[pi]) || (pj in idpair && pi in idpair[pj])) {
-							if(!(pi in ref) && !(pj in ref)){
-								idx++
-								ref[pi]=idx
-								li_singleton--
-								ref[pj]=idx
-								lj_singleton--
-								cluster[idx]=pi" "pj
-							}
-							else if(!(pi in ref)){
-								ref[pi]=ref[pj]
-								li_singleton--
-								cluster[ref[pj]]=cluster[ref[pj]]" "pi
-							}
-							else if(!(pj in ref)){
-								ref[pj]=ref[pi]
-								lj_singleton--
-								cluster[ref[pi]]=cluster[ref[pi]]" "pj
-							}
-							else if(ref[pi] != ref[pj]){
-								oldref=ref[pj]
-								ref[pj]=ref[pi]
-								cluster[ref[pi]]=cluster[ref[pi]]" "cluster[oldref]
-								delete cluster[oldref]
-							}
-						}
-					}
-				}
-				
-				sumidC=0
+				lj=length(PROTLIST[gj])				
 				sumidR=0
 				ncommonR=0
 				split("",bbh,"")
-				# keep track of bbh per cluster
-				split("",bbh_clust,"")
 				for(pi in PROTLIST[gi]){
 					# pi has a BUH and pi has a BUH in gj
 					if(pi in BUH && gj in BUH[pi]){
@@ -207,100 +145,44 @@ END {
 						# pj has a BUH, pj has a BUH in gi, pi is the BUH of pj
 						if(pj in BUH && gi in BUH[pj] && BUH[pj][gi]==pi){
 							sumidR=sumidR+idpair[pi][pj]
-							if(pi in ref && ref[pi] in bbh_clust){
-								if(idpair[pi][pj] > bbh_clust[ref[pi]]){
-									sumidC=sumidC-bbh_clust[ref[pi]]+idpair[pi][pj]
-									bbh_clust[ref[pi]]=idpair[pi][pj]
-								}
-							}
-							else if(pi in ref && !(ref[pi] in bbh_clust)){
-								sumidC=sumidC+idpair[pi][pj]
-								bbh_clust[ref[pi]]=idpair[pi][pj]
-							}
 							bbh[pi]++
 							bbh[pj]++
 							ncommonR++
-						}
-					}
-				}
-
-				ci=0
-				cj=0
-				# For the real BBH pairs, look at the cluster - if the cluster contains other proteins, the c value
-				# of the corresponding genome is increased
-				# In the end this value is deduced from the number of prot to account for the clusters
-				split("",counted,"")
-				# Loop through all proteins (of gi and gj) that belong to a bbh
-				# Consider all members of the cluster of each of these proteins
-				for(p in bbh){
-					# Get the cluster index
-					idx=ref[p]
-					split(cluster[idx],cluster_array," ")
-					for(clust in cluster_array){
-						# Other prot of the same cluster
-						pp=cluster_array[clust]
-						# Genome of the corresponding prot 
-						g=pp
-						gsub(/_[^_]+$/,"",g)
-						# Prot in a cluster but not in a bbh pair --> should not be counted
-						# Because we only compare proteins that are common to both genomes, i.e. bbh
-						# So proteins that do not have a bbh in the other genomes do not count
-						if(!(pp in bbh)){
-							if(!(pp in counted)){
-								g==gi ? ci=ci+1 : cj=cj+1
+							if(OBBH!=""){
+								print pi,pj,idpair[pi][pj],covpair[pi,pj] >> OBBH
 							}
-							counted[pp]++
 						}
 					}
 				}
 
-				# genome length ll is the visible genome length l minus the proteins that are in a cluster but not in a bbh
-				lli=li-ci
-				llj=lj-cj
-				lli < llj ? minprot=lli : minprot=llj
-
 				li < lj ? mp=li : mp=lj
 
-				# length(clusters) version. We count prot that are not in a cluster (singletons) + nb clusters
-				lci=li_singleton+length(cluster)
-				lcj=lj_singleton+length(cluster)
-				lci<lcj ? lc=lci : lc=lcj
-
 				if(mp==0){
 					print "bad mp for genomes "gi" "gj
 				}
-				if(minprot==0){
-					print "bad minprot for genomes"gi" "gj
-				}
-				if(lc==0){
-					print "bad lc for genomes "gi" "gj
-				}
 
 				if(gi==gj){
-					lc = length(cluster)
 					if(!MEM){
-						print gi,gj,sumidR/mp,ncommonR/li,li,lj,sumidR/minprot,ncommonR/((lli+llj)/2),lli,llj,sumidC/lc,length(bbh_clust)/lc,lc,lc
+						print gi,gj,sumidR/mp,2*sumidR/(li+lj),ncommonR/li,li,lj
 					}
 				}
 				else {
-					if(gj":"gi in part_wgrr1){
-						avg_wgrr1=(part_wgrr1[gj":"gi]+sumidR/mp)/2
-						avg_wgrr2=(part_wgrr2[gj":"gi]+sumidR/minprot)/2
-						avg_wgrr3=(part_wgrr3[gj":"gi]+sumidC/lc)/2
+					if(gj":"gi in part_wgrr){
+						avg_wgrr=(part_wgrr[gj":"gi]+sumidR/mp)/2
+						avg_SD=(part_SD[gj":"gi]+(2*sumidR/(li+lj)))/2
 						split(final_out[gj":"gi],ff,"\t")
 						# Print twice for both directions of the wGRR
 						if(!MEM){
-							print ff[1],ff[2],avg_wgrr1,ff[4],ff[5],ff[6],avg_wgrr2,ff[8],ff[9],ff[10],avg_wgrr3,ff[12],ff[13],ff[14]
-							print gi,gj,avg_wgrr1,ncommonR/((li+lj)/2),li,lj,avg_wgrr2,ncommonR/((lli+llj)/2),lli,llj,avg_wgrr3,length(bbh_clust)/((lci+lcj)/2),lci,lcj
+							print ff[1],ff[2],avg_wgrr,avg_SD,ff[5],ff[6],ff[7]
+							print gi,gj,avg_wgrr,avg_SD,ncommonR/((li+lj)/2),li,lj
 						}
 						delete part_wgrr[gj":"gi]
 						delete final_out[gj":"gi]
 					}
 					else{
-						part_wgrr1[gi":"gj]=sumidR/mp
-						part_wgrr2[gi":"gj]=sumidR/minprot
-						part_wgrr3[gi":"gj]=sumidC/lc
-						final_out[gi":"gj]=gi"\t"gj"\t"sumidR/mp"\t"ncommonR/((li+lj)/2)"\t"li"\t"lj"\t"sumidR/minprot"\t"ncommonR/((lli+llj)/2)"\t"lli"\t"llj"\t"sumidC/lc"\t"length(bbh_clust)/((lci+lcj)/2)"\t"lci"\t"lcj
+						part_wgrr[gi":"gj]=sumidR/mp
+						part_SD[gi":"gj]=2*sumidR/(li+lj)
+						final_out[gi":"gj]=gi"\t"gj"\t"sumidR/mp"\t"2*sumidR/(li+lj)"\t"ncommonR/((li+lj)/2)"\t"li"\t"lj
 					}
 				}
 			}
diff --git a/wGRR_worker.zsh b/wGRR_worker.zsh
index df1451d..93349ae 100755
--- a/wGRR_worker.zsh
+++ b/wGRR_worker.zsh
@@ -22,16 +22,17 @@ textifyDuration() {
    echo "$txt"
 }
 
-ARRAYSIZE=$1
-OUT=$2
-NJOBS=$3
-arg=$4
-PRT=$5
-TMP=$6
-STIME=$7
+AWKEXE=$1
+ARRAYSIZE=$2
+OUT=$3
+NJOBS=$4
+arg=$5
+PRT=$6
+TMP=$7
+STIME=$8
 
 if [[ `hostname` != "maestro-"* ]] || [[ ${SLURM_ARRAY_TASK_ID} == "" ]] ; then
-	SLURM_ARRAY_TASK_ID=$4
+	SLURM_ARRAY_TASK_ID=$5
 fi
 
 OUTFILE="$TMP"/$OUT.wgrr_part.${SLURM_ARRAY_TASK_ID}
@@ -41,9 +42,9 @@ MINP=$((MAXP-ARRAYSIZE+1))
 
 if [[ $STIME != "" ]] ; then
 	CTIME=$(date +%s)
-	printf "\r\033[K[PROGRESS]  --  [%-40s]  %s/%s  %s"  $(head -c $((arg*40/NJOBS)) < /dev/zero | tr "\0" "=") $arg $NJOBS $(textifyDuration $((CTIME-STIME))) 
+	printf "\r\033[K%-17s  --  [%-50s]  %s/%s  %s"  "[PROGRESS]" $(C=$((arg*50/NJOBS)) ; if [ $C -eq 0 ] ; then printf "=" ; else head -c $C < /dev/zero | tr "\0" "=" ; fi) $arg $NJOBS $(textifyDuration $((CTIME-STIME))) 
 fi
 
-awk -v MINP=$MINP -v MAXP=$MAXP -f wGRR.awk $OUT.allpairs.txt $PRT $OUT.m8 | sort -k1,1V -k2,2V > $OUTFILE
+$AWKEXE -v MINP=$MINP -v MAXP=$MAXP -v OBBH=${OUT}.bbh_part.${SLURM_ARRAY_TASK_ID} -f wGRR.awk $OUT.allpairs.txt $PRT $OUT.m8 | sort -k1,1V -k2,2V > $OUTFILE
 
 exit 0
-- 
GitLab