From 34cb42c47567f8c277b358e96b89c7105ad17bb4 Mon Sep 17 00:00:00 2001
From: jgugliel <julien.guglielmini@pasteur.fr>
Date: Mon, 13 Mar 2023 11:26:59 +0100
Subject: [PATCH] A few simplifications

---
 wGRR            | 52 ++++++++++++++++++++++++++-----------------------
 wGRR_worker.zsh |  4 ++--
 2 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/wGRR b/wGRR
index 2c75c1c..b4b12fe 100755
--- a/wGRR
+++ b/wGRR
@@ -246,6 +246,8 @@ if ! echo "" | $AWKEXE '{a[1][2]=3}' &> /dev/null ; then
 	exit 1
 fi
 
+alias awk=$AWKEXE
+
 ## Check MMseqs
 if [[ $MMPATH == "N.O.P.A.T.H" ]] ; then
 	if ! MMSEQS=$(command -v mmseqs)  ; then
@@ -263,13 +265,15 @@ else
 	fi
 fi
 
+alias mmseqs=$MMSEQS
+
 if [[ ${IDLIST} != "N.O.L.I.S.T" ]] ; then
 	printf "%-17s  --  %s %s\n" "["$(textifyDuration $SECONDS)"]" "Sampling the input file ${PRT} according to the ${IDLIST} file"  | tee -a ${OUT}.wgrr.log
-	$AWKEXE 'NR==FNR{a[$1]++;next}s!=""{print s;s=""}/^>/{k=0;g=substr($1,2);gsub(/_[^_]+$/,"",g);if(g in a){print;k=1;s=""}next}k{s=s""$0}END{if(s!=""){print s}}' ${IDLIST} ${PRT} > ${PRT:t:r}.sample.prt
+	awk 'NR==FNR{a[$1]++;next}s!=""{print s;s=""}/^>/{k=0;g=substr($1,2);gsub(/_[^_]+$/,"",g);if(g in a){print;k=1;s=""}next}k{s=s""$0}END{if(s!=""){print s}}' ${IDLIST} ${PRT} > ${PRT:t:r}.sample.prt
 	PRT=${PRT:t:r}.sample.prt
 fi
 
-STATS=($($AWKEXE '/^>/{p++;g=substr($1,2);gsub(/_[^_]+$/,"",g);if(!a[g]++){c++};LNR=NR}{if(NR>LNR+1){n=1}}END{if(n!=1){n=0}print c"\t"p"\t"n"\t"p/c}' $PRT))
+STATS=($(awk '/^>/{p++;g=substr($1,2);gsub(/_[^_]+$/,"",g);if(!a[g]++){c++};LNR=NR}{if(NR>LNR+1){n=1}}END{if(n!=1){n=0}print c"\t"p"\t"n"\t"p/c}' $PRT))
 printf "%-17s  --  %s %s %s %s %s\n" "["$(textifyDuration $SECONDS)"]" "Input file has" $STATS[1] "genomes and a total of" $STATS[2] "proteins"  | tee -a ${OUT}.wgrr.log
 printf "%-17s  --  %s %s\n" "["$(textifyDuration $SECONDS)"]" "Mean number of proteins per genome:" $STATS[4]  | tee -a ${OUT}.wgrr.log
 
@@ -295,7 +299,7 @@ fi
 if [[ $STATS[3] -eq 1 ]]; then
 	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Converting fasta file to sequential fasta"  | tee -a ${OUT}.wgrr.log
 	OPRT=$PRT
-	$AWKEXE '!/^>/{s=s$0;next}(s!=""){print s;s=""}{print}END{print s}' $OPRT > $OPRT:t:r_seq.prt
+	awk '!/^>/{s=s$0;next}(s!=""){print s;s=""}{print}END{print s}' $OPRT > $OPRT:t:r_seq.prt
 	PRT=$OPRT:t:r_seq.prt
 fi
 
@@ -303,11 +307,11 @@ if [[ -f $OUT.allpairs.txt ]] ; then
 	printf "%-17s  --  %s %s%s\n" "["$(textifyDuration $SECONDS)"]" "Using existing file" $OUT ".allpairs.txt" | tee -a ${OUT}.wgrr.log
 else
 	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Writing genomes pairs" | tee -a ${OUT}.wgrr.log
-	$AWKEXE 'BEGIN{x=1}/^>/{g=substr($1,2);gsub(/_[^_]+$/,"",g);if(FNR==1){a[x]=g;++x;currg=g;next}if(g!=currg){a[x]=g;x++;currg=g}}END{i=0;while(++i in a){j=i;while(++j in a){print a[i]"\t"a[j]}}}' $PRT > $OUT.allpairs.txt
-	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "$(wc -l $OUT.allpairs.txt | $AWKEXE '{print $1}') pairs written" | tee -a ${OUT}.wgrr.log
+	awk 'BEGIN{x=1}/^>/{g=substr($1,2);gsub(/_[^_]+$/,"",g);if(FNR==1){a[x]=g;++x;currg=g;next}if(g!=currg){a[x]=g;x++;currg=g}}END{i=0;while(++i in a){j=i;while(++j in a){print a[i]"\t"a[j]}}}' $PRT > $OUT.allpairs.txt
+	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "$(wc -l $OUT.allpairs.txt | awk '{print $1}') pairs written" | tee -a ${OUT}.wgrr.log
 fi
 
-NPAIRS=$(wc -l $OUT.allpairs.txt | $AWKEXE '{print $1}')
+NPAIRS=$(wc -l $OUT.allpairs.txt | awk '{print $1}')
 if [[ ! -f $OUT.allpairs.txt ]] || [[ $NPAIRS < 1 ]] ; then
 	printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "An error occurred when writing the $OUT.allpairs.txt file." | tee -a ${OUT}.wgrr.log
 	exit 1
@@ -335,19 +339,19 @@ fi
 if [[ $SKIP == 0 || $BATCHFLAG == 1 ]] ; then
 
 	if [[ $STATS[1] -gt 5 ]] ; then
-		$AWKEXE '/^>/{g=$1;gsub(/_[0-9]+$/,"",g);a[g]++;if(length(a)>5){exit};print;getline;print}' $PRT > $OUT.testrun.prt
+		awk '/^>/{g=$1;gsub(/_[0-9]+$/,"",g);a[g]++;if(length(a)>5){exit};print;getline;print}' $PRT > $OUT.testrun.prt
 	else
 		cp $PRT $OUT.testrun.prt
 	fi
 
-	$AWKEXE 'BEGIN{x=1}/^>/{g=substr($1,2);gsub(/_[^_]+$/,"",g);if(FNR==1){a[x]=g;++x;currg=g;next}if(g!=currg){a[x]=g;x++;currg=g}}END{i=0;while(++i in a){j=i;while(++j in a){print a[i]"\t"a[j]}}}' $OUT.testrun.prt > $OUT.testrun.allpairs.txt
+	awk 'BEGIN{x=1}/^>/{g=substr($1,2);gsub(/_[^_]+$/,"",g);if(FNR==1){a[x]=g;++x;currg=g;next}if(g!=currg){a[x]=g;x++;currg=g}}END{i=0;while(++i in a){j=i;while(++j in a){print a[i]"\t"a[j]}}}' $OUT.testrun.prt > $OUT.testrun.allpairs.txt
 
 	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Running MMseqs on a sample file" | tee -a ${OUT}.wgrr.log
-	$MMSEQS easy-search $OUT.testrun.prt $OUT.testrun.prt $OUT.testrun.m8 $tmp -s 7.5 --threads $THREADS --format-output "query,target,qcov,tcov,fident,evalue,bits" --add-self-matches > $OUT.testrun.mmseqs.search.log
+	mmseqs easy-search $OUT.testrun.prt $OUT.testrun.prt $OUT.testrun.m8 $tmp -s 7.5 --threads $THREADS --format-output "query,target,qcov,tcov,fident,evalue,bits" --add-self-matches > $OUT.testrun.mmseqs.search.log
 
-	M2=$($AWKEXE -f wGRR.awk -v MINP=1 -v MAXP=10 -v OUT=$OUT -v MEM=1 $OUT.testrun.allpairs.txt $OUT.testrun.prt $OUT.testrun.m8)
-	REQMEM=$(bc -l <<< $(numfmt --from=iec $M2)*($ARRAYSIZE*0.15) | numfmt --to=iec | $AWKEXE '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}')
-	REQMEMT=$(bc -l <<< $(numfmt --from=iec $REQMEM)*$THREADS | numfmt --to=iec | $AWKEXE '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}')
+	M2=$(awk -f wGRR.awk -v MINP=1 -v MAXP=10 -v OUT=$OUT -v MEM=1 $OUT.testrun.allpairs.txt $OUT.testrun.prt $OUT.testrun.m8)
+	REQMEM=$(bc -l <<< $(numfmt --from=iec $M2)*($ARRAYSIZE*0.15) | numfmt --to=iec | awk '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}')
+	REQMEMT=$(bc -l <<< $(numfmt --from=iec $REQMEM)*$THREADS | numfmt --to=iec | awk '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}')
 
 	if [[ $TESTRUN == 1 && $BATCHFLAG == 0 ]] ; then
 		printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "With the current -a parameter (${ARRAYSIZE}) ${NJOBS} workers are required" | tee -a ${OUT}.wgrr.log
@@ -383,9 +387,9 @@ rm -rf ${OUT}.bbh_part.*(N)
 if [[ -f $OUT.m8 ]] ; then
 	printf "%-17s  --  %s %s%s\n" "["$(textifyDuration $SECONDS)"]" "Using existing MMseqs output file" $OUT ".m8"  | tee -a ${OUT}.wgrr.log
 else
-	MIDENT=$($AWKEXE '!/^>/{a[$0]++}END{for(i in a){if(a[i]>m){m=a[i]}}print m}' $PRT)
+	MIDENT=$(awk '!/^>/{a[$0]++}END{for(i in a){if(a[i]>m){m=a[i]}}print m}' $PRT)
 	if [[ $((MIDENT*2)) -gt "$MMS_DEF_MAX_SEQS" ]] ; then
-		MMS_MAX_SEQ_PARAM=($(echo $((MIDENT*2)) | $AWKEXE '{c=substr($1,2);gsub(/[0-9]/,0,c);print "--max-seqs "substr($1,1,1)+1""c}'))
+		MMS_MAX_SEQ_PARAM=($(echo $((MIDENT*2)) | awk '{c=substr($1,2);gsub(/[0-9]/,0,c);print "--max-seqs "substr($1,1,1)+1""c}'))
 		printf "%-17s  --  %s %s\n" "["$(textifyDuration $SECONDS)"]" "Setting MMseqs parameter" "$MMS_MAX_SEQ_PARAM" | tee -a ${OUT}.wgrr.log
 	fi
 	if [[ $BATCHFLAG == 1 ]] ; then
@@ -393,16 +397,16 @@ else
 			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Submitting MMseqs search to Maestro" | tee -a ${OUT}.wgrr.log
 			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "The command is:" >> ${OUT}.wgrr.log
 			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "sbatch --wait --parsable -o ${OUT}.mmseqs.log -p ${PARTITION} -c ${THREADS} -J \"wGRR_MMSeqs\" --wrap=\"${MMSEQS} easy-search ${PRT} ${PRT} ${OUT}.m8 ${tmp} -s 7.5 --threads ${THREADS} --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches ${MMS_MAX_SEQ_PARAM}" >> ${OUT}.wgrr.log
-			JID=$(sbatch --wait --parsable -o "$OUT".mmseqs.log -p $PARTITION -c $THREADS -J "wGRR_MMSeqs" --wrap="$MMSEQS easy-search $PRT $PRT ${OUT}.m8 ${tmp} -s 7.5 --threads $THREADS --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches $MMS_MAX_SEQ_PARAM")
-			PQT=$(sacct -X -j $JID -o Reserved -n | $AWKEXE '{n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];print t}')
+			JID=$(sbatch --wait --parsable -o "$OUT".mmseqs.log -p $PARTITION -c $THREADS -J "wGRR_MMSeqs" --wrap="mmseqs easy-search $PRT $PRT ${OUT}.m8 ${tmp} -s 7.5 --threads $THREADS --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches $MMS_MAX_SEQ_PARAM")
+			PQT=$(sacct -X -j $JID -o Reserved -n | awk '{n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];print t}')
 			printf "%-17s  --  %s %s %s %s %s\n" "["$(textifyDuration $SECONDS)"]" "The job" $JID "has been" $(textifyDuration $PQT) "in queue" | tee -a ${OUT}.wgrr.log
 			QT=$((QT+PQT))
 		else
 			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Submitting MMseqs search and linclust to Maestro" | tee -a ${OUT}.wgrr.log
 			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "The command is:" >> ${OUT}.wgrr.log
 			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "sbatch --wait --parsable -o ${OUT}.mmseqs.log -p ${PARTITION} -c ${THREADS} -J \"wGRR_MMSeqs\" --wrap=\"${MMSEQS} easy-search ${PRT} ${PRT} ${OUT}.m8 ${tmp} -s 7.5 --threads ${THREADS} --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches ${MMS_MAX_SEQ_PARAM} ; ${MMSEQS} easy-linclust ${PRT} ${OUT} ${tmp} --threads ${THREADS}" >> ${OUT}.wgrr.log
-			JID=$(sbatch --wait --parsable -o "$OUT".mmseqs.log -p $PARTITION -c $THREADS -J "wGRR_MMSeqs" --wrap="$MMSEQS easy-search $PRT $PRT ${OUT}.m8 ${tmp} -s 7.5 --threads $THREADS --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches $MMS_MAX_SEQ_PARAM ; ${MMSEQS} easy-linclust ${PRT} ${OUT} ${tmp} --threads ${THREADS}")
-			PQT=$(sacct -X -j $JID -o Reserved -n | $AWKEXE '{n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];print t}')
+			JID=$(sbatch --wait --parsable -o "$OUT".mmseqs.log -p $PARTITION -c $THREADS -J "wGRR_MMSeqs" --wrap="mmseqs easy-search $PRT $PRT ${OUT}.m8 ${tmp} -s 7.5 --threads $THREADS --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches $MMS_MAX_SEQ_PARAM ; ${MMSEQS} easy-linclust ${PRT} ${OUT} ${tmp} --threads ${THREADS}")
+			PQT=$(sacct -X -j $JID -o Reserved -n | awk '{n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];print t}')
 			printf "%-17s  --  %s %s %s %s %s\n" "["$(textifyDuration $SECONDS)"]" "The job" $JID "has been" $(textifyDuration $PQT) "in queue" | tee -a ${OUT}.wgrr.log
 			QT=$((QT+PQT))
 		fi
@@ -410,13 +414,13 @@ else
 		printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Running MMseqs search" | tee -a ${OUT}.wgrr.log
 		printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "The command is:" >> ${OUT}.wgrr.log
 		printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "${MMSEQS} easy-search ${PRT} ${PRT} ${OUT}.m8 ${tmp} -s 7.5 --threads ${THREADS} --format-output \"query,target,qcov,tcov,fident,evalue,bits\" --add-self-matches ${MMS_MAX_SEQ_PARAM} > ${OUT}.mmseqs.search.log" >> ${OUT}.wgrr.log
-		$MMSEQS easy-search $PRT $PRT $OUT.m8 $tmp -s 7.5 --threads $THREADS --format-output "query,target,qcov,tcov,fident,evalue,bits" --add-self-matches $MMS_MAX_SEQ_PARAM > $OUT.mmseqs.search.log
+		mmseqs easy-search $PRT $PRT $OUT.m8 $tmp -s 7.5 --threads $THREADS --format-output "query,target,qcov,tcov,fident,evalue,bits" --add-self-matches $MMS_MAX_SEQ_PARAM > $OUT.mmseqs.search.log
 
 		if [[ $JACCARD == 1 ]] ; then
 			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Running MMseqs linclust" | tee -a ${OUT}.wgrr.log
 			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "The command is:" >> ${OUT}.wgrr.log
 			printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "${MMSEQS} easy-linclust ${PRT} ${OUT} ${tmp} --threads ${THREADS} > ${OUT}.mmseqs.linclust.log" >> ${OUT}.wgrr.log
-			$MMSEQS easy-linclust $PRT $OUT $tmp --threads $THREADS > $OUT.mmseqs.linclust.log
+			mmseqs easy-linclust $PRT $OUT $tmp --threads $THREADS > $OUT.mmseqs.linclust.log
 		fi
 	fi
 fi
@@ -476,7 +480,7 @@ else
 		mv $tmp/"$OUT".wgrr_part.* "$OUT".wgrr_part/
 		exit 1
 	fi
-	PQT=$(sacct -X -j $JID -o Reserved -n | $AWKEXE 'NR==1{prevt=0}{t=0;n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];if(t<prevt){tt=tt+prevt}prevt=t}END{print tt+t}')
+	PQT=$(sacct -X -j $JID -o Reserved -n | awk 'NR==1{prevt=0}{t=0;n=split($1,a,"-");if(n>1){t=t+a[1]*86400};split(a[n],b,":");t=t+b[1]*3600+b[2]*60+b[3];if(t<prevt){tt=tt+prevt}prevt=t}END{print tt+t}')
 	printf "%-17s  --  %s %s %s %s %s\n" "["$(textifyDuration $SECONDS)"]" "The job" $JID "has been" $(textifyDuration $PQT) "in queue" | tee -a ${OUT}.wgrr.log
 	QT=$((QT+PQT))
 	mkdir -p "$OUT".logs
@@ -484,7 +488,7 @@ else
 fi
 
 printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Sorting results" | tee -a ${OUT}.wgrr.log
-sort -m "$tmp"/$OUT.wgrr_part.* | sort -u -k1,1V -k2,2V | $AWKEXE 'BEGIN{print "GenomeA\tGenomeB\twGRR\tSørensen-Dice\tCommon\tNprotA\tNprotB"}1' > $OUT.wgrr.txt
+sort -m "$tmp"/$OUT.wgrr_part.* | sort -u -k1,1V -k2,2V | awk 'BEGIN{print "GenomeA\tGenomeB\twGRR\tSørensen-Dice\tCommon\tNprotA\tNprotB"}1' > $OUT.wgrr.txt
 
 if [[ ! -f "$OUT".wgrr.txt ]] ; then
 	printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "Failed to sort the wGRR table." | tee -a ${OUT}.wgrr.log
@@ -501,7 +505,7 @@ if [[ ! -s ${OUT}.bbh.txt ]] ; then
 	printf "%-17s  --  %s\n" "[WARNING]" "Failed to produce the BBH output file." | tee -a ${OUT}.wgrr.log
 fi
 
-NLINES=$(wc -l "$OUT".wgrr.txt | $AWKEXE '{print $1}')
+NLINES=$(wc -l "$OUT".wgrr.txt | awk '{print $1}')
 NWGRR=$((STATS[1]*STATS[1]+1))
 if [[ $NLINES -lt $NWGRR ]] ; then
 	printf "${red}%-17s  --  %s\n${normal}" "[ERROR]" "An error occurred during wGRR calculation:" | tee -a ${OUT}.wgrr.log
@@ -517,7 +521,7 @@ fi
 
 if [[ $JACCARD == 1 ]] ; then
 	printf "%-17s  --  %s\n" "["$(textifyDuration $SECONDS)"]" "Jaccard index calculation" | tee -a ${OUT}.wgrr.log
-	$AWKEXE 'BEGIN{c=0;OFS="\t"}NR==FNR{p=substr($1,2);g=p;gsub(/_[^_]++$/,"",g);G[g][p]++;next}FILENAME==ARGV[2]{if(!($1 in A)){A[$1]++;c++}C[$2]=c}FILENAME==ARGV[3]{split("",C1,"");split("",C2,"");common=0;orphan1=0;orphan2=0;for(p in G[$1]){if(!(p in C)){orphan1++}else{C1[C[p]]++};for(q in G[$2]){if(!(q in C)){orphan2++}else{C2[C[q]]++}}}for(x in C1){if(x in C2){common++}}if(!($1 in O)){O[$1]++;print $1,$1,length(C1),length(C1),length(C1),"1"};if(!($2 in O)){O[$2]++;print $2,$2,length(C2),length(C2),length(C2),"1"}val=common/(length(C1)+length(C2)+orphan1+orphan2-common);print $1,$2,length(C1),length(C2),common,val;print $2,$1,length(C2),length(C1),common,val}' <(grep ">" $PRT) ${OUT}_cluster.tsv ${OUT}.allpairs.txt | sort -k1,1V -k2,2V > ${OUT}.jaccard.txt
+	awk 'BEGIN{c=0;OFS="\t"}NR==FNR{p=substr($1,2);g=p;gsub(/_[^_]++$/,"",g);G[g][p]++;next}FILENAME==ARGV[2]{if(!($1 in A)){A[$1]++;c++}C[$2]=c}FILENAME==ARGV[3]{split("",C1,"");split("",C2,"");common=0;orphan1=0;orphan2=0;for(p in G[$1]){if(!(p in C)){orphan1++}else{C1[C[p]]++};for(q in G[$2]){if(!(q in C)){orphan2++}else{C2[C[q]]++}}}for(x in C1){if(x in C2){common++}}if(!($1 in O)){O[$1]++;print $1,$1,length(C1),length(C1),length(C1),"1"};if(!($2 in O)){O[$2]++;print $2,$2,length(C2),length(C2),length(C2),"1"}val=common/(length(C1)+length(C2)+orphan1+orphan2-common);print $1,$2,length(C1),length(C2),common,val;print $2,$1,length(C2),length(C1),common,val}' <(grep ">" $PRT) ${OUT}_cluster.tsv ${OUT}.allpairs.txt | sort -k1,1V -k2,2V > ${OUT}.jaccard.txt
 fi
 
 rm -rf "$OUT".logs
diff --git a/wGRR_worker.zsh b/wGRR_worker.zsh
index 93349ae..3047985 100755
--- a/wGRR_worker.zsh
+++ b/wGRR_worker.zsh
@@ -22,7 +22,7 @@ textifyDuration() {
    echo "$txt"
 }
 
-AWKEXE=$1
+alias awk=$1
 ARRAYSIZE=$2
 OUT=$3
 NJOBS=$4
@@ -45,6 +45,6 @@ if [[ $STIME != "" ]] ; then
 	printf "\r\033[K%-17s  --  [%-50s]  %s/%s  %s"  "[PROGRESS]" $(C=$((arg*50/NJOBS)) ; if [ $C -eq 0 ] ; then printf "=" ; else head -c $C < /dev/zero | tr "\0" "=" ; fi) $arg $NJOBS $(textifyDuration $((CTIME-STIME))) 
 fi
 
-$AWKEXE -v MINP=$MINP -v MAXP=$MAXP -v OBBH=${OUT}.bbh_part.${SLURM_ARRAY_TASK_ID} -f wGRR.awk $OUT.allpairs.txt $PRT $OUT.m8 | sort -k1,1V -k2,2V > $OUTFILE
+awk -v MINP=$MINP -v MAXP=$MAXP -v OBBH=${OUT}.bbh_part.${SLURM_ARRAY_TASK_ID} -f wGRR.awk $OUT.allpairs.txt $PRT $OUT.m8 | sort -k1,1V -k2,2V > $OUTFILE
 
 exit 0
-- 
GitLab