From d57d1088fa2619f1154cec10d465a6d428af83b6 Mon Sep 17 00:00:00 2001 From: jgugliel <julien.guglielmini@pasteur.fr> Date: Mon, 15 May 2023 11:40:56 +0200 Subject: [PATCH] Added coverage and identity thresholds as parameters --- README.md | 6 +++++- wGRR | 14 +++++++++++--- wGRR.awk | 2 -- wGRR_worker.zsh | 4 +++- 4 files changed, 19 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index cdcad89..2452af6 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ chmod +x wGRR*.zsh ### On a local machine ```bash -./wGRR -i $fasta [-p $mmseqs2_path -o $output_prefix -t $threads -a $comparisons -l $id_file -T -f -j -s] +./wGRR -i $fasta [-C $coverage_threshold -I $identity_threshold -p $mmseqs2_path -o $output_prefix -t $threads -a $comparisons -l $id_file -T -f -j -s] ``` **WARNING** Memory consumption can be really high and running `wGRR` might exhaust your system. By default, `wGRR` will estimate the required memory and stop if your system does not fulfill this requirement. @@ -100,6 +100,10 @@ This corresponds to the protein `00141` of the element `ESCO001.0321.00001.P001` * `$output_prefix` is a prefix that will be used for each output files. By default, a random string is used. +* `$coverage_threshold` is the coverage threshold for defining the BBH pairs (float number, between 0 and 1). The default value is 0.5. + +* `$identity_threshold` is the sequence identity threshold for defining the BBH pairs (float number, between 0 and 1). The default value is 0.35. + * `$threads` is the number of threads to use. Using more than one is highly recommended. * `$comparisons` is the number of genetic elements to be compared simultaneously on each thread. A value of 1,000 means that, on each thread, groups of 1,000 genetic elements will be compared simultaneously. Increasing this number will decrease the total number of tasks to be performed but will increase RAM usage. By default, a reasonable value is automatically determined. This value should result in workers running in less than 2 hours and which is perfectly adapted to the use of the `-f` flag (see the "sbatch" section above). diff --git a/wGRR b/wGRR index b0d91cb..a719527 100755 --- a/wGRR +++ b/wGRR @@ -7,7 +7,7 @@ trap 'rm -rf "$tmp"' EXIT export LC_ALL=C SECONDS=0 -readonly VERSION=1.3 +readonly VERSION=1.4 bold=$(tput bold) green=$(tput setaf 2) @@ -36,6 +36,10 @@ display_usage() { echo " If wGRR is used on Maestro, MMSeqs will be automatically loaded." echo " -o <string> Base name for the output files." echo " By default a random string will be used." + echo " -C <float> Coverage threshold for the BBH pairs." + echo " default: 0.5" + echo " -I <float> Identity threshold for the BBH pairs." + echo " default: 0.35" echo " -t <integer> Number of threads to use." echo " default: 1" echo " recommended: at least 4" @@ -124,9 +128,11 @@ TESTRUN=0 ## -T FAST=0 ## -f SKIP=0 ## -s JACCARD=0 ## -j +COVT=0.5 ## -C +IDT=0.35 ## -I ## catch option values -while getopts :fTsji:p:o:t:a:m:l: option ; do +while getopts :fTsji:p:o:t:a:m:l:C:I: option ; do case $option in i) PRT="$OPTARG"; if [[ ! -s $PRT ]]; then printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "Fasta file '$PRT' not found or empty (option -i)." ; exit 1 ; fi ;; p) MMPATH="$OPTARG" ;; @@ -139,6 +145,8 @@ while getopts :fTsji:p:o:t:a:m:l: option ; do f) FAST=1 ;; s) SKIP=1 ;; j) JACCARD=1 ;; + C) COVT="$OPTARG"; if [[ ${COVT} -gt 1 ]] || [[ ${COVT} -lt 0 ]] ; then printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "Coverage threshold must be between 0 and 1 (option -C)." ; exit 1 ; fi ;; + I) IDT="$OPTARG"; if [[ ${IDT} -gt 1 ]] || [[ ${IDT} -lt 0 ]] ; then printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "Identity threshold must be between 0 and 1 (option -C)." ; exit 1 ; fi ;; :) printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "option $OPTARG : missing argument" ; exit 1 ;; \?) printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "$OPTARG : invalid option" ; exit 1 ;; esac @@ -349,7 +357,7 @@ if [[ $SKIP == 0 || $BATCHFLAG == 1 ]] ; then printf "%-17s -- %s\n" "["$(textifyDuration $SECONDS)"]" "Running MMseqs on a sample file" | tee -a ${OUT}.wgrr.log mmseqs easy-search $OUT.testrun.prt $OUT.testrun.prt $OUT.testrun.m8 $tmp -s 7.5 --threads $THREADS --format-output "query,target,qcov,tcov,fident,evalue,bits" --add-self-matches > $OUT.testrun.mmseqs.search.log - M2=$(awk -f wGRR.awk -v MINP=1 -v MAXP=10 -v OUT=$OUT -v MEM=1 $OUT.testrun.allpairs.txt $OUT.testrun.prt $OUT.testrun.m8) + M2=$(awk -f wGRR.awk -v MINP=1 -v MAXP=10 -v OUT=$OUT -v MEM=1 -v COV=${COVT} -v ID=${IDT} $OUT.testrun.allpairs.txt $OUT.testrun.prt $OUT.testrun.m8) REQMEM=$(bc -l <<< $(numfmt --from=iec $M2)*($ARRAYSIZE*0.15) | numfmt --to=iec | awk '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}') REQMEMT=$(bc -l <<< $(numfmt --from=iec $REQMEM)*$THREADS | numfmt --to=iec | awk '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}') diff --git a/wGRR.awk b/wGRR.awk index 390238b..3b3b87e 100644 --- a/wGRR.awk +++ b/wGRR.awk @@ -1,7 +1,5 @@ BEGIN { FS=OFS="\t" - COV=0.5 - ID=0.35 EVAL=1e-4 } diff --git a/wGRR_worker.zsh b/wGRR_worker.zsh index 3047985..3a2034a 100755 --- a/wGRR_worker.zsh +++ b/wGRR_worker.zsh @@ -30,6 +30,8 @@ arg=$5 PRT=$6 TMP=$7 STIME=$8 +COVT=$9 +IDT=${10} if [[ `hostname` != "maestro-"* ]] || [[ ${SLURM_ARRAY_TASK_ID} == "" ]] ; then SLURM_ARRAY_TASK_ID=$5 @@ -45,6 +47,6 @@ if [[ $STIME != "" ]] ; then printf "\r\033[K%-17s -- [%-50s] %s/%s %s" "[PROGRESS]" $(C=$((arg*50/NJOBS)) ; if [ $C -eq 0 ] ; then printf "=" ; else head -c $C < /dev/zero | tr "\0" "=" ; fi) $arg $NJOBS $(textifyDuration $((CTIME-STIME))) fi -awk -v MINP=$MINP -v MAXP=$MAXP -v OBBH=${OUT}.bbh_part.${SLURM_ARRAY_TASK_ID} -f wGRR.awk $OUT.allpairs.txt $PRT $OUT.m8 | sort -k1,1V -k2,2V > $OUTFILE +awk -v COV=${COVT} -v ID=${IDT} -v MINP=$MINP -v MAXP=$MAXP -v OBBH=${OUT}.bbh_part.${SLURM_ARRAY_TASK_ID} -f wGRR.awk $OUT.allpairs.txt $PRT $OUT.m8 | sort -k1,1V -k2,2V > $OUTFILE exit 0 -- GitLab