Skip to content
Snippets Groups Projects
Commit d57d1088 authored by Julien  GUGLIELMINI's avatar Julien GUGLIELMINI
Browse files

Added coverage and identity thresholds as parameters

parent 2a4329f4
No related branches found
No related tags found
No related merge requests found
......@@ -47,7 +47,7 @@ chmod +x wGRR*.zsh
### On a local machine
```bash
./wGRR -i $fasta [-p $mmseqs2_path -o $output_prefix -t $threads -a $comparisons -l $id_file -T -f -j -s]
./wGRR -i $fasta [-C $coverage_threshold -I $identity_threshold -p $mmseqs2_path -o $output_prefix -t $threads -a $comparisons -l $id_file -T -f -j -s]
```
**WARNING**
Memory consumption can be really high and running `wGRR` might exhaust your system. By default, `wGRR` will estimate the required memory and stop if your system does not fulfill this requirement.
......@@ -100,6 +100,10 @@ This corresponds to the protein `00141` of the element `ESCO001.0321.00001.P001`
* `$output_prefix` is a prefix that will be used for each output files. By default, a random string is used.
* `$coverage_threshold` is the coverage threshold for defining the BBH pairs (float number, between 0 and 1). The default value is 0.5.
* `$identity_threshold` is the sequence identity threshold for defining the BBH pairs (float number, between 0 and 1). The default value is 0.35.
* `$threads` is the number of threads to use. Using more than one is highly recommended.
* `$comparisons` is the number of genetic elements to be compared simultaneously on each thread. A value of 1,000 means that, on each thread, groups of 1,000 genetic elements will be compared simultaneously. Increasing this number will decrease the total number of tasks to be performed but will increase RAM usage. By default, a reasonable value is automatically determined. This value should result in workers running in less than 2 hours and which is perfectly adapted to the use of the `-f` flag (see the "sbatch" section above).
......
......@@ -7,7 +7,7 @@ trap 'rm -rf "$tmp"' EXIT
export LC_ALL=C
SECONDS=0
readonly VERSION=1.3
readonly VERSION=1.4
bold=$(tput bold)
green=$(tput setaf 2)
......@@ -36,6 +36,10 @@ display_usage() {
echo " If wGRR is used on Maestro, MMSeqs will be automatically loaded."
echo " -o <string> Base name for the output files."
echo " By default a random string will be used."
echo " -C <float> Coverage threshold for the BBH pairs."
echo " default: 0.5"
echo " -I <float> Identity threshold for the BBH pairs."
echo " default: 0.35"
echo " -t <integer> Number of threads to use."
echo " default: 1"
echo " recommended: at least 4"
......@@ -124,9 +128,11 @@ TESTRUN=0 ## -T
FAST=0 ## -f
SKIP=0 ## -s
JACCARD=0 ## -j
COVT=0.5 ## -C
IDT=0.35 ## -I
## catch option values
while getopts :fTsji:p:o:t:a:m:l: option ; do
while getopts :fTsji:p:o:t:a:m:l:C:I: option ; do
case $option in
i) PRT="$OPTARG"; if [[ ! -s $PRT ]]; then printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "Fasta file '$PRT' not found or empty (option -i)." ; exit 1 ; fi ;;
p) MMPATH="$OPTARG" ;;
......@@ -139,6 +145,8 @@ while getopts :fTsji:p:o:t:a:m:l: option ; do
f) FAST=1 ;;
s) SKIP=1 ;;
j) JACCARD=1 ;;
C) COVT="$OPTARG"; if [[ ${COVT} -gt 1 ]] || [[ ${COVT} -lt 0 ]] ; then printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "Coverage threshold must be between 0 and 1 (option -C)." ; exit 1 ; fi ;;
I) IDT="$OPTARG"; if [[ ${IDT} -gt 1 ]] || [[ ${IDT} -lt 0 ]] ; then printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "Identity threshold must be between 0 and 1 (option -C)." ; exit 1 ; fi ;;
:) printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "option $OPTARG : missing argument" ; exit 1 ;;
\?) printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "$OPTARG : invalid option" ; exit 1 ;;
esac
......@@ -349,7 +357,7 @@ if [[ $SKIP == 0 || $BATCHFLAG == 1 ]] ; then
printf "%-17s -- %s\n" "["$(textifyDuration $SECONDS)"]" "Running MMseqs on a sample file" | tee -a ${OUT}.wgrr.log
mmseqs easy-search $OUT.testrun.prt $OUT.testrun.prt $OUT.testrun.m8 $tmp -s 7.5 --threads $THREADS --format-output "query,target,qcov,tcov,fident,evalue,bits" --add-self-matches > $OUT.testrun.mmseqs.search.log
M2=$(awk -f wGRR.awk -v MINP=1 -v MAXP=10 -v OUT=$OUT -v MEM=1 $OUT.testrun.allpairs.txt $OUT.testrun.prt $OUT.testrun.m8)
M2=$(awk -f wGRR.awk -v MINP=1 -v MAXP=10 -v OUT=$OUT -v MEM=1 -v COV=${COVT} -v ID=${IDT} $OUT.testrun.allpairs.txt $OUT.testrun.prt $OUT.testrun.m8)
REQMEM=$(bc -l <<< $(numfmt --from=iec $M2)*($ARRAYSIZE*0.15) | numfmt --to=iec | awk '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}')
REQMEMT=$(bc -l <<< $(numfmt --from=iec $REQMEM)*$THREADS | numfmt --to=iec | awk '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}')
......
BEGIN {
FS=OFS="\t"
COV=0.5
ID=0.35
EVAL=1e-4
}
......
......@@ -30,6 +30,8 @@ arg=$5
PRT=$6
TMP=$7
STIME=$8
COVT=$9
IDT=${10}
if [[ `hostname` != "maestro-"* ]] || [[ ${SLURM_ARRAY_TASK_ID} == "" ]] ; then
SLURM_ARRAY_TASK_ID=$5
......@@ -45,6 +47,6 @@ if [[ $STIME != "" ]] ; then
printf "\r\033[K%-17s -- [%-50s] %s/%s %s" "[PROGRESS]" $(C=$((arg*50/NJOBS)) ; if [ $C -eq 0 ] ; then printf "=" ; else head -c $C < /dev/zero | tr "\0" "=" ; fi) $arg $NJOBS $(textifyDuration $((CTIME-STIME)))
fi
awk -v MINP=$MINP -v MAXP=$MAXP -v OBBH=${OUT}.bbh_part.${SLURM_ARRAY_TASK_ID} -f wGRR.awk $OUT.allpairs.txt $PRT $OUT.m8 | sort -k1,1V -k2,2V > $OUTFILE
awk -v COV=${COVT} -v ID=${IDT} -v MINP=$MINP -v MAXP=$MAXP -v OBBH=${OUT}.bbh_part.${SLURM_ARRAY_TASK_ID} -f wGRR.awk $OUT.allpairs.txt $PRT $OUT.m8 | sort -k1,1V -k2,2V > $OUTFILE
exit 0
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment