diff --git a/README.md b/README.md index 8edc271c3debbd22d853f0076e55f005e2dba00b..dacd65bd1244567f46b7b78591127a00eaf0fa28 100644 --- a/README.md +++ b/README.md @@ -22,9 +22,11 @@ The scripts are written in `zsh` and `awk` and as such wGRR should run on any Un wGRR can also be run on Institut Pasteur's cluster Maestro. In this case, all dependencies will be automatically loaded upon execution. ## Installation -Download the files, then (if necessary): +Download the files and make them executable (if necessary): ```bash -chmod +x wGRR* +git clone git@gitlab.pasteur.fr:jgugliel/wGRR.git wGRR +cd wGRR +chmod +x wGRR*.zsh ``` ## Usage @@ -84,9 +86,12 @@ This corresponds to the protein `00141` of the element `ESCO001.0321.00001.P001` * `$threads` is the number of threads to use. Using more than one is highly recommended. -* `$comparisons` is the number of genetic elements to be compared simultaneously on each thread. The default value, 10,000, means that on each thread groups of 10,000 genetic elements will be compared simultaneously. Increasing this number will decrease the total number of tasks to be performed but will increase RAM usage. +* `$comparisons` is the number of genetic elements to be compared simultaneously on each thread. A value of 1,000 means that, on each thread, groups of 1,000 genetic elements will be compared simultaneously. Increasing this number will decrease the total number of tasks to be performed but will increase RAM usage. By default, a reasonable value is automatically determined. This value should result in workers running in less than 2 hours and which is perfectly adapted to the use of the `-f` flag (see the "sbatch" section above). -* test run. By using the `-T` flag, a test run is executed, which omits MMseqs and wGRR calculations, and outputs some statistics of your input file to help you set up the analysis properly. +* test run. By using the `-T` flag, a test run is executed, which omits MMseqs and wGRR calculations, and outputs some statistics of your input file to help you set up the run properly. + +* If `wGRR` runs interactively (not run with `sbatch` on Maestro), the total amount of required memory will be estimated and compared to the total memory of your system. To skip this and speed up the run, you can set the `-s` flag. +**WARNING**: if your run is not properly configured, you can completely exhaust your system. ## Output wGRR produces a table with several metrics corresponding to 3 different ways of calculating the wGRR. diff --git a/wGRR b/wGRR index 4767a926013ac190f75861217d474be8fa11d52d..ecbaeb9a21083041117ae6aabea81229e730e5e0 100755 --- a/wGRR +++ b/wGRR @@ -7,7 +7,7 @@ trap 'rm -rf "$tmp"' EXIT export LC_ALL=C SECONDS=0 -readonly VERSION=1.0 +readonly VERSION=1.1 bold=$(tput bold) normal=$(tput sgr0) @@ -40,13 +40,19 @@ display_usage() { echo " -a <integer> Number of genomes comparisons to be performed in each thread." echo " Increasing this will decrease the number of tasks to be performed" echo " but each task will take longer and use more RAM." - echo " default: 10000" + echo " default: auto" echo " -m <integer> Max number of simulteaneous tasks." echo " Only applicable to Maestro, for the wGRR calculation step." echo " default: not set" echo " -T Test run. Useful to get stats on the input file and correctly set the -a parameter." + echo " default: off" + echo " -s Skip memory check." + echo " ${bold}WARNING:${normal} dangerous. If the analysis is not properly configured," + echo " you might exhaust your system." + echo " default: off" echo " -f In sbatch mode, will use the fast QOS (jobs limited to 2 hours)" - echo " Warning: be sure the jobs will run under 2 hours otherwise they will fail." + echo " ${bold}WARNING:${normal}: be sure the jobs will run under 2 hours otherwise they will fail." + echo " default: off" echo "" echo "${bold}DESCRIPTION:${normal}" echo "This pipeline will do all proteins pairwise comparisons using the MMseqs2 software and then process " @@ -70,6 +76,7 @@ if [ $# -lt 1 ] ; then exit 1 fi +## Functions textifyDuration() { local duration=$1 local shiff=$duration @@ -97,7 +104,7 @@ PRT="N.O.P.R.T" ## -i MMPATH="N.O.P.A.T.H" ## -p OUT="N.O.O.U.T" ## -o THREADS=1 ## -t -ARRAYSIZE=10000 ## -a +ARRAYSIZE="AUTO" ## -a BATCHFLAG=0 ## Are we in a sbatch job? QT=0 ## Queing time (for Maestro) MAXJOBS=0 ## -m @@ -106,18 +113,20 @@ MIDENT=0 MMS_MAX_SEQ_PARAM="" TESTRUN=0 ## -T FAST=0 ## -f +SKIP=0 ## -s ## catch option values -while getopts :fTi:p:o:t:a:m: option ; do +while getopts :fTsi:p:o:t:a:m: option ; do case $option in i) PRT="$OPTARG"; if [ ! -f $PRT ]; then echo "[ERROR] -- fasta file '$PRT' not found (option -f)." ; exit 1 ; fi ;; p) MMPATH="$OPTARG" ;; o) OUT="$OPTARG"; if [ -f "$OUT".wgrr.final.txt ]; then echo "[ERROR] -- file $OUT.wgrr.final.txt already exists. Change option -o or remove this file and relaunch." ; exit 1 ; fi ;; t) THREADS="$OPTARG"; if [[ ! $THREADS =~ ^[0-9]+$ ]]; then echo "[ERROR] -- number of threads $THREADS must be an integer (option -t)." ; exit 1 ; fi ;; - a) ARRAYSIZE="$OPTARG"; if [[ ! $ARRAYSIZE =~ ^[0-9]+$ ]]; then echo "[ERROR] -- number of genomes comparisons $ARRAYSIZE must be an integer (option -a)." ; exit 1 ; fi ;; + a) ARRAYSIZE="$OPTARG"; if [[ ! $ARRAYSIZE =~ ^[0-9]+$ ]]; then if [[ $ARRAYSIZE != "AUTO" ]] ; then echo "[ERROR] -- number of genomes comparisons $ARRAYSIZE must be an integer (option -a)." ; exit 1 ; fi ; fi ;; m) MAXJOBS="$OPTARG"; if [[ ! $MAXJOBS =~ ^[0-9]+$ ]]; then echo "[ERROR] -- max number of simulteaneous jobs $MAXJOBS must be an integer (option -m)." ; exit 1 ; fi ;; T) TESTRUN=1 ;; f) FAST=1 ;; + s) SKIP=1 ;; :) echo "option $OPTARG : missing argument" ; exit 1 ;; \?) echo "$OPTARG : invalid option" ; exit 1 ;; esac @@ -128,6 +137,7 @@ shift "$((OPTIND - 1))" # Remove all aliases unalias -a +STIME=$(date +%s) if [[ -e ${OUT}.wgrr.log ]] ; then printf "%-10s -- %s\n" "[WARNING]" "Appending to existing log" @@ -166,6 +176,24 @@ if [[ $SLURM_JOBID != "" ]] ; then BATCHFLAG=$(squeue -j $SLURM_JOBID --Format=batchflag -h | awk '{print $1}') fi +# Get system total memory if not in sbatch +TOTALMEM=0 +if [[ $BATCHFLAG==0 ]] ; then + if [[ $SLURM_MEM_PER_CPU != "" ]] ; then + TOTALMEM=$SLURM_MEM_PER_CPU + else + free -b > /dev/null 2>&1 + if [[ $? -eq 0 ]] ; then + TOTALMEM=$(free -b | awk 'NR==2{print $2}') + else + sysctl hw.memsize > /dev/null 2>&1 + if [[ $? -eq 0 ]] ; then + TOTALMEM=$(sysctl hw.memsize | awk '{print $NF}') + fi + fi + fi +fi + printf "%-10s -- %s\n" "[INFO]" "wGRR version ${VERSION}" | tee -a ${OUT}.wgrr.log if [[ $BATCHFLAG == 0 ]] ; then @@ -173,7 +201,7 @@ if [[ $BATCHFLAG == 0 ]] ; then printf "%-10s -- %s\n" "[WARNING]" "The fast flag (-f) is useless when not using sbatch" | tee -a ${OUT}.wgrr.log fi if [[ $THREADS -gt $(nproc) ]] ; then - printf "%-10s -- %s\n" "[ERROR]" "You required $THREADS threads but your machine has $(nproc) cores." | tee -a ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[ERROR]" "You requested $THREADS threads but your machine has $(nproc) cores." | tee -a ${OUT}.wgrr.log exit 1 fi printf "%-10s -- %s\n" "[INFO]" "The command is:" >> ${OUT}.wgrr.log @@ -228,6 +256,25 @@ STATS=($($AWKEXE '/^>/{p++;g=substr($1,2);gsub(/_[^_]+$/,"",g);if(!a[g]++){c++}; printf "%-10s -- %s %s %s %s %s\n" "[INFO]" "Input file has" $STATS[1] "genomes and a total of" $STATS[2] "proteins" | tee -a ${OUT}.wgrr.log printf "%-10s -- %s %s\n" "[INFO]" "Mean number of proteins per genome:" $STATS[4] | tee -a ${OUT}.wgrr.log +if [[ $ARRAYSIZE == "AUTO" ]] ; then + if [[ ${STATS[4]} -le 100 ]] ; then + ARRAYSIZE=10000 + elif [[ ${STATS[4]} -le 500 ]] ; then + ARRAYSIZE=2000 + elif [[ ${STATS[4]} -le 1000 ]] ; then + ARRAYSIZE=750 + elif [[ ${STATS[4]} -le 2000 ]] ; then + ARRAYSIZE=400 + elif [[ ${STATS[4]} -le 3000 ]] ; then + ARRAYSIZE=200 + elif [[ ${STATS[4]} -le 5000 ]] ; then + ARRAYSIZE=100 + else + ARRAYSIZE=50 + fi + printf "%-10s -- %s %s\n" "[INFO]" "Setting automatically the -a parameter to" $ARRAYSIZE | tee -a ${OUT}.wgrr.log +fi + if [[ $STATS[3] -eq 1 ]]; then printf "%-10s -- %s\n" "[INFO]" "Converting fasta file to sequential fasta" | tee -a ${OUT}.wgrr.log OPRT=$PRT @@ -253,51 +300,76 @@ fi NJOBS=$(( (NPAIRS+ARRAYSIZE-1)/ARRAYSIZE)) duration=$SECONDS -if [[ $BATCHFLAG == 1 || $TESTRUN == 1 ]] ; then - if [[ $BATCHFLAG == 1 && $TESTRUN == 1 ]] ; then - printf "%-10s -- %s\n" "[WARNING]" "Test run (option -T) is not available with sbatch runs" | tee -a ${OUT}.wgrr.log - printf "%-10s -- %s\n" "[WARNING]" "The run will continue ignoring the -T option" | tee -a ${OUT}.wgrr.log - printf "%-10s -- %s\n" "[INFO]" "Estimating required memory" | tee -a ${OUT}.wgrr.log - elif [[ $TESTRUN == 1 ]] ; then - printf "%-10s -- %s\n" "[INFO]" "This is a test run (option -T)" | tee -a ${OUT}.wgrr.log - elif [[ $BATCHFLAG == 1 ]] ; then - printf "%-10s -- %s\n" "[INFO]" "Estimating required memory" | tee -a ${OUT}.wgrr.log - fi +if [[ $BATCHFLAG == 1 && $SKIP == 1 ]] ; then + printf "%-10s -- %s\n" "[WARNING]" "Skipping estimation of required memory (option -s) is not available with sbatch runs" | tee -a ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[WARNING]" "The run will continue ignoring the -s option" | tee -a ${OUT}.wgrr.log +fi + +if [[ $BATCHFLAG == 1 && $TESTRUN == 1 ]] ; then + printf "%-10s -- %s\n" "[WARNING]" "Test run (option -T) is not available with sbatch runs" | tee -a ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[WARNING]" "The run will continue ignoring the -T option" | tee -a ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[INFO]" "Estimating required memory" | tee -a ${OUT}.wgrr.log +elif [[ $TESTRUN == 1 ]] ; then + printf "%-10s -- %s\n" "[INFO]" "This is a test run (option -T)" | tee -a ${OUT}.wgrr.log +elif [[ $SKIP == 0 ]] ; then + printf "%-10s -- %s\n" "[INFO]" "Estimating required memory" | tee -a ${OUT}.wgrr.log +fi + +if [[ $SKIP == 0 || $BATCHFLAG == 1 ]] ; then + if [[ $STATS[1] -gt 5 ]] ; then $AWKEXE '/^>/{g=$1;gsub(/_[0-9]+$/,"",g);a[g]++;if(length(a)>5){exit};print;getline;print}' $PRT > $OUT.testrun.prt else cp $PRT $OUT.testrun.prt fi + $AWKEXE 'BEGIN{x=1}/^>/{g=substr($1,2);gsub(/_[^_]+$/,"",g);if(FNR==1){a[x]=g;++x;currg=g;next}if(g!=currg){a[x]=g;x++;currg=g}}END{i=0;while(++i in a){j=i;while(++j in a){print a[i]"\t"a[j]}}}' $OUT.testrun.prt > $OUT.testrun.allpairs.txt - if [[ $TESTRUN == 1 && $BATCHFLAG == 0 ]] ; then - printf "%-10s -- %s\n" "[INFO]" "Running MMseqs on a sample file" | tee -a ${OUT}.wgrr.log - fi + + printf "%-10s -- %s\n" "[INFO]" "Running MMseqs on a sample file" | tee -a ${OUT}.wgrr.log $MMSEQS easy-search $OUT.testrun.prt $OUT.testrun.prt $OUT.testrun.m8 $tmp --threads $THREADS --format-output "query,target,qcov,tcov,fident,evalue,bits" --add-self-matches > $OUT.testrun.mmseqs.search.log + M2=$($AWKEXE -f wGRR.awk -v MINP=1 -v MAXP=10 -v OUT=$OUT -v MEM=1 $OUT.testrun.allpairs.txt $OUT.testrun.prt $OUT.testrun.m8) REQMEM=$(bc -l <<< $(numfmt --from=iec $M2)*($ARRAYSIZE*0.15) | numfmt --to=iec | $AWKEXE '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}') REQMEMT=$(bc -l <<< $(numfmt --from=iec $REQMEM)*$THREADS | numfmt --to=iec | $AWKEXE '{U=$0;gsub(/[^A-Za-z]/,"",U);V=$0;gsub(/[A-Za-z]+$/,"",V);split(V,a,".");n=split(a[1],b,"");c=b[1]+1;i=1;while(++i<=n){c=c"0"}print c""U}') + if [[ $TESTRUN == 1 && $BATCHFLAG == 0 ]] ; then printf "%-10s -- %s\n" "[INFO]" "With the current -a parameter (${ARRAYSIZE}) ${NJOBS} workers are required" | tee -a ${OUT}.wgrr.log printf "%-10s -- %s\n" "[INFO]" "Each worker will require approximately $REQMEM" | tee -a ${OUT}.wgrr.log printf "%-10s -- %s\n" "[INFO]" "With the number of threads you specified (-t $THREADS) you will need a total of $REQMEMT" | tee -a ${OUT}.wgrr.log + if [[ $TOTALMEM == 0 ]] ; then + printf "%-10s -- %s\n" "[WARNING]" "Total memory of your system could not be determined" | tee -a ${OUT}.wgrr.log + else + printf "%-10s -- %s\n" "[INFO]" "Total memory of your system: $(numfmt --to=iec $TOTALMEM)" | tee -a ${OUT}.wgrr.log + fi printf "%-10s -- %s\n" "[INFO]" "Testrun is done. Now cleaning" | tee -a ${OUT}.wgrr.log elif [[ $BATCHFLAG == 1 ]] ; then printf "%-10s -- %s\n" "[INFO]" "$REQMEM required per worker" | tee -a ${OUT}.wgrr.log fi + rm $OUT.testrun.* - if [[ $TESTRUN == 1 && $BATCHFLAG == 0 ]] ; then - printf "%-10s -- %s\n" "[INFO]" "Done." | tee -a ${OUT}.wgrr.log - duration=$SECONDS - printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log - exit 0 + + if [[ $TESTRUN == 0 && $BATCHFLAG == 0 ]] ; then + if [[ $TOTALMEM -lt $(bc -l <<< $(numfmt --from=iec $REQMEM)*$THREADS) ]] ; then + printf "%-10s -- %s\n" "[ERROR]" "Your system does not have enough memory (${REQMEMT} required). Try decreasing the -a parameter and/or the number of threads." | tee -a ${OUT}.wgrr.log + exit 1 + fi fi fi +if [[ $TESTRUN == 1 && $BATCHFLAG == 0 ]] ; then + printf "%-10s -- %s\n" "[INFO]" "Done." | tee -a ${OUT}.wgrr.log + duration=$SECONDS + printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log + exit 0 +fi + +duration=$SECONDS +printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log + if [[ -f $OUT.m8 ]] ; then printf "%-10s -- %s %s%s\n" "[INFO]" "Using existing MMseqs output file" $OUT ".m8" | tee -a ${OUT}.wgrr.log else MIDENT=$($AWKEXE '!/^>/{a[$0]++}END{for(i in a){if(a[i]>m){m=a[i]}}print m}' $PRT) - printf "%-10s -- %s\n" "[ELAPSED]" $(textifyDuration $duration) | tee -a ${OUT}.wgrr.log if [[ $((MIDENT*2)) -gt "$MMS_DEF_MAX_SEQS" ]] ; then MMS_MAX_SEQ_PARAM=($(echo $((MIDENT*2)) | $AWKEXE '{c=substr($1,2);gsub(/[0-9]/,0,c);print "--max-seqs "substr($1,1,1)+1""c}')) printf "%-10s -- %s %s\n" "[INFO]" "Setting MMseqs parameter" "$MMS_MAX_SEQ_PARAM" | tee -a ${OUT}.wgrr.log @@ -330,7 +402,7 @@ if [[ $BATCHFLAG == 0 ]] ; then if [[ $NJOBS -le 1 ]] ; then NJOBS=1 if [[ $THREADS > 1 ]] ; then - printf "%-10s -- %s\n" "[INFO]" "$THREADS threads required, but only 1 job to execute" | tee -a ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[INFO]" "$THREADS threads requested, but only 1 job to execute" | tee -a ${OUT}.wgrr.log fi printf "%-10s -- %s\n" "[INFO]" "Starting 1 job on 1 thread for wGRR calculation" | tee -a ${OUT}.wgrr.log ARRAYSIZE=$NPAIRS @@ -342,13 +414,13 @@ if [[ $BATCHFLAG == 0 ]] ; then ./wGRR_worker.zsh $ARRAYSIZE $OUT $NJOBS 1 $PRT $tmp else if [[ $THREADS -gt $NJOBS ]] ; then - printf "%-10s -- %s\n" "[INFO]" "$THREADS threads required, but only $NJOBS jobs to execute" | tee -a ${OUT}.wgrr.log + printf "%-10s -- %s\n" "[INFO]" "$THREADS threads requested, but only $NJOBS jobs to execute" | tee -a ${OUT}.wgrr.log fi printf "%-10s -- %s\n" "[INFO]" "Starting $NJOBS jobs on $THREADS threads for wGRR calculation" | tee -a ${OUT}.wgrr.log printf "%-10s -- [%-40s] 0/%s" "[PROGRESS]" "" $NJOBS for i in {1..$NJOBS} ; do echo $i - done | xargs -n 1 -P $THREADS sh -c 'arg=$5; NJOBS=$2 ; printf "\r\033[K[PROGRESS] -- [%-40s] %s/%s" $(head -c $((arg*40/NJOBS+1)) < /dev/zero | tr "\0" "=") $arg $NJOBS ; ./wGRR_worker.zsh $0 $1 $NJOBS $arg $3 $4 ' $ARRAYSIZE $OUT $NJOBS $PRT $tmp + done | xargs -n 1 -P $THREADS sh -c 'arg=$6; NJOBS=$2 ; ./wGRR_worker.zsh $0 $1 $NJOBS $arg $3 $4 $5 ' $ARRAYSIZE $OUT $NJOBS $PRT $tmp $STIME printf "\n" fi else diff --git a/wGRR_worker.zsh b/wGRR_worker.zsh index 22b0cfefe8353b811d08ba8232f38f2c8b534894..df1451db0b98e7a577056c9608bb5c04394a2073 100755 --- a/wGRR_worker.zsh +++ b/wGRR_worker.zsh @@ -1,9 +1,34 @@ #!/bin/zsh +textifyDuration() { + local duration=$1 + local shiff=$duration + local secs + secs=$((shiff % 60)); shiff=$((shiff / 60)); + local mins + mins=$((shiff % 60)); shiff=$((shiff / 60)); + local hours + hours=$((shiff % 24)); shiff=$((shiff / 24)); + local days=$shiff + if [[ $days -gt 0 ]]; then + txt="${days}D-${hours}h:${mins}m:${secs}s" + elif [[ $hours -gt 0 ]]; then + txt="${hours}h:${mins}m:${secs}s" + elif [[ $mins -gt 0 ]]; then + txt="${mins}m:${secs}s" + else + txt="${secs}s" + fi + echo "$txt" +} + ARRAYSIZE=$1 OUT=$2 +NJOBS=$3 +arg=$4 PRT=$5 TMP=$6 +STIME=$7 if [[ `hostname` != "maestro-"* ]] || [[ ${SLURM_ARRAY_TASK_ID} == "" ]] ; then SLURM_ARRAY_TASK_ID=$4 @@ -14,6 +39,11 @@ OUTFILE="$TMP"/$OUT.wgrr_part.${SLURM_ARRAY_TASK_ID} MAXP=$((ARRAYSIZE*SLURM_ARRAY_TASK_ID)) MINP=$((MAXP-ARRAYSIZE+1)) +if [[ $STIME != "" ]] ; then + CTIME=$(date +%s) + printf "\r\033[K[PROGRESS] -- [%-40s] %s/%s %s" $(head -c $((arg*40/NJOBS)) < /dev/zero | tr "\0" "=") $arg $NJOBS $(textifyDuration $((CTIME-STIME))) +fi + awk -v MINP=$MINP -v MAXP=$MAXP -f wGRR.awk $OUT.allpairs.txt $PRT $OUT.m8 | sort -k1,1V -k2,2V > $OUTFILE exit 0