diff --git a/wGRR b/wGRR index e78e9ae522cb9d489ca38ed983e4563ecbb345ec..25f662e7e9668bd64993178da109cd0847a0b9a9 100755 --- a/wGRR +++ b/wGRR @@ -43,6 +43,7 @@ display_usage() { echo " -m <integer> Max number of simulteaneous tasks." echo " Only applicable to Maestro, for the wGRR calculation step." echo " default: not set" + echo " -T Test run. Useful to get stats on the input file and correctly set the -a parameter." echo "" echo "${bold}DESCRIPTION:${normal}" echo "This pipeline will do all proteins pairwise comparisons using the MMseqs2 software and then process " @@ -97,13 +98,13 @@ ARRAYSIZE=10000 ## -a BATCHFLAG=0 ## Are we in a sbatch job? QT=0 ## Queing time (for Maestro) MAXJOBS=0 ## -m -DEFAULT_MAIN_RAM="10G" ## RAM for the main script (on Maestro) MMS_DEF_MAX_SEQS=300 MIDENT=0 MMS_MAX_SEQ_PARAM="" +TESTRUN=0 ## -T ## catch option values -while getopts :f:p:o:t:a:m: option ; do +while getopts :Tf:p:o:t:a:m: option ; do case $option in f) PRT="$OPTARG"; if [ ! -f $PRT ]; then echo "[ERROR] -- fasta file '$PRT' not found (option -f)." ; exit 1 ; fi ;; p) MMPATH="$OPTARG" ;; @@ -111,6 +112,7 @@ while getopts :f:p:o:t:a:m: option ; do t) THREADS="$OPTARG"; if [[ ! $THREADS =~ ^[0-9]+$ ]]; then echo "[ERROR] -- number of threads $THREADS must be an integer (option -t)." ; exit 1 ; fi ;; a) ARRAYSIZE="$OPTARG"; if [[ ! $ARRAYSIZE =~ ^[0-9]+$ ]]; then echo "[ERROR] -- number of genomes comparisons $ARRAYSIZE must be an integer (option -a)." ; exit 1 ; fi ;; m) MAXJOBS="$OPTARG"; if [[ ! $MAXJOBS =~ ^[0-9]+$ ]]; then echo "[ERROR] -- max number of simulteaneous jobs $MAXJOBS must be an integer (option -m)." ; exit 1 ; fi ;; + T) TESTRUN=1 ;; :) echo "option $OPTARG : missing argument" ; exit 1 ;; \?) echo "$OPTARG : invalid option" ; exit 1 ;; esac @@ -211,8 +213,9 @@ else fi -STATS=($($AWKEXE '/^>/{p++;g=substr($1,2);gsub(/_[0-9]+$/,"",g);if(!a[g]++){c++};LNR=NR}{if(NR>LNR+1){n=1}}END{if(n!=1){n=0}print c"\t"p"\t"n}' $PRT)) -echo "[INFO] -- Input file has "$STATS[1]" genomes and a total of "$STATS[2]" proteins" +STATS=($($AWKEXE '/^>/{p++;g=substr($1,2);gsub(/_[0-9]+$/,"",g);if(!a[g]++){c++};LNR=NR}{if(NR>LNR+1){n=1}}END{if(n!=1){n=0}print c"\t"p"\t"n"\t"p/c}' $PRT)) +echo "[INFO] -- Input file has "$STATS[1]" genomes and a total of "$STATS[2]" proteins." +echo "[INFO] -- Mean number of proteins per genome: $STATS[4]" if [[ $STATS[3] -eq 1 ]]; then echo "[INFO] -- Converting fasta file to sequential fasta" @@ -224,7 +227,7 @@ fi duration=$SECONDS if [[ -f $OUT.m8 ]] ; then echo "[INFO] -- Using existing MMseqs output file $OUT.m8" -else +elif [[ $TESTRUN == 0 ]] ; then MIDENT=$($AWKEXE '!/^>/{a[$0]++}END{for(i in a){if(a[i]>m){m=a[i]}}print m}' $PRT) printf "%-10s -- %s\n" "[TIME]" $(textifyDuration $duration) if [[ $((MIDENT*2)) -gt "$MMS_DEF_MAX_SEQS" ]] ; then @@ -243,7 +246,7 @@ else fi fi -if [[ ! -f $OUT.m8 ]] ; then +if [[ $TESTRUN == 0 ]] && [[ ! -f $OUT.m8 ]] ; then echo "[ERROR] -- Something went wrong during the MMseqs step." exit 1 fi @@ -264,6 +267,14 @@ if [[ ! -f $OUT.allpairs.txt ]] || [[ $NPAIRS < 1 ]] ; then exit 1 fi +if [[ $TESTRUN == 1 ]] ; then + rm -rf "$OUT".logs + printf "%-10s -- %s\n" "[INFO]" "Done" + duration=$SECONDS + printf "%-10s -- %s\n" "[TIME]" $(textifyDuration $duration) + exit 0 +fi + duration=$SECONDS NJOBS=$(( (NPAIRS+ARRAYSIZE-1)/ARRAYSIZE)) if [[ $BATCHFLAG == 0 ]] ; then