Skip to content
Snippets Groups Projects
Commit dc0aec01 authored by Julien  GUGLIELMINI's avatar Julien GUGLIELMINI
Browse files

option -T added

parent 30db0561
No related branches found
No related tags found
No related merge requests found
...@@ -43,6 +43,7 @@ display_usage() { ...@@ -43,6 +43,7 @@ display_usage() {
echo " -m <integer> Max number of simulteaneous tasks." echo " -m <integer> Max number of simulteaneous tasks."
echo " Only applicable to Maestro, for the wGRR calculation step." echo " Only applicable to Maestro, for the wGRR calculation step."
echo " default: not set" echo " default: not set"
echo " -T Test run. Useful to get stats on the input file and correctly set the -a parameter."
echo "" echo ""
echo "${bold}DESCRIPTION:${normal}" echo "${bold}DESCRIPTION:${normal}"
echo "This pipeline will do all proteins pairwise comparisons using the MMseqs2 software and then process " echo "This pipeline will do all proteins pairwise comparisons using the MMseqs2 software and then process "
...@@ -97,13 +98,13 @@ ARRAYSIZE=10000 ## -a ...@@ -97,13 +98,13 @@ ARRAYSIZE=10000 ## -a
BATCHFLAG=0 ## Are we in a sbatch job? BATCHFLAG=0 ## Are we in a sbatch job?
QT=0 ## Queing time (for Maestro) QT=0 ## Queing time (for Maestro)
MAXJOBS=0 ## -m MAXJOBS=0 ## -m
DEFAULT_MAIN_RAM="10G" ## RAM for the main script (on Maestro)
MMS_DEF_MAX_SEQS=300 MMS_DEF_MAX_SEQS=300
MIDENT=0 MIDENT=0
MMS_MAX_SEQ_PARAM="" MMS_MAX_SEQ_PARAM=""
TESTRUN=0 ## -T
## catch option values ## catch option values
while getopts :f:p:o:t:a:m: option ; do while getopts :Tf:p:o:t:a:m: option ; do
case $option in case $option in
f) PRT="$OPTARG"; if [ ! -f $PRT ]; then echo "[ERROR] -- fasta file '$PRT' not found (option -f)." ; exit 1 ; fi ;; f) PRT="$OPTARG"; if [ ! -f $PRT ]; then echo "[ERROR] -- fasta file '$PRT' not found (option -f)." ; exit 1 ; fi ;;
p) MMPATH="$OPTARG" ;; p) MMPATH="$OPTARG" ;;
...@@ -111,6 +112,7 @@ while getopts :f:p:o:t:a:m: option ; do ...@@ -111,6 +112,7 @@ while getopts :f:p:o:t:a:m: option ; do
t) THREADS="$OPTARG"; if [[ ! $THREADS =~ ^[0-9]+$ ]]; then echo "[ERROR] -- number of threads $THREADS must be an integer (option -t)." ; exit 1 ; fi ;; t) THREADS="$OPTARG"; if [[ ! $THREADS =~ ^[0-9]+$ ]]; then echo "[ERROR] -- number of threads $THREADS must be an integer (option -t)." ; exit 1 ; fi ;;
a) ARRAYSIZE="$OPTARG"; if [[ ! $ARRAYSIZE =~ ^[0-9]+$ ]]; then echo "[ERROR] -- number of genomes comparisons $ARRAYSIZE must be an integer (option -a)." ; exit 1 ; fi ;; a) ARRAYSIZE="$OPTARG"; if [[ ! $ARRAYSIZE =~ ^[0-9]+$ ]]; then echo "[ERROR] -- number of genomes comparisons $ARRAYSIZE must be an integer (option -a)." ; exit 1 ; fi ;;
m) MAXJOBS="$OPTARG"; if [[ ! $MAXJOBS =~ ^[0-9]+$ ]]; then echo "[ERROR] -- max number of simulteaneous jobs $MAXJOBS must be an integer (option -m)." ; exit 1 ; fi ;; m) MAXJOBS="$OPTARG"; if [[ ! $MAXJOBS =~ ^[0-9]+$ ]]; then echo "[ERROR] -- max number of simulteaneous jobs $MAXJOBS must be an integer (option -m)." ; exit 1 ; fi ;;
T) TESTRUN=1 ;;
:) echo "option $OPTARG : missing argument" ; exit 1 ;; :) echo "option $OPTARG : missing argument" ; exit 1 ;;
\?) echo "$OPTARG : invalid option" ; exit 1 ;; \?) echo "$OPTARG : invalid option" ; exit 1 ;;
esac esac
...@@ -211,8 +213,9 @@ else ...@@ -211,8 +213,9 @@ else
fi fi
STATS=($($AWKEXE '/^>/{p++;g=substr($1,2);gsub(/_[0-9]+$/,"",g);if(!a[g]++){c++};LNR=NR}{if(NR>LNR+1){n=1}}END{if(n!=1){n=0}print c"\t"p"\t"n}' $PRT)) STATS=($($AWKEXE '/^>/{p++;g=substr($1,2);gsub(/_[0-9]+$/,"",g);if(!a[g]++){c++};LNR=NR}{if(NR>LNR+1){n=1}}END{if(n!=1){n=0}print c"\t"p"\t"n"\t"p/c}' $PRT))
echo "[INFO] -- Input file has "$STATS[1]" genomes and a total of "$STATS[2]" proteins" echo "[INFO] -- Input file has "$STATS[1]" genomes and a total of "$STATS[2]" proteins."
echo "[INFO] -- Mean number of proteins per genome: $STATS[4]"
if [[ $STATS[3] -eq 1 ]]; then if [[ $STATS[3] -eq 1 ]]; then
echo "[INFO] -- Converting fasta file to sequential fasta" echo "[INFO] -- Converting fasta file to sequential fasta"
...@@ -224,7 +227,7 @@ fi ...@@ -224,7 +227,7 @@ fi
duration=$SECONDS duration=$SECONDS
if [[ -f $OUT.m8 ]] ; then if [[ -f $OUT.m8 ]] ; then
echo "[INFO] -- Using existing MMseqs output file $OUT.m8" echo "[INFO] -- Using existing MMseqs output file $OUT.m8"
else elif [[ $TESTRUN == 0 ]] ; then
MIDENT=$($AWKEXE '!/^>/{a[$0]++}END{for(i in a){if(a[i]>m){m=a[i]}}print m}' $PRT) MIDENT=$($AWKEXE '!/^>/{a[$0]++}END{for(i in a){if(a[i]>m){m=a[i]}}print m}' $PRT)
printf "%-10s -- %s\n" "[TIME]" $(textifyDuration $duration) printf "%-10s -- %s\n" "[TIME]" $(textifyDuration $duration)
if [[ $((MIDENT*2)) -gt "$MMS_DEF_MAX_SEQS" ]] ; then if [[ $((MIDENT*2)) -gt "$MMS_DEF_MAX_SEQS" ]] ; then
...@@ -243,7 +246,7 @@ else ...@@ -243,7 +246,7 @@ else
fi fi
fi fi
if [[ ! -f $OUT.m8 ]] ; then if [[ $TESTRUN == 0 ]] && [[ ! -f $OUT.m8 ]] ; then
echo "[ERROR] -- Something went wrong during the MMseqs step." echo "[ERROR] -- Something went wrong during the MMseqs step."
exit 1 exit 1
fi fi
...@@ -264,6 +267,14 @@ if [[ ! -f $OUT.allpairs.txt ]] || [[ $NPAIRS < 1 ]] ; then ...@@ -264,6 +267,14 @@ if [[ ! -f $OUT.allpairs.txt ]] || [[ $NPAIRS < 1 ]] ; then
exit 1 exit 1
fi fi
if [[ $TESTRUN == 1 ]] ; then
rm -rf "$OUT".logs
printf "%-10s -- %s\n" "[INFO]" "Done"
duration=$SECONDS
printf "%-10s -- %s\n" "[TIME]" $(textifyDuration $duration)
exit 0
fi
duration=$SECONDS duration=$SECONDS
NJOBS=$(( (NPAIRS+ARRAYSIZE-1)/ARRAYSIZE)) NJOBS=$(( (NPAIRS+ARRAYSIZE-1)/ARRAYSIZE))
if [[ $BATCHFLAG == 0 ]] ; then if [[ $BATCHFLAG == 0 ]] ; then
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment