Skip to content
Snippets Groups Projects
Commit 568ebfc2 authored by Julien  GUGLIELMINI's avatar Julien GUGLIELMINI
Browse files

Added option to deal with large m8 files; checking for remaining tmp folders from previous analyses

parent 161310a1
No related branches found
No related tags found
No related merge requests found
...@@ -7,7 +7,7 @@ trap 'rm -rf "$tmp"' EXIT ...@@ -7,7 +7,7 @@ trap 'rm -rf "$tmp"' EXIT
export LC_ALL=C export LC_ALL=C
SECONDS=0 SECONDS=0
readonly VERSION=1.4.1 readonly VERSION=1.4.2
bold=$(tput bold) bold=$(tput bold)
green=$(tput setaf 2) green=$(tput setaf 2)
...@@ -66,6 +66,8 @@ display_usage() { ...@@ -66,6 +66,8 @@ display_usage() {
echo " -f In sbatch mode, will use the fast QOS (jobs limited to 2 hours)" echo " -f In sbatch mode, will use the fast QOS (jobs limited to 2 hours)"
echo " ${bold}WARNING:${normal}: be sure the jobs will run under 2 hours otherwise they will fail." echo " ${bold}WARNING:${normal}: be sure the jobs will run under 2 hours otherwise they will fail."
echo " default: off" echo " default: off"
echo " -n If turned on, the MMseqs output file will be erased to save disk space."
echo " default: off"
echo "" echo ""
echo "${bold}DESCRIPTION:${normal}" echo "${bold}DESCRIPTION:${normal}"
echo "This pipeline will do all proteins pairwise comparisons using the MMseqs2 software and then process " echo "This pipeline will do all proteins pairwise comparisons using the MMseqs2 software and then process "
...@@ -121,18 +123,22 @@ IDLIST="N.O.L.I.S.T" ## -l ...@@ -121,18 +123,22 @@ IDLIST="N.O.L.I.S.T" ## -l
BATCHFLAG=0 ## Are we in a sbatch job? BATCHFLAG=0 ## Are we in a sbatch job?
QT=0 ## Queing time (for Maestro) QT=0 ## Queing time (for Maestro)
MAXJOBS=0 ## -m MAXJOBS=0 ## -m
MMS_DEF_MAX_SEQS=300
MIDENT=0
MMS_MAX_SEQ_PARAM=""
TESTRUN=0 ## -T TESTRUN=0 ## -T
FAST=0 ## -f FAST=0 ## -f
SKIP=0 ## -s SKIP=0 ## -s
JACCARD=0 ## -j JACCARD=0 ## -j
COVT=0.5 ## -C COVT=0.5 ## -C
IDT=0.35 ## -I IDT=0.35 ## -I
NOM8=0 ## -n
MMS_DEF_MAX_SEQS=300
MIDENT=0
MMS_MAX_SEQ_PARAM=""
EST_M8_SIZE=0
tmp="wgrrtmp"
## catch option values ## catch option values
while getopts :fTsji:p:o:t:a:m:l:C:I: option ; do while getopts :fTsji:p:o:t:a:m:l:C:I:c:n: option ; do
case $option in case $option in
i) PRT="$OPTARG"; if [[ ! -s $PRT ]]; then printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "Fasta file '$PRT' not found or empty (option -i)." ; exit 1 ; fi ;; i) PRT="$OPTARG"; if [[ ! -s $PRT ]]; then printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "Fasta file '$PRT' not found or empty (option -i)." ; exit 1 ; fi ;;
p) MMPATH="$OPTARG" ;; p) MMPATH="$OPTARG" ;;
...@@ -145,6 +151,7 @@ while getopts :fTsji:p:o:t:a:m:l:C:I: option ; do ...@@ -145,6 +151,7 @@ while getopts :fTsji:p:o:t:a:m:l:C:I: option ; do
f) FAST=1 ;; f) FAST=1 ;;
s) SKIP=1 ;; s) SKIP=1 ;;
j) JACCARD=1 ;; j) JACCARD=1 ;;
n) NOM8=1 ;;
C) COVT="$OPTARG"; if [[ ${COVT} -gt 1 ]] || [[ ${COVT} -lt 0 ]] ; then printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "Coverage threshold must be between 0 and 1 (option -C)." ; exit 1 ; fi ;; C) COVT="$OPTARG"; if [[ ${COVT} -gt 1 ]] || [[ ${COVT} -lt 0 ]] ; then printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "Coverage threshold must be between 0 and 1 (option -C)." ; exit 1 ; fi ;;
I) IDT="$OPTARG"; if [[ ${IDT} -gt 1 ]] || [[ ${IDT} -lt 0 ]] ; then printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "Identity threshold must be between 0 and 1 (option -C)." ; exit 1 ; fi ;; I) IDT="$OPTARG"; if [[ ${IDT} -gt 1 ]] || [[ ${IDT} -lt 0 ]] ; then printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "Identity threshold must be between 0 and 1 (option -C)." ; exit 1 ; fi ;;
:) printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "option $OPTARG : missing argument" ; exit 1 ;; :) printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "option $OPTARG : missing argument" ; exit 1 ;;
...@@ -153,6 +160,11 @@ while getopts :fTsji:p:o:t:a:m:l:C:I: option ; do ...@@ -153,6 +160,11 @@ while getopts :fTsji:p:o:t:a:m:l:C:I: option ; do
done done
shift "$((OPTIND - 1))" shift "$((OPTIND - 1))"
if [[ ${PRT} == "N.O.P.R.T" ]] ; then
printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "No input file provided (option -i)." | tee -a ${OUT}.wgrr.log
exit 1
fi
# Improvement: test if $PRT is a proper fasta file # Improvement: test if $PRT is a proper fasta file
# Remove all aliases # Remove all aliases
...@@ -177,20 +189,29 @@ fi ...@@ -177,20 +189,29 @@ fi
/usr/bin/mktemp --version > /dev/null 2>&1 /usr/bin/mktemp --version > /dev/null 2>&1
if [[ $? -eq 1 ]] ; then if [[ $? -eq 1 ]] ; then
tmp=$(mktemp -d tmp-XXXXXX) tmp=$(mktemp -d wgrrtmp-XXXXXX)
else else
if [[ `hostname` == "maestro-"* ]] ; then if [[ `hostname` == "maestro-"* ]] ; then
if [[ ! -d ${APPASCRATCH}/${USER} ]] ; then if [[ ! -d ${APPASCRATCH}/${USER} ]] ; then
mkdir ${APPASCRATCH}/${USER} mkdir ${APPASCRATCH}/${USER}
fi fi
tmp=$(mktemp -p ${APPASCRATCH}/${USER} -d tmp-XXXXXXXX) tmp=$(mktemp -p ${APPASCRATCH}/${USER} -d wgrrtmp-XXXXXXXX)
else
tmp=$(mktemp -d wgrrtmp-XXXXXXXX)
fi
fi
if [[ ${tmp} == "wgrrtmp" ]] ; then
if [[ -d ${tmp} ]] ; then
printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "Could not create temporary directory. Please delete ${tmp} directory and relaunch the analysis." | tee -a ${OUT}.wgrr.log
exit 1
else else
tmp=$(mktemp -d tmp-XXXXXXXX) mkdir ${tmp}
fi fi
fi fi
if [[ $OUT == "N.O.O.U.T" ]] ; then if [[ $OUT == "N.O.O.U.T" ]] ; then
OUT=$(basename $tmp) OUT=$(basename $tmp | cut -d'-' -f 2)
fi fi
## Get partition (if sbatch) or check nproc (if local/Check number of threads ## Get partition (if sbatch) or check nproc (if local/Check number of threads
...@@ -222,6 +243,18 @@ fi ...@@ -222,6 +243,18 @@ fi
printf "%-17s -- %s\n" "["$(textifyDuration $SECONDS)"]" "wGRR version ${VERSION}" | tee -a ${OUT}.wgrr.log printf "%-17s -- %s\n" "["$(textifyDuration $SECONDS)"]" "wGRR version ${VERSION}" | tee -a ${OUT}.wgrr.log
NTMP=0
for f in ${tmp}/../wgrrtmp* ; do
if [[ -d ${f} ]] ; then
NTMP=$(( NTMP+1 ))
fi
if [[ ${NTMP} -gt 1 ]] ; then
g=$(echo "$(cd "$(dirname $tmp)" ; pwd)")
printf "${bold}${yellow}%-17s -- %s\n${normal}" "[WARNING]" "Found tmp folders from previous analyses in ${g}" | tee -a ${OUT}.wgrr.log
printf "${bold}${yellow}%-17s -- %s\n${normal}" "[WARNING]" "Consider deleting them" | tee -a ${OUT}.wgrr.log
fi
done
if [[ $BATCHFLAG == 0 ]] ; then if [[ $BATCHFLAG == 0 ]] ; then
if [[ $FAST == 1 ]] ; then if [[ $FAST == 1 ]] ; then
printf "${bold}${yellow}%-17s -- %s\n${normal}" "[WARNING]" "The fast flag (-f) is useless when not using sbatch" | tee -a ${OUT}.wgrr.log printf "${bold}${yellow}%-17s -- %s\n${normal}" "[WARNING]" "The fast flag (-f) is useless when not using sbatch" | tee -a ${OUT}.wgrr.log
...@@ -291,6 +324,16 @@ STATS=($(awk '/^>/{p++;g=substr($1,2);gsub(/_[^_]+$/,"",g);if(!a[g]++){c++};LNR= ...@@ -291,6 +324,16 @@ STATS=($(awk '/^>/{p++;g=substr($1,2);gsub(/_[^_]+$/,"",g);if(!a[g]++){c++};LNR=
printf "%-17s -- %s %s %s %s %s\n" "["$(textifyDuration $SECONDS)"]" "Input file has" $STATS[1] "genomes and a total of" $STATS[2] "proteins" | tee -a ${OUT}.wgrr.log printf "%-17s -- %s %s %s %s %s\n" "["$(textifyDuration $SECONDS)"]" "Input file has" $STATS[1] "genomes and a total of" $STATS[2] "proteins" | tee -a ${OUT}.wgrr.log
printf "%-17s -- %s %s\n" "["$(textifyDuration $SECONDS)"]" "Mean number of proteins per genome:" $STATS[4] | tee -a ${OUT}.wgrr.log printf "%-17s -- %s %s\n" "["$(textifyDuration $SECONDS)"]" "Mean number of proteins per genome:" $STATS[4] | tee -a ${OUT}.wgrr.log
EST_M8_SIZE=$((STATS[1] * STATS[1] * STATS[4] / 10000000 ))
if [[ ${EST_M8_SIZE} == 0 ]] ; then
printf "${bold}${yellow}%-17s -- %s\n${normal}" "[WARNING]" "Unable to estimate the MMseqs output file size." | tee -a ${OUT}.wgrr.log
elif [[ ${EST_M8_SIZE} -gt 10 ]] ; then
if [[ ${NOM8} == 0 ]] ; then
printf "${bold}${yellow}%-17s -- %s\n${normal}" "[WARNING]" "MMseqs output file could exceed 10GB. Consider using the -n option to save disk space." | tee -a ${OUT}.wgrr.log
fi
fi
if [[ $ARRAYSIZE == "AUTO" ]] ; then if [[ $ARRAYSIZE == "AUTO" ]] ; then
if [[ ${STATS[4]} -le 100 ]] ; then if [[ ${STATS[4]} -le 100 ]] ; then
ARRAYSIZE=10000 ARRAYSIZE=10000
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment