diff --git a/wGRR b/wGRR index 02c7a5d0314869f19c1f0a762b14eeaeefb8f4c5..03c6101a62f99ef8c0d371150ed680cc4068bbb7 100755 --- a/wGRR +++ b/wGRR @@ -7,7 +7,7 @@ trap 'rm -rf "$tmp"' EXIT export LC_ALL=C SECONDS=0 -readonly VERSION=1.4.1 +readonly VERSION=1.4.2 bold=$(tput bold) green=$(tput setaf 2) @@ -66,6 +66,8 @@ display_usage() { echo " -f In sbatch mode, will use the fast QOS (jobs limited to 2 hours)" echo " ${bold}WARNING:${normal}: be sure the jobs will run under 2 hours otherwise they will fail." echo " default: off" + echo " -n If turned on, the MMseqs output file will be erased to save disk space." + echo " default: off" echo "" echo "${bold}DESCRIPTION:${normal}" echo "This pipeline will do all proteins pairwise comparisons using the MMseqs2 software and then process " @@ -121,18 +123,22 @@ IDLIST="N.O.L.I.S.T" ## -l BATCHFLAG=0 ## Are we in a sbatch job? QT=0 ## Queing time (for Maestro) MAXJOBS=0 ## -m -MMS_DEF_MAX_SEQS=300 -MIDENT=0 -MMS_MAX_SEQ_PARAM="" TESTRUN=0 ## -T FAST=0 ## -f SKIP=0 ## -s JACCARD=0 ## -j COVT=0.5 ## -C IDT=0.35 ## -I +NOM8=0 ## -n + +MMS_DEF_MAX_SEQS=300 +MIDENT=0 +MMS_MAX_SEQ_PARAM="" +EST_M8_SIZE=0 +tmp="wgrrtmp" ## catch option values -while getopts :fTsji:p:o:t:a:m:l:C:I: option ; do +while getopts :fTsji:p:o:t:a:m:l:C:I:c:n: option ; do case $option in i) PRT="$OPTARG"; if [[ ! -s $PRT ]]; then printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "Fasta file '$PRT' not found or empty (option -i)." ; exit 1 ; fi ;; p) MMPATH="$OPTARG" ;; @@ -145,6 +151,7 @@ while getopts :fTsji:p:o:t:a:m:l:C:I: option ; do f) FAST=1 ;; s) SKIP=1 ;; j) JACCARD=1 ;; + n) NOM8=1 ;; C) COVT="$OPTARG"; if [[ ${COVT} -gt 1 ]] || [[ ${COVT} -lt 0 ]] ; then printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "Coverage threshold must be between 0 and 1 (option -C)." ; exit 1 ; fi ;; I) IDT="$OPTARG"; if [[ ${IDT} -gt 1 ]] || [[ ${IDT} -lt 0 ]] ; then printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "Identity threshold must be between 0 and 1 (option -C)." ; exit 1 ; fi ;; :) printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "option $OPTARG : missing argument" ; exit 1 ;; @@ -153,6 +160,11 @@ while getopts :fTsji:p:o:t:a:m:l:C:I: option ; do done shift "$((OPTIND - 1))" +if [[ ${PRT} == "N.O.P.R.T" ]] ; then + printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "No input file provided (option -i)." | tee -a ${OUT}.wgrr.log + exit 1 +fi + # Improvement: test if $PRT is a proper fasta file # Remove all aliases @@ -177,20 +189,29 @@ fi /usr/bin/mktemp --version > /dev/null 2>&1 if [[ $? -eq 1 ]] ; then - tmp=$(mktemp -d tmp-XXXXXX) + tmp=$(mktemp -d wgrrtmp-XXXXXX) else if [[ `hostname` == "maestro-"* ]] ; then if [[ ! -d ${APPASCRATCH}/${USER} ]] ; then mkdir ${APPASCRATCH}/${USER} fi - tmp=$(mktemp -p ${APPASCRATCH}/${USER} -d tmp-XXXXXXXX) + tmp=$(mktemp -p ${APPASCRATCH}/${USER} -d wgrrtmp-XXXXXXXX) + else + tmp=$(mktemp -d wgrrtmp-XXXXXXXX) + fi +fi + +if [[ ${tmp} == "wgrrtmp" ]] ; then + if [[ -d ${tmp} ]] ; then + printf "${bold}${red}%-17s -- %s\n${normal}" "[ERROR]" "Could not create temporary directory. Please delete ${tmp} directory and relaunch the analysis." | tee -a ${OUT}.wgrr.log + exit 1 else - tmp=$(mktemp -d tmp-XXXXXXXX) + mkdir ${tmp} fi fi if [[ $OUT == "N.O.O.U.T" ]] ; then - OUT=$(basename $tmp) + OUT=$(basename $tmp | cut -d'-' -f 2) fi ## Get partition (if sbatch) or check nproc (if local/Check number of threads @@ -222,6 +243,18 @@ fi printf "%-17s -- %s\n" "["$(textifyDuration $SECONDS)"]" "wGRR version ${VERSION}" | tee -a ${OUT}.wgrr.log +NTMP=0 +for f in ${tmp}/../wgrrtmp* ; do + if [[ -d ${f} ]] ; then + NTMP=$(( NTMP+1 )) + fi + if [[ ${NTMP} -gt 1 ]] ; then + g=$(echo "$(cd "$(dirname $tmp)" ; pwd)") + printf "${bold}${yellow}%-17s -- %s\n${normal}" "[WARNING]" "Found tmp folders from previous analyses in ${g}" | tee -a ${OUT}.wgrr.log + printf "${bold}${yellow}%-17s -- %s\n${normal}" "[WARNING]" "Consider deleting them" | tee -a ${OUT}.wgrr.log + fi +done + if [[ $BATCHFLAG == 0 ]] ; then if [[ $FAST == 1 ]] ; then printf "${bold}${yellow}%-17s -- %s\n${normal}" "[WARNING]" "The fast flag (-f) is useless when not using sbatch" | tee -a ${OUT}.wgrr.log @@ -291,6 +324,16 @@ STATS=($(awk '/^>/{p++;g=substr($1,2);gsub(/_[^_]+$/,"",g);if(!a[g]++){c++};LNR= printf "%-17s -- %s %s %s %s %s\n" "["$(textifyDuration $SECONDS)"]" "Input file has" $STATS[1] "genomes and a total of" $STATS[2] "proteins" | tee -a ${OUT}.wgrr.log printf "%-17s -- %s %s\n" "["$(textifyDuration $SECONDS)"]" "Mean number of proteins per genome:" $STATS[4] | tee -a ${OUT}.wgrr.log +EST_M8_SIZE=$((STATS[1] * STATS[1] * STATS[4] / 10000000 )) + +if [[ ${EST_M8_SIZE} == 0 ]] ; then + printf "${bold}${yellow}%-17s -- %s\n${normal}" "[WARNING]" "Unable to estimate the MMseqs output file size." | tee -a ${OUT}.wgrr.log +elif [[ ${EST_M8_SIZE} -gt 10 ]] ; then + if [[ ${NOM8} == 0 ]] ; then + printf "${bold}${yellow}%-17s -- %s\n${normal}" "[WARNING]" "MMseqs output file could exceed 10GB. Consider using the -n option to save disk space." | tee -a ${OUT}.wgrr.log + fi +fi + if [[ $ARRAYSIZE == "AUTO" ]] ; then if [[ ${STATS[4]} -le 100 ]] ; then ARRAYSIZE=10000