1.1

6bac946f · Alexis CRISCUOLO · 8d6e3948 · 6bac946f · 6bac946f · 8d6e3948
Commit 6bac946f authored 1 year ago by Alexis CRISCUOLO
--- a/ASSU.sh
+++ b/ASSU.sh
@@ -31,7 +31,10 @@
 # = VERSIONS =                                                                                               #
 # ============                                                                                               #
 #                                                                                                            #
-  VERSION=1.0;                                                                                               #
+  VERSION=1.1;                                                                                               #
+# + SSUdb inside a dedicated directory ($SSUDB_DIR)                                                          #
+#                                                                                                            #
+# VERSION=1.0;                                                                                               #
 #                                                                                                            #
 ##############################################################################################################
@@ -131,7 +134,7 @@
 #                                                                                                            #
  PUNZIP="$PIGZ_BIN --decompress";
 #                                                                                                            #
-# -- agrep ------------------------------------------------------------------------------------------------  #
+# -- zgrep ------------------------------------------------------------------------------------------------  #
 #                                                                                                            #
  ZGREP="$ZGREP_BIN";
 #                                                                                                            #
@@ -162,10 +165,12 @@ mandoc() {
 USAGE:  ASSU  [options]  <infile> [<infile> ...]
 OPTIONS:
-  -d <file>    SSU databank file (default: SSUdb.gz in the same directory as ASSU.sh)
+  -d <file>    SSU databank file (default: db/SSUdb.gz in the same directory as ASSU)
-  -p <string>  restricts the SSU databank to the specified pattern (default: none)
+  -p <string>  restricts the  SSU databank  to the  specified  (extended regex)  pattern
-  -o <string>  output SSU sequence file name (default: ssu.fasta)
+               (default: none)
-  -O <string>  writes the selected reads into the specified file name (default: none)
+  -o <string>  output FASTA-formatted SSU sequence file name (default: ssu.fasta)
+  -O <string>  writes the selected  reads into the  specified FASTQ-formatted  file name
+               (default: none)
  -l <int>     minimum sequence length (default: 1000)
  -L <int>     minimum read length (default: AUTO)
  -Q <int>     minimum base Phred quality value (default: 20)
@@ -174,7 +179,7 @@ mandoc() {
  -F <float>   minimum proportion of the majority base to infer that base (default: 0.8)
  -A <float>   minimum ratio of the alternative  base(s) to the majority one to add that
               base(s) to the consensus (default: 0.2)
-  -N           set N when multiple bases at a position (default: not set)
+  -N           set N when multiple bases at a consensus position (default: not set)
  -w <dir>     path to the tmp directory (default: \$TMPDIR, otherwise /tmp)
  -t <int>     thread numbers (default: 2)
  -v           verbose mode
@@ -183,8 +188,9 @@ mandoc() {
  -h           prints this help and exit
 EXAMPLES:
-  ASSU  -t 24  -o 16s.fasta  fwd.fastq.gz  rev.fastq.gz  sgl.fastq.bz2
+  ASSU  -t 24  -o 16s.fasta  fwd.fastq.gz  rev.fastq.gz  sgl.fastq.gz
-  ASSU  -d SSUdb.gz  -O 16s.fastq  -p Clostridium  -L 75  -v  *.fastq
+  ASSU  -d SSUdb.gz  -O 16s.fastq  -p "Devosia limi"  -L 75  -v  *.fastq
+  ASSU  -p "Citrobacter|Escherichia|Shigella"  -N  -v  hts.fastq.bz2
 EOF
 } 
@@ -212,11 +218,11 @@ chrono() {
 }
 #                                                                                                            #
 # -- dcontrol ---------------------------------------------------------------------------------------------  #
-# >> controls required and optional dependancies                                                             #
+# >> controls required and optional (non-coreutils) dependancies                                             #
 #                                                                                                            #
 dcontrol() {
  fail=false;
-  for binexe in  bc  fold  grep  $ZGREP_BIN  sed  $GAWK_BIN  $BWAMEM2_BIN  $SAMTOOLS_BIN
+  for binexe in  bc  grep  $ZGREP_BIN  sed  $GAWK_BIN  $BWAMEM2_BIN  $SAMTOOLS_BIN
  do
    if [ ! $(command -v $binexe) ]; then echo "[ERROR] $binexe not found" >&2 ; fail=true; fi
  done
@@ -233,7 +239,7 @@ dcontrol() {
 }
 #                                                                                                            #
 # -- dcheck -----------------------------------------------------------------------------------------------  #
-# >> checks dependancies and exit                                                                            #
+# >> checks (non-coreutils) dependancies and exit                                                            #
 #                                                                                                            #
 dcheck() {
  echo "Checking required dependencies ..." ;
@@ -243,12 +249,6 @@ dcheck() {
  if [ ! $(command -v $binexe) ]; then echo -e "\e[31m[fail]\e[0m";
  else        echo -e -n "$(which $binexe)\t\t\e[32m[ok]\e[0m\t\t";   bc -v | head -1 ;
  fi
-  ## fold ############################
-  echo -e -n "> \e[1mfold\e[0m        >1.0       mandatory\t" ;       binexe=fold;
-  echo -e -n "$binexe    \t\t" ;
-  if [ ! $(command -v $binexe) ]; then echo -e "\e[31m[fail]\e[0m";
-  else        echo -e -n "$(which $binexe)\t\t\e[32m[ok]\e[0m\t\t";   fold --version | head -1 ;
-  fi
  ## grep ############################
  echo -e -n "> \e[1mgrep\e[0m        >1.0       mandatory\t" ;       binexe=grep;
  echo -e -n "$binexe    \t\t" ;
@@ -345,24 +345,26 @@ dispdb() {
 if [ $# -lt 1 ]; then mandoc ; exit 1 ; fi
-SSUDB=$PWD/SSUdb.gz;      # SSU databank   -d
+SSUDB_DIR=${SSUDB_DIR:-$PWD/db}
-OUTFILE=ssu.fasta;        # FASTA outfile  -o
+SSUDB=$SSUDB_DIR/SSUdb.gz; # SSU databank   -d
-OUTFQ="$NA";              # FASTQ outfile  -O
+OUTFILE=ssu.fasta;         # FASTA outfile  -o
-PATTERN="$NA";            # pattern        -p
+OUTFQ="$NA";               # FASTQ outfile  -O
-SLGT=1000;                # min seq lgt    -l
+PATTERN="$NA";             # pattern        -p
-RLGT="00";                # min read lgt   -L
+SLGT=1000;                 # min seq lgt    -l
-BQ=20;                    # min Phred      -Q
+RLGT="00";                 # min read lgt   -L
-MQ=20;                    # min map Phread -M
+BQ=20;                     # min Phred      -Q
-COV=50;                   # min cov depth  -D
+MQ=20;                     # min map Phread -M
-FQ=0.8;                   # min freq       -F
+COV=50;                    # min cov depth  -D
-FH=0.2;                   # min het        -A
+FQ=0.8;                    # min freq       -F
-AMB=true;                 # only N         -N 
+FH=0.2;                    # min het        -A
-DISPDB=false;             # display SSUdb  -s        
+AMB=true;                  # only N         -N 
-TMP_DIR=${TMPDIR:-/tmp};  # tmp directory  -w
+DISPDB=false;              # display SSUdb  -s        
-NTHREADS=2;               # no. threads    -t
+TMP_DIR=${TMPDIR:-/tmp};   # tmp directory  -w
-VERBOSE=false;            # verbose mode   -v        
+NTHREADS=2;                # no. threads    -t
+VERBOSE=false;             # verbose mode   -v        
-while getopts d:p:o:O:l:L:Q:M:D:F:A:w:t:Nvcsh option
+DEBUG=false;               # debug mode     -X
+while getopts d:p:o:O:l:L:Q:M:D:F:A:w:t:NvcshX option
 do
  case $option in
  d)  SSUDB="$OPTARG"    ;;
@@ -382,6 +384,7 @@ do
  v)  VERBOSE=true       ;;
  c)  dcheck             ;;
  s)  DISPDB=true        ;;
+  X)  DEBUG=true         ;;
  h)  mandoc ;  exit 0   ;;
  \?) mandoc ;  exit 1   ;;
  esac
@@ -391,11 +394,24 @@ shift "$(( $OPTIND - 1 ))"
 NFILE=$#;      # no. input file(s)
 FQLIST="$@";   # specified input file(s)
+$DEBUG && VERBOSE=true;
 echo "# ASSU v$VERSION" ;
 echo "# $COPYRIGHT" ;
 echo "+ https://gitlab.pasteur.fr/GIPhy/ASSU" ;
-$VERBOSE && echo "> Syst: $MACHTYPE" ;
+if $VERBOSE
-$VERBOSE && echo "> Bash: $BASH_VERSION" ;
+then
+  echo "> Syst: $MACHTYPE" ;
+  echo "> Bash: $BASH_VERSION" ;
+  echo "> SSUdb: $SSUDB" ;
+  if [ -s ${SSUDB%.*}.version.txt ]
+  then
+    $BAWK '(NR==1){d=$2}
+           (NR==2){n=$1}
+           END    {print"> SSUdb v"d" ("n" sequences)"}' ${SSUDB%.*}.version.txt
+  fi
+fi
 [ ! -e $SSUDB ]                     && echoxit "[ERROR] databank not found (option -d): $SSUDB" ;
 [   -d $SSUDB ]                     && echoxit "[ERROR] not a file (option -d): $SSUDB" ;
@@ -547,30 +563,20 @@ MESSAGE="examining SSU databank" ;
 echo -n "$(chrono) $MESSAGE ." ;
 ## indexing SSU databank #######################################################
-if [ "$PATTERN" == "$NA" ]
+if [ "$PATTERN" != "$NA" ] && [ $($ZGREP -c -E "$PATTERN" $SSUDB) -eq 0 ]
 then
-  $BWA_INDEX -p $SSU_IDX $SSUDB &>/dev/null ;
+  echo ".. [ok]" ;
-else
+  echo "> no selected sequence with the specified pattern: $PATTERN" ;
-  $ZGREP -A1 -F "$PATTERN" $SSUDB | grep -v -e "^--$" > $REF_SEQ ;
+  finalize ;
+  exit ;
-  if [ ! -s $REF_SEQ ]
-  then
-    echo ".. [ok]" ;
-    echo "> no selected sequence with the specified pattern: $PATTERN" ;
-    finalize ;
-    exit ;
-  fi
-  $BWA_INDEX -p $SSU_IDX $REF_SEQ &>/dev/null ;
-  rm -f $REF_SEQ ;
 fi
+$BWA_INDEX -p $SSU_IDX $SSUDB &>/dev/null ;
 echo -n "." ;
 ## aligning HTS reads ##########################################################
 touch $SAM ;
 [ $NTHREADS -gt 24 ] && DSRCopt="-t24" || DSRCopt="-t$NTHREADS" ;
-# BWAopt="-t $NTHREADS -k 21 -w 50 -D 1.0 -r 2.0 -T $(( $RLGT - 20 ))";
 BWAopt="-t $NTHREADS -k 21 -w 50";
 for fin in $FQLIST
 do
@@ -588,18 +594,36 @@ done
 echo " [ok]" ;
 ## getting ref #################################################################
-## maximizing the no. hits
+if $DEBUG
-# accn=$($TAWK '{++c[$3]}END{for(a in c)if(c[a]>1)print c[a]" "a}' $SAM | sort -n | tail -1 | $BAWK '{print$2}');
+then
-## maximizing the avg score among the two most hitted references (as longer sequences often wins when using the basic criterion above)
+  echo -e "no.hits\t\tsum.scores\tavg.score\tinfo" ;
-accn=$($TAWK '{print$3"\t"$14}' $SAM | 
+  $TAWK '{print$3"\t"$14}' $SAM |                                
-         sed 's/AS:i://g' | 
+    sed 's/AS:i://g' |
-           $TAWK '   {m[$1]+=$2;++n[$1]}
+      $TAWK '   {m[$1]+=$2;++n[$1]}
-                  END{for(a in m)if(n[a]>10)print n[a]" "(m[a]/n[a])" "a}' | 
+             END{for(a in m)if(n[a]>10)print n[a]"\t"m[a]"\t"(m[a]/n[a])"\t"a}' | 
-             sort -rn | head -2 | 
+        sort -rn | head |
-               sort -rgk2 | head -1 | $BAWK '{print$3}');
+          while read x y z a ; do echo -e "$x\t\t$y\t\t$z\t\t$($ZGREP -m1 -F $a $SSUDB)" ; done | tr -d '>' ;
+fi
+if [ "$PATTERN" == "$NA" ]
+then
+  # maximizing the sum of alignment scores (field 14; AS:i:)
+  accn=$($TAWK '{print$3"\t"$14}' $SAM | sed 's/AS:i://g' | 
+           $TAWK '{s[$1]+=$2}END{for(a in s)print s[a]"\t"a}' | 
+             sort -rg | $TAWK '(NR==1){print$2;exit}');
+else
+  # selecting accession(s) using $PATTERN
+  $ZGREP -E "$PATTERN" $SSUDB | $BAWK '{print$1}' | tr -d '>' > $TMP1 ; 
+  # maximizing the sum of alignment scores (AS:i:) among the selected reference sequences
+  accn=$($TAWK '{print$3"\t"$14}' $SAM | sed 's/AS:i://g' |
+             $TAWK '{s[$1]+=$2}END{for(a in s)print s[a]"\t"a}' |
+               grep -F -f $TMP1 |
+                 sort -rg | $TAWK '(NR==1){print$2;exit}');
+fi
 if [ -z "$accn" ]
 then
+  $VERBOSE && [ "$PATTERN" != "$NA" ] && echo "> selection pattern: $PATTERN" ;
  echo "> no SSU found" ;
  finalize ;
  echo "$(chrono) exit" ;
@@ -634,7 +658,6 @@ MESSAGE="building SSU sequence" ;
 echo -n "$(chrono) $MESSAGE ." ;
 ## getting HTS reads ###########################################################
-# $TAWK -v l=$RLGT '(length($10)>=l){print$10"\t"$11"\t"$12}' $SAM |  sed 's/NM:i://g' | $TAWK '($3<10){print$1"\t"$2}' |  # NOTE: <10 mismatches against the best hit
 $TAWK -v l=$RLGT '(length($10)>=l){print$10"\t"$11}' $SAM |  
  sort -u | $TAWK '{print"@SSU:"NR;
                    print$1;
@@ -676,16 +699,16 @@ $BWA_MEM $BWAopt $REF_IDX $FQTMP 2>/dev/null |
 echo -n "." ;
 ## building consensus ##########################################################
-# STCopt="--threads $NTHREADS --mode bayesian --ambig --min-MQ 2 --P-het 1.0e-100 --het-scale 1.0";
+# STCopt="--threads $NTHREADS --mode bayesian --ambig --min-MQ 2 --P-het 1.0e-10 --het-scale 1.0";
 STCopt="--threads $NTHREADS --mode simple --min-MQ $MQ --min-BQ $BQ --min-depth $COV --use-qual --call-fract $FQ --het-fract $FH";
 $AMB && STCopt="$STCopt --ambig";
 SEQ=$($SAMTOOLS_CONSENSUS $STCopt $BAM 2>/dev/null |
 	grep -v "^>" | tr -d '\n' |
-          sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;' |
+          sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;' |  #  trimming low
-          sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;' |
+          sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;' |  #  quality ends
-          sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;' |
+          sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;' |  #  of  the  SSU
-          sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;' |
+          sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;' |  #  sequence (if
-          sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;');
+          sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;');  #  any)
 echo " [ok]" ;
@@ -717,14 +740,6 @@ then
                                         while(++i<=w) switch(c[i]){ case "A": case "C": case "G": case "T": printf" "; continue; 
                                                                     default:                                printf"*"; continue; }
                                         print""; }' ;
-#  for b in $(sed 's/./& /g' <<<$"$SEQ") 
-#  do
-#    case $b in
-#    A|C|G|T) echo -n    "$b"            ;;
-#    *)       echo -n -e "\e[31m$b\e[0m" ;;
-#    esac
-#  done
-#  echo ;
 fi
 if [ $l -lt $SLGT ]
@@ -744,7 +759,7 @@ then
  exit ;
 fi
-echo ">SSU_covr_$DEPTH""_lgt_$l""_amb_$a"  > $OUTFILE ;
+echo ">contigSSU_covr_$DEPTH""_lgt_$l""_amb_$a"  > $OUTFILE ;
 echo "$SEQ"                               >> $OUTFILE ;
 if [ "$OUTFQ" == "$NA" ]
 then
@@ -763,3 +778,4 @@ echo "$(chrono) exit" ;
 exit ;
--- a/README.md
+++ b/README.md
@@ -4,9 +4,9 @@
 # ASSU
-_ASSU_ (ASSembling SSU) is a command line tool written in [Bash](https://www.gnu.org/software/bash/) to carry out the reference-guided assembly of small subunit (SSU) 16S ribosomal ribonucleic acid (rRNA) using short high-throughput sequencing (HTS) reads derived from whole genome sequencing of bacteria or archaea strains. 
+_ASSU_ (ASSembling SSU) is a command line tool written in [Bash](https://www.gnu.org/software/bash/) to carry out the reference-guided assembly of small subunit (SSU) 16S ribosomal ribonucleic acid (rRNA) using short high-throughput sequencing (HTS) reads derived from whole genome) sequencing of bacteria or archaea strains. 
-This tool was developed to compensate the failure of several _de novo_ assembly programs to assemble a non-fragmented SSU segment when the sequenced genome contains different 16S rRNA copies with sequence variation, especially when using short HTS reads.
+This tool was developed to compensate the failure of several _de novo_ assembly programs to assemble (at least one) non-fragmented SSU segment when the sequenced genome contains different 16S rRNA copies with sequence variation, especially when using short HTS reads.
@@ -14,21 +14,17 @@ This tool was developed to compensate the failure of several _de novo_ assembly
 You will need to install the required programs and tools listed in the following tables, or to verify that they are already installed with the required version.
-##### Mandatory dependencies
 <div align="center">
 | program                                               | package                                         | version      | sources                                                                                                                                                 |
 |:----------------------------------------------------- |:-----------------------------------------------:| ------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [_gawk_](https://www.gnu.org/software/gawk/)          | -                                               | > 4.0.0      | [ftp.gnu.org/gnu/gawk](http://ftp.gnu.org/gnu/gawk/)                                                                                                    |
-| _gunzip_<br>_zgrep_                                   | [_gzip_](https://www.gnu.org/software/gzip/)    | > 1.0        | [ftp.gnu.org/gnu/gzip](https://ftp.gnu.org/gnu/gzip/)                                                                                                   |
 | [_bwa-mem2_](https://github.com/bwa-mem2/bwa-mem2)    | -                                               | &ge; 2.2.1   | [gitlab.pasteur.fr/GIPhy/contig_info](https://gitlab.pasteur.fr/GIPhy/contig_info)                                                                      |
 | [_samtools_](http://www.htslib.org/)                  | -                                               | &ge; 1.18    | [github.com/samtools/samtools](https://github.com/samtools/samtools)<br>[sourceforge.net/projects/samtools](https://sourceforge.net/projects/samtools)  |
 </div>
-##### Optional dependencies
+##### Optional programs
 <div align="center">
@@ -41,6 +37,22 @@ You will need to install the required programs and tools listed in the following
 </div>
+##### Standard GNU packages and utilities
+<div align="center">
+| program                                                 | package                                               | version      | sources                                                            |
+|:------------------------------------------------------- |:-----------------------------------------------------:| ------------:|:------------------------------------------------------------------ |
+| _echo_<br>_head_<br>_fold_<br>_paste_<br>_tail_<br>_tr_ | [coreutils](https://www.gnu.org/software/coreutils/)  | > 8.0        | [ftp.gnu.org/gnu/coreutils](https://ftp.gnu.org/gnu/coreutils)     |
+| _gunzip_<br>_zgrep_                                     | [gzip](https://www.gnu.org/software/gzip/)            | > 1.0        | [ftp.gnu.org/gnu/gzip](https://ftp.gnu.org/gnu/gzip/)              |
+| [_bc_](https://www.gnu.org/software/bc)                 | -                                                     | > 1.0        | [ftp.gnu.org/gnu/bc](https://ftp.gnu.org/gnu/bc)                   |
+| [_gawk_](https://www.gnu.org/software/gawk)             | -                                                     | > 4.0.0      | [ftp.gnu.org/gnu/gawk](http://ftp.gnu.org/gnu/gawk/)               |
+| [_grep_](https://www.gnu.org/software/grep)             | -                                                     | > 2.0        | [ftp.gnu.org/gnu/bc](https://ftp.gnu.org/gnu/grep)                 |
+| [_sed_](https://www.gnu.org/software/sed)               | -                                                     | > 4.2        | [ftp.gnu.org/gnu/bc](https://ftp.gnu.org/gnu/sed)                  |
+</div>
 ## Installation and execution
 **A.** Clone this repository with the following command line:
@@ -86,7 +98,7 @@ For each required program, the table below reports the corresponding variable as
 ./ASSU.sh  [options]  <infile>  [<infile> ...]
 ```
-**F.** _ASSU_ also requires a databank of reference SSU sequences. By default, a version of this databank is provided as a file named `SSUdb.gz` (see details in `SSUdb.version.txt`). However, a more recent version can be quickly built using the provided script `makeSSUdb.sh` with the following command line:
+**F.** _ASSU_ also requires a databank of reference SSU sequences. By default, a version of this databank is provided inside the directory _db/_ as a file named `SSUdb.gz` (see details in `SSUdb.version.txt`). However, a more recent version can be quickly built using the provided script `makeSSUdb.sh` with the following command line:
 ```bash
 ./makeSSUdb.sh
@@ -103,10 +115,12 @@ Run _ASSU_ without option to read the following documentation:
 USAGE:  ASSU  [options]  <infile> [<infile> ...]
 OPTIONS:
-  -d <file>    SSU databank file (default: SSUdb.gz in the same directory as ASSU.sh)
+  -d <file>    SSU databank file (default: db/SSUdb.gz in the same directory as ASSU)
-  -p <string>  restricts the SSU databank to the specified pattern (default: none)
+  -p <string>  restricts the  SSU databank  to the  specified  (extended regex)  pattern
-  -o <string>  output SSU sequence file name (default: ssu.fasta)
+               (default: none)
-  -O <string>  writes the selected reads into the specified file name (default: none)
+  -o <string>  output FASTA-formatted SSU sequence file name (default: ssu.fasta)
+  -O <string>  writes the selected  reads into the  specified FASTQ-formatted  file name
+               (default: none)
  -l <int>     minimum sequence length (default: 1000)
  -L <int>     minimum read length (default: AUTO)
  -Q <int>     minimum base Phred quality value (default: 20)
@@ -115,7 +129,7 @@ Run _ASSU_ without option to read the following documentation:
  -F <float>   minimum proportion of the majority base to infer that base (default: 0.8)
  -A <float>   minimum ratio of the alternative  base(s) to the majority one to add that
               base(s) to the consensus (default: 0.2)
-  -N           set N when multiple bases at a position (default: not set)
+  -N           set N when multiple bases at a consensus position (default: not set)
  -w <dir>     path to the tmp directory (default: $TMPDIR, otherwise /tmp)
  -t <int>     thread numbers (default: 2)
  -v           verbose mode
@@ -124,28 +138,31 @@ Run _ASSU_ without option to read the following documentation:
  -h           prints this help and exit
 EXAMPLES:
-  ASSU  -t 24  -o 16s.fasta  fwd.fastq.gz  rev.fastq.gz  sgl.fastq.bz2
+  ASSU  -t 24  -o 16s.fasta  fwd.fastq.gz  rev.fastq.gz  sgl.fastq.gz
-  ASSU  -d SSUdb.gz  -O 16s.fastq  -p Clostridium  -L 75  -v  *.fastq
+  ASSU  -d SSUdb.gz  -O 16s.fastq  -p "Devosia limi"  -L 75  -v  *.fastq
+  ASSU  -p "Citrobacter|Escherichia|Shigella"  -N  -v  hts.fastq.bz2
 ```
 ## Notes
-* In brief, _ASSU_ first quickly aligns the specified HTS reads against all the reference sequences available in the SSU databank using _bwa-mem2_. This first step enables to determine the most suited reference sequence (called model), as well as the subset of HTS reads that arise from SSU genome region(s). Next, every HTS read from the subset is accurately aligned against the model sequence, and the resulting alignments are processed by _samtools_ to build a final (consensus) sequence.
+* In brief, _ASSU_ first quickly aligns the specified HTS reads against all the reference sequences available in the SSU databank using _bwa-mem2_. This first step enables to determine the most suited reference sequence (called model), as well as the subset of HTS reads that arise from SSU genome regions. Next, every HTS read from the subset is accurately aligned against the model sequence, and the resulting alignments are processed by _samtools_ to build a final (consensus) sequence.
 * _ASSU_ requires at least one HTS read file. Input file(s) should be in FASTQ format and can be compressed using [_gzip_](https://www.gnu.org/software/gzip/), [_bzip2_](https://sourceware.org/bzip2/) or [_DSRC_](https://github.com/refresh-bio/DSRC) (Roguski and Deorowicz 2014). Note that input files compressed using [_bzip2_](https://sourceware.org/bzip2/) or [_DSRC_](https://github.com/refresh-bio/DSRC) require the associated decompression tool to be read (see [Dependencies](#dependencies)).
 * _ASSU_ is not working with long HTS reads, as _bwa-mem2_ is not developed to align HTS reads on significantly shorter reference sequences. The source code of _ASSU_ can be easily modified (on request) to deal with such a case, but long HTS reads generally lead to complete SSU segments via _de novo_ assembly.
-* By default, _ASSU_ expects that the SSU databank file `SSUdb.gz` is located in the same directory. However, an alternative SSU databank file (e.g. different version, different file name) can be specified using option `-d`. The content of the specified SSU databank can be summarized using option `-s`.
+* By default, _ASSU_ expects that the SSU databank file `SSUdb.gz` is located in the directory `db/`. However, an alternative SSU databank file (e.g. different version, different file name) can be specified using option `-d`. The content of the specified SSU databank can be summarized using option `-s`.
-* The running time of _ASSU_ is very dependent on the size of the input files, but faster running times can be obtained using multiple threads (option `-t`) and/or a temporary directory located on a hard drive with high speed (option `-w`). Faster running times can be also observed by restricting the reference SSU sequences using a specified pattern (e.g. sequence accession, genus name; option `-p`).
+* The running time of _ASSU_ is very dependent on the size of the input files, but faster running times can be obtained using multiple threads (option `-t`) and/or a temporary directory located on a hard drive with high speed (option `-w`). 
 * The assembled sequence is written in FASTA format into an output file (option `-o`; default name: `ssu.fasta`). Optionally, the selected HTS reads can be written in FASTQ format into a specified output file (option `-O`).
+* The selection of the model sequence can be oriented/forced by using the option `-p` to set a(n extended-regex) pattern (e.g. accessions, genus, species). It is recommended to specify the pattern between quotation marks.
 * As the assembled SSU sequence is often the consensus of several copies with sequence variation within the sequenced genome (e.g. Větrovský and Baldrian 2013), it may contain ambiguous positions resulting from the consensus of different sequenced bases at those positions. In such cases, degenerated nucleotides are used to represent the consensus of different character states (see e.g. [Table 1](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2865858/table/T1/) in Johnson 2010), or lowercase characters when a deletion (i.e. gap) is involved in the consensus. Note that every degenerated nucleotide can be replaced by the character state `N` using option `-N`.
-* The (number of) ambiguous positions can be slightly modified by considering shorter HTS reads (option `-L`), putative sequencing errors (option `-Q`), weak alignments (option `-M`) or low coverage depth (option `-D`). The consensus definition can be modified by tuning the two options `-F` and `-A` (corresponding to the options `--call-fract` and `--het-fract` of [samtools consensus](http://www.htslib.org/doc/samtools-consensus.html), respectively).
+* The (number of) ambiguous positions can be slightly modified by considering shorter HTS reads (option `-L`), putative sequencing errors (option `-Q`), weak alignments (option `-M`), low coverage depth (option `-D`) or alternative model sequence (option `-p`). The consensus definition can be modified by tuning the two options `-F` and `-A`, corresponding to the options `--call-fract` and `--het-fract` of [samtools consensus](http://www.htslib.org/doc/samtools-consensus.html) (mode simple), respectively.
 * No output file is written in several situations: <br> &emsp; &bullet; insufficient coverage depth (default: at least 50&times;; option `-D`), <br> &emsp; &bullet; too short assembled SSU sequence (default: at least 1,000 bps; option `-l`), <br> &emsp; &bullet; too many ambiguous positions (i.e. more than 5%). 
@@ -172,23 +189,25 @@ Use the following command line to run _ASSU_ on these two FASTQ files using 12 t
 ./ASSU.sh  -t 12  -o FWSEC0011.ssu.fasta  -v  SRR7896249_*.fastq.gz
 ```
-Note that the SSU databank used for this assembly is the version 2024-02-16 (20,404 sequences).
+Note that the SSU databank used for this assembly is the version 2024-02-18 (20,404 sequences).
 As the verbose mode was set (option `-v`), this command line leads to the following output: 
 ```
-# ASSU v1.0
+# ASSU v1.1
 # Copyright (C) 2024 Institut Pasteur
 + https://gitlab.pasteur.fr/GIPhy/ASSU
 > Syst: x86_64-redhat-linux-gnu
 > Bash: 4.4.20(1)-release
+> SSUdb: /local/bin//ASSU/db/SSUdb.gz
+> SSUdb v2024-02-18 (20404 sequences)
 [00:00] checking input files ... [ok]
 + SRR7896249_1.fastq.gz
 + SRR7896249_2.fastq.gz
 [00:00] creating tmp directory .... [ok]
-> TMP_DIR=/tmp/ASSU.f2PzH4mWpX
+> TMP_DIR=/tmp/ASSU.uYf5cUoa6R
-[00:00] examining SSU databank ...... [ok]
+[00:01] examining SSU databank ...... [ok]
 > model:  Bacteria  |  Escherichia fergusonii  |  NR_074902.1  |  1542 bps
-[00:24] building SSU sequence .... [ok]
+[00:27] building SSU sequence .... [ok]
 > 3016 selected reads (903953 bases; lgt > 269)
 > coverage depth: 586x
 > 1543 bps (ambiguous bases: 14)
@@ -226,9 +245,9 @@ As the verbose mode was set (option `-v`), this command line leads to the follow
 1501 ACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTTA
-[00:27] writing output file ... [ok]
+[00:30] writing output file ... [ok]
 + FASTA: FWSEC0011.ssu.fasta
-[00:27] exit
+[00:30] exit
 ```
 The SSU sequence [NR_074902.1](https://www.ncbi.nlm.nih.gov/nuccore/NR_074902.1/) was selected as a model to carry out the reference-guided assembly using 3,016 HTS reads, leading to an assembled (consensus) sequence of length 1,543 bps (coverage depth: 586&times;) written into the FASTA file _FWSEC0011.ssu.fasta_. 
@@ -455,22 +474,24 @@ As the HTS reads arise from an _E. coli_ genome, _ASSU_ can also be run by using
 This command line leads to the following output:
 ```
-# ASSU v1.0
+# ASSU v1.1
 # Copyright (C) 2024 Institut Pasteur
 + https://gitlab.pasteur.fr/GIPhy/ASSU
 > Syst: x86_64-redhat-linux-gnu
 > Bash: 4.4.20(1)-release
+> SSUdb: /local/bin/ASSU/db/SSUdb.gz
+> SSUdb v2024-02-18 (20404 sequences)
 [00:00] checking input files ... [ok]
 + SRR7896249_1.fastq.gz
 + SRR7896249_2.fastq.gz
 [00:00] creating tmp directory .... [ok]
-> TMP_DIR=/tmp/ASSU.lc4DYy80KV
+> TMP_DIR=/tmp/ASSU.TxJar4O6ZP
 [00:00] examining SSU databank ...... [ok]
 > selection pattern: Escherichia coli
 > model:  Bacteria  |  Escherichia coli  |  NR_114042.1  |  1467 bps
-[00:15] building SSU sequence .... [ok]
+[00:25] building SSU sequence .... [ok]
-> 2868 selected reads (859565 bases; lgt > 269)
+> 3016 selected reads (903953 bases; lgt > 269)
-> coverage depth: 585x
+> coverage depth: 616x
 > 1468 bps (ambiguous bases: 14)
              10        20        30        40        50        60        70        80        90       100
               |         |         |         |         |         |         |         |         |         |
@@ -504,9 +525,9 @@ This command line leads to the following output:
 1401 CAAAAGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAG
-[00:18] writing output file ... [ok]
+[00:29] writing output file ... [ok]
 + FASTA: ssu.fasta
-[00:18] exit
+[00:29] exit
 ```
 As expected, _ASSU_ assembles a similar 16S rRNA sequence using _E. coli_ as a model (e.g. same ambiguous positions).

--- a/SSUdb.gz
+++ b/SSUdb.gz
--- a/SSUdb.version.txt
+++ b/SSUdb.version.txt
-SSUdb.gz  2024-02-16 05:36:17
-20404 sequences
-built using makeSSUdb v1.0
--- a/db/SSUdb.gz
+++ b/db/SSUdb.gz
--- a/db/SSUdb.version.txt
+++ b/db/SSUdb.version.txt
+SSUdb.gz  2024-02-18 08:49:06
+20404 sequences
+built using makeSSUdb v1.1
--- a/makeSSUdb.sh
+++ b/makeSSUdb.sh
@@ -31,7 +31,10 @@
 # = VERSIONS =                                                                                               #
 # ============                                                                                               #
 #                                                                                                            #
-  VERSION=1.0;                                                                                               #
+  VERSION=1.1;                                                                                               #
+# + best compression ratio                                                                                   #
+#                                                                                                            #
+# VERSION=1.0;                                                                                               #
 #                                                                                                            #
 ##############################################################################################################
@@ -150,7 +153,7 @@ echo "... [ok]" ;
 echo -n "compressing .." ;
 n=$(grep -c "^>" $TMP_DIR/ssu.db);
 echo -n "." ;
-$GZIP -c $TMP_DIR/ssu.db > SSUdb.gz ;
+$GZIP --best -c $TMP_DIR/ssu.db > SSUdb.gz ;
 echo -n "." ;
 { echo  -n "SSUdb.gz  " ;
  date +"%Y-%m-%d %H:%M:%S" ;

--- a/example/FWSEC0011.ssu.fasta
+++ b/example/FWSEC0011.ssu.fasta
->SSU_covr_257_lgt_1543_amb_14
+>contigSSU_covr_586_lgt_1543_amb_14
 AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGRAARCAGCTTGCTGYTTYGCTGACGAGTGGCGGACGGGTGAGTAATGTCTGGGAAACTGCCTGATGGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAACGTCGCAAGACCAAAGAGGGGGACCTTCGGGCCTCTTGCCATCGGATGTGCCCAGATGGGATTAGCTWGTWGGTGGGGTAACGGCTCACCWAGGCGACGATCCCTAGCTGGTCTGAGAGGATGACCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTTGGAGGTTGTGCCCTTGAGGCGTGGCTTCCGGAGCTAACGCGTTAAGTCGACCGCCTGGGGAGTACGGCCGCAAGGTTAAAACTCAAATGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACCTTACCTGGTCTTGACATCCACRGAASTTTYCAGAGATGaGAWTgGTGCCTTCGGGAACYGTGAGACAGGTGCTGCATGGCTGTCGTCAGCTCGTGTTGTGAAATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTATCCTTTGTTGCCAGCGGTCCGGCCGGGAACTCAAAGGAGACTGCCAGTGATAAACTGGAGGAAGGTGGGGATGACGTCAAGTCATCATGGCCCTTACGACCAGGGCTACACACGTGCTACAATGGCGCATACAAAGAGAAGCGACCTCGCGAGAGCAAGCGGACCTCATAAAGTGCGTCGTAGTCCGGATTGGAGTCTGCAACTCGACTCCATGAAGTCGGAATCGCTAGTAATCGTGGATCAGAATGCCACGGTGAATACGTTCCCGGGCCTTGTACACACCGCCCGTCACACCATGGGAGTGGGTTGCAAAAGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTTA