diff --git a/ASSU.sh b/ASSU.sh index f346c18cfb0c39efa686b57f57858312a68e6a97..3f7a7328d22573135c0c548d8cd4ae7bd1e8a96c 100755 --- a/ASSU.sh +++ b/ASSU.sh @@ -31,7 +31,10 @@ # = VERSIONS = # # ============ # # # - VERSION=1.0; # + VERSION=1.1; # +# + SSUdb inside a dedicated directory ($SSUDB_DIR) # +# # +# VERSION=1.0; # # # ############################################################################################################## @@ -131,7 +134,7 @@ # # PUNZIP="$PIGZ_BIN --decompress"; # # -# -- agrep ------------------------------------------------------------------------------------------------ # +# -- zgrep ------------------------------------------------------------------------------------------------ # # # ZGREP="$ZGREP_BIN"; # # @@ -162,10 +165,12 @@ mandoc() { USAGE: ASSU [options] <infile> [<infile> ...] OPTIONS: - -d <file> SSU databank file (default: SSUdb.gz in the same directory as ASSU.sh) - -p <string> restricts the SSU databank to the specified pattern (default: none) - -o <string> output SSU sequence file name (default: ssu.fasta) - -O <string> writes the selected reads into the specified file name (default: none) + -d <file> SSU databank file (default: db/SSUdb.gz in the same directory as ASSU) + -p <string> restricts the SSU databank to the specified (extended regex) pattern + (default: none) + -o <string> output FASTA-formatted SSU sequence file name (default: ssu.fasta) + -O <string> writes the selected reads into the specified FASTQ-formatted file name + (default: none) -l <int> minimum sequence length (default: 1000) -L <int> minimum read length (default: AUTO) -Q <int> minimum base Phred quality value (default: 20) @@ -174,7 +179,7 @@ mandoc() { -F <float> minimum proportion of the majority base to infer that base (default: 0.8) -A <float> minimum ratio of the alternative base(s) to the majority one to add that base(s) to the consensus (default: 0.2) - -N set N when multiple bases at a position (default: not set) + -N set N when multiple bases at a consensus position (default: not set) -w <dir> path to the tmp directory (default: \$TMPDIR, otherwise /tmp) -t <int> thread numbers (default: 2) -v verbose mode @@ -183,8 +188,9 @@ mandoc() { -h prints this help and exit EXAMPLES: - ASSU -t 24 -o 16s.fasta fwd.fastq.gz rev.fastq.gz sgl.fastq.bz2 - ASSU -d SSUdb.gz -O 16s.fastq -p Clostridium -L 75 -v *.fastq + ASSU -t 24 -o 16s.fasta fwd.fastq.gz rev.fastq.gz sgl.fastq.gz + ASSU -d SSUdb.gz -O 16s.fastq -p "Devosia limi" -L 75 -v *.fastq + ASSU -p "Citrobacter|Escherichia|Shigella" -N -v hts.fastq.bz2 EOF } @@ -212,11 +218,11 @@ chrono() { } # # # -- dcontrol --------------------------------------------------------------------------------------------- # -# >> controls required and optional dependancies # +# >> controls required and optional (non-coreutils) dependancies # # # dcontrol() { fail=false; - for binexe in bc fold grep $ZGREP_BIN sed $GAWK_BIN $BWAMEM2_BIN $SAMTOOLS_BIN + for binexe in bc grep $ZGREP_BIN sed $GAWK_BIN $BWAMEM2_BIN $SAMTOOLS_BIN do if [ ! $(command -v $binexe) ]; then echo "[ERROR] $binexe not found" >&2 ; fail=true; fi done @@ -233,7 +239,7 @@ dcontrol() { } # # # -- dcheck ----------------------------------------------------------------------------------------------- # -# >> checks dependancies and exit # +# >> checks (non-coreutils) dependancies and exit # # # dcheck() { echo "Checking required dependencies ..." ; @@ -243,12 +249,6 @@ dcheck() { if [ ! $(command -v $binexe) ]; then echo -e "\e[31m[fail]\e[0m"; else echo -e -n "$(which $binexe)\t\t\e[32m[ok]\e[0m\t\t"; bc -v | head -1 ; fi - ## fold ############################ - echo -e -n "> \e[1mfold\e[0m >1.0 mandatory\t" ; binexe=fold; - echo -e -n "$binexe \t\t" ; - if [ ! $(command -v $binexe) ]; then echo -e "\e[31m[fail]\e[0m"; - else echo -e -n "$(which $binexe)\t\t\e[32m[ok]\e[0m\t\t"; fold --version | head -1 ; - fi ## grep ############################ echo -e -n "> \e[1mgrep\e[0m >1.0 mandatory\t" ; binexe=grep; echo -e -n "$binexe \t\t" ; @@ -345,24 +345,26 @@ dispdb() { if [ $# -lt 1 ]; then mandoc ; exit 1 ; fi -SSUDB=$PWD/SSUdb.gz; # SSU databank -d -OUTFILE=ssu.fasta; # FASTA outfile -o -OUTFQ="$NA"; # FASTQ outfile -O -PATTERN="$NA"; # pattern -p -SLGT=1000; # min seq lgt -l -RLGT="00"; # min read lgt -L -BQ=20; # min Phred -Q -MQ=20; # min map Phread -M -COV=50; # min cov depth -D -FQ=0.8; # min freq -F -FH=0.2; # min het -A -AMB=true; # only N -N -DISPDB=false; # display SSUdb -s -TMP_DIR=${TMPDIR:-/tmp}; # tmp directory -w -NTHREADS=2; # no. threads -t -VERBOSE=false; # verbose mode -v - -while getopts d:p:o:O:l:L:Q:M:D:F:A:w:t:Nvcsh option +SSUDB_DIR=${SSUDB_DIR:-$PWD/db} +SSUDB=$SSUDB_DIR/SSUdb.gz; # SSU databank -d +OUTFILE=ssu.fasta; # FASTA outfile -o +OUTFQ="$NA"; # FASTQ outfile -O +PATTERN="$NA"; # pattern -p +SLGT=1000; # min seq lgt -l +RLGT="00"; # min read lgt -L +BQ=20; # min Phred -Q +MQ=20; # min map Phread -M +COV=50; # min cov depth -D +FQ=0.8; # min freq -F +FH=0.2; # min het -A +AMB=true; # only N -N +DISPDB=false; # display SSUdb -s +TMP_DIR=${TMPDIR:-/tmp}; # tmp directory -w +NTHREADS=2; # no. threads -t +VERBOSE=false; # verbose mode -v +DEBUG=false; # debug mode -X + +while getopts d:p:o:O:l:L:Q:M:D:F:A:w:t:NvcshX option do case $option in d) SSUDB="$OPTARG" ;; @@ -382,6 +384,7 @@ do v) VERBOSE=true ;; c) dcheck ;; s) DISPDB=true ;; + X) DEBUG=true ;; h) mandoc ; exit 0 ;; \?) mandoc ; exit 1 ;; esac @@ -391,11 +394,24 @@ shift "$(( $OPTIND - 1 ))" NFILE=$#; # no. input file(s) FQLIST="$@"; # specified input file(s) +$DEBUG && VERBOSE=true; + + echo "# ASSU v$VERSION" ; echo "# $COPYRIGHT" ; echo "+ https://gitlab.pasteur.fr/GIPhy/ASSU" ; -$VERBOSE && echo "> Syst: $MACHTYPE" ; -$VERBOSE && echo "> Bash: $BASH_VERSION" ; +if $VERBOSE +then + echo "> Syst: $MACHTYPE" ; + echo "> Bash: $BASH_VERSION" ; + echo "> SSUdb: $SSUDB" ; + if [ -s ${SSUDB%.*}.version.txt ] + then + $BAWK '(NR==1){d=$2} + (NR==2){n=$1} + END {print"> SSUdb v"d" ("n" sequences)"}' ${SSUDB%.*}.version.txt + fi +fi [ ! -e $SSUDB ] && echoxit "[ERROR] databank not found (option -d): $SSUDB" ; [ -d $SSUDB ] && echoxit "[ERROR] not a file (option -d): $SSUDB" ; @@ -547,30 +563,20 @@ MESSAGE="examining SSU databank" ; echo -n "$(chrono) $MESSAGE ." ; ## indexing SSU databank ####################################################### -if [ "$PATTERN" == "$NA" ] +if [ "$PATTERN" != "$NA" ] && [ $($ZGREP -c -E "$PATTERN" $SSUDB) -eq 0 ] then - $BWA_INDEX -p $SSU_IDX $SSUDB &>/dev/null ; -else - $ZGREP -A1 -F "$PATTERN" $SSUDB | grep -v -e "^--$" > $REF_SEQ ; - - if [ ! -s $REF_SEQ ] - then - echo ".. [ok]" ; - echo "> no selected sequence with the specified pattern: $PATTERN" ; - finalize ; - exit ; - fi - - $BWA_INDEX -p $SSU_IDX $REF_SEQ &>/dev/null ; - rm -f $REF_SEQ ; + echo ".. [ok]" ; + echo "> no selected sequence with the specified pattern: $PATTERN" ; + finalize ; + exit ; fi +$BWA_INDEX -p $SSU_IDX $SSUDB &>/dev/null ; echo -n "." ; ## aligning HTS reads ########################################################## touch $SAM ; [ $NTHREADS -gt 24 ] && DSRCopt="-t24" || DSRCopt="-t$NTHREADS" ; -# BWAopt="-t $NTHREADS -k 21 -w 50 -D 1.0 -r 2.0 -T $(( $RLGT - 20 ))"; BWAopt="-t $NTHREADS -k 21 -w 50"; for fin in $FQLIST do @@ -588,18 +594,36 @@ done echo " [ok]" ; ## getting ref ################################################################# -## maximizing the no. hits -# accn=$($TAWK '{++c[$3]}END{for(a in c)if(c[a]>1)print c[a]" "a}' $SAM | sort -n | tail -1 | $BAWK '{print$2}'); -## maximizing the avg score among the two most hitted references (as longer sequences often wins when using the basic criterion above) -accn=$($TAWK '{print$3"\t"$14}' $SAM | - sed 's/AS:i://g' | - $TAWK ' {m[$1]+=$2;++n[$1]} - END{for(a in m)if(n[a]>10)print n[a]" "(m[a]/n[a])" "a}' | - sort -rn | head -2 | - sort -rgk2 | head -1 | $BAWK '{print$3}'); +if $DEBUG +then + echo -e "no.hits\t\tsum.scores\tavg.score\tinfo" ; + $TAWK '{print$3"\t"$14}' $SAM | + sed 's/AS:i://g' | + $TAWK ' {m[$1]+=$2;++n[$1]} + END{for(a in m)if(n[a]>10)print n[a]"\t"m[a]"\t"(m[a]/n[a])"\t"a}' | + sort -rn | head | + while read x y z a ; do echo -e "$x\t\t$y\t\t$z\t\t$($ZGREP -m1 -F $a $SSUDB)" ; done | tr -d '>' ; +fi + +if [ "$PATTERN" == "$NA" ] +then + # maximizing the sum of alignment scores (field 14; AS:i:) + accn=$($TAWK '{print$3"\t"$14}' $SAM | sed 's/AS:i://g' | + $TAWK '{s[$1]+=$2}END{for(a in s)print s[a]"\t"a}' | + sort -rg | $TAWK '(NR==1){print$2;exit}'); +else + # selecting accession(s) using $PATTERN + $ZGREP -E "$PATTERN" $SSUDB | $BAWK '{print$1}' | tr -d '>' > $TMP1 ; + # maximizing the sum of alignment scores (AS:i:) among the selected reference sequences + accn=$($TAWK '{print$3"\t"$14}' $SAM | sed 's/AS:i://g' | + $TAWK '{s[$1]+=$2}END{for(a in s)print s[a]"\t"a}' | + grep -F -f $TMP1 | + sort -rg | $TAWK '(NR==1){print$2;exit}'); +fi if [ -z "$accn" ] then + $VERBOSE && [ "$PATTERN" != "$NA" ] && echo "> selection pattern: $PATTERN" ; echo "> no SSU found" ; finalize ; echo "$(chrono) exit" ; @@ -634,7 +658,6 @@ MESSAGE="building SSU sequence" ; echo -n "$(chrono) $MESSAGE ." ; ## getting HTS reads ########################################################### -# $TAWK -v l=$RLGT '(length($10)>=l){print$10"\t"$11"\t"$12}' $SAM | sed 's/NM:i://g' | $TAWK '($3<10){print$1"\t"$2}' | # NOTE: <10 mismatches against the best hit $TAWK -v l=$RLGT '(length($10)>=l){print$10"\t"$11}' $SAM | sort -u | $TAWK '{print"@SSU:"NR; print$1; @@ -676,16 +699,16 @@ $BWA_MEM $BWAopt $REF_IDX $FQTMP 2>/dev/null | echo -n "." ; ## building consensus ########################################################## -# STCopt="--threads $NTHREADS --mode bayesian --ambig --min-MQ 2 --P-het 1.0e-100 --het-scale 1.0"; +# STCopt="--threads $NTHREADS --mode bayesian --ambig --min-MQ 2 --P-het 1.0e-10 --het-scale 1.0"; STCopt="--threads $NTHREADS --mode simple --min-MQ $MQ --min-BQ $BQ --min-depth $COV --use-qual --call-fract $FQ --het-fract $FH"; $AMB && STCopt="$STCopt --ambig"; SEQ=$($SAMTOOLS_CONSENSUS $STCopt $BAM 2>/dev/null | grep -v "^>" | tr -d '\n' | - sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;' | - sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;' | - sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;' | - sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;' | - sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;'); + sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;' | # trimming low + sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;' | # quality ends + sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;' | # of the SSU + sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;' | # sequence (if + sed -r 's/^[ACGT]{0,5}[acgtRrYySsWwKkMmBbDdHhVvNn]+//;s/[acgtRrYySsWwKkMmBbDdHhVvNn]+[ACGT]{0,5}$//;'); # any) echo " [ok]" ; @@ -717,14 +740,6 @@ then while(++i<=w) switch(c[i]){ case "A": case "C": case "G": case "T": printf" "; continue; default: printf"*"; continue; } print""; }' ; -# for b in $(sed 's/./& /g' <<<$"$SEQ") -# do -# case $b in -# A|C|G|T) echo -n "$b" ;; -# *) echo -n -e "\e[31m$b\e[0m" ;; -# esac -# done -# echo ; fi if [ $l -lt $SLGT ] @@ -744,7 +759,7 @@ then exit ; fi -echo ">SSU_covr_$DEPTH""_lgt_$l""_amb_$a" > $OUTFILE ; +echo ">contigSSU_covr_$DEPTH""_lgt_$l""_amb_$a" > $OUTFILE ; echo "$SEQ" >> $OUTFILE ; if [ "$OUTFQ" == "$NA" ] then @@ -763,3 +778,4 @@ echo "$(chrono) exit" ; exit ; + diff --git a/README.md b/README.md index 96248428608cedc51d79e052caf8d91cb0af766c..8d9d01a923568b087e65e5d6a01329d890ea796c 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,9 @@ # ASSU -_ASSU_ (ASSembling SSU) is a command line tool written in [Bash](https://www.gnu.org/software/bash/) to carry out the reference-guided assembly of small subunit (SSU) 16S ribosomal ribonucleic acid (rRNA) using short high-throughput sequencing (HTS) reads derived from whole genome sequencing of bacteria or archaea strains. +_ASSU_ (ASSembling SSU) is a command line tool written in [Bash](https://www.gnu.org/software/bash/) to carry out the reference-guided assembly of small subunit (SSU) 16S ribosomal ribonucleic acid (rRNA) using short high-throughput sequencing (HTS) reads derived from whole genome) sequencing of bacteria or archaea strains. -This tool was developed to compensate the failure of several _de novo_ assembly programs to assemble a non-fragmented SSU segment when the sequenced genome contains different 16S rRNA copies with sequence variation, especially when using short HTS reads. +This tool was developed to compensate the failure of several _de novo_ assembly programs to assemble (at least one) non-fragmented SSU segment when the sequenced genome contains different 16S rRNA copies with sequence variation, especially when using short HTS reads. @@ -14,21 +14,17 @@ This tool was developed to compensate the failure of several _de novo_ assembly You will need to install the required programs and tools listed in the following tables, or to verify that they are already installed with the required version. -##### Mandatory dependencies - <div align="center"> | program | package | version | sources | |:----------------------------------------------------- |:-----------------------------------------------:| ------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------- | -| [_gawk_](https://www.gnu.org/software/gawk/) | - | > 4.0.0 | [ftp.gnu.org/gnu/gawk](http://ftp.gnu.org/gnu/gawk/) | -| _gunzip_<br>_zgrep_ | [_gzip_](https://www.gnu.org/software/gzip/) | > 1.0 | [ftp.gnu.org/gnu/gzip](https://ftp.gnu.org/gnu/gzip/) | | [_bwa-mem2_](https://github.com/bwa-mem2/bwa-mem2) | - | ≥ 2.2.1 | [gitlab.pasteur.fr/GIPhy/contig_info](https://gitlab.pasteur.fr/GIPhy/contig_info) | | [_samtools_](http://www.htslib.org/) | - | ≥ 1.18 | [github.com/samtools/samtools](https://github.com/samtools/samtools)<br>[sourceforge.net/projects/samtools](https://sourceforge.net/projects/samtools) | </div> -##### Optional dependencies +##### Optional programs <div align="center"> @@ -41,6 +37,22 @@ You will need to install the required programs and tools listed in the following </div> +##### Standard GNU packages and utilities + +<div align="center"> + +| program | package | version | sources | +|:------------------------------------------------------- |:-----------------------------------------------------:| ------------:|:------------------------------------------------------------------ | +| _echo_<br>_head_<br>_fold_<br>_paste_<br>_tail_<br>_tr_ | [coreutils](https://www.gnu.org/software/coreutils/) | > 8.0 | [ftp.gnu.org/gnu/coreutils](https://ftp.gnu.org/gnu/coreutils) | +| _gunzip_<br>_zgrep_ | [gzip](https://www.gnu.org/software/gzip/) | > 1.0 | [ftp.gnu.org/gnu/gzip](https://ftp.gnu.org/gnu/gzip/) | +| [_bc_](https://www.gnu.org/software/bc) | - | > 1.0 | [ftp.gnu.org/gnu/bc](https://ftp.gnu.org/gnu/bc) | +| [_gawk_](https://www.gnu.org/software/gawk) | - | > 4.0.0 | [ftp.gnu.org/gnu/gawk](http://ftp.gnu.org/gnu/gawk/) | +| [_grep_](https://www.gnu.org/software/grep) | - | > 2.0 | [ftp.gnu.org/gnu/bc](https://ftp.gnu.org/gnu/grep) | +| [_sed_](https://www.gnu.org/software/sed) | - | > 4.2 | [ftp.gnu.org/gnu/bc](https://ftp.gnu.org/gnu/sed) | + +</div> + + ## Installation and execution **A.** Clone this repository with the following command line: @@ -86,7 +98,7 @@ For each required program, the table below reports the corresponding variable as ./ASSU.sh [options] <infile> [<infile> ...] ``` -**F.** _ASSU_ also requires a databank of reference SSU sequences. By default, a version of this databank is provided as a file named `SSUdb.gz` (see details in `SSUdb.version.txt`). However, a more recent version can be quickly built using the provided script `makeSSUdb.sh` with the following command line: +**F.** _ASSU_ also requires a databank of reference SSU sequences. By default, a version of this databank is provided inside the directory _db/_ as a file named `SSUdb.gz` (see details in `SSUdb.version.txt`). However, a more recent version can be quickly built using the provided script `makeSSUdb.sh` with the following command line: ```bash ./makeSSUdb.sh @@ -103,10 +115,12 @@ Run _ASSU_ without option to read the following documentation: USAGE: ASSU [options] <infile> [<infile> ...] OPTIONS: - -d <file> SSU databank file (default: SSUdb.gz in the same directory as ASSU.sh) - -p <string> restricts the SSU databank to the specified pattern (default: none) - -o <string> output SSU sequence file name (default: ssu.fasta) - -O <string> writes the selected reads into the specified file name (default: none) + -d <file> SSU databank file (default: db/SSUdb.gz in the same directory as ASSU) + -p <string> restricts the SSU databank to the specified (extended regex) pattern + (default: none) + -o <string> output FASTA-formatted SSU sequence file name (default: ssu.fasta) + -O <string> writes the selected reads into the specified FASTQ-formatted file name + (default: none) -l <int> minimum sequence length (default: 1000) -L <int> minimum read length (default: AUTO) -Q <int> minimum base Phred quality value (default: 20) @@ -115,7 +129,7 @@ Run _ASSU_ without option to read the following documentation: -F <float> minimum proportion of the majority base to infer that base (default: 0.8) -A <float> minimum ratio of the alternative base(s) to the majority one to add that base(s) to the consensus (default: 0.2) - -N set N when multiple bases at a position (default: not set) + -N set N when multiple bases at a consensus position (default: not set) -w <dir> path to the tmp directory (default: $TMPDIR, otherwise /tmp) -t <int> thread numbers (default: 2) -v verbose mode @@ -124,28 +138,31 @@ Run _ASSU_ without option to read the following documentation: -h prints this help and exit EXAMPLES: - ASSU -t 24 -o 16s.fasta fwd.fastq.gz rev.fastq.gz sgl.fastq.bz2 - ASSU -d SSUdb.gz -O 16s.fastq -p Clostridium -L 75 -v *.fastq + ASSU -t 24 -o 16s.fasta fwd.fastq.gz rev.fastq.gz sgl.fastq.gz + ASSU -d SSUdb.gz -O 16s.fastq -p "Devosia limi" -L 75 -v *.fastq + ASSU -p "Citrobacter|Escherichia|Shigella" -N -v hts.fastq.bz2 ``` ## Notes -* In brief, _ASSU_ first quickly aligns the specified HTS reads against all the reference sequences available in the SSU databank using _bwa-mem2_. This first step enables to determine the most suited reference sequence (called model), as well as the subset of HTS reads that arise from SSU genome region(s). Next, every HTS read from the subset is accurately aligned against the model sequence, and the resulting alignments are processed by _samtools_ to build a final (consensus) sequence. +* In brief, _ASSU_ first quickly aligns the specified HTS reads against all the reference sequences available in the SSU databank using _bwa-mem2_. This first step enables to determine the most suited reference sequence (called model), as well as the subset of HTS reads that arise from SSU genome regions. Next, every HTS read from the subset is accurately aligned against the model sequence, and the resulting alignments are processed by _samtools_ to build a final (consensus) sequence. * _ASSU_ requires at least one HTS read file. Input file(s) should be in FASTQ format and can be compressed using [_gzip_](https://www.gnu.org/software/gzip/), [_bzip2_](https://sourceware.org/bzip2/) or [_DSRC_](https://github.com/refresh-bio/DSRC) (Roguski and Deorowicz 2014). Note that input files compressed using [_bzip2_](https://sourceware.org/bzip2/) or [_DSRC_](https://github.com/refresh-bio/DSRC) require the associated decompression tool to be read (see [Dependencies](#dependencies)). * _ASSU_ is not working with long HTS reads, as _bwa-mem2_ is not developed to align HTS reads on significantly shorter reference sequences. The source code of _ASSU_ can be easily modified (on request) to deal with such a case, but long HTS reads generally lead to complete SSU segments via _de novo_ assembly. -* By default, _ASSU_ expects that the SSU databank file `SSUdb.gz` is located in the same directory. However, an alternative SSU databank file (e.g. different version, different file name) can be specified using option `-d`. The content of the specified SSU databank can be summarized using option `-s`. +* By default, _ASSU_ expects that the SSU databank file `SSUdb.gz` is located in the directory `db/`. However, an alternative SSU databank file (e.g. different version, different file name) can be specified using option `-d`. The content of the specified SSU databank can be summarized using option `-s`. -* The running time of _ASSU_ is very dependent on the size of the input files, but faster running times can be obtained using multiple threads (option `-t`) and/or a temporary directory located on a hard drive with high speed (option `-w`). Faster running times can be also observed by restricting the reference SSU sequences using a specified pattern (e.g. sequence accession, genus name; option `-p`). +* The running time of _ASSU_ is very dependent on the size of the input files, but faster running times can be obtained using multiple threads (option `-t`) and/or a temporary directory located on a hard drive with high speed (option `-w`). * The assembled sequence is written in FASTA format into an output file (option `-o`; default name: `ssu.fasta`). Optionally, the selected HTS reads can be written in FASTQ format into a specified output file (option `-O`). +* The selection of the model sequence can be oriented/forced by using the option `-p` to set a(n extended-regex) pattern (e.g. accessions, genus, species). It is recommended to specify the pattern between quotation marks. + * As the assembled SSU sequence is often the consensus of several copies with sequence variation within the sequenced genome (e.g. VÄ›trovský and Baldrian 2013), it may contain ambiguous positions resulting from the consensus of different sequenced bases at those positions. In such cases, degenerated nucleotides are used to represent the consensus of different character states (see e.g. [Table 1](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2865858/table/T1/) in Johnson 2010), or lowercase characters when a deletion (i.e. gap) is involved in the consensus. Note that every degenerated nucleotide can be replaced by the character state `N` using option `-N`. -* The (number of) ambiguous positions can be slightly modified by considering shorter HTS reads (option `-L`), putative sequencing errors (option `-Q`), weak alignments (option `-M`) or low coverage depth (option `-D`). The consensus definition can be modified by tuning the two options `-F` and `-A` (corresponding to the options `--call-fract` and `--het-fract` of [samtools consensus](http://www.htslib.org/doc/samtools-consensus.html), respectively). +* The (number of) ambiguous positions can be slightly modified by considering shorter HTS reads (option `-L`), putative sequencing errors (option `-Q`), weak alignments (option `-M`), low coverage depth (option `-D`) or alternative model sequence (option `-p`). The consensus definition can be modified by tuning the two options `-F` and `-A`, corresponding to the options `--call-fract` and `--het-fract` of [samtools consensus](http://www.htslib.org/doc/samtools-consensus.html) (mode simple), respectively. * No output file is written in several situations: <br>   • insufficient coverage depth (default: at least 50×; option `-D`), <br>   • too short assembled SSU sequence (default: at least 1,000 bps; option `-l`), <br>   • too many ambiguous positions (i.e. more than 5%). @@ -172,23 +189,25 @@ Use the following command line to run _ASSU_ on these two FASTQ files using 12 t ./ASSU.sh -t 12 -o FWSEC0011.ssu.fasta -v SRR7896249_*.fastq.gz ``` -Note that the SSU databank used for this assembly is the version 2024-02-16 (20,404 sequences). +Note that the SSU databank used for this assembly is the version 2024-02-18 (20,404 sequences). As the verbose mode was set (option `-v`), this command line leads to the following output: ``` -# ASSU v1.0 +# ASSU v1.1 # Copyright (C) 2024 Institut Pasteur + https://gitlab.pasteur.fr/GIPhy/ASSU > Syst: x86_64-redhat-linux-gnu > Bash: 4.4.20(1)-release +> SSUdb: /local/bin//ASSU/db/SSUdb.gz +> SSUdb v2024-02-18 (20404 sequences) [00:00] checking input files ... [ok] + SRR7896249_1.fastq.gz + SRR7896249_2.fastq.gz [00:00] creating tmp directory .... [ok] -> TMP_DIR=/tmp/ASSU.f2PzH4mWpX -[00:00] examining SSU databank ...... [ok] +> TMP_DIR=/tmp/ASSU.uYf5cUoa6R +[00:01] examining SSU databank ...... [ok] > model: Bacteria | Escherichia fergusonii | NR_074902.1 | 1542 bps -[00:24] building SSU sequence .... [ok] +[00:27] building SSU sequence .... [ok] > 3016 selected reads (903953 bases; lgt > 269) > coverage depth: 586x > 1543 bps (ambiguous bases: 14) @@ -226,9 +245,9 @@ As the verbose mode was set (option `-v`), this command line leads to the follow 1501 ACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTTA -[00:27] writing output file ... [ok] +[00:30] writing output file ... [ok] + FASTA: FWSEC0011.ssu.fasta -[00:27] exit +[00:30] exit ``` The SSU sequence [NR_074902.1](https://www.ncbi.nlm.nih.gov/nuccore/NR_074902.1/) was selected as a model to carry out the reference-guided assembly using 3,016 HTS reads, leading to an assembled (consensus) sequence of length 1,543 bps (coverage depth: 586×) written into the FASTA file _FWSEC0011.ssu.fasta_. @@ -455,22 +474,24 @@ As the HTS reads arise from an _E. coli_ genome, _ASSU_ can also be run by using This command line leads to the following output: ``` -# ASSU v1.0 +# ASSU v1.1 # Copyright (C) 2024 Institut Pasteur + https://gitlab.pasteur.fr/GIPhy/ASSU > Syst: x86_64-redhat-linux-gnu > Bash: 4.4.20(1)-release +> SSUdb: /local/bin/ASSU/db/SSUdb.gz +> SSUdb v2024-02-18 (20404 sequences) [00:00] checking input files ... [ok] + SRR7896249_1.fastq.gz + SRR7896249_2.fastq.gz [00:00] creating tmp directory .... [ok] -> TMP_DIR=/tmp/ASSU.lc4DYy80KV +> TMP_DIR=/tmp/ASSU.TxJar4O6ZP [00:00] examining SSU databank ...... [ok] > selection pattern: Escherichia coli > model: Bacteria | Escherichia coli | NR_114042.1 | 1467 bps -[00:15] building SSU sequence .... [ok] -> 2868 selected reads (859565 bases; lgt > 269) -> coverage depth: 585x +[00:25] building SSU sequence .... [ok] +> 3016 selected reads (903953 bases; lgt > 269) +> coverage depth: 616x > 1468 bps (ambiguous bases: 14) 10 20 30 40 50 60 70 80 90 100 | | | | | | | | | | @@ -504,9 +525,9 @@ This command line leads to the following output: 1401 CAAAAGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAG -[00:18] writing output file ... [ok] +[00:29] writing output file ... [ok] + FASTA: ssu.fasta -[00:18] exit +[00:29] exit ``` As expected, _ASSU_ assembles a similar 16S rRNA sequence using _E. coli_ as a model (e.g. same ambiguous positions). diff --git a/SSUdb.gz b/SSUdb.gz deleted file mode 100644 index a4fef7b44b49e701e1a26a1fab7dc7a472d7262b..0000000000000000000000000000000000000000 Binary files a/SSUdb.gz and /dev/null differ diff --git a/SSUdb.version.txt b/SSUdb.version.txt deleted file mode 100644 index 292c9653895a04a7c3294a8ecd001d8ebe95fd2e..0000000000000000000000000000000000000000 --- a/SSUdb.version.txt +++ /dev/null @@ -1,3 +0,0 @@ -SSUdb.gz 2024-02-16 05:36:17 -20404 sequences -built using makeSSUdb v1.0 diff --git a/db/SSUdb.gz b/db/SSUdb.gz new file mode 100644 index 0000000000000000000000000000000000000000..2c515ecd221c655e0c33226be87e581f9edd2aca Binary files /dev/null and b/db/SSUdb.gz differ diff --git a/db/SSUdb.version.txt b/db/SSUdb.version.txt new file mode 100644 index 0000000000000000000000000000000000000000..27221c7c1ec4d8c477d3e110e15ba8b53b7d1a7c --- /dev/null +++ b/db/SSUdb.version.txt @@ -0,0 +1,3 @@ +SSUdb.gz 2024-02-18 08:49:06 +20404 sequences +built using makeSSUdb v1.1 diff --git a/makeSSUdb.sh b/db/makeSSUdb.sh similarity index 96% rename from makeSSUdb.sh rename to db/makeSSUdb.sh index 70aeca9224efa9883a28aa8ff7c2b4e5dba58a4a..45e5a1a77fbb06a8504355d1fe1abb4792fbf466 100755 --- a/makeSSUdb.sh +++ b/db/makeSSUdb.sh @@ -31,7 +31,10 @@ # = VERSIONS = # # ============ # # # - VERSION=1.0; # + VERSION=1.1; # +# + best compression ratio # +# # +# VERSION=1.0; # # # ############################################################################################################## @@ -150,7 +153,7 @@ echo "... [ok]" ; echo -n "compressing .." ; n=$(grep -c "^>" $TMP_DIR/ssu.db); echo -n "." ; -$GZIP -c $TMP_DIR/ssu.db > SSUdb.gz ; +$GZIP --best -c $TMP_DIR/ssu.db > SSUdb.gz ; echo -n "." ; { echo -n "SSUdb.gz " ; date +"%Y-%m-%d %H:%M:%S" ; diff --git a/example/FWSEC0011.ssu.fasta b/example/FWSEC0011.ssu.fasta index 65f738a8cf6615806efa60f544ac4a946e983d69..8382807d55298f80e04236366276b051782a7d07 100644 --- a/example/FWSEC0011.ssu.fasta +++ b/example/FWSEC0011.ssu.fasta @@ -1,2 +1,2 @@ ->SSU_covr_257_lgt_1543_amb_14 +>contigSSU_covr_586_lgt_1543_amb_14 AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGRAARCAGCTTGCTGYTTYGCTGACGAGTGGCGGACGGGTGAGTAATGTCTGGGAAACTGCCTGATGGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAACGTCGCAAGACCAAAGAGGGGGACCTTCGGGCCTCTTGCCATCGGATGTGCCCAGATGGGATTAGCTWGTWGGTGGGGTAACGGCTCACCWAGGCGACGATCCCTAGCTGGTCTGAGAGGATGACCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTTGGAGGTTGTGCCCTTGAGGCGTGGCTTCCGGAGCTAACGCGTTAAGTCGACCGCCTGGGGAGTACGGCCGCAAGGTTAAAACTCAAATGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACCTTACCTGGTCTTGACATCCACRGAASTTTYCAGAGATGaGAWTgGTGCCTTCGGGAACYGTGAGACAGGTGCTGCATGGCTGTCGTCAGCTCGTGTTGTGAAATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTATCCTTTGTTGCCAGCGGTCCGGCCGGGAACTCAAAGGAGACTGCCAGTGATAAACTGGAGGAAGGTGGGGATGACGTCAAGTCATCATGGCCCTTACGACCAGGGCTACACACGTGCTACAATGGCGCATACAAAGAGAAGCGACCTCGCGAGAGCAAGCGGACCTCATAAAGTGCGTCGTAGTCCGGATTGGAGTCTGCAACTCGACTCCATGAAGTCGGAATCGCTAGTAATCGTGGATCAGAATGCCACGGTGAATACGTTCCCGGGCCTTGTACACACCGCCCGTCACACCATGGGAGTGGGTTGCAAAAGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTTA