From 66c95a4eddffb59e1d602f495b4c283dc0c1bbb3 Mon Sep 17 00:00:00 2001 From: Alexis CRISCUOLO <alexis.criscuolo@pasteur.fr> Date: Tue, 6 Apr 2021 08:18:24 +0200 Subject: [PATCH] 2.0 --- README.md | 19 ++-- wgetENAHTS.sh | 239 ++++++++++++++++++++++++++++++++++---------------- 2 files changed, 173 insertions(+), 85 deletions(-) diff --git a/README.md b/README.md index 2aace6d..4d507ab 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # wgetENAHTS _wgetENAHTS_ is a command line program written in [Bash](https://www.gnu.org/software/bash/) to download gzipped FASTQ files from the [European Nucleotide Archive](https://www.ebi.ac.uk/ena/browser/home) (ENA) [ftp repository](ftp://ftp.sra.ebi.ac.uk/vol1/fastq/). -Every download is performed by the standard tool [_wget_](https://www.gnu.org/software/wget/). +Every download is performed using the standard tool [_wget_](https://www.gnu.org/software/wget/). ## Installation and execution @@ -27,8 +27,8 @@ Execute _wgetENAHTS_ with the following command line model: Run _wgetENAHTS_ without option to read the following documentation: ``` - USAGE: wgetENAHTS.sh [[-o <dir>] [-f <infile>] [-t <nthreads>] - [-r <rate>] [-w <sec>] [-n] [-h]] [<accn> ...] + USAGE: wgetENAHTS.sh [[-o <dir>] [-f <infile>] + [-t <nthreads>] [-r <rate>] [-n] [-h]] [<accn> ...] Downloads FASTQ files corresponding to the specified DRR/ERR/SRR accession(s) Files are downloaded from the ENA ftp repository ftp.sra.ebi.ac.uk/vol1/fastq @@ -37,11 +37,9 @@ Run _wgetENAHTS_ without option to read the following documentation: -o <dir> output directory (default: .) -f <file> to read accession(s) from the specified file (default: all the last arguments) - -t <int> maximum number of concurrent download(s) (default: 2) + -t <int> number of thread(s) (default: 2) -r <int> maximum download rate per file (in kb per seconds; default: entire available bandwidth) - -w <int> waiting time between each successive download (in seconds; default: - same as the specified value for option -t) -n no file download, only check (default: not set) -h prints this help and exits @@ -66,17 +64,18 @@ Run _wgetENAHTS_ without option to read the following documentation: + same as above with 9 parallel downloads and 500kb/sec download rate per file: wgetENAHTS.sh -t 9 -r 500 -f accn.txt - ``` ## Notes -* The HTS read accessions should starts with DRR, ERR or SRR (specified as arguments, or via a text file using option `-f`). The output file names are identical to those available in the repository corresponding to each specified accession identifier. Every downloaded file has file extension `.fastq.gz`. +* The HTS read accessions should starts with DRR, ERR or SRR (specified as final arguments, or in a text file using option `-f`). The output file names are identical to those available in the repository corresponding to each specified accession identifier. Every downloaded file has file extension `.fastq.gz`. + +* After checking the existence of a repository for each sêcified accession, a first step of (parallel) downloading is performed. Each downloaded file that seems incomplete (or missing) is downloaded a second time. -* For a given DRR/ERR/SRR accession, the existence of a repository within the ENA can be easily assessed using option `-n`. +* No download is performed when the output directory already contains files named with the specified accessions. -* After a first step of (parallel) downloading, the integrity of each gathered file is assessed. Each downloaded file that seems incomplete (or missing) are downloaded a second time. +* For a given DRR/ERR/SRR accession, the existence of a repository within the ENA can be easily assessed using option `-n` (i.e. no file download). * Fast running times are expected when running _wgetENAHTS_ on multiple threads (option `-t`). Depending on the bandwidth, the maximum download rate per file can be restricted using option `-r`. diff --git a/wgetENAHTS.sh b/wgetENAHTS.sh index b5c749d..981e4a2 100755 --- a/wgetENAHTS.sh +++ b/wgetENAHTS.sh @@ -33,7 +33,13 @@ # = VERSIONS = # # ============ # # # - VERSION=1.0.210327acjg # + VERSION=2.0.210406acjg # +# + option -w removed # +# + one thread per file, instead of one thread per accession # +# + gzip file integrity assessment is also multi-threaded # +# + modified output # +# # +# VERSION=1.0.210327acjg # # # ############################################################################################################## @@ -62,6 +68,17 @@ echoxit() { echo "$1" >&2 ; exit 1 ; } # # +# = fb() ================================================================================================= # +# prints the specified byte size $1 in rounded format # +# # +fb() { + if [ $1 -gt 1073741824 ]; then echo "$(bc -l <<<"scale=2;$1/1073741824" | sed 's/^\./0\./') Gb" ; + elif [ $1 -gt 1048576 ]; then echo "$(bc -l <<<"scale=2;$1/1048576" | sed 's/^\./0\./') Mb" ; + elif [ $1 -gt 1024 ]; then echo "$(bc -l <<<"scale=2;$1/1024" | sed 's/^\./0\./') kb" ; + else echo "$1 b" ; fi +} +typeset -fx fb ; +# # # = chrono() ============================================================================================= # # prints formatted elapsed time # # # @@ -76,8 +93,8 @@ mandoc() { echo -e "\n\033[1m wgetENAHTS v$VERSION $COPYRIGHT\033[0m"; cat <<EOF - USAGE: wgetENAHTS.sh [[-o <dir>] [-f <infile>] [-t <nthreads>] - [-r <rate>] [-w <sec>] [-n] [-h]] [<accn> ...] + USAGE: wgetENAHTS.sh [[-o <dir>] [-f <infile>] + [-t <nthreads>] [-r <rate>] [-n] [-h]] [<accn> ...] Downloads FASTQ files corresponding to the specified DRR/ERR/SRR accession(s) Files are downloaded from the ENA ftp repository ftp.sra.ebi.ac.uk/vol1/fastq @@ -86,11 +103,9 @@ mandoc() { -o <dir> output directory (default: .) -f <file> to read accession(s) from the specified file (default: all the last arguments) - -t <int> maximum number of concurrent download(s) (default: 2) + -t <int> number of thread(s) (default: 2) -r <int> maximum download rate per file (in kb per seconds; default: entire available bandwidth) - -w <int> waiting time between each successive download (in seconds; default: - same as the specified value for option -t) -n no file download, only check (default: not set) -h prints this help and exits @@ -155,6 +170,7 @@ if [ $# -lt 1 ]; then mandoc ; exit 1 ; fi FTPENA="ftp://ftp.sra.ebi.ac.uk/vol1/fastq"; WGET_DWNL="$WGET --no-clobber --continue --recursive --no-parent --level=1 --no-directories"; WGET_TEST="$WGET --spider"; +WGET_RESP="$WGET_TEST --server-response"; NTHREADS=2; OUTDIR="."; @@ -208,37 +224,45 @@ fi [[ $NTHREADS =~ ^[0-9]+$ ]] || echoxit "incorrect value (option -t): $NTHREADS" ; [ $NTHREADS -lt 1 ] && NTHREADS=1; [[ $WAITIME =~ ^[0-9]+$ ]] || echoxit "incorrect value (option -w): $WAITIME" ; - [ $WAITIME -eq 0 ] && WAITIME=$NTHREADS; +if [ $WAITIME -eq 0 ] +then + wt=0; while [ $(( $wt * $wt )) -lt $NTHREADS ]; do let wt++ ; done + WAITIME=$wt; +fi [ $WAITIME -lt 1 ] && WAITIME=1; -TMPF=$(mktemp -t -p ${TMPDIR:-/tmp}); + ############################################################################################################## #### #### -#### DOWNLOADING FASTQ FILES #### +#### CHECKING REPOSITORIES #### #### #### ############################################################################################################## ACCNLIST="$@"; -N=$#; +NA=$#; if [ -s $INFILE ] then ACCNLIST="$(tr -d '\r' < $INFILE) $ACCNLIST" ; - N=$(( $N + $(tr -d '\r' < $INFILE | wc -l) )); + NA=$(( $NA + $(tr -d '\r' < $INFILE | wc -l) )); fi -[ $N -eq 0 ] && echoxit "no accession found" ; +[ $NA -eq 0 ] && echoxit "no accession found" ; + +if [ $NA -eq 1 ]; then echo -e "$(chrono)\t\t$NA specified accession" ; +else echo -e "$(chrono)\t\t$NA specified accessions" ; fi +echo ; -trap "rm -f $TMPF;echo;for a in $(echo $ACCNLIST);do if ls $OUTDIR/\$a*.fastq.gz&>/dev/null;then for f in $OUTDIR/\$a*.fastq.gz;do if ! $GZIP -t \$f&>/dev/null;then echo removing \$f;rm -f \$f;fi;done;fi;done;exit 1;" SIGINT ; +trap "echo;echo interrupting;for a in \$(echo $ACCNLIST);do rm -f $OUTDIR/\$a.weh;done;exit 1;" SIGINT ; C=0; -DL=0; echo $DL > $TMPF ; +DL=0; for ACCN in $ACCNLIST do let C++ ; - OUTFQ=$OUTDIR/$ACCN*.fastq.gz; [ "$OUTDIR" == "." ] && OUTFQ=$ACCN*.fastq.gz; + [ "$OUTDIR" == "." ] && OUTFQ=$ACCN*.fastq.gz || OUTFQ=$OUTDIR/$ACCN*.fastq.gz; if $DWNL && ls $OUTFQ &>/dev/null then - echo -e "[$C/$N]\033[34m file(s) already exist(s) for accession $ACCN\033[0m" >&2 ; - ls -lho $OUTFQ | while read line ; do echo -e "\033[90m$line\033[0m" ; done >&2 ; + echo -e "[$C/$NA]\t\t\033[34mfile(s) already exist(s) for accession $ACCN\033[0m" >&2 ; + stat -c "%s %n" $OUTFQ | while read s n ; do echo -e "\033[90m$n\t[$(fb $s)]\033[0m" >&2 ; done continue ; fi nc=${#ACCN}; @@ -246,95 +270,160 @@ do elif [ $nc -eq 10 ]; then URL="$FTPENA/${ACCN:0:6}/00${ACCN:9:1}/$ACCN/"; else URL="$FTPENA/${ACCN:0:6}/0${ACCN:9:2}/$ACCN/"; fi - if $WGET_TEST $URL + if $WGET_TEST $URL || $WGET_TEST $URL then if $DWNL then - let DL++ ; echo $DL > $TMPF ; - ECHS="[$C/$N] downloading content of $URL"; - ECHE="[$C/$N] download completed for accession $ACCN"; - ECHP="[WARNING] download problem for accession $ACCN"; - echo "echo $ECHS; sleep $WAITIME; while true; do $WGET_DWNL $URL && break; done; if ls $OUTFQ &>/dev/null; then echo $ECHE; else echo $ECHP >&2; fi" ; - sleep $WAITIME ; + echo -n -e "[$C/$NA]\t\tchecking repository for accession $ACCN ." ; + sleep 0.5 &>/dev/null ; + echo -n "." ; + let DL++; + echo -n "." ; + while true; do $WGET_RESP $URL 2>&1 && break; done | grep -F "fastq.gz" > $OUTDIR/$ACCN.weh ; + echo " [ok]" ; else - echo -e "[$C/$N]\033[32m existing repository for accession $ACCN: $URL\033[0m" >&2 ; - sleep 1 ; + echo -e "[$C/$NA]\t\t\033[32mexisting repository for accession $ACCN: $URL\033[0m" >&2 ; fi else - echo -e "[$C/$N]\033[31m nothing found for accession $ACCN\033[0m" >&2 ; - sleep 1 ; + echo -e "[$C/$NA]\t\t\033[31mnothing found for accession $ACCN\033[0m" >&2 ; + fi + sleep 0.5 &>/dev/null ; +done + +echo ; +if [ $DL -eq 0 ]; then exit 0 ; +elif [ $DL -eq 1 ]; then echo -e -n "$(chrono)\t\t$DL valid accession; " ; +else echo -e -n "$(chrono)\t\t$DL valid accessions; " ; fi + + +############################################################################################################## +#### #### +#### DOWNLOADING FASTQ FILES #### +#### #### +############################################################################################################## + +N=0; +for ACCN in $ACCNLIST +do + if [ ! -s $OUTDIR/$ACCN.weh ]; then continue ; fi + nf=$(grep -c -F ".fastq.gz" $OUTDIR/$ACCN.weh); + N=$(( $N + $nf )); +done + +if [ $N -eq 1 ]; then echo "$N file to download" ; +else echo "$N files to download" ; fi +echo ; + +trap "echo;echo interrupting;if ls $OUTDIR/*.fastq.gz.weh &>/dev/null;then for f in $OUTDIR/*.fastq.gz.weh;do if [ \$(cat \$f) -ne 0 ];then echo removing \${f%.*};rm -f \$f \${f%.*};fi;done;fi;rm -f $OUTDIR/*.weh;exit 1;" SIGINT ; + +C=0; +for ACCN in $ACCNLIST +do + if [ ! -s $OUTDIR/$ACCN.weh ]; then continue ; fi + nc=${#ACCN}; + if [ $nc -eq 9 ]; then URL="$FTPENA/${ACCN:0:6}/$ACCN/"; + elif [ $nc -eq 10 ]; then URL="$FTPENA/${ACCN:0:6}/00${ACCN:9:1}/$ACCN/"; + else URL="$FTPENA/${ACCN:0:6}/0${ACCN:9:2}/$ACCN/"; fi + for FQGZ in $(sed "s/.*$ACCN/$ACCN/g" $OUTDIR/$ACCN.weh) + do + let C++ ; + CMD1="echo -e \"[$C/$N]$'\t'$'\t'downloading$'\t'$FQGZ\" ; sleep $WAITIME"; + CMD2="while true ; do $WGET_DWNL $URL$FQGZ && break ; done"; + CMD3="$GZIP --test $OUTDIR/$FQGZ &>/dev/null ; echo \$? > $OUTDIR/$FQGZ.weh"; + CMD4="s=\$(fb \$(stat -c %s $OUTDIR/$FQGZ))"; + CMD5="echo \"[$C/$N]$'\t'$'\t'completed$'\t'$FQGZ$'\t'[\$s]\""; + echo "$CMD1 ; $CMD2 ; $CMD3 ; $CMD4 ; $CMD5" ; + sleep $WAITIME ; + done done | xargs -P $NTHREADS -I CMD bash -c CMD ; +N=0; +for ACCN in $ACCNLIST +do + if [ ! -s $OUTDIR/$ACCN.weh ]; then continue ; fi + for FQGZ in $(sed "s/.*$ACCN/$ACCN/g" $OUTDIR/$ACCN.weh) + do + [ -e $OUTDIR/$FQGZ ] && let N++; + done +done -DL=$(cat $TMPF); rm -f $TMPF ; -if [ $DL -eq 0 ]; then exit 0 ; fi ## NOTE: no download -if [ $DL -eq 1 ] -then echo "$(chrono) $DL downloaded repository" ; -else echo "$(chrono) $DL downloaded repositories" ; -fi +echo ; +if [ $N -eq 1 ]; then echo -e "$(chrono)\t\t$N downloaded file" ; +else echo -e "$(chrono)\t\t$N downloaded files" ; fi ############################################################################################################## #### #### -#### CHECKING GZIPPED FASTQ FILES AND DOWNLOADING AGAIN (IF REQUIRED) #### +#### CHECKING FASTQ FILES AND DOWNLOADING AGAIN (IF REQUIRED) #### #### #### ############################################################################################################## C=0; -NF=0; for ACCN in $ACCNLIST do - let C++ ; - OUTFQ=$OUTDIR/$ACCN*.fastq.gz; [ "$OUTDIR" == "." ] && OUTFQ=$ACCN*.fastq.gz; + if [ ! -s $OUTDIR/$ACCN.weh ]; then continue ; fi nc=${#ACCN}; if [ $nc -eq 9 ]; then URL="$FTPENA/${ACCN:0:6}/$ACCN/"; elif [ $nc -eq 10 ]; then URL="$FTPENA/${ACCN:0:6}/00${ACCN:9:1}/$ACCN/"; else URL="$FTPENA/${ACCN:0:6}/0${ACCN:9:2}/$ACCN/"; fi - if $WGET_TEST $URL - then - echo -n "[$C/$N] checking downloaded files for accession $ACCN ..." ; - n1=$(ls $OUTFQ 2>/dev/null | wc -l); - if [ $n1 -eq 0 ] + for FQGZ in $(sed "s/.*$ACCN/$ACCN/g" $OUTDIR/$ACCN.weh) + do + let C++ ; + if [ ! -e $OUTDIR/$FQGZ ] || [ ! -s $OUTDIR/$FQGZ.weh ] || [ "$(cat $OUTDIR/$FQGZ.weh)" -ne 0 ] then - echo -e "\n\033[31m[WARNING] problem with accession $ACCN\033[0m" >&2 ; - echo -n "[$C/$N] downloading again content of $URL ..." ; - else - for f in $OUTFQ - do - if ! $GZIP --test $f 2>/dev/null - then - echo -e "\n\033[31m[WARNING] problem with file $f\033[0m" >&2 ; - rm -f $f ; - echo -n "[$C/$N] downloading again $f ..." ; - else - echo -n "." ; - fi - done - n1=$(ls $OUTFQ 2>/dev/null | wc -l); + rm -f $OUTDIR/$FQGZ ; + echo -e "\033[31m[WARNING]\t\tproblem with file $FQGZ\033[0m" >&2 ; + CMD1="echo \"[$C/$N]$'\t'$'\t'downloading$'\t'$FQGZ\" ; sleep $WAITIME"; + CMD2="while true ; do $WGET_DWNL $URL$FQGZ && break ; done"; + CMD3="$GZIP --test $OUTDIR/$FQGZ &>/dev/null ; echo \$? > $OUTDIR/$FQGZ.weh"; + CMD4="s=\$(fb \$(stat -c %s $OUTDIR/$FQGZ))"; + CMD5="echo \"[$C/$N]$'\t'$'\t'completed$'\t'$FQGZ$'\t'[\$s]\""; + echo "$CMD1 ; $CMD2 ; $CMD3 ; $CMD4 ; $CMD5" ; + sleep $WAITIME ; fi - while true ; do $WGET_DWNL $URL && break ; done ; ## NOTE: does nothing when all files were downloaded - n2=$(ls $OUTFQ 2>/dev/null | wc -l); - if [ $n2 -eq 0 ] + done +done | xargs -P $NTHREADS -I CMD bash -c CMD ; + +echo ; + +############################################################################################################## +#### #### +#### FINALIZING #### +#### #### +############################################################################################################## + +C=0; +for ACCN in $ACCNLIST +do + if [ ! -s $OUTDIR/$ACCN.weh ]; then continue ; fi + nc=${#ACCN}; + if [ $nc -eq 9 ]; then URL="$FTPENA/${ACCN:0:6}/$ACCN/"; + elif [ $nc -eq 10 ]; then URL="$FTPENA/${ACCN:0:6}/00${ACCN:9:1}/$ACCN/"; + else URL="$FTPENA/${ACCN:0:6}/0${ACCN:9:2}/$ACCN/"; + fi + for FQGZ in $(sed "s/.*$ACCN/$ACCN/g" $OUTDIR/$ACCN.weh) + do + let C++ ; + if [ ! -s $OUTDIR/$FQGZ ] then - echo -e "\n\033[31m[FAIL] unable to download content of $URL\033[0m" >&2 ; - sleep $WAITIME ; + echo -e "[$C/$N]\t\t\033[31m[FAIL] unable to download $URL/$FQGZ\033[0m" >&2 ; + elif [ ! -s $OUTDIR/$FQGZ.weh ] + then + echo -e "[$C/$N]\t\t\033[31m[FAIL] unable to verify $OUTDIR/$FQGZ\033[0m" >&2 ; + elif [ "$(cat $OUTDIR/$FQGZ.weh)" -ne 0 ] + then + echo -e "[$C/$N]\t\t\033[31m[FAIL] incorrect gzip integrity: $OUTDIR/$FQGZ\033[0m" >&2 ; else - echo " [ok]" ; - ls -lho $OUTFQ ; - NF=$(( $NF + $n2 )); - if [ $n1 -eq $n2 ]; then sleep 0.5 ; else sleep $WAITIME ; fi - fi - else - sleep 0.5 ; - fi + echo -e "[$C/$N]\t\t$(md5sum -b $FQGZ | sed 's/ .*//')\t$FQGZ\t[$(fb $(stat -c %s $FQGZ))]" ; + fi + rm -f $OUTDIR/$FQGZ.weh ; + done done -if [ $NF -eq 1 ] -then echo "$(chrono) $NF downloaded file" ; -else echo "$(chrono) $NF downloaded files" ; -fi +echo ; +echo -e "$(chrono)\t\texiting" ; exit 0 ; -- GitLab