diff --git a/README.md b/README.md index 6870917fbb6a682d3f698d8d99643859c2557a70..a8e8e8164bd532c42386c8a725ceac29198f05aa 100644 --- a/README.md +++ b/README.md @@ -27,33 +27,34 @@ Execute _wgetENAHTS_ with the following command line model: Run _wgetENAHTS_ without option to read the following documentation: ``` - USAGE: wgetENAHTS.sh [[-o <dir>] [-f <infile>] - [-t <nthreads>] [-r <rate>] [-n] [-h]] [<accn> ...] + USAGE: wgetENAHTS.sh [[-o <dir>] [-f <infile>] [-t <nthreads>] + [-p <protocol>] [-r <rate>] [-n] [-h]] [<accn> ...] Downloads FASTQ files corresponding to the specified DRR/ERR/SRR accession(s) Files are downloaded from the ENA ftp repository ftp.sra.ebi.ac.uk/vol1/fastq OPTIONS: - -o <dir> output directory (default: .) - -f <file> to read accession(s) from the specified file (default: all the last - arguments) - -t <int> number of thread(s) (default: 2) - -r <int> maximum download rate per file (in kb per seconds; default: entire - available bandwidth) - -n no file download, only check (default: not set) - -h prints this help and exits + -o <dir> output directory (default: .) + -f <file> to read accession(s) from the specified file (default: all the last + arguments) + -t <int> number of thread(s) (default: 2) + -p <string> force the transfer protocol, either ftp or https (default: auto) + -r <int> maximum download rate per file, in kb per seconds (default: entire + available bandwidth) + -n no file download, only check (default: not set) + -h prints this help and exits EXAMPLES: + downloading the SE FASTQ file corresponding to accession DRR000003: wgetENAHTS.sh DRR000003 - + downloading the FASTQ files corresponding to accessions ERR000001 and ERR000004: + + downloading FASTQ files corresponding to accessions ERR000001 and ERR000004: wgetENAHTS.sh ERR000001 ERR000004 + assessing the repository existence for accessions SRR9870010-39: wgetENAHTS.sh -n SRR98700{10..39} - + downloading the FASTQ files (if any) corresponding to accessions SRR9870010-39: + + downloading FASTQ files (if any) corresponding to accessions SRR9870010-39: wgetENAHTS.sh SRR98700{10..39} + same as above with (at most) 6 parallel downloads and saved outputs: diff --git a/wgetENAHTS.sh b/wgetENAHTS.sh index 7e7c583b6cba15e51325f40ba5be03713619f5cd..87e6dd38e7ff178275d650968deb09eaad35cbd1 100755 --- a/wgetENAHTS.sh +++ b/wgetENAHTS.sh @@ -105,21 +105,22 @@ mandoc() { echo -e "\n\033[1m wgetENAHTS v$VERSION $COPYRIGHT\033[0m"; cat <<EOF - USAGE: wgetENAHTS.sh [[-o <dir>] [-f <infile>] - [-t <nthreads>] [-r <rate>] [-n] [-h]] [<accn> ...] + USAGE: wgetENAHTS.sh [[-o <dir>] [-f <infile>] [-t <nthreads>] + [-p <protocol>] [-r <rate>] [-n] [-h]] [<accn> ...] Downloads FASTQ files corresponding to the specified DRR/ERR/SRR accession(s) Files are downloaded from the ENA ftp repository ftp.sra.ebi.ac.uk/vol1/fastq OPTIONS: - -o <dir> output directory (default: .) - -f <file> to read accession(s) from the specified file (default: all the last - arguments) - -t <int> number of thread(s) (default: 2) - -r <int> maximum download rate per file (in kb per seconds; default: entire - available bandwidth) - -n no file download, only check (default: not set) - -h prints this help and exits + -o <dir> output directory (default: .) + -f <file> to read accession(s) from the specified file (default: all the last + arguments) + -t <int> number of thread(s) (default: 2) + -p <string> force the transfer protocol, either ftp or https (default: auto) + -r <int> maximum download rate per file, in kb per seconds (default: entire + available bandwidth) + -n no file download, only check (default: not set) + -h prints this help and exits EXAMPLES: + downloading the SE FASTQ file corresponding to accession DRR000003: @@ -158,7 +159,7 @@ EOF # # WGET_BIN=wget; [ ! $(command -v $WGET_BIN) ] && echoxit "no $WGET_BIN detected" ; - WGET_STATIC_OPTIONS="--no-check-certificate --retry-connrefused --random-wait --quiet"; + WGET_STATIC_OPTIONS="--quiet --retry-connrefused --no-check-certificate"; WGET="$WGET_BIN $WGET_STATIC_OPTIONS"; # # # -- gzip ------------------------------------------------------------------------------------------------- # @@ -180,28 +181,28 @@ EOF if [ $# -lt 1 ]; then mandoc ; exit 1 ; fi FILE_REPORT="https://www.ebi.ac.uk/ena/portal/api/filereport?download=true&result=read_run&accession="; -WGET_DWNL="$WGET --continue --no-directories"; -WGET_READ="$WGET --output-document -"; +WGET_DWNL="$WGET --read-timeout=200 --continue --no-directories"; +WGET_READ="$WGET --read-timeout=200 --output-document -"; WGET_TEST="$WGET --spider"; NTHREADS=2; OUTDIR="."; INFILE="_N.A_"; MAXRATE="NA"; -WAITIME=0; +PROTOCOL="auto"; DWNL=true; -while getopts t:o:f:r:w:nh option +while getopts t:o:f:r:p:nh option do case $option in - t) NTHREADS=$OPTARG ;; - o) OUTDIR="$OPTARG" ;; - f) INFILE="$OPTARG" ;; - r) MAXRATE=$OPTARG ;; - w) WAITIME=$OPTARG ;; - n) DWNL=false ;; - h) mandoc ; exit 0 ;; - \?) mandoc ; exit 1 ;; + t) NTHREADS=$OPTARG ;; + o) OUTDIR="$OPTARG" ;; + f) INFILE="$OPTARG" ;; + r) MAXRATE=$OPTARG ;; + p) PROTOCOL="$OPTARG" ;; + n) DWNL=false ;; + h) mandoc ; exit 0 ;; + \?) mandoc ; exit 1 ;; esac done shift "$(( $OPTIND - 1 ))" @@ -233,15 +234,16 @@ then [ $MAXRATE -lt 1 ] && MAXRATE=1; WGET_DWNL="$WGET_DWNL --limit-rate=$MAXRATE"k; fi -[[ $NTHREADS =~ ^[0-9]+$ ]] || echoxit "incorrect value (option -t): $NTHREADS" ; - [ $NTHREADS -lt 1 ] && NTHREADS=1; -[[ $WAITIME =~ ^[0-9]+$ ]] || echoxit "incorrect value (option -w): $WAITIME" ; -if [ $WAITIME -eq 0 ] +if [ "$PROTOCOL" != "auto" ] && [ "$PROTOCOL" != "ftp" ] && [ "$PROTOCOL" != "https" ] then - wt=0; while [ $(( $wt * $wt )) -lt $NTHREADS ]; do let wt++ ; done - WAITIME=$wt; + echoxit "transfer protocol should be either ftp or https (option -p): $PROTOCOL" ; fi -[ $WAITIME -lt 1 ] && WAITIME=1; +[[ $NTHREADS =~ ^[0-9]+$ ]] || echoxit "incorrect value (option -t): $NTHREADS" ; + [ $NTHREADS -lt 1 ] && NTHREADS=1; + +wt=0; while [ $(( $wt * $wt )) -lt $NTHREADS ]; do let wt++ ; done +WAITIME=$wt; +[ $WAITIME -lt 1 ] && WAITIME=1; ############################################################################################################## @@ -266,24 +268,28 @@ echo ; ############################################################################################################## #### #### -#### CHECKING PROTOCOL #### +#### ASSESSING TRANSFER PROTOCOL #### #### #### ############################################################################################################## URL="ftp.sra.ebi.ac.uk/vol1/fastq"; -echo -n -e "$(chrono)\t\tchecking protocol " ; -time_ftp=$SECONDS; -for i in {1..5} ; do echo -n "." ; timeout 2 $WGET_TEST "ftp://$URL/DRR00$i/" &>/dev/null ; rm -f wget-log ; done -time_ftp=$(( $SECONDS - $time_ftp )); -time_https=$SECONDS; -for i in {1..5} ; do echo -n "." ; timeout 2 $WGET_TEST "https://$URL/DRR00$i/" &>/dev/null ; rm -f wget-log ; done -time_https=$(( $SECONDS - $time_https )); -echo " [ok]" ; -if [ $time_ftp -lt $time_https ] -then FTPENA="ftp://$URL"; echo -e "$(chrono)\t\tselected protocol: ftp ($time_ftp:$time_https)" ; -else FTPENA="https://$URL"; echo -e "$(chrono)\t\tselected protocol: https ($time_https:$time_ftp)" ; +if [ "$PROTOCOL" == "auto" ] +then + echo -n -e "$(chrono)\t\tassessing transfer protocol " ; + PROTOCOL="ftp://"; time_ftp=$SECONDS; + for i in {1..5} ; do echo -n "." ; timeout 2 $WGET_TEST "$PROTOCOL$URL/DRR00$i/" &>/dev/null ; rm -f wget-log ; done + time_ftp=$(( $SECONDS - $time_ftp )); + PROTOCOL="https://"; time_https=$SECONDS; + for i in {1..5} ; do echo -n "." ; timeout 2 $WGET_TEST "$PROTOCOL$URL/DRR00$i/" &>/dev/null ; rm -f wget-log ; done + time_https=$(( $SECONDS - $time_https )); + echo " [ok]" ; + if [ $time_ftp -lt $time_https ] + then PROTOCOL="ftp://"; echo -e "$(chrono)\t\tselected protocol: ftp ($time_ftp<$time_https)" ; + else PROTOCOL="https://"; echo -e "$(chrono)\t\tselected protocol: https ($time_https<$time_ftp)" ; + fi + echo ; fi -echo ; +FTPENA="$PROTOCOL$URL" ; ############################################################################################################## @@ -303,7 +309,7 @@ do if $DWNL && ls $OUTFQ &>/dev/null then echo -e "[$C/$NA]\t\t\033[34mfile(s) already exist(s) for accession $ACCN\033[0m" >&2 ; - stat -c "%s %n" $OUTFQ | while read s n ; do echo -e "\033[90m$n\t[$(fb $s)]\033[0m" >&2 ; done + stat -c "%s %n" $OUTFQ | while read s n ; do echo -e "\t\t\033[90m$n\t[$(fb $s)]\033[0m" >&2 ; done continue ; fi nc=${#ACCN}; @@ -353,14 +359,6 @@ echo ; if [ $DL -eq 0 ]; then exit 0 ; elif [ $DL -eq 1 ]; then echo -e -n "$(chrono)\t\t$DL valid accession; " ; else echo -e -n "$(chrono)\t\t$DL valid accessions; " ; fi - - -############################################################################################################## -#### #### -#### DOWNLOADING FASTQ FILES #### -#### #### -############################################################################################################## - N=0; for ACCN in $ACCNLIST do @@ -368,13 +366,67 @@ do nf=$(grep -c -F ".fastq.gz" $OUTDIR/$ACCN.weh); N=$(( $N + $nf )); done - if [ $N -eq 1 ]; then echo "$N file to download" ; else echo "$N files to download" ; fi echo ; + +############################################################################################################## +#### #### +#### SORTING ACCESSIONS #### +#### #### +############################################################################################################## + ACCNSORT="$(for ACCN in $ACCNLIST ; do [ -s $OUTDIR/$ACCN.weh ] && echo -e "$(sed 's/ /\t/g' $OUTDIR/$ACCN.weh | cut -f2 | paste -sd+ | bc -l)\t$ACCN" ; done | sort -gr | cut -f2 | tr '\n' ' ')"; + +############################################################################################################## +#### #### +#### MEASURING DOWNLOAD SPEED #### +#### #### +############################################################################################################## + +# if [ "$MAXRATE" == "NA" ] +# then +# echo -e -n "$(chrono)\t\testimating download speed " ; +# for ACCN in $ACCNSORT +# do +# echo -n "." ; +# if [ ! -s $OUTDIR/$ACCN.weh ]; then continue ; fi +# nc=${#ACCN}; +# if [ $nc -eq 9 ]; then URL="$FTPENA/${ACCN:0:6}/$ACCN/"; +# elif [ $nc -eq 10 ]; then URL="$FTPENA/${ACCN:0:6}/00${ACCN:9:1}/$ACCN/"; +# else URL="$FTPENA/${ACCN:0:6}/0${ACCN:9:2}/$ACCN/"; +# fi +# echo -n "." ; +# for FQGZ in $(sed "s/.*$ACCN/$ACCN/g" $OUTDIR/$ACCN.weh) +# do +# echo -n "." ; +# timeout 30 $WGET_DWNL $URL$FQGZ &>/dev/null ; +# bytes=$(du -b $OUTDIR/$FQGZ | tail -1 | cut -f1); +# RATE=$(( $bytes / 30 )); +# rm -f wget-log ; +# break ; +# done +# break ; +# done +# echo " [ok]" ; +# if [ $N -lt $NTHREADS ]; then RATE=$(( $RATE / $N )); else RATE=$(( $RATE / $NTHREADS )); fi +# if [ $RATE -gt 10000 ] +# then +# WGET_DWNL="$WGET_DWNL --limit-rate=$RATE" +# echo -e "$(chrono)\t\tdownload speed per file: $RATE bytes/seconds" ; +# fi +# echo ; +# fi + + +############################################################################################################## +#### #### +#### DOWNLOADING FASTQ FILES #### +#### #### +############################################################################################################## + C=0; for ACCN in $ACCNSORT do @@ -388,7 +440,7 @@ do do let C++ ; CMD1="echo -e \"[$C/$N]$'\t'$'\t'downloading$'\t'$FQGZ\" ; sleep $WAITIME"; - CMD2="while true ; do $WGET_DWNL $URL$FQGZ && break ; done"; + CMD2="while true ; do nice $WGET_DWNL $URL$FQGZ && break ; done"; CMD3="md5sum -b $OUTDIR/$FQGZ > $OUTDIR/$FQGZ.weh"; CMD4="s=\$(fb \$(stat -c %s $OUTDIR/$FQGZ))"; CMD5="echo \"[$C/$N]$'\t'$'\t'completed$'\t'$FQGZ$'\t'[\$s]\""; @@ -435,7 +487,7 @@ do # rm -f $OUTDIR/$FQGZ ; echo -e "\033[31m[WARNING]\t\tproblem with file $FQGZ\033[0m" >&2 ; CMD1="echo \"[$C/$N]$'\t'$'\t'downloading$'\t'$FQGZ\" ; sleep $WAITIME"; - CMD2="while true ; do $WGET_DWNL $URL$FQGZ && break ; done"; + CMD2="while true ; do nice $WGET_DWNL $URL$FQGZ && break ; done"; CMD3="md5sum -b $OUTDIR/$FQGZ > $OUTDIR/$FQGZ.weh"; CMD4="s=\$(fb \$(stat -c %s $OUTDIR/$FQGZ))"; CMD5="echo \"[$C/$N]$'\t'$'\t'completed$'\t'$FQGZ$'\t'[\$s]\"";