From 66c95a4eddffb59e1d602f495b4c283dc0c1bbb3 Mon Sep 17 00:00:00 2001
From: Alexis  CRISCUOLO <alexis.criscuolo@pasteur.fr>
Date: Tue, 6 Apr 2021 08:18:24 +0200
Subject: [PATCH] 2.0

---
 README.md     |  19 ++--
 wgetENAHTS.sh | 239 ++++++++++++++++++++++++++++++++++----------------
 2 files changed, 173 insertions(+), 85 deletions(-)

diff --git a/README.md b/README.md
index 2aace6d..4d507ab 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # wgetENAHTS
 
 _wgetENAHTS_ is a command line program written in [Bash](https://www.gnu.org/software/bash/) to download gzipped FASTQ files from the [European Nucleotide Archive](https://www.ebi.ac.uk/ena/browser/home) (ENA) [ftp repository](ftp://ftp.sra.ebi.ac.uk/vol1/fastq/).
-Every download is performed by the standard tool [_wget_](https://www.gnu.org/software/wget/).
+Every download is performed using the standard tool [_wget_](https://www.gnu.org/software/wget/).
 
 
 ## Installation and execution
@@ -27,8 +27,8 @@ Execute _wgetENAHTS_ with the following command line model:
 Run _wgetENAHTS_ without option to read the following documentation:
 
 ```
- USAGE:  wgetENAHTS.sh  [[-o <dir>]  [-f <infile>]  [-t <nthreads>]
-                         [-r <rate>]  [-w <sec>]  [-n]  [-h]]  [<accn> ...] 
+ USAGE:  wgetENAHTS.sh  [[-o <dir>]  [-f <infile>]  
+                         [-t <nthreads>] [-r <rate>]  [-n]  [-h]]  [<accn> ...] 
 
  Downloads FASTQ files corresponding to the specified DRR/ERR/SRR accession(s)
  Files are downloaded from the ENA ftp repository ftp.sra.ebi.ac.uk/vol1/fastq
@@ -37,11 +37,9 @@ Run _wgetENAHTS_ without option to read the following documentation:
   -o <dir>   output directory (default: .)
   -f <file>  to read accession(s) from the specified file (default: all the last 
              arguments)
-  -t <int>   maximum number of concurrent download(s) (default: 2)
+  -t <int>   number of thread(s) (default: 2)
   -r <int>   maximum download rate per file (in kb per seconds; default: entire 
              available bandwidth)
-  -w <int>   waiting time between each successive download (in seconds; default:
-             same as the specified value for option -t)
   -n         no file download, only check (default: not set)
   -h         prints this help and exits
 
@@ -66,17 +64,18 @@ Run _wgetENAHTS_ without option to read the following documentation:
 
   + same as above with 9 parallel downloads and 500kb/sec download rate per file:
      wgetENAHTS.sh  -t 9  -r 500  -f accn.txt
-
 ```
 
 
 ## Notes
 
-* The HTS read accessions should starts with DRR, ERR or SRR (specified as arguments, or via a text file using option `-f`). The output file names are identical to those available in the repository corresponding to each specified accession identifier. Every downloaded file has file extension `.fastq.gz`.
+* The HTS read accessions should starts with DRR, ERR or SRR (specified as final arguments, or in a text file using option `-f`). The output file names are identical to those available in the repository corresponding to each specified accession identifier. Every downloaded file has file extension `.fastq.gz`.
+
+* After checking the existence of a repository for each sêcified accession, a first step of (parallel) downloading is performed. Each downloaded file that seems incomplete (or missing) is downloaded a second time.
 
-* For a given DRR/ERR/SRR accession, the existence of a repository within the ENA  can be easily assessed using option `-n`.
+* No download is performed when the output directory already contains files named with the specified accessions.
 
-* After a first step of (parallel) downloading, the integrity of each gathered file is assessed. Each downloaded file that seems incomplete (or missing) are downloaded a second time. 
+* For a given DRR/ERR/SRR accession, the existence of a repository within the ENA  can be easily assessed using option `-n` (i.e. no file download).
 
 * Fast running times are expected when running _wgetENAHTS_ on multiple threads (option `-t`). Depending on the bandwidth, the maximum download rate per file can be restricted using option `-r`.
 
diff --git a/wgetENAHTS.sh b/wgetENAHTS.sh
index b5c749d..981e4a2 100755
--- a/wgetENAHTS.sh
+++ b/wgetENAHTS.sh
@@ -33,7 +33,13 @@
 # = VERSIONS =                                                                                               #
 # ============                                                                                               #
 #                                                                                                            #
-  VERSION=1.0.210327acjg                                                                                     #
+  VERSION=2.0.210406acjg                                                                                     #
+# + option -w removed                                                                                        #
+# + one thread per file, instead of one thread per accession                                                 #
+# + gzip file integrity assessment is also multi-threaded                                                    #
+# + modified output                                                                                          #
+#                                                                                                            #
+# VERSION=1.0.210327acjg                                                                                     #
 #                                                                                                            #
 ##############################################################################################################
   
@@ -62,6 +68,17 @@ echoxit() {
   echo "$1" >&2 ; exit 1 ;
 }    
 #                                                                                                            #
+# = fb() =================================================================================================   #
+#   prints the specified byte size $1 in rounded format                                                      #
+#                                                                                                            #
+fb() {
+  if   [ $1 -gt 1073741824 ]; then echo "$(bc -l <<<"scale=2;$1/1073741824" | sed 's/^\./0\./') Gb" ;
+  elif [ $1 -gt 1048576 ];    then echo "$(bc -l <<<"scale=2;$1/1048576"    | sed 's/^\./0\./') Mb" ;
+  elif [ $1 -gt 1024 ];       then echo "$(bc -l <<<"scale=2;$1/1024"       | sed 's/^\./0\./') kb" ;
+  else                             echo "$1 b" ; fi
+}
+typeset -fx fb ;
+#                                                                                                            #
 # = chrono() =============================================================================================   #
 #   prints formatted elapsed time                                                                            #
 #                                                                                                            #
@@ -76,8 +93,8 @@ mandoc() {
   echo -e "\n\033[1m wgetENAHTS v$VERSION                     $COPYRIGHT\033[0m";
   cat <<EOF
 
- USAGE:  wgetENAHTS.sh  [[-o <dir>]  [-f <infile>]  [-t <nthreads>]
-                         [-r <rate>]  [-w <sec>]  [-n]  [-h]]  [<accn> ...] 
+ USAGE:  wgetENAHTS.sh  [[-o <dir>]  [-f <infile>]  
+                         [-t <nthreads>] [-r <rate>]  [-n]  [-h]]  [<accn> ...] 
 
  Downloads FASTQ files corresponding to the specified DRR/ERR/SRR accession(s)
  Files are downloaded from the ENA ftp repository ftp.sra.ebi.ac.uk/vol1/fastq
@@ -86,11 +103,9 @@ mandoc() {
   -o <dir>   output directory (default: .)
   -f <file>  to read accession(s) from the specified file (default: all the last
              arguments)
-  -t <int>   maximum number of concurrent download(s) (default: 2)
+  -t <int>   number of thread(s) (default: 2)
   -r <int>   maximum download rate per file (in kb per seconds; default: entire 
              available bandwidth)
-  -w <int>   waiting time between each successive download (in seconds; default:
-             same as the specified value for option -t)
   -n         no file download, only check (default: not set)
   -h         prints this help and exits
 
@@ -155,6 +170,7 @@ if [ $# -lt 1 ]; then mandoc ; exit 1 ; fi
 FTPENA="ftp://ftp.sra.ebi.ac.uk/vol1/fastq";
 WGET_DWNL="$WGET --no-clobber --continue --recursive --no-parent --level=1 --no-directories";
 WGET_TEST="$WGET --spider";
+WGET_RESP="$WGET_TEST --server-response";
 
 NTHREADS=2;
 OUTDIR=".";
@@ -208,37 +224,45 @@ fi
 [[ $NTHREADS =~ ^[0-9]+$ ]]  || echoxit "incorrect value (option -t): $NTHREADS" ; 
  [ $NTHREADS -lt 1 ]         && NTHREADS=1;
 [[ $WAITIME =~ ^[0-9]+$ ]]   || echoxit "incorrect value (option -w): $WAITIME" ;
- [ $WAITIME -eq 0 ]          && WAITIME=$NTHREADS;
+if [ $WAITIME -eq 0 ]
+then
+  wt=0; while [ $(( $wt * $wt )) -lt $NTHREADS ]; do let wt++ ; done
+  WAITIME=$wt;
+fi
  [ $WAITIME -lt 1 ]          && WAITIME=1;
-TMPF=$(mktemp -t -p ${TMPDIR:-/tmp});
 
+ 
 ##############################################################################################################
 ####                                                                                                      ####
-#### DOWNLOADING FASTQ FILES                                                                              ####
+#### CHECKING REPOSITORIES                                                                                ####
 ####                                                                                                      ####
 ##############################################################################################################
 
 ACCNLIST="$@";
-N=$#;
+NA=$#;
 if [ -s $INFILE ]
 then
   ACCNLIST="$(tr -d '\r' < $INFILE) $ACCNLIST" ;
-  N=$(( $N + $(tr -d '\r' < $INFILE | wc -l) ));
+  NA=$(( $NA + $(tr -d '\r' < $INFILE | wc -l) ));
 fi
-[ $N -eq 0 ] && echoxit "no accession found" ;
+[ $NA -eq 0 ] && echoxit "no accession found" ;
+
+if [ $NA -eq 1 ]; then echo -e "$(chrono)\t\t$NA specified accession" ;
+else                   echo -e "$(chrono)\t\t$NA specified accessions" ; fi
+echo ;
 
-trap "rm -f $TMPF;echo;for a in $(echo $ACCNLIST);do if ls $OUTDIR/\$a*.fastq.gz&>/dev/null;then for f in $OUTDIR/\$a*.fastq.gz;do if ! $GZIP -t \$f&>/dev/null;then echo removing \$f;rm -f \$f;fi;done;fi;done;exit 1;" SIGINT ;
+trap "echo;echo interrupting;for a in \$(echo $ACCNLIST);do rm -f $OUTDIR/\$a.weh;done;exit 1;" SIGINT ;
 
 C=0;
-DL=0; echo $DL > $TMPF ;
+DL=0;
 for ACCN in $ACCNLIST
 do
   let C++ ;
-  OUTFQ=$OUTDIR/$ACCN*.fastq.gz; [ "$OUTDIR" == "." ] && OUTFQ=$ACCN*.fastq.gz;
+  [ "$OUTDIR" == "." ] && OUTFQ=$ACCN*.fastq.gz || OUTFQ=$OUTDIR/$ACCN*.fastq.gz; 
   if $DWNL && ls $OUTFQ &>/dev/null
   then
-    echo -e "[$C/$N]\033[34m file(s) already exist(s) for accession $ACCN\033[0m"    >&2 ;
-    ls -lho $OUTFQ | while read line ; do echo -e "\033[90m$line\033[0m" ; done      >&2 ;
+    echo -e "[$C/$NA]\t\t\033[34mfile(s) already exist(s) for accession $ACCN\033[0m"    >&2 ;
+    stat -c "%s %n" $OUTFQ | while read s n ; do echo -e "\033[90m$n\t[$(fb $s)]\033[0m" >&2 ; done 
     continue ;
   fi
   nc=${#ACCN};
@@ -246,95 +270,160 @@ do
   elif [ $nc -eq 10 ]; then URL="$FTPENA/${ACCN:0:6}/00${ACCN:9:1}/$ACCN/";
   else                      URL="$FTPENA/${ACCN:0:6}/0${ACCN:9:2}/$ACCN/";
   fi
-  if $WGET_TEST $URL
+  if $WGET_TEST $URL || $WGET_TEST $URL
   then
     if $DWNL
     then
-      let DL++ ; echo $DL > $TMPF ;
-      ECHS="[$C/$N] downloading content of $URL";
-      ECHE="[$C/$N] download completed for accession $ACCN";
-      ECHP="[WARNING] download problem for accession $ACCN";
-      echo "echo $ECHS; sleep $WAITIME; while true; do $WGET_DWNL $URL && break; done; if ls $OUTFQ &>/dev/null; then echo $ECHE; else echo $ECHP >&2; fi" ;
-      sleep $WAITIME ;
+      echo -n -e "[$C/$NA]\t\tchecking repository for accession $ACCN ." ;
+      sleep 0.5 &>/dev/null ;
+      echo -n "." ; 
+      let DL++;
+      echo -n "." ; 
+      while true; do $WGET_RESP $URL 2>&1 && break; done | grep -F "fastq.gz" > $OUTDIR/$ACCN.weh ;
+      echo " [ok]" ;
     else
-      echo -e "[$C/$N]\033[32m existing repository for accession $ACCN: $URL\033[0m" >&2 ;
-      sleep 1 ;
+      echo -e "[$C/$NA]\t\t\033[32mexisting repository for accession $ACCN: $URL\033[0m" >&2 ;
     fi	
   else
-    echo -e "[$C/$N]\033[31m nothing found for accession $ACCN\033[0m"               >&2 ;
-    sleep 1 ;
+    echo -e "[$C/$NA]\t\t\033[31mnothing found for accession $ACCN\033[0m"               >&2 ;
+  fi
+  sleep 0.5 &>/dev/null ;
+done 
+
+echo ; 
+if   [ $DL -eq 0 ]; then exit 0 ;
+elif [ $DL -eq 1 ]; then echo -e -n "$(chrono)\t\t$DL valid accession; " ;
+else                     echo -e -n "$(chrono)\t\t$DL valid accessions; " ; fi
+
+
+##############################################################################################################
+####                                                                                                      ####
+#### DOWNLOADING FASTQ FILES                                                                              ####
+####                                                                                                      ####
+##############################################################################################################
+
+N=0;
+for ACCN in $ACCNLIST
+do
+  if [ ! -s $OUTDIR/$ACCN.weh ]; then continue ; fi
+  nf=$(grep -c -F ".fastq.gz" $OUTDIR/$ACCN.weh);
+  N=$(( $N + $nf ));
+done      
+
+if [ $N -eq 1 ]; then echo "$N file to download" ;
+else                  echo "$N files to download" ; fi
+echo ;
+
+trap "echo;echo interrupting;if ls $OUTDIR/*.fastq.gz.weh &>/dev/null;then for f in $OUTDIR/*.fastq.gz.weh;do if [ \$(cat \$f) -ne 0 ];then echo removing \${f%.*};rm -f \$f \${f%.*};fi;done;fi;rm -f $OUTDIR/*.weh;exit 1;" SIGINT ;
+
+C=0;
+for ACCN in $ACCNLIST
+do
+  if [ ! -s $OUTDIR/$ACCN.weh ]; then continue ; fi
+  nc=${#ACCN};
+  if   [ $nc -eq 9 ];  then URL="$FTPENA/${ACCN:0:6}/$ACCN/";
+  elif [ $nc -eq 10 ]; then URL="$FTPENA/${ACCN:0:6}/00${ACCN:9:1}/$ACCN/";
+  else                      URL="$FTPENA/${ACCN:0:6}/0${ACCN:9:2}/$ACCN/";
   fi
+  for FQGZ in $(sed "s/.*$ACCN/$ACCN/g" $OUTDIR/$ACCN.weh)
+  do
+    let C++ ;
+    CMD1="echo -e \"[$C/$N]$'\t'$'\t'downloading$'\t'$FQGZ\" ; sleep $WAITIME";
+    CMD2="while true ; do $WGET_DWNL $URL$FQGZ && break ; done";
+    CMD3="$GZIP --test $OUTDIR/$FQGZ &>/dev/null ; echo \$? > $OUTDIR/$FQGZ.weh";
+    CMD4="s=\$(fb \$(stat -c %s $OUTDIR/$FQGZ))";
+    CMD5="echo \"[$C/$N]$'\t'$'\t'completed$'\t'$FQGZ$'\t'[\$s]\"";
+    echo "$CMD1 ; $CMD2 ; $CMD3 ; $CMD4 ; $CMD5" ;
+    sleep $WAITIME ;
+  done
 done | xargs -P $NTHREADS -I CMD bash -c CMD ;
 
+N=0;
+for ACCN in $ACCNLIST
+do
+  if [ ! -s $OUTDIR/$ACCN.weh ]; then continue ; fi
+  for FQGZ in $(sed "s/.*$ACCN/$ACCN/g" $OUTDIR/$ACCN.weh)
+  do
+    [ -e $OUTDIR/$FQGZ ] && let N++;
+  done
+done
 
-DL=$(cat $TMPF); rm -f $TMPF ;
-if [ $DL -eq 0 ]; then exit 0 ; fi ## NOTE: no download
-if [ $DL -eq 1 ] 
-then echo "$(chrono) $DL downloaded repository" ;
-else echo "$(chrono) $DL downloaded repositories" ;
-fi
+echo ; 
+if [ $N -eq 1 ]; then echo -e "$(chrono)\t\t$N downloaded file" ;
+else                  echo -e "$(chrono)\t\t$N downloaded files" ; fi
 
 
 ##############################################################################################################
 ####                                                                                                      ####
-#### CHECKING GZIPPED FASTQ FILES AND DOWNLOADING AGAIN (IF REQUIRED)                                     ####
+#### CHECKING FASTQ FILES AND DOWNLOADING AGAIN (IF REQUIRED)                                             ####
 ####                                                                                                      ####
 ##############################################################################################################
 
 C=0;
-NF=0;
 for ACCN in $ACCNLIST
 do
-  let C++ ;
-  OUTFQ=$OUTDIR/$ACCN*.fastq.gz; [ "$OUTDIR" == "." ] && OUTFQ=$ACCN*.fastq.gz;
+  if [ ! -s $OUTDIR/$ACCN.weh ]; then continue ; fi
   nc=${#ACCN};
   if   [ $nc -eq 9 ];  then URL="$FTPENA/${ACCN:0:6}/$ACCN/";
   elif [ $nc -eq 10 ]; then URL="$FTPENA/${ACCN:0:6}/00${ACCN:9:1}/$ACCN/";
   else                      URL="$FTPENA/${ACCN:0:6}/0${ACCN:9:2}/$ACCN/";
   fi
-  if $WGET_TEST $URL
-  then
-    echo -n "[$C/$N] checking downloaded files for accession $ACCN ..." ;
-    n1=$(ls $OUTFQ 2>/dev/null | wc -l);
-    if [ $n1 -eq 0 ]
+  for FQGZ in $(sed "s/.*$ACCN/$ACCN/g" $OUTDIR/$ACCN.weh)
+  do
+    let C++ ;
+    if [ ! -e $OUTDIR/$FQGZ ] || [ ! -s $OUTDIR/$FQGZ.weh ] || [ "$(cat $OUTDIR/$FQGZ.weh)" -ne 0 ]
     then
-      echo -e "\n\033[31m[WARNING] problem with accession $ACCN\033[0m"    >&2 ;
-      echo -n "[$C/$N] downloading again content of $URL ..." ;
-    else
-      for f in $OUTFQ
-      do
-        if ! $GZIP --test $f 2>/dev/null
-        then 
-          echo -e "\n\033[31m[WARNING] problem with file $f\033[0m"        >&2 ;
-          rm -f $f ;
-          echo -n "[$C/$N] downloading again $f ..." ;
-        else
-	  echo -n "." ;
-	fi
-      done
-      n1=$(ls $OUTFQ 2>/dev/null | wc -l);
+      rm -f $OUTDIR/$FQGZ ;
+      echo -e "\033[31m[WARNING]\t\tproblem with file $FQGZ\033[0m" >&2 ;
+      CMD1="echo \"[$C/$N]$'\t'$'\t'downloading$'\t'$FQGZ\" ; sleep $WAITIME";
+      CMD2="while true ; do $WGET_DWNL $URL$FQGZ && break ; done";
+      CMD3="$GZIP --test $OUTDIR/$FQGZ &>/dev/null ; echo \$? > $OUTDIR/$FQGZ.weh";
+      CMD4="s=\$(fb \$(stat -c %s $OUTDIR/$FQGZ))";
+      CMD5="echo \"[$C/$N]$'\t'$'\t'completed$'\t'$FQGZ$'\t'[\$s]\"";
+      echo "$CMD1 ; $CMD2 ; $CMD3 ; $CMD4 ; $CMD5" ;
+      sleep $WAITIME ;
     fi
-    while true ; do $WGET_DWNL $URL && break ; done ; ## NOTE: does nothing when all files were downloaded
-    n2=$(ls $OUTFQ 2>/dev/null | wc -l);
-    if [ $n2 -eq 0 ]
+  done
+done | xargs -P $NTHREADS -I CMD bash -c CMD ;
+
+echo ; 
+
+##############################################################################################################
+####                                                                                                      ####
+#### FINALIZING                                                                                           ####
+####                                                                                                      ####
+##############################################################################################################
+
+C=0;
+for ACCN in $ACCNLIST
+do
+  if [ ! -s $OUTDIR/$ACCN.weh ]; then continue ; fi
+  nc=${#ACCN};
+  if   [ $nc -eq 9 ];  then URL="$FTPENA/${ACCN:0:6}/$ACCN/";
+  elif [ $nc -eq 10 ]; then URL="$FTPENA/${ACCN:0:6}/00${ACCN:9:1}/$ACCN/";
+  else                      URL="$FTPENA/${ACCN:0:6}/0${ACCN:9:2}/$ACCN/";
+  fi
+  for FQGZ in $(sed "s/.*$ACCN/$ACCN/g" $OUTDIR/$ACCN.weh)
+  do
+    let C++ ;
+    if   [ ! -s $OUTDIR/$FQGZ ]
     then
-      echo -e "\n\033[31m[FAIL] unable to download content of $URL\033[0m" >&2 ;
-      sleep $WAITIME ;
+      echo -e "[$C/$N]\t\t\033[31m[FAIL] unable to download $URL/$FQGZ\033[0m" >&2 ;
+    elif [ ! -s $OUTDIR/$FQGZ.weh ]
+    then
+      echo -e "[$C/$N]\t\t\033[31m[FAIL] unable to verify $OUTDIR/$FQGZ\033[0m" >&2 ;
+    elif [ "$(cat $OUTDIR/$FQGZ.weh)" -ne 0 ]
+    then
+      echo -e "[$C/$N]\t\t\033[31m[FAIL] incorrect gzip integrity: $OUTDIR/$FQGZ\033[0m" >&2 ;
     else
-      echo " [ok]" ;
-      ls -lho $OUTFQ ;
-      NF=$(( $NF + $n2 ));
-      if [ $n1 -eq $n2 ]; then sleep 0.5 ; else sleep $WAITIME ; fi
-    fi
-  else
-    sleep 0.5 ;
-  fi	
+      echo -e "[$C/$N]\t\t$(md5sum -b $FQGZ | sed 's/ .*//')\t$FQGZ\t[$(fb $(stat -c %s $FQGZ))]" ; 
+     fi
+    rm -f $OUTDIR/$FQGZ.weh ;
+  done
 done
 
-if [ $NF -eq 1 ] 
-then echo "$(chrono) $NF downloaded file" ;
-else echo "$(chrono) $NF downloaded files" ;
-fi
+echo ;
+echo -e "$(chrono)\t\texiting" ;
 
 exit 0 ;
 
-- 
GitLab