Skip to content
Snippets Groups Projects
Commit 04d9f1b8 authored by Alexis  CRISCUOLO's avatar Alexis CRISCUOLO :black_circle:
Browse files

3.0

parent 66c95a4e
No related branches found
Tags 3.0
No related merge requests found
......@@ -71,7 +71,9 @@ Run _wgetENAHTS_ without option to read the following documentation:
* The HTS read accessions should starts with DRR, ERR or SRR (specified as final arguments, or in a text file using option `-f`). The output file names are identical to those available in the repository corresponding to each specified accession identifier. Every downloaded file has file extension `.fastq.gz`.
* After checking the existence of a repository for each sêcified accession, a first step of (parallel) downloading is performed. Each downloaded file that seems incomplete (or missing) is downloaded a second time.
* For each specified accession, a summary file (extension `.weh`) is written. This summary file contains the list of associated FASTQ file(s) together with their expected MD5 hash value.
* After checking the existence of a repository for each specified accession, a first step of (parallel) downloading is performed. Each downloaded file that seems incomplete (MD5 checksum) is downloaded a second time.
* No download is performed when the output directory already contains files named with the specified accessions.
......
......@@ -33,7 +33,11 @@
# = VERSIONS = #
# ============ #
# #
VERSION=2.0.210406acjg #
VERSION=3.0.211103acjg #
# + weh files completed with original md5 checksum(s) #
# + performs MD5-based instead of gzip-based file integrity assessment #
# #
# VERSION=2.0.210406acjg #
# + option -w removed #
# + one thread per file, instead of one thread per accession #
# + gzip file integrity assessment is also multi-threaded #
......@@ -90,7 +94,7 @@ chrono() {
# prints the doc #
# #
mandoc() {
echo -e "\n\033[1m wgetENAHTS v$VERSION $COPYRIGHT\033[0m";
echo -e "\n\033[1m wgetENAHTS v$VERSION $COPYRIGHT\033[0m";
cat <<EOF
USAGE: wgetENAHTS.sh [[-o <dir>] [-f <infile>]
......@@ -113,13 +117,13 @@ mandoc() {
+ downloading the SE FASTQ file corresponding to accession DRR000003:
wgetENAHTS.sh DRR000003
+ downloading the FASTQ files corresponding to accessions ERR000001 and ERR000004:
+ downloading FASTQ files corresponding to accessions ERR000001 and ERR000004:
wgetENAHTS.sh ERR000001 ERR000004
+ assessing the repository existence for accessions SRR9870010-39:
wgetENAHTS.sh -n SRR98700{10..39}
+ downloading the FASTQ files (if any) corresponding to accessions SRR9870010-39:
+ downloading FASTQ files (if any) corresponding to accessions SRR9870010-39:
wgetENAHTS.sh SRR98700{10..39}
+ same as above with (at most) 6 parallel downloads and saved outputs:
......@@ -169,6 +173,7 @@ if [ $# -lt 1 ]; then mandoc ; exit 1 ; fi
FTPENA="ftp://ftp.sra.ebi.ac.uk/vol1/fastq";
WGET_DWNL="$WGET --no-clobber --continue --recursive --no-parent --level=1 --no-directories";
WGET_READ="$WGET --output-document -";
WGET_TEST="$WGET --spider";
WGET_RESP="$WGET_TEST --server-response";
......@@ -251,7 +256,7 @@ if [ $NA -eq 1 ]; then echo -e "$(chrono)\t\t$NA specified accession" ;
else echo -e "$(chrono)\t\t$NA specified accessions" ; fi
echo ;
trap "echo;echo interrupting;for a in \$(echo $ACCNLIST);do rm -f $OUTDIR/\$a.weh;done;exit 1;" SIGINT ;
trap "echo;echo interrupting;for a in \$(echo $ACCNLIST);do rm -f $OUTDIR/\$a.weh $OUTDIR/\$a.md5 $OUTDIR/\$a.md5weh;done;exit 1;" SIGINT ;
C=0;
DL=0;
......@@ -274,12 +279,17 @@ do
then
if $DWNL
then
echo -n -e "[$C/$NA]\t\tchecking repository for accession $ACCN ." ;
echo -n -e "[$C/$NA]\t\tchecking repository for accession $ACCN " ;
sleep 0.5 &>/dev/null ;
echo -n "." ;
let DL++;
echo -n "." ;
while true; do $WGET_RESP $URL 2>&1 && break; done | grep -F "fastq.gz" > $OUTDIR/$ACCN.weh ;
echo -n "." ;
while true; do $WGET_READ "https://www.ebi.ac.uk/ena/portal/api/filereport?accession=$ACCN&download=true&result=read_run" 2>&1 && break; done | sed -n '$p' | cut -f4 | tr ';' '\n' > $OUTDIR/$ACCN.md5 ;
paste $OUTDIR/$ACCN.md5 $OUTDIR/$ACCN.weh | tr '\t' ' ' > $OUTDIR/$ACCN.md5weh ;
mv $OUTDIR/$ACCN.md5weh $OUTDIR/$ACCN.weh ;
rm $OUTDIR/$ACCN.md5 ;
echo " [ok]" ;
else
echo -e "[$C/$NA]\t\t\033[32mexisting repository for accession $ACCN: $URL\033[0m" >&2 ;
......@@ -330,7 +340,7 @@ do
let C++ ;
CMD1="echo -e \"[$C/$N]$'\t'$'\t'downloading$'\t'$FQGZ\" ; sleep $WAITIME";
CMD2="while true ; do $WGET_DWNL $URL$FQGZ && break ; done";
CMD3="$GZIP --test $OUTDIR/$FQGZ &>/dev/null ; echo \$? > $OUTDIR/$FQGZ.weh";
CMD3="md5sum -b $OUTDIR/$FQGZ > $OUTDIR/$FQGZ.weh";
CMD4="s=\$(fb \$(stat -c %s $OUTDIR/$FQGZ))";
CMD5="echo \"[$C/$N]$'\t'$'\t'completed$'\t'$FQGZ$'\t'[\$s]\"";
echo "$CMD1 ; $CMD2 ; $CMD3 ; $CMD4 ; $CMD5" ;
......@@ -371,13 +381,13 @@ do
for FQGZ in $(sed "s/.*$ACCN/$ACCN/g" $OUTDIR/$ACCN.weh)
do
let C++ ;
if [ ! -e $OUTDIR/$FQGZ ] || [ ! -s $OUTDIR/$FQGZ.weh ] || [ "$(cat $OUTDIR/$FQGZ.weh)" -ne 0 ]
if [ ! -e $OUTDIR/$FQGZ ] || [ ! -s $OUTDIR/$FQGZ.weh ] || [ $(grep -c -F "$(sed 's/ .*//' $OUTDIR/$FQGZ.weh)" $OUTDIR/$ACCN.weh) -ne 1 ]
then
rm -f $OUTDIR/$FQGZ ;
echo -e "\033[31m[WARNING]\t\tproblem with file $FQGZ\033[0m" >&2 ;
CMD1="echo \"[$C/$N]$'\t'$'\t'downloading$'\t'$FQGZ\" ; sleep $WAITIME";
CMD2="while true ; do $WGET_DWNL $URL$FQGZ && break ; done";
CMD3="$GZIP --test $OUTDIR/$FQGZ &>/dev/null ; echo \$? > $OUTDIR/$FQGZ.weh";
CMD3="md5sum -b $OUTDIR/$FQGZ > $OUTDIR/$FQGZ.weh";
CMD4="s=\$(fb \$(stat -c %s $OUTDIR/$FQGZ))";
CMD5="echo \"[$C/$N]$'\t'$'\t'completed$'\t'$FQGZ$'\t'[\$s]\"";
echo "$CMD1 ; $CMD2 ; $CMD3 ; $CMD4 ; $CMD5" ;
......@@ -412,12 +422,12 @@ do
elif [ ! -s $OUTDIR/$FQGZ.weh ]
then
echo -e "[$C/$N]\t\t\033[31m[FAIL] unable to verify $OUTDIR/$FQGZ\033[0m" >&2 ;
elif [ "$(cat $OUTDIR/$FQGZ.weh)" -ne 0 ]
then
echo -e "[$C/$N]\t\t\033[31m[FAIL] incorrect gzip integrity: $OUTDIR/$FQGZ\033[0m" >&2 ;
elif [ $(grep -c -F "$(sed 's/ .*//' $OUTDIR/$FQGZ.weh)" $OUTDIR/$ACCN.weh) -ne 1 ]
then
echo -e "[$C/$N]\t\t[31m[FAIL] incorrect MD5 checksum: $(sed 's/ .*//' $OUTDIR/$FQGZ.weh)\033[0m" >&2 ;
else
echo -e "[$C/$N]\t\t$(md5sum -b $FQGZ | sed 's/ .*//')\t$FQGZ\t[$(fb $(stat -c %s $FQGZ))]" ;
fi
echo -e "[$C/$N]\t\t$(sed 's/ .*//' $OUTDIR/$FQGZ.weh)\t$FQGZ\t[$(fb $(stat -c %s $FQGZ))]" ;
fi
rm -f $OUTDIR/$FQGZ.weh ;
done
done
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment