Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
wgetENAHTS
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
GIPhy
wgetENAHTS
Commits
94e47eec
Commit
94e47eec
authored
3 years ago
by
Alexis CRISCUOLO
Browse files
Options
Downloads
Patches
Plain Diff
4.0
parent
8f289ad4
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
README.md
+13
-12
13 additions, 12 deletions
README.md
wgetENAHTS.sh
+107
-55
107 additions, 55 deletions
wgetENAHTS.sh
with
120 additions
and
67 deletions
README.md
+
13
−
12
View file @
94e47eec
...
...
@@ -27,33 +27,34 @@ Execute _wgetENAHTS_ with the following command line model:
Run _wgetENAHTS_ without option to read the following documentation:
```
USAGE: wgetENAHTS.sh [[-o <dir>] [-f <infile>]
[-
t
<
nthreads
>] [-r <rate>] [-n] [-h]] [<accn> ...]
USAGE: wgetENAHTS.sh [[-o <dir>] [-f <infile>]
[-t <nthreads>]
[-
p
<
protocol
>]
[-r <rate>] [-n] [-h]] [<accn> ...]
Downloads FASTQ files corresponding to the specified DRR/ERR/SRR accession(s)
Files are downloaded from the ENA ftp repository ftp.sra.ebi.ac.uk/vol1/fastq
OPTIONS:
-o <dir> output directory (default: .)
-f <file> to read accession(s) from the specified file (default: all the last
arguments)
-t <int> number of thread(s) (default: 2)
-r <int> maximum download rate per file (in kb per seconds; default: entire
available bandwidth)
-n no file download, only check (default: not set)
-h prints this help and exits
-o <dir> output directory (default: .)
-f <file> to read accession(s) from the specified file (default: all the last
arguments)
-t <int> number of thread(s) (default: 2)
-p <string> force the transfer protocol, either ftp or https (default: auto)
-r <int> maximum download rate per file, in kb per seconds (default: entire
available bandwidth)
-n no file download, only check (default: not set)
-h prints this help and exits
EXAMPLES:
+ downloading the SE FASTQ file corresponding to accession DRR000003:
wgetENAHTS.sh DRR000003
+ downloading
the
FASTQ files corresponding to accessions ERR000001 and ERR000004:
+ downloading FASTQ files corresponding to accessions ERR000001 and ERR000004:
wgetENAHTS.sh ERR000001 ERR000004
+ assessing the repository existence for accessions SRR9870010-39:
wgetENAHTS.sh -n SRR98700{10..39}
+ downloading
the
FASTQ files (if any) corresponding to accessions SRR9870010-39:
+ downloading FASTQ files (if any) corresponding to accessions SRR9870010-39:
wgetENAHTS.sh SRR98700{10..39}
+ same as above with (at most) 6 parallel downloads and saved outputs:
...
...
This diff is collapsed.
Click to expand it.
wgetENAHTS.sh
+
107
−
55
View file @
94e47eec
...
...
@@ -105,21 +105,22 @@ mandoc() {
echo
-e
"
\n\0
33[1m wgetENAHTS v
$VERSION
$COPYRIGHT
\0
33[0m"
;
cat
<<
EOF
USAGE: wgetENAHTS.sh [[-o <dir>] [-f <infile>]
[-
t
<
nthreads
>] [-r <rate>] [-n] [-h]] [<accn> ...]
USAGE: wgetENAHTS.sh [[-o <dir>] [-f <infile>]
[-t <nthreads>]
[-
p
<
protocol
>]
[-r <rate>] [-n] [-h]] [<accn> ...]
Downloads FASTQ files corresponding to the specified DRR/ERR/SRR accession(s)
Files are downloaded from the ENA ftp repository ftp.sra.ebi.ac.uk/vol1/fastq
OPTIONS:
-o <dir> output directory (default: .)
-f <file> to read accession(s) from the specified file (default: all the last
arguments)
-t <int> number of thread(s) (default: 2)
-r <int> maximum download rate per file (in kb per seconds; default: entire
available bandwidth)
-n no file download, only check (default: not set)
-h prints this help and exits
-o <dir> output directory (default: .)
-f <file> to read accession(s) from the specified file (default: all the last
arguments)
-t <int> number of thread(s) (default: 2)
-p <string> force the transfer protocol, either ftp or https (default: auto)
-r <int> maximum download rate per file, in kb per seconds (default: entire
available bandwidth)
-n no file download, only check (default: not set)
-h prints this help and exits
EXAMPLES:
+ downloading the SE FASTQ file corresponding to accession DRR000003:
...
...
@@ -158,7 +159,7 @@ EOF
# #
WGET_BIN
=
wget
;
[
!
$(
command
-v
$WGET_BIN
)
]
&&
echoxit
"no
$WGET_BIN
detected"
;
WGET_STATIC_OPTIONS
=
"--
no-check-certificate
--retry-connrefused --
random-wait --quiet
"
;
WGET_STATIC_OPTIONS
=
"--
quiet
--retry-connrefused --
no-check-certificate
"
;
WGET
=
"
$WGET_BIN
$WGET_STATIC_OPTIONS
"
;
# #
# -- gzip ------------------------------------------------------------------------------------------------- #
...
...
@@ -180,28 +181,28 @@ EOF
if
[
$#
-lt
1
]
;
then
mandoc
;
exit
1
;
fi
FILE_REPORT
=
"https://www.ebi.ac.uk/ena/portal/api/filereport?download=true&result=read_run&accession="
;
WGET_DWNL
=
"
$WGET
--continue --no-directories"
;
WGET_READ
=
"
$WGET
--output-document -"
;
WGET_DWNL
=
"
$WGET
--read-timeout=200
--continue --no-directories"
;
WGET_READ
=
"
$WGET
--read-timeout=200
--output-document -"
;
WGET_TEST
=
"
$WGET
--spider"
;
NTHREADS
=
2
;
OUTDIR
=
"."
;
INFILE
=
"_N.A_"
;
MAXRATE
=
"NA"
;
WAITIME
=
0
;
PROTOCOL
=
"auto"
;
DWNL
=
true
;
while
getopts
t:o:f:r:
w
:nh option
while
getopts
t:o:f:r:
p
:nh option
do
case
$option
in
t
)
NTHREADS
=
$OPTARG
;;
o
)
OUTDIR
=
"
$OPTARG
"
;;
f
)
INFILE
=
"
$OPTARG
"
;;
r
)
MAXRATE
=
$OPTARG
;;
w
)
WAITIME
=
$OPTARG
;;
n
)
DWNL
=
false
;;
h
)
mandoc
;
exit
0
;;
\?
)
mandoc
;
exit
1
;;
t
)
NTHREADS
=
$OPTARG
;;
o
)
OUTDIR
=
"
$OPTARG
"
;;
f
)
INFILE
=
"
$OPTARG
"
;;
r
)
MAXRATE
=
$OPTARG
;;
p
)
PROTOCOL
=
"
$OPTARG
"
;;
n
)
DWNL
=
false
;;
h
)
mandoc
;
exit
0
;;
\?
)
mandoc
;
exit
1
;;
esac
done
shift
"
$((
$OPTIND
-
1
))
"
...
...
@@ -233,15 +234,16 @@ then
[
$MAXRATE
-lt
1
]
&&
MAXRATE
=
1
;
WGET_DWNL
=
"
$WGET_DWNL
--limit-rate=
$MAXRATE
"
k
;
fi
[[
$NTHREADS
=
~ ^[0-9]+
$
]]
||
echoxit
"incorrect value (option -t):
$NTHREADS
"
;
[
$NTHREADS
-lt
1
]
&&
NTHREADS
=
1
;
[[
$WAITIME
=
~ ^[0-9]+
$
]]
||
echoxit
"incorrect value (option -w):
$WAITIME
"
;
if
[
$WAITIME
-eq
0
]
if
[
"
$PROTOCOL
"
!=
"auto"
]
&&
[
"
$PROTOCOL
"
!=
"ftp"
]
&&
[
"
$PROTOCOL
"
!=
"https"
]
then
wt
=
0
;
while
[
$((
$wt
*
$wt
))
-lt
$NTHREADS
]
;
do
let
wt++
;
done
WAITIME
=
$wt
;
echoxit
"transfer protocol should be either ftp or https (option -p):
$PROTOCOL
"
;
fi
[
$WAITIME
-lt
1
]
&&
WAITIME
=
1
;
[[
$NTHREADS
=
~ ^[0-9]+
$
]]
||
echoxit
"incorrect value (option -t):
$NTHREADS
"
;
[
$NTHREADS
-lt
1
]
&&
NTHREADS
=
1
;
wt
=
0
;
while
[
$((
$wt
*
$wt
))
-lt
$NTHREADS
]
;
do
let
wt++
;
done
WAITIME
=
$wt
;
[
$WAITIME
-lt
1
]
&&
WAITIME
=
1
;
##############################################################################################################
...
...
@@ -266,24 +268,28 @@ echo ;
##############################################################################################################
#### ####
####
CHECKING PROTOCOL
####
####
ASSESSING TRANSFER PROTOCOL
####
#### ####
##############################################################################################################
URL
=
"ftp.sra.ebi.ac.uk/vol1/fastq"
;
echo
-n
-e
"
$(
chrono
)
\t\t
checking protocol "
;
time_ftp
=
$SECONDS
;
for
i
in
{
1..5
}
;
do
echo
-n
"."
;
timeout
2
$WGET_TEST
"ftp://
$URL
/DRR00
$i
/"
&>/dev/null
;
rm
-f
wget-log
;
done
time_ftp
=
$((
$SECONDS
-
$time_ftp
))
;
time_https
=
$SECONDS
;
for
i
in
{
1..5
}
;
do
echo
-n
"."
;
timeout
2
$WGET_TEST
"https://
$URL
/DRR00
$i
/"
&>/dev/null
;
rm
-f
wget-log
;
done
time_https
=
$((
$SECONDS
-
$time_https
))
;
echo
" [ok]"
;
if
[
$time_ftp
-lt
$time_https
]
then
FTPENA
=
"ftp://
$URL
"
;
echo
-e
"
$(
chrono
)
\t\t
selected protocol: ftp (
$time_ftp
:
$time_https
)"
;
else
FTPENA
=
"https://
$URL
"
;
echo
-e
"
$(
chrono
)
\t\t
selected protocol: https (
$time_https
:
$time_ftp
)"
;
if
[
"
$PROTOCOL
"
==
"auto"
]
then
echo
-n
-e
"
$(
chrono
)
\t\t
assessing transfer protocol "
;
PROTOCOL
=
"ftp://"
;
time_ftp
=
$SECONDS
;
for
i
in
{
1..5
}
;
do
echo
-n
"."
;
timeout
2
$WGET_TEST
"
$PROTOCOL$URL
/DRR00
$i
/"
&>/dev/null
;
rm
-f
wget-log
;
done
time_ftp
=
$((
$SECONDS
-
$time_ftp
))
;
PROTOCOL
=
"https://"
;
time_https
=
$SECONDS
;
for
i
in
{
1..5
}
;
do
echo
-n
"."
;
timeout
2
$WGET_TEST
"
$PROTOCOL$URL
/DRR00
$i
/"
&>/dev/null
;
rm
-f
wget-log
;
done
time_https
=
$((
$SECONDS
-
$time_https
))
;
echo
" [ok]"
;
if
[
$time_ftp
-lt
$time_https
]
then
PROTOCOL
=
"ftp://"
;
echo
-e
"
$(
chrono
)
\t\t
selected protocol: ftp (
$time_ftp
<
$time_https
)"
;
else
PROTOCOL
=
"https://"
;
echo
-e
"
$(
chrono
)
\t\t
selected protocol: https (
$time_https
<
$time_ftp
)"
;
fi
echo
;
fi
echo
;
FTPENA
=
"
$PROTOCOL$URL
"
;
##############################################################################################################
...
...
@@ -303,7 +309,7 @@ do
if
$DWNL
&&
ls
$OUTFQ
&>/dev/null
then
echo
-e
"[
$C
/
$NA
]
\t\t\0
33[34mfile(s) already exist(s) for accession
$ACCN
\0
33[0m"
>
&2
;
stat
-c
"%s %n"
$OUTFQ
|
while
read
s n
;
do
echo
-e
"
\0
33[90m
$n
\t
[
$(
fb
$s
)
]
\0
33[0m"
>
&2
;
done
stat
-c
"%s %n"
$OUTFQ
|
while
read
s n
;
do
echo
-e
"
\
t\t\
0
33[90m
$n
\t
[
$(
fb
$s
)
]
\0
33[0m"
>
&2
;
done
continue
;
fi
nc
=
${#
ACCN
}
;
...
...
@@ -353,14 +359,6 @@ echo ;
if
[
$DL
-eq
0
]
;
then
exit
0
;
elif
[
$DL
-eq
1
]
;
then
echo
-e
-n
"
$(
chrono
)
\t\t
$DL
valid accession; "
;
else
echo
-e
-n
"
$(
chrono
)
\t\t
$DL
valid accessions; "
;
fi
##############################################################################################################
#### ####
#### DOWNLOADING FASTQ FILES ####
#### ####
##############################################################################################################
N
=
0
;
for
ACCN
in
$ACCNLIST
do
...
...
@@ -368,13 +366,67 @@ do
nf
=
$(
grep
-c
-F
".fastq.gz"
$OUTDIR
/
$ACCN
.weh
)
;
N
=
$((
$N
+
$nf
))
;
done
if
[
$N
-eq
1
]
;
then
echo
"
$N
file to download"
;
else
echo
"
$N
files to download"
;
fi
echo
;
##############################################################################################################
#### ####
#### SORTING ACCESSIONS ####
#### ####
##############################################################################################################
ACCNSORT
=
"
$(
for
ACCN
in
$ACCNLIST
;
do
[
-s
$OUTDIR
/
$ACCN
.weh
]
&&
echo
-e
"
$(
sed
's/ /\t/g'
$OUTDIR
/
$ACCN
.weh |
cut
-f2
|
paste
-sd
+ | bc
-l
)
\t
$ACCN
"
;
done
|
sort
-gr
|
cut
-f2
|
tr
'\n'
' '
)
"
;
##############################################################################################################
#### ####
#### MEASURING DOWNLOAD SPEED ####
#### ####
##############################################################################################################
# if [ "$MAXRATE" == "NA" ]
# then
# echo -e -n "$(chrono)\t\testimating download speed " ;
# for ACCN in $ACCNSORT
# do
# echo -n "." ;
# if [ ! -s $OUTDIR/$ACCN.weh ]; then continue ; fi
# nc=${#ACCN};
# if [ $nc -eq 9 ]; then URL="$FTPENA/${ACCN:0:6}/$ACCN/";
# elif [ $nc -eq 10 ]; then URL="$FTPENA/${ACCN:0:6}/00${ACCN:9:1}/$ACCN/";
# else URL="$FTPENA/${ACCN:0:6}/0${ACCN:9:2}/$ACCN/";
# fi
# echo -n "." ;
# for FQGZ in $(sed "s/.*$ACCN/$ACCN/g" $OUTDIR/$ACCN.weh)
# do
# echo -n "." ;
# timeout 30 $WGET_DWNL $URL$FQGZ &>/dev/null ;
# bytes=$(du -b $OUTDIR/$FQGZ | tail -1 | cut -f1);
# RATE=$(( $bytes / 30 ));
# rm -f wget-log ;
# break ;
# done
# break ;
# done
# echo " [ok]" ;
# if [ $N -lt $NTHREADS ]; then RATE=$(( $RATE / $N )); else RATE=$(( $RATE / $NTHREADS )); fi
# if [ $RATE -gt 10000 ]
# then
# WGET_DWNL="$WGET_DWNL --limit-rate=$RATE"
# echo -e "$(chrono)\t\tdownload speed per file: $RATE bytes/seconds" ;
# fi
# echo ;
# fi
##############################################################################################################
#### ####
#### DOWNLOADING FASTQ FILES ####
#### ####
##############################################################################################################
C
=
0
;
for
ACCN
in
$ACCNSORT
do
...
...
@@ -388,7 +440,7 @@ do
do
let
C++
;
CMD1
=
"echo -e
\"
[
$C
/
$N
]
$'
\t
'
$'
\t
'downloading
$'
\t
'
$FQGZ
\"
; sleep
$WAITIME
"
;
CMD2
=
"while true ; do
$WGET_DWNL
$URL$FQGZ
&& break ; done"
;
CMD2
=
"while true ; do
nice
$WGET_DWNL
$URL$FQGZ
&& break ; done"
;
CMD3
=
"md5sum -b
$OUTDIR
/
$FQGZ
>
$OUTDIR
/
$FQGZ
.weh"
;
CMD4
=
"s=
\$
(fb
\$
(stat -c %s
$OUTDIR
/
$FQGZ
))"
;
CMD5
=
"echo
\"
[
$C
/
$N
]
$'
\t
'
$'
\t
'completed
$'
\t
'
$FQGZ$'
\t
'[
\$
s]
\"
"
;
...
...
@@ -435,7 +487,7 @@ do
# rm -f $OUTDIR/$FQGZ ;
echo
-e
"
\0
33[31m[WARNING]
\t\t
problem with file
$FQGZ
\0
33[0m"
>
&2
;
CMD1
=
"echo
\"
[
$C
/
$N
]
$'
\t
'
$'
\t
'downloading
$'
\t
'
$FQGZ
\"
; sleep
$WAITIME
"
;
CMD2
=
"while true ; do
$WGET_DWNL
$URL$FQGZ
&& break ; done"
;
CMD2
=
"while true ; do
nice
$WGET_DWNL
$URL$FQGZ
&& break ; done"
;
CMD3
=
"md5sum -b
$OUTDIR
/
$FQGZ
>
$OUTDIR
/
$FQGZ
.weh"
;
CMD4
=
"s=
\$
(fb
\$
(stat -c %s
$OUTDIR
/
$FQGZ
))"
;
CMD5
=
"echo
\"
[
$C
/
$N
]
$'
\t
'
$'
\t
'completed
$'
\t
'
$FQGZ$'
\t
'[
\$
s]
\"
"
;
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment