wgetGenBankWGS.sh 24 KB
Newer Older
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
1
2
3
4
#!/bin/bash

#############################################################################################################
#                                                                                                           #
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
5
# wgetGenBankWGS: downloading WGS genome assembly files from NCBI                                           #
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
6
#                                                                                                           #
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
7
# Copyright (C) 2019,2020  Institut Pasteur                                                                 #
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
8
9
10
11
12
13
14
15
16
17
18
19
#                                                                                                           #
# This program  is free software:  you can  redistribute it  and/or modify it  under the terms  of the GNU  #
# General Public License as published by the Free Software Foundation, either version 3 of the License, or  #
# (at your option) any later version.                                                                       #
#                                                                                                           #
# This program is distributed in the hope that it will be useful,  but WITHOUT ANY WARRANTY;  without even  #
# the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public  #
# License for more details.                                                                                 #
#                                                                                                           #
# You should have received a copy of the  GNU General Public License along with this program.  If not, see  #
# <http://www.gnu.org/licenses/>.                                                                           #
#                                                                                                           #
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
20
21
22
23
24
25
26
# Contact:                                                                                                  #
#  Alexis Criscuolo                                                            alexis.criscuolo@pasteur.fr  #
#  Genome Informatics & Phylogenetics (GIPhy)                                             giphy.pasteur.fr  #
#  Bioinformatics and Biostatistics Hub                                 research.pasteur.fr/team/hub-giphy  #
#  USR 3756 IP CNRS                          research.pasteur.fr/team/bioinformatics-and-biostatistics-hub  #
#  Dpt. Biologie Computationnelle                     research.pasteur.fr/department/computational-biology  #
#  Institut Pasteur, Paris, FRANCE                                                     research.pasteur.fr  #
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
27
28
29
30
31
32
33
34
35
#                                                                                                           #
#############################################################################################################

#############################################################################################################
#                                                                                                           #
# ============                                                                                              #
# = VERSIONS =                                                                                              #
# ============                                                                                              #
#                                                                                                           #
Alexis  CRISCUOLO's avatar
v0.5    
Alexis CRISCUOLO committed
36
37
38
39
40
  VERSION=0.5.201018ac                                                                                      #
# + adding flag -T- or -t- in file name for type material                                                   #
# + adding flag -w- in file name for genomes excluded from RefSeq                                           #
#                                                                                                           #
# VERSION=0.4.200504ac                                                                                      #
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
41
42
43
44
45
46
47
48
# + discarding option -t (type strain info)                                                                 #
# + option -t for multithread (instead of -c)                                                               #
# + adding single quote (') in the list of special characters                                               #
# + deals with wgs_master starting with 6 alphabetic characters                                             #
# + new option -f to download different file types                                                          #
# + new option -z to keep compressed format                                                                 #
#                                                                                                           #
# VERSION=0.3.190613ac                                                                                      #
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
49
50
51
52
# + no test between ftp and http protocols; use directly http                                               #
# + fixed bug when the specified pattern has no match                                                       #
#                                                                                                           #
# VERSION=0.2.190228ac                                                                                      #
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# + option -d for downloading from either genbank or refseq                                                 #
# + option -t to get the type strain name(s) for each selected species                                      #
#                                                                                                           #
# VERSION=0.1.190124ac                                                                                      #
#                                                                                                           #
#############################################################################################################

#############################################################################################################
#                                                                                                           #
# ============                                                                                              #
# = DOC      =                                                                                              #
# ============                                                                                              #
#                                                                                                           #
if [ "$1" = "-?" ] || [ "$1" = "-h" ] || [ $# -le 1 ]                                                       #
then                                                                                                        #
  cat <<EOF

Alexis  CRISCUOLO's avatar
v0.5    
Alexis CRISCUOLO committed
70
 wgetGenBankWGS v.$VERSION                                 Copyright (C) 2019-2020  Institut Pasteur
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
71

Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
72
73
74
75
 Downloading sequence files corresponding to selected entries from genome assembly report files:
   GenBank:  ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt
   RefSeq:   ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt

Alexis  CRISCUOLO's avatar
v0.5    
Alexis CRISCUOLO committed
76
 Writing output files 'Species.isolate--accn--GC' with the following content (and extension):
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
77
78
79
80
81
82
   -f 1      genomic sequence(s) in FASTA format (.fasta)
   -f 2      genomic sequence(s) in GenBank format (.gbk)
   -f 3      annotations in GFF3 format (.gff)
   -f 4      codon CDS in FASTA format (.fasta)
   -f 5      amino acid CDS in FASTA format (.fasta)
   -f 6      RNA sequences in FASTA format (.fasta)
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
83
84

 USAGE:  
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
85
    wgetGenBankWGS.sh  -e <pattern>  [-v <pattern>]  [-o <outdir>]  [-f <integer>]  [-n]  [-z]  [-t <nthreads>]
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
86
87
88
89
90
  where:
    -e <pattern>  extended regexp selection pattern (mandatory) 
    -v <pattern>  extended regexp exclusion pattern (default: none)
    -d <string>   either 'genbank' or 'refseq' (default: genbank)
    -n            no download, i.e. to only print the number of selected files (default: not set)
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
91
92
    -f <integer>  file type identifier (see above; default: 1)
    -z            no unzip, i.e. downloaded files are compressed (default: not set)
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
93
    -o <outdir>   output directory (default: .)
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
94
    -t <nthreads> number of threads (default: 1)
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
95
96
97
98
99

 EXAMPLES:
  + getting the total number of available complete Salmonella genomes inside RefSeq:
     wgetGenBankWGS.sh -e "Salmonella.*Complete Genome" -v "phage|virus" -d refseq -n

Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
100
101
102
103
104
105
106
107
108
109
110
  + getting the total number of genomes inside GenBank deposited in 1996:
     wgetGenBankWGS.sh -e "1996/[01-12]+/[01-31]+" -n
 
  + getting the total number of available SARS-CoV-2 genomes (taxid=694009) inside GenBank:
     wgetGenBankWGS.sh -e $'\t'694009$'\t' -n
 
  + downloading the full RefSeq assembly report:
      wgetGenBankWGS.sh -e "/" -d refseq -n
 
  + downloading the GenBank files with the assembly accessions GCF_900002335, GCF_000002415 and GCF_000002765:
     wgetGenBankWGS.sh -e "GCF_900002335|GCF_000002415|GCF_000002765" -d refseq
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
111
112
113
114

  + downloading in the directory Dermatophilaceae every available genome sequence from this family using 30 threads:
     wgetGenBankWGS.sh -e "Austwickia|Dermatophilus|Kineosphaera|Mobilicoccus|Piscicoccus|Tonsilliphilus" -o Dermatophilaceae -t 30

Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
115
116
117
118
119
  + downloading the non-Listeria proteomes with the wgs_master starting with "PPP":
     wgetGenBankWGS.sh -e $'\t'"PPP.00000000" -v "Listeria" -f 5

  + downloading the genome annotation of every Klesiella type strain in compressed gff3 format using 30 threads
     wgetGenBankWGS.sh -e "Klebsiella.*type material" -f 3 -z -t 30
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
120
121
122
123
124
125
126
127
128
129
130
131
132

EOF
  exit 1 ;                                                                                                  # 
fi                                                                                                          # 
#                                                                                                           #
#############################################################################################################

#############################################################################################################
#                                                                                                           #
# ===============                                                                                           #
# = CONSTANTS   =                                                                                           #
# ===============                                                                                           #
#                                                                                                           #
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
133
134
135
136
137
138
139
# = PROTOCOL can be either "ftp:" or "https"; however, "https:" is generally faster ======================  #
#                                                                                                           #
  PROTOCOL="https:";                                                                                        
#                                                                                                           #
# = WGETOPT are the basic wget options ===================================================================  #
#                                                                                                           #
  WGETOPT="--no-check-certificate --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -q";     
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
140
141
142
143
144
145
#                                                                                                           #
#                                                                                                           #
# ===============                                                                                           #
# = FUNCTIONS   =                                                                                           #
# ===============                                                                                           #
#                                                                                                           #
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
146
147
148
# = gettime() arguments: =================================================================================  #
#    1. START: the starting time in seconds                                                                 #
#   returns the elapsed time since $START                                                                   #
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
149
150
151
152
153
154
gettime() {
  t=$(( $SECONDS - $1 )); sec=$(( $t % 60 )); min=$(( $t / 60 ));
  if [ $sec -lt 10 ]; then sec="0$sec"; fi
  if [ $min -lt 10 ]; then min="0$min"; fi
  echo "[$min:$sec]" ;
}
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
155
#                                                                                                           #
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# = randomfile() arguments: ==============================================================================  #
#    1. PREFIX: prefix file name                                                                            #
#   returns a random file name from a given PREFIX file name                                                #
#                                                                                                           #
randomfile() {
  rdmf=$1.$RANDOM; while [ -e $rdmf ]; do rdmf=$1.$RANDOM ; done
  echo $rdmf ;
}
#                                                                                                           #
# = dwnl() arguments: ====================================================================================  #
#    1. URL: URL of the file to download                                                                    #
#    2. OUTFILE: output file name                                                                           #
#   downloads the file from URL and writes it into OUTFILE                                                  #
#                                                                                                           #
dwnl() {
  tmp=$(randomfile $2);
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
172
  wget $WGETOPT --spider $1 || return 1 ;
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
173
174
175
176
177
178
  while [ 1 ]
  do
    wget $WGETOPT -O $tmp $1 ;
    if [ $? == 0 ]; then mv $tmp $2 ; break; fi
    sleep 1 ;
  done
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
179
  return 0 ;
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
180
181
182
183
184
185
186
187
188
}
#                                                                                                           #
# = dwnlgz() arguments: ==================================================================================  #
#    1. URL: URL of the gz file to download                                                                 #
#    2. OUTFILE: output file name                                                                           #
#   downloads the file from URL and unzip it into OUTFILE                                                   #
#                                                                                                           #
dwnlgz() {
  tmp=$(randomfile $2);
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
189
  wget $WGETOPT --spider $1 || return 1 ;
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
190
191
  while [ 1 ]
  do
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
192
193
    wget $WGETOPT -O $tmp $1 ;
    if [ $? == 0 ]; then gunzip -c $tmp > $2 ; rm $tmp ; break; fi
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
194
195
    sleep 1 ;
  done
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
196
  return 0 ;
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
197
198
199
200
}
#                                                                                                           #
#############################################################################################################

Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
201

Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
202
203
204
205
206
207
208
209
210
211
212
#############################################################################################################
####                                                                                                     ####
#### INITIALIZING PARAMETERS AND READING OPTIONS                                                         ####
####                                                                                                     ####
#############################################################################################################
INCLUDE_PATTERN="";
EXCLUDE_PATTERN="^#";
REPOSITORY="genbank";
OUTDIR=".";
NTHREADS=1;
DWNL=true;
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
213
214
FTYPE=1;
UNZIP=true;
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
215
WAITIME=0.5;
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
216
while getopts :e:v:o:t:d:f:nz option
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
217
218
do
  case $option in
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
219
220
221
222
223
224
225
226
    e) INCLUDE_PATTERN="$OPTARG"                         ;;
    v) EXCLUDE_PATTERN="$OPTARG"                         ;;
    d) REPOSITORY="$OPTARG"                              ;;
    o) OUTDIR="$OPTARG"                                  ;;
    t) NTHREADS=$OPTARG                                  ;;
    f) FTYPE=$OPTARG                                     ;;
    n) DWNL=false                                        ;;
    z) UNZIP=false                                       ;;
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
227
228
229
230
    :) echo "option $OPTARG : missing argument" ; exit 1 ;;
   \?) echo "$OPTARG : option invalide" ;         exit 1 ;;
  esac
done
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
231
if [ -z "$INCLUDE_PATTERN" ]; then echo "no specified pattern (option -e)" ;                                                     exit 1 ; fi
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
232
233
if [ $NTHREADS -lt 1 ];       then echo "incorrect number of threads (option -t): $THREADS" ;                                    exit 1 ; fi
if [ "$REPOSITORY" != "genbank" ] && [ "$REPOSITORY" != "refseq" ]; then "incorrect repository name (options -d): $REPOSITORY" ; exit 1 ; fi
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
234
235
236
237
238
239
240
241
242
243
244
245
INEXT="_genomic.fna.gz"; OUTEXT=".fasta";
if $DWNL
then
  if   [ "$FTYPE" == "1" ];   then echo "file type: genomic sequence(s) in FASTA format" ;   FTYPE=1; INEXT="_genomic.fna.gz";          OUTEXT=".fasta";
  elif [ "$FTYPE" == "2" ];   then echo "file type: genomic sequence(s) in GenBank format" ; FTYPE=2; INEXT="_genomic.gbff.gz";         OUTEXT=".gbk";
  elif [ "$FTYPE" == "3" ];   then echo "file type: annotations in GFF3 format" ;            FTYPE=3; INEXT="_genomic.gff.gz";          OUTEXT=".gff";
  elif [ "$FTYPE" == "4" ];   then echo "file type: codon CDS in FASTA format" ;             FTYPE=4; INEXT="_cds_from_genomic.fna.gz"; OUTEXT=".fasta";
  elif [ "$FTYPE" == "5" ];   then echo "file type: amino acid CDS in FASTA format" ;        FTYPE=5; INEXT="_protein.faa.gz";          OUTEXT=".fasta";
  elif [ "$FTYPE" == "6" ];   then echo "file type: RNA sequences in FASTA format" ;         FTYPE=6; INEXT="_rna_from_genomic.fna.gz"; OUTEXT=".fasta";
  fi
fi
OUTDIR=$(dirname $OUTDIR/.);
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
246
247
if [ ! -e $OUTDIR ];          then echo "creating output directory: $OUTDIR" ; mkdir $OUTDIR ;                                            fi

Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
248
249
250
251
252
253
254
255
256
257
258

trap  "echo interrupting wgetGenBankWGS ; wait ; if [ \"$OUTDIR\" != "." ]; then rm -r $OUTDIR ; fi ; exit 1"  INT ;


#############################################################################################################
####                                                                                                     ####
#### DOWNLOADING GENOME ASSEMBLY REPORT FILE                                                             ####
####                                                                                                     ####
#############################################################################################################
echo -n "downloading $REPOSITORY assembly report ... " ;
ASSEMBLY_REPORT=ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_$REPOSITORY.txt;
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
259
SUMMARY=$OUTDIR/summary.txt;
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
260
261
dwnl $PROTOCOL"//"$ASSEMBLY_REPORT $SUMMARY ;
echo "[ok]" ;
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
262
263
264
265
266
267
268
269
270
271


#############################################################################################################
####                                                                                                     ####
#### SELECTING WGS ENTRIES                                                                               ####
####                                                                                                     ####
#############################################################################################################
echo "selection criterion: $INCLUDE_PATTERN" ;
if [ "$EXCLUDE_PATTERN" != "^#" ]; then echo "exclusion criterion: $EXCLUDE_PATTERN" ; fi
tmp=$(randomfile $SUMMARY);
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
272
273
274
mv $SUMMARY $tmp ;
sed -n '2p' $tmp > $SUMMARY ;
sed '1,2d' $tmp | grep -E "$INCLUDE_PATTERN" | grep -v -E "$EXCLUDE_PATTERN" | grep -F "ftp://ftp.ncbi.nlm.nih.gov" >> $SUMMARY ;
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
275
rm $tmp ;
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
276
n=$(grep -v -c "^#" $SUMMARY);
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
277
echo "$REPOSITORY: $n entries" ;
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
278
279
280
if [ $n -eq 0 ]; then exit 0 ; fi


Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
281
if ! $DWNL ; then echo "see details in the report file: $SUMMARY" ; exit 0 ;  fi
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
282
283
284
285
286
287
288
289


#############################################################################################################
####                                                                                                     ####
#### DOWNLOADING WGS NUCLEOTIDE SEQUENCES                                                                ####
####                                                                                                     ####
#############################################################################################################
FULLSUMMARY=$(randomfile $SUMMARY);
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
290
291
head -1 $SUMMARY | sed 's/^# /# file\t/' > $FULLSUMMARY ;
tr '\t' '|' < $SUMMARY > $tmp ; mv $tmp $SUMMARY ;  ## to deal with empty entries, not well managed using IFS=$'\t'
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
292
293
START=$SECONDS;
i=-1;
Alexis  CRISCUOLO's avatar
v0.5    
Alexis CRISCUOLO committed
294
while IFS="|" read -r assembly_accession _ _ wgs_master _ _ _ organism_name infraspecific_name isolate _ _ _ _ _ _ _ _ _ ftp_path excluded_from_refseq relation_to_type_material
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
295
296
do
  let i++; if [ $i -lt 1 ]; then continue; fi
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
297

Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
298
299
  NAME=$(echo "$organism_name" | tr ",/\?%*:|'\"<>()[]#;" '_' |                                    ### replacing special char. by '_'
           sed -e 's/ bv\./ bv/;s/ genomosp\./ genomosp/;s/ sp\./ sp/;s/ str\./ str/;s/ subsp\./ subsp/'); 
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
300

Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
301
302
  STRAIN=$(echo "$infraspecific_name" | sed 's/strain=//g' | tr ",/\?%*:|'\"<>()[]#;" '_');        ### replacing special char. by '_'
  [ -n "$STRAIN" ]  && [ $(echo "$NAME" | grep -c -F "$STRAIN") -eq 0 ]  && NAME="$NAME.$STRAIN";
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
303

Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
304
305
  ISOLATE=$(echo "$isolate" | tr ",/\?%*:|'\"<>()[]#;" '_');                                       ### replacing special char. by '_'
  [ -n "$ISOLATE" ] && [ $(echo "$NAME" | grep -c -F "$ISOLATE") -eq 0 ] && NAME="$NAME.$ISOLATE";
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
306

Alexis  CRISCUOLO's avatar
v0.5    
Alexis CRISCUOLO committed
307
308
309
310
311
312
313
  [ "$relation_to_type_material" == "assembly from type material" ] && NAME="$NAME""--T";
  [ "$relation_to_type_material" == "assembly from synonym type material" ] && NAME="$NAME""--T";
  [ "$excluded_from_refseq" == "untrustworthy as type" ] && NAME="$NAME""--t";
  [ -n "$excluded_from_refseq" ] && [ "$excluded_from_refseq" != "untrustworthy as type" ] && NAME="$NAME""--w";
  
  # accn=$(tr -d '0123456789.' <<< "$wgs_master")"01"; [ -n "$wgs_master" ] && NAME="$NAME""--""$accn";
  [ -n "$wgs_master" ] && NAME="$NAME""--""$wgs_master";
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
314
  [ -n "$assembly_accession" ] && NAME="$NAME""--""$assembly_accession";
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
315
  
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
316
317
  GZFILE=$(basename $ftp_path)$INEXT;
  URL=$(echo $ftp_path | sed "s/ftp:/$PROTOCOL/")"/$GZFILE";
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
318

Alexis  CRISCUOLO's avatar
v0.5    
Alexis CRISCUOLO committed
319
  >&2 echo -e "$(gettime $START) [$i/$n] $organism_name | $infraspecific_name | $isolate | $assembly_accession | $wgs_master | \e[31m$excluded_from_refseq\e[0m \e[34m$relation_to_type_material\e[0m | $ftp_path" ;
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
320
321
322
323
324

  OUTFILE=$(echo "$NAME" | tr ' ' '.' | sed 's/\.\.*/\./g')$OUTEXT;                                ### replacing blank spaces by '.', and successive dots by only one

  if $UNZIP
  then
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
325
    dwnlgz $URL $OUTDIR/$OUTFILE &
Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
326
327
328
329
330
331
332
333
334
335
    echo -e "$OUTFILE\t$(sed -n $(( $i + 1 ))p $SUMMARY)" ;
  else
    dwnl $URL $OUTDIR/$OUTFILE.gz &
    echo -e "$OUTFILE.gz\t$(sed -n $(( $i + 1 ))p $SUMMARY)" ;
  fi
    
  while [ $(jobs -r | wc -l) -gt $NTHREADS ]; do sleep $WAITIME ; done
  # if [ $i -eq $n ]; then wait ; fi

done  <  $SUMMARY  |  tr '|' '\t'  >>  $FULLSUMMARY ;
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
336
337
338
339
340


wait ;


Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
341
342
343
344
345
346
347
348
349
350
351
#############################################################################################################
####                                                                                                     ####
#### CHECKING EXISTING FILES                                                                             ####
####                                                                                                     ####
#############################################################################################################
awk -v d=$OUTDIR 'BEGIN{FS=OFS="\t"}
                  (NR==1){print;next}
                  {l=$0;if(getline < (d"/"$1) <= 0){$1="na";l=$0}print l}'  $FULLSUMMARY > $SUMMARY ;
rm $FULLSUMMARY ;
n=$(grep -Pc "^na\t" $SUMMARY);
if [ $n -ne 0 ]; then echo "WARNING: $n files are not available with the specified file type (-f $FTYPE)" ; fi
Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
352
353
echo "see details in the report file: $SUMMARY" ;

Alexis  CRISCUOLO's avatar
0.4    
Alexis CRISCUOLO committed
354

Alexis  CRISCUOLO's avatar
Alexis CRISCUOLO committed
355
exit ;