Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
panacota
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Amandine PERRIN
panacota
Commits
a92e9d8c
Commit
a92e9d8c
authored
5 years ago
by
Amandine PERRIN
Browse files
Options
Downloads
Patches
Plain Diff
Adapt download script to new ngd API
parent
d326ac08
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
PanACoTA/prepare_module/download_genomes_func.py
+75
-35
75 additions, 35 deletions
PanACoTA/prepare_module/download_genomes_func.py
with
75 additions
and
35 deletions
PanACoTA/prepare_module/download_genomes_func.py
+
75
−
35
View file @
a92e9d8c
...
...
@@ -21,42 +21,66 @@ from PanACoTA import utils
logger
=
logging
.
getLogger
(
"
ddg.log_dds
"
)
def
download_from_refseq
(
s
um_file
,
NCBI_species
,
NCBI_taxid
,
outdir
,
threads
):
def
download_from_refseq
(
s
pecies_linked
,
NCBI_species
,
NCBI_taxid
,
outdir
,
threads
):
"""
Download refseq genomes of given species
Parameters
----------
species_linked : str
given NCBI species with
'
_
'
instead of spaces, or NCBI taxID if species
name not given
NCBI_species : str
name of species to download: user given NCBI species with
'
_
'
instead of spaces. None if
no species name given
NCBI_taxid : int
species taxid given in NCBI
outdir : str
Directory where downloaded sequences must be saved
threads : int
Number f threads to use to download genome sequences
Returns
-------
str :
Output filename of downloaded summary
"""
# arguments needed to download a species genomes
keyargs
=
{
"
section
"
:
"
refseq
"
,
"
file_format
"
:
"
fasta
"
,
"
output
"
:
outdir
,
# Name of summary file, with metadata for each strain:
sumfile
=
os
.
path
.
join
(
outdir
,
"
assembly_summary-{}.txt
"
.
format
(
species_linked
))
abs_sumfile
=
os
.
path
.
abspath
(
sumfile
)
# arguments needed to download all genomes of the given species
abs_outdir
=
os
.
path
.
abspath
(
outdir
)
keyargs
=
{
"
section
"
:
"
refseq
"
,
"
file_format
"
:
"
fasta
"
,
"
output
"
:
abs_outdir
,
"
parallel
"
:
threads
,
"
group
"
:
"
bacteria
"
,
"
species_taxid
"
:
NCBI_taxid
}
# summary file could not be downloaded because given species does not match
# any NCBI species. Just download genomes with the given taxID
if
not
sum_file
:
logger
.
info
(
"
Downloading refseq genomes for taxid={}
"
.
format
(
NCBI_taxid
))
else
:
with
open
(
sum_file
,
"
r
"
)
as
sum_lines
:
for
line
in
sum_lines
:
infos
=
line
.
split
()
if
len
(
infos
)
>=
6
:
try
:
number
=
int
(
infos
[
6
])
except
ValueError
:
continue
if
number
!=
int
(
NCBI_taxid
):
logger
.
error
(
"
Your NCBI_taxid ({}) does not match with your provided NCBI
"
"
species ({}). The NCBI_taxid for this species is
"
"
{}
"
.
format
(
NCBI_taxid
,
NCBI_species
,
infos
[
6
]))
sys
.
exit
(
1
)
"
species_taxid
"
:
NCBI_taxid
,
"
metadata_table
"
:
abs_sumfile
}
message
=
"
Downloading all genomes for
"
# If NCBI species given, add it to arguments to download genomes, and write it to info message
if
NCBI_species
:
keyargs
[
"
genus
"
]
=
NCBI_species
logger
.
info
(
"
Downloading refseq genomes for {} (taxid={})
"
.
format
(
NCBI_species
,
NCBI_taxid
))
max_retries
=
15
message
+=
f
"
NCBI species =
{
NCBI_species
}
"
# If NCBI species given, add it to arguments to download genomes, and write it to info message
if
NCBI_taxid
:
keyargs
[
"
species_taxid
"
]
=
NCBI_taxid
if
NCBI_species
:
message
+=
f
"
(NCBI_taxid =
{
NCBI_taxid
}
).
"
else
:
message
+=
f
"
NCBI_taxid =
{
NCBI_taxid
}
"
logger
.
info
(
f
"
Metadata for all genomes will be saved in
{
sumfile
}
"
)
logger
.
info
(
message
)
# Download genomes
max_retries
=
15
# If connection to NCBI fails, how many retry downloads must be done
error_message
=
(
"
Could not download genomes. Check that you gave valid NCBI taxid and/or
"
"
NCBI species name. If you gave both, check that given taxID and name really
"
"
correspond to the same species.
"
)
try
:
# Download genomes
ret
=
ngd
.
download
(
**
keyargs
)
except
:
logger
.
error
(
"
Could not download species taxID {}. Check that you gave the good
"
"
one.
"
.
format
(
NCBI_taxid
)
)
# Error message if crash during execution of ncbi_genome_download
logger
.
error
(
error_message
)
sys
.
exit
(
1
)
attempts
=
0
while
ret
==
75
and
attempts
<
max_retries
:
...
...
@@ -64,6 +88,12 @@ def download_from_refseq(sum_file, NCBI_species, NCBI_taxid, outdir, threads):
logging
.
error
((
'
Downloading from NCBI failed due to a connection error,
'
'
retrying. Already retried so far: %s
'
),
attempts
)
ret
=
ngd
.
download
(
**
keyargs
)
# Message if NGD did not manage to download the genomes (wrong species name/taxid)
if
ret
!=
0
:
# Error message
logger
.
error
(
error_message
)
sys
.
exit
(
1
)
sys
.
exit
(
1
)
nb_gen
,
db_dir
=
to_database
(
outdir
)
logger
.
info
(
"
Downloaded {} genomes.
"
.
format
(
nb_gen
))
return
db_dir
...
...
@@ -74,10 +104,20 @@ def download_summary(species_linked, outdir):
Get assembly_summary file for the given species if it exists. To be able to download it,
the given NCBI species name must be exalctly as the name given on NCBI website.
species_linked : given NCBI species with
'
_
'
instead of spaces, or NCBI taxID if species
Parameters
----------
species_linked : str
given NCBI species with
'
_
'
instead of spaces, or NCBI taxID if species
name not given (then, assembly file won
'
t be found
outdir: directory where downloaded assembly file must be saved
outdir : str
Directory where summary file must be saved
logger : logging.Logger
log object to add information
Returns
-------
str :
Output filename of downloaded summary
"""
logger
.
info
(
"
Retrieving assembly_summary file for {}
"
.
format
(
species_linked
))
url
=
(
"
ftp://ftp.ncbi.nih.gov/genomes/refseq/
"
...
...
@@ -86,9 +126,9 @@ def download_summary(species_linked, outdir):
try
:
urllib
.
request
.
urlretrieve
(
url
,
outfile
)
except
:
logger
.
warning
(
"
assembly_summary file cannot be downloaded.
Please check that you
"
"
provided the exact species name, as given in NCBI
"
)
return
logger
.
warning
(
f
"
assembly_summary file
for
{
species_linked
}
cannot be downloaded.
"
"
Please check that you
provided the exact species name, as given in NCBI
"
)
return
""
return
outfile
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment