Skip to content
Snippets Groups Projects
Commit a92e9d8c authored by Amandine  PERRIN's avatar Amandine PERRIN
Browse files

Adapt download script to new ngd API

parent d326ac08
No related branches found
No related tags found
No related merge requests found
......@@ -21,42 +21,66 @@ from PanACoTA import utils
logger = logging.getLogger("ddg.log_dds")
def download_from_refseq(sum_file, NCBI_species, NCBI_taxid, outdir, threads):
def download_from_refseq(species_linked, NCBI_species, NCBI_taxid, outdir, threads):
"""
Download refseq genomes of given species
Parameters
----------
species_linked : str
given NCBI species with '_' instead of spaces, or NCBI taxID if species
name not given
NCBI_species : str
name of species to download: user given NCBI species with '_' instead of spaces. None if
no species name given
NCBI_taxid : int
species taxid given in NCBI
outdir : str
Directory where downloaded sequences must be saved
threads : int
Number f threads to use to download genome sequences
Returns
-------
str :
Output filename of downloaded summary
"""
# arguments needed to download a species genomes
keyargs = {"section": "refseq", "file_format": "fasta", "output": outdir,
# Name of summary file, with metadata for each strain:
sumfile = os.path.join(outdir, "assembly_summary-{}.txt".format(species_linked))
abs_sumfile = os.path.abspath(sumfile)
# arguments needed to download all genomes of the given species
abs_outdir = os.path.abspath(outdir)
keyargs = {"section": "refseq", "file_format": "fasta", "output": abs_outdir,
"parallel": threads, "group": "bacteria",
"species_taxid": NCBI_taxid}
# summary file could not be downloaded because given species does not match
# any NCBI species. Just download genomes with the given taxID
if not sum_file:
logger.info("Downloading refseq genomes for taxid={}".format(NCBI_taxid))
else:
with open(sum_file, "r") as sum_lines:
for line in sum_lines:
infos = line.split()
if len(infos)>=6:
try:
number = int(infos[6])
except ValueError:
continue
if number != int(NCBI_taxid):
logger.error("Your NCBI_taxid ({}) does not match with your provided NCBI "
"species ({}). The NCBI_taxid for this species is "
"{}".format(NCBI_taxid, NCBI_species, infos[6]))
sys.exit(1)
"species_taxid": NCBI_taxid, "metadata_table":abs_sumfile}
message = "Downloading all genomes for "
# If NCBI species given, add it to arguments to download genomes, and write it to info message
if NCBI_species:
keyargs["genus"] = NCBI_species
logger.info("Downloading refseq genomes for {} (taxid={})".format(NCBI_species,
NCBI_taxid))
max_retries = 15
message += f"NCBI species = {NCBI_species}"
# If NCBI species given, add it to arguments to download genomes, and write it to info message
if NCBI_taxid:
keyargs["species_taxid"] = NCBI_taxid
if NCBI_species:
message += f" (NCBI_taxid = {NCBI_taxid})."
else:
message += f" NCBI_taxid = {NCBI_taxid}"
logger.info(f"Metadata for all genomes will be saved in {sumfile}")
logger.info(message)
# Download genomes
max_retries = 15 # If connection to NCBI fails, how many retry downloads must be done
error_message = ("Could not download genomes. Check that you gave valid NCBI taxid and/or "
"NCBI species name. If you gave both, check that given taxID and name really "
"correspond to the same species.")
try:
# Download genomes
ret = ngd.download(**keyargs)
except:
logger.error("Could not download species taxID {}. Check that you gave the good "
"one.".format(NCBI_taxid))
# Error message if crash during execution of ncbi_genome_download
logger.error(error_message)
sys.exit(1)
attempts = 0
while ret == 75 and attempts < max_retries:
......@@ -64,6 +88,12 @@ def download_from_refseq(sum_file, NCBI_species, NCBI_taxid, outdir, threads):
logging.error(('Downloading from NCBI failed due to a connection error, '
'retrying. Already retried so far: %s'), attempts)
ret = ngd.download(**keyargs)
# Message if NGD did not manage to download the genomes (wrong species name/taxid)
if ret != 0:
# Error message
logger.error(error_message)
sys.exit(1)
sys.exit(1)
nb_gen, db_dir = to_database(outdir)
logger.info("Downloaded {} genomes.".format(nb_gen))
return db_dir
......@@ -74,10 +104,20 @@ def download_summary(species_linked, outdir):
Get assembly_summary file for the given species if it exists. To be able to download it,
the given NCBI species name must be exalctly as the name given on NCBI website.
species_linked : given NCBI species with '_' instead of spaces, or NCBI taxID if species
Parameters
----------
species_linked : str
given NCBI species with '_' instead of spaces, or NCBI taxID if species
name not given (then, assembly file won't be found
outdir: directory where downloaded assembly file must be saved
outdir : str
Directory where summary file must be saved
logger : logging.Logger
log object to add information
Returns
-------
str :
Output filename of downloaded summary
"""
logger.info("Retrieving assembly_summary file for {}".format(species_linked))
url = ("ftp://ftp.ncbi.nih.gov/genomes/refseq/"
......@@ -86,9 +126,9 @@ def download_summary(species_linked, outdir):
try:
urllib.request.urlretrieve(url, outfile)
except:
logger.warning("assembly_summary file cannot be downloaded. Please check that you "
"provided the exact species name, as given in NCBI")
return
logger.warning(f"assembly_summary file for {species_linked} cannot be downloaded. "
"Please check that you provided the exact species name, as given in NCBI")
return ""
return outfile
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment