Skip to content
Snippets Groups Projects
Commit fc4ec18c authored by Amandine  PERRIN's avatar Amandine PERRIN
Browse files

summary file is now automatically downloaded

parent 283d4177
Branches
Tags
No related merge requests found
......@@ -99,39 +99,6 @@ def download_from_refseq(species_linked, NCBI_species, NCBI_taxid, outdir, threa
return db_dir
def download_summary(species_linked, outdir):
"""
Get assembly_summary file for the given species if it exists. To be able to download it,
the given NCBI species name must be exalctly as the name given on NCBI website.
Parameters
----------
species_linked : str
given NCBI species with '_' instead of spaces, or NCBI taxID if species
name not given (then, assembly file won't be found
outdir : str
Directory where summary file must be saved
logger : logging.Logger
log object to add information
Returns
-------
str :
Output filename of downloaded summary
"""
logger.info("Retrieving assembly_summary file for {}".format(species_linked))
url = ("ftp://ftp.ncbi.nih.gov/genomes/refseq/"
"bacteria/{}/assembly_summary.txt").format(species_linked)
outfile = os.path.join(outdir, "assembly_summary-{}.txt".format(species_linked))
try:
urllib.request.urlretrieve(url, outfile)
except:
logger.warning(f"assembly_summary file for {species_linked} cannot be downloaded. "
"Please check that you provided the exact species name, as given in NCBI")
return ""
return outfile
def to_database(outdir):
"""
Move .fna.gz files to 'database_init' folder, and uncompress them.
......
......@@ -117,24 +117,20 @@ def main(cmd, NCBI_species, NCBI_taxid, outdir, threads, no_refseq, only_mash, l
logfile_base, logger = utils.init_logger(logfile_base, level, 'prepare', details=True,
verbose=verbose, quiet=quiet)
# Message on what will be done (cmd, cores used)
logger.info("Command used\n \t > " + cmd)
message = f"'PanACoTA prepare' will run on {threads} "
message += f"cores" if threads>1 else "core"
logger.info(message)
# Start prepare step
# Run more than only mash filter :
# Run more than only mash filter (!only_mash):
# - start from QC and mash (norefseq)
# - start from genome download (!norefseq))
if not only_mash:
if no_refseq: # Do not download genomes, just do QC and mash filter on given genomes
logger.info('You asked to skip refseq downloads.')
else: # Do all steps: download, QC, mash filter
sum_file = ""
# If user specified a species name, try to download the corresponding summary file
if NCBI_species:
sum_file = dgf.download_summary(species_linked, outdir)
# Download all genomes of the given taxID
db_dir = dgf.download_from_refseq(species_linked, NCBI_species, NCBI_taxid,
outdir, threads)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment