diff --git a/PanACoTA/prepare_module/download_genomes_func.py b/PanACoTA/prepare_module/download_genomes_func.py index f16744dcd1d7b22d7ffe9cf87a071f29b0a843a9..da26ec8c38fbd74d00ff10d14da3bfd0c613f67a 100644 --- a/PanACoTA/prepare_module/download_genomes_func.py +++ b/PanACoTA/prepare_module/download_genomes_func.py @@ -99,39 +99,6 @@ def download_from_refseq(species_linked, NCBI_species, NCBI_taxid, outdir, threa return db_dir -def download_summary(species_linked, outdir): - """ - Get assembly_summary file for the given species if it exists. To be able to download it, - the given NCBI species name must be exalctly as the name given on NCBI website. - - Parameters - ---------- - species_linked : str - given NCBI species with '_' instead of spaces, or NCBI taxID if species - name not given (then, assembly file won't be found - outdir : str - Directory where summary file must be saved - logger : logging.Logger - log object to add information - - Returns - ------- - str : - Output filename of downloaded summary - """ - logger.info("Retrieving assembly_summary file for {}".format(species_linked)) - url = ("ftp://ftp.ncbi.nih.gov/genomes/refseq/" - "bacteria/{}/assembly_summary.txt").format(species_linked) - outfile = os.path.join(outdir, "assembly_summary-{}.txt".format(species_linked)) - try: - urllib.request.urlretrieve(url, outfile) - except: - logger.warning(f"assembly_summary file for {species_linked} cannot be downloaded. " - "Please check that you provided the exact species name, as given in NCBI") - return "" - return outfile - - def to_database(outdir): """ Move .fna.gz files to 'database_init' folder, and uncompress them. diff --git a/PanACoTA/subcommands/prepare.py b/PanACoTA/subcommands/prepare.py index d9989ef72dc4eb6979c9245fbf894c8835564bb3..cdd09f3a48b122efe991d96212174c26f91cb34a 100644 --- a/PanACoTA/subcommands/prepare.py +++ b/PanACoTA/subcommands/prepare.py @@ -117,24 +117,20 @@ def main(cmd, NCBI_species, NCBI_taxid, outdir, threads, no_refseq, only_mash, l logfile_base, logger = utils.init_logger(logfile_base, level, 'prepare', details=True, verbose=verbose, quiet=quiet) + # Message on what will be done (cmd, cores used) logger.info("Command used\n \t > " + cmd) message = f"'PanACoTA prepare' will run on {threads} " message += f"cores" if threads>1 else "core" logger.info(message) # Start prepare step - # Run more than only mash filter : + # Run more than only mash filter (!only_mash): # - start from QC and mash (norefseq) # - start from genome download (!norefseq)) if not only_mash: if no_refseq: # Do not download genomes, just do QC and mash filter on given genomes logger.info('You asked to skip refseq downloads.') - else: # Do all steps: download, QC, mash filter - sum_file = "" - # If user specified a species name, try to download the corresponding summary file - if NCBI_species: - sum_file = dgf.download_summary(species_linked, outdir) # Download all genomes of the given taxID db_dir = dgf.download_from_refseq(species_linked, NCBI_species, NCBI_taxid, outdir, threads)