summary file is now automatically downloaded

fc4ec18c · Amandine PERRIN · 283d4177 · fc4ec18c · fc4ec18c
Commit fc4ec18c authored 5 years ago by Amandine PERRIN
--- a/PanACoTA/prepare_module/download_genomes_func.py
+++ b/PanACoTA/prepare_module/download_genomes_func.py
@@ -99,39 +99,6 @@ def download_from_refseq(species_linked, NCBI_species, NCBI_taxid, outdir, threa
    return db_dir


-def download_summary(species_linked, outdir):
-    """
-    Get assembly_summary file for the given species if it exists. To be able to download it,
-    the given NCBI species name must be exalctly as the name given on NCBI website.
-
-    Parameters
-    ----------
-    species_linked : str
-        given NCBI species with '_' instead of spaces, or NCBI taxID if species
-        name not given (then, assembly file won't be found
-    outdir : str
-        Directory where summary file must be saved
-    logger : logging.Logger
-        log object to add information
-
-    Returns
-    -------
-    str :
-        Output filename of downloaded summary
-    """
-    logger.info("Retrieving assembly_summary file for {}".format(species_linked))
-    url = ("ftp://ftp.ncbi.nih.gov/genomes/refseq/"
-           "bacteria/{}/assembly_summary.txt").format(species_linked)
-    outfile = os.path.join(outdir, "assembly_summary-{}.txt".format(species_linked))
-    try:
-        urllib.request.urlretrieve(url, outfile)
-    except:
-        logger.warning(f"assembly_summary file for {species_linked} cannot be downloaded. "
-                        "Please check that you provided the exact species name, as given in NCBI")
-        return ""
-    return outfile
-
-
 def to_database(outdir):
    """
    Move .fna.gz files to 'database_init' folder, and uncompress them.

--- a/PanACoTA/subcommands/prepare.py
+++ b/PanACoTA/subcommands/prepare.py
@@ -117,24 +117,20 @@ def main(cmd, NCBI_species, NCBI_taxid, outdir, threads, no_refseq, only_mash, l
    logfile_base, logger = utils.init_logger(logfile_base, level, 'prepare', details=True,
                                             verbose=verbose, quiet=quiet)

+    # Message on what will be done (cmd, cores used)
    logger.info("Command used\n \t > " + cmd)
    message = f"'PanACoTA prepare' will run on {threads} "
    message += f"cores" if threads>1 else "core"
    logger.info(message)

    # Start prepare step
-    # Run more than only mash filter :
+    # Run more than only mash filter (!only_mash):
    # - start from QC and mash (norefseq)
    # - start from genome download (!norefseq))
    if not only_mash:
        if no_refseq:   # Do not download genomes, just do QC and mash filter on given genomes
            logger.info('You asked to skip refseq downloads.')
-
        else:  # Do all steps: download, QC, mash filter
-            sum_file = ""
-            # If user specified a species name, try to download the corresponding summary file
-            if NCBI_species:
-                sum_file = dgf.download_summary(species_linked, outdir)
            # Download all genomes of the given taxID
            db_dir = dgf.download_from_refseq(species_linked, NCBI_species, NCBI_taxid,
                                              outdir, threads)