Adapt download script to new ngd API

a92e9d8c · Amandine PERRIN · d326ac08 · a92e9d8c
Commit a92e9d8c authored 5 years ago by Amandine PERRIN
--- a/PanACoTA/prepare_module/download_genomes_func.py
+++ b/PanACoTA/prepare_module/download_genomes_func.py
@@ -21,42 +21,66 @@ from PanACoTA import utils
 logger = logging.getLogger("ddg.log_dds")


-def download_from_refseq(sum_file, NCBI_species, NCBI_taxid, outdir, threads):
+def download_from_refseq(species_linked, NCBI_species, NCBI_taxid, outdir, threads):
    """
    Download refseq genomes of given species
+
+    Parameters
+    ----------
+    species_linked : str
+        given NCBI species with '_' instead of spaces, or NCBI taxID if species
+        name not given
+    NCBI_species : str
+        name of species to download: user given NCBI species with '_' instead of spaces. None if
+        no species name given
+    NCBI_taxid : int
+        species taxid given in NCBI
+    outdir : str
+        Directory where downloaded sequences must be saved
+    threads : int
+        Number f threads to use to download genome sequences
+
+    Returns
+    -------
+    str :
+        Output filename of downloaded summary
+
    """
-    # arguments needed to download a species genomes
-    keyargs = {"section": "refseq", "file_format": "fasta", "output": outdir,
+    # Name of summary file, with metadata for each strain:
+    sumfile = os.path.join(outdir, "assembly_summary-{}.txt".format(species_linked))
+    abs_sumfile = os.path.abspath(sumfile)
+
+    # arguments needed to download all genomes of the given species
+    abs_outdir = os.path.abspath(outdir)
+    keyargs = {"section": "refseq", "file_format": "fasta", "output": abs_outdir,
               "parallel": threads, "group": "bacteria",
-               "species_taxid": NCBI_taxid}
-    # summary file could not be downloaded because given species does not match
-    #  any NCBI species. Just download genomes with the given taxID
-    if not sum_file:
-        logger.info("Downloading refseq genomes for taxid={}".format(NCBI_taxid))
-    else:
-        with open(sum_file, "r") as sum_lines:
-            for line in sum_lines:
-                infos = line.split()
-                if len(infos)>=6:
-                    try:
-                        number = int(infos[6])
-                    except ValueError:
-                        continue
-                if number != int(NCBI_taxid):
-                    logger.error("Your NCBI_taxid ({}) does not match with your provided NCBI "
-                                 "species ({}). The NCBI_taxid for this species is "
-                                 "{}".format(NCBI_taxid, NCBI_species, infos[6]))
-                    sys.exit(1)
+               "species_taxid": NCBI_taxid, "metadata_table":abs_sumfile}
+    message = "Downloading all genomes for "
+    # If NCBI species given, add it to arguments to download genomes, and write it to info message
+    if NCBI_species:
        keyargs["genus"] = NCBI_species
-        logger.info("Downloading refseq genomes for {} (taxid={})".format(NCBI_species,
-                                                                          NCBI_taxid))
-    max_retries = 15
-
+        message += f"NCBI species = {NCBI_species}"
+    # If NCBI species given, add it to arguments to download genomes, and write it to info message
+    if NCBI_taxid:
+        keyargs["species_taxid"] = NCBI_taxid
+        if NCBI_species:
+            message += f" (NCBI_taxid = {NCBI_taxid})."
+        else:
+            message += f" NCBI_taxid = {NCBI_taxid}"
+    logger.info(f"Metadata for all genomes will be saved in {sumfile}")
+    logger.info(message)
+
+    # Download genomes
+    max_retries = 15 # If connection to NCBI fails, how many retry downloads must be done
+    error_message = ("Could not download genomes. Check that you gave valid NCBI taxid and/or "
+                     "NCBI species name. If you gave both, check that given taxID and name really "
+                     "correspond to the same species.")
    try:
+        # Download genomes
        ret = ngd.download(**keyargs)
    except:
-        logger.error("Could not download species taxID {}. Check that you gave the good "
-                     "one.".format(NCBI_taxid))
+        # Error message if crash during execution of ncbi_genome_download
+        logger.error(error_message)
        sys.exit(1)
    attempts = 0
    while ret == 75 and attempts < max_retries:
@@ -64,6 +88,12 @@ def download_from_refseq(sum_file, NCBI_species, NCBI_taxid, outdir, threads):
        logging.error(('Downloading from NCBI failed due to a connection error, '
                       'retrying. Already retried so far: %s'), attempts)
        ret = ngd.download(**keyargs)
+    # Message if NGD did not manage to download the genomes (wrong species name/taxid)
+    if ret != 0:
+        # Error message
+        logger.error(error_message)
+        sys.exit(1)
+    sys.exit(1)
    nb_gen, db_dir = to_database(outdir)
    logger.info("Downloaded {} genomes.".format(nb_gen))
    return db_dir
@@ -74,10 +104,20 @@ def download_summary(species_linked, outdir):
    Get assembly_summary file for the given species if it exists. To be able to download it,
    the given NCBI species name must be exalctly as the name given on NCBI website.

-    species_linked : given NCBI species with '_' instead of spaces, or NCBI taxID if species
+    Parameters
+    ----------
+    species_linked : str
+        given NCBI species with '_' instead of spaces, or NCBI taxID if species
        name not given (then, assembly file won't be found
-
-    outdir: directory where downloaded assembly file must be saved
+    outdir : str
+        Directory where summary file must be saved
+    logger : logging.Logger
+        log object to add information
+
+    Returns
+    -------
+    str :
+        Output filename of downloaded summary
    """
    logger.info("Retrieving assembly_summary file for {}".format(species_linked))
    url = ("ftp://ftp.ncbi.nih.gov/genomes/refseq/"
@@ -86,9 +126,9 @@ def download_summary(species_linked, outdir):
    try:
        urllib.request.urlretrieve(url, outfile)
    except:
-        logger.warning("assembly_summary file cannot be downloaded. Please check that you "
-                       "provided the exact species name, as given in NCBI")
-        return
+        logger.warning(f"assembly_summary file for {species_linked} cannot be downloaded. "
+                        "Please check that you provided the exact species name, as given in NCBI")
+        return ""
    return outfile