Add possibility to choose assembly level while downloading genomes from NCBI

f6510f0d · Amandine PERRIN · 0b90c4b7 · f6510f0d · f6510f0d
Commit f6510f0d authored 4 years ago by Amandine PERRIN
--- a/PanACoTA/prepare_module/download_genomes_func.py
+++ b/PanACoTA/prepare_module/download_genomes_func.py
@@ -53,7 +53,7 @@ from PanACoTA import utils
 logger = logging.getLogger("prepare.dds")


-def download_from_refseq(species_linked, NCBI_species, NCBI_taxid, outdir, threads):
+def download_from_refseq(species_linked, NCBI_species, NCBI_taxid, levels, outdir, threads):
    """
    Download refseq genomes of given species

@@ -79,7 +79,7 @@ def download_from_refseq(species_linked, NCBI_species, NCBI_taxid, outdir, threa

    """
    # Name of summary file, with metadata for each strain:
-    sumfile = os.path.join(outdir, "assembly_summary-{}.txt".format(species_linked))
+    sumfile = os.path.join(outdir, f"assembly_summary-{species_linked}.txt")
    abs_sumfile = os.path.abspath(sumfile)

    # arguments needed to download all genomes of the given species
@@ -99,6 +99,10 @@ def download_from_refseq(species_linked, NCBI_species, NCBI_taxid, outdir, threa
            message += f" (NCBI_taxid = {NCBI_taxid})."
        else:
            message += f" NCBI_taxid = {NCBI_taxid}"
+    # If assembly level(s) given, add it to arguments, and write to info message
+    if levels:
+        keyargs["assembly_levels"] = levels
+        message += f" (Only those assembly levels: {levels}). "
    logger.info(f"Metadata for all genomes will be saved in {sumfile}")
    logger.info(message)


--- a/test/test_unit/test_prepare/test_download.py
+++ b/test/test_unit/test_prepare/test_download.py
@@ -204,9 +204,10 @@ def test_download():
    NCBI_taxid = "104099"
    outdir = os.path.join(DATA_TEST_DIR, "test_download_refseq")
    threads = 1
+    levels = ""

-    db_dir, nb_gen = downg.download_from_refseq(species_linked, NCBI_species, NCBI_taxid,
-                                        outdir, threads)
+    db_dir, nb_gen = downg.download_from_refseq(species_linked, NCBI_species, NCBI_taxid, levels,
+                                                outdir, threads)
    # Check path to uncompressed files is as expected
    assert db_dir == os.path.join(outdir, "Database_init")
    # Check number of genomes downloaded. We cannot know the exact value, as it is updated everyday. But in nov. 2019, there are 4 genomes. So, there must be at least those 4 genomes
@@ -241,8 +242,9 @@ def test_download_noSpeName():
    NCBI_taxid = "104099"
    outdir = os.path.join(DATA_TEST_DIR, "test_download_refseq_noSpe")
    threads = 1
+    levels = ""

-    db_dir, nb_gen = downg.download_from_refseq(species_linked, NCBI_species, NCBI_taxid,
+    db_dir, nb_gen = downg.download_from_refseq(species_linked, NCBI_species, NCBI_taxid, levels,
                                                outdir, threads)

    # Check path to uncompressed files is as expected
@@ -277,8 +279,9 @@ def test_download_wrongTaxID(caplog):
    NCBI_taxid = "10409"
    outdir = os.path.join(DATA_TEST_DIR, "test_download_refseq_wrongTaxID")
    threads = 1
+    levels = ""
    with pytest.raises(SystemExit):
-        downg.download_from_refseq(species_linked, NCBI_species, NCBI_taxid,
+        downg.download_from_refseq(species_linked, NCBI_species, NCBI_taxid, levels,
                                   outdir, threads)

    # Check path to uncompressed files does not exist
@@ -311,8 +314,9 @@ def test_download_diffSpeTaxID(caplog):
    NCBI_taxid = "104099"
    outdir = os.path.join(DATA_TEST_DIR, "test_download_refseq_wrongTaxID")
    threads = 1
+    levels = ""
    with pytest.raises(SystemExit):
-        downg.download_from_refseq(species_linked, NCBI_species, NCBI_taxid,
+        downg.download_from_refseq(species_linked, NCBI_species, NCBI_taxid, levels,
                                   outdir, threads)

    # Check path to uncompressed files does not exist