Skip to content
Snippets Groups Projects
Select Git revision
  • f6510f0d1f15cce5dcf36b8f538183aad0639ccf
  • master default protected
  • dev
  • install
  • new_master
  • protein_ortho
  • documentation
  • pr18
  • dev-licence
  • docker
  • prodigal_train
  • containers
  • module_all
  • functional_tests
  • opti
  • helpers
  • v1.4.1
  • v1.4.0
  • v1.3.1
  • v1.3.0
  • v1.2.0
  • v1.1.0
  • v1.0.1
  • v1.0
24 results

test_download.py

Blame
  • test_download.py 13.32 KiB
    #!/usr/bin/env python3
    # coding: utf-8
    
    """
    Unit tests for the download_genomes_func submodule in prepare module
    """
    import os
    import logging
    import glob
    import shutil
    import pytest
    
    import PanACoTA.prepare_module.download_genomes_func as downg
    
    
    DATA_TEST_DIR = os.path.join("test", "data", "prepare")
    
    
    def test_to_database():
        """
        Test that all fna.gz files are uncompressed and moved to a created Database_init folder
        """
        out_dir = os.path.join(DATA_TEST_DIR, "genomes")
        nb_gen, db_init_dir = downg.to_database(out_dir)
        db_dir = os.path.join(DATA_TEST_DIR, "genomes", "Database_init")
        assert os.path.isdir(db_dir)
        files_all = glob.glob(os.path.join(db_dir, "*"))
        files_fna = glob.glob(os.path.join(db_dir, "*.fna"))
        # Check that there are only 3 files in result database
        assert len(files_all) == len(files_fna)
        # And that those files are .fna files
        assert len(files_fna) == 3
        # Check that we have as many genomes as expected, and that the output database has the
        # expected name
        assert nb_gen == 3
        assert db_init_dir == db_dir
        assert os.path.isfile(os.path.join(db_dir, "ACOR002.0519.fna"))
        assert os.path.isfile(os.path.join(db_dir, "ACOR002.0519.fna"))
        assert os.path.isfile(os.path.join(db_dir, "ACOR003.0519.fna"))
    
        # Remove database created
        shutil.rmtree(db_dir)
    
    
    def test_to_database_nofolder_refseq(caplog):
        """
        Test behavior when the folder that should contain refseq downloaded genomes does not exist
        -> should exit with error message
        """
        caplog.set_level(logging.DEBUG)
        with pytest.raises(SystemExit):
            downg.to_database(DATA_TEST_DIR)
    
        assert "ERROR" in caplog.text
        assert ("The folder containing genomes downloaded from NCBI refseq "
                "(test/data/prepare/refseq/bacteria) does not exist.") in caplog.text
        assert ("Check that you really downloaded sequences (fna.gz) and that they are "
                "in this folder") in caplog.text
    
    
    def test_to_database_nofolder_per_genome(caplog):
        """
        Test behavior when the folder refseq/bacteria exists, but there are no folders inside
        -> should exit with error message
        """
        empty_dir = os.path.join(DATA_TEST_DIR, "refseq", "bacteria")
        os.makedirs(empty_dir)
        caplog.set_level(logging.DEBUG)
        with pytest.raises(SystemExit):
            downg.to_database(DATA_TEST_DIR)
        # Check error message is as expected
        assert "ERROR" in caplog.text
        assert ("The folder supposed to contain genomes downloaded from NCBI refseq "
                "(test/data/prepare/refseq/bacteria) exists but is empty") in caplog.text
        assert ("Check that you really downloaded sequences (fna.gz)") in caplog.text
    
        # Remove files/folders specific to test
        shutil.rmtree(os.path.join(DATA_TEST_DIR, "refseq"))
    
    
    def test_to_database_1empty_genome_folder(caplog):
        """
        Test behavior when the folder refseq/bacteria exists, there are subfolders inside,
        but 1 of them is empty: warning message informing that this genome will be ignored
        """
        out_dir = os.path.join(DATA_TEST_DIR, "genomes")
        gz_genomes_folder = os.path.join(out_dir, "refseq", "bacteria")
    
        # Empty 1 directory: move its file to 'out_dir'
        to_move_filename = "ACOR002.0519.fna.gz" # File that must be moved
        to_empty_dir = "ACOR002" # Directory containing file to move
        to_move_file = os.path.join(gz_genomes_folder, to_empty_dir, to_move_filename)
        shutil.move(to_move_file, os.path.join(out_dir, to_move_filename))
    
        # Run to_database
        nb_gen, db_dir = downg.to_database(out_dir)
        assert nb_gen == 2
        assert db_dir == os.path.join(out_dir, "Database_init")
    
        # Check that a warning message was raised, indicating that genome is ignored
        caplog.set_level(logging.DEBUG)
        assert "WARNING" in caplog.text
        assert ("Problem with genome in ACOR002: no compressed fasta file downloaded. "
                "This genome will be ignored.") in caplog.text
        assert not os.path.isfile(os.path.join(db_dir, "ACOR002.0519.fna"))
        assert os.path.isfile(os.path.join(db_dir, "ACOR001.0519.fna"))
        assert os.path.isfile(os.path.join(db_dir, "ACOR003.0519.fna"))
    
        # Remove files/folders specific to test
        shutil.move(os.path.join(out_dir, to_move_filename), to_move_file)
        shutil.rmtree(db_dir)
    
    
    def test_to_database_several_genomes(caplog):
        """
        Test behavior when the folder refseq/bacteria exists, there are subfolders inside,
        but 1 of them contains more than 1 genome: warning message informing that this
        genome will be ignored
        """
        out_dir = os.path.join(DATA_TEST_DIR, "genomes")
        gz_genomes_folder = os.path.join(out_dir, "refseq", "bacteria")
    
        # Create a new gz file in one of the genome directories
        to_create_filename = "ACOR002.0519.bis.fna.gz" # Name of file that must be created
        to_fill_dir = "ACOR002" # Directory containing file to create
        to_create_path = os.path.join(gz_genomes_folder, to_fill_dir, to_create_filename)
        # Create empty gz file
        open(to_create_path, "w").close()
    
        # Run to_database, and check that only 2 genomes were considered
        nb_gen, db_dir = downg.to_database(out_dir)
        assert nb_gen == 2
        assert db_dir == os.path.join(out_dir, "Database_init")
    
        # Check that a warning message was raised, indicating that genome is ignored
        caplog.set_level(logging.DEBUG)
        assert "WARNING" in caplog.text
        assert ("Problem with genome in ACOR002: several compressed fasta files found. "
                "This genome will be ignored.") in caplog.text
        assert not os.path.isfile(os.path.join(db_dir, "ACOR002.0519.fna"))
        assert os.path.isfile(os.path.join(db_dir, "ACOR001.0519.fna"))
        assert os.path.isfile(os.path.join(db_dir, "ACOR003.0519.fna"))
    
        # Remove test files/folders
        os.remove(to_create_path)
        shutil.rmtree(db_dir)
    
    
    def test_to_database_1genome_wrong_format(caplog):
        """
        Test behavior when the folder refseq/bacteria exists, there is 1 genome per subfolder,
        but 1 genome cannot be unzipped
        """
        out_dir = os.path.join(DATA_TEST_DIR, "genomes")
        gz_genomes_folder = os.path.join(out_dir, "refseq", "bacteria")
    
        # Name of directory directly containing the original gz file
        to_corrupt_dir = "ACOR001"
        # Name of original gz file that must be moved to be saved
        to_empty_filename = "ACOR001.0519.fna.gz"
        # Complete path to this original gz file
        to_empty_path = os.path.join(gz_genomes_folder, to_corrupt_dir, to_empty_filename) #
        # copy real gz genome file to outdir to save it, and create a fake one in place of it
        shutil.copy(to_empty_path, os.path.join(out_dir, to_empty_filename))
        # Create fake gz file (txt file)
        false_gz = open(to_empty_path, "w")
        false_gz.write("This is not a gz file")
        false_gz.close()
    
        # Run to_database
        nb_gen, db_dir = downg.to_database(out_dir)
        assert nb_gen == 2
        assert db_dir == os.path.join(out_dir, "Database_init")
    
        # Check that a error message was raised, indicating that genome is ignored
        caplog.set_level(logging.DEBUG)
        assert "ERROR" in caplog.text
        assert ("Error while trying to uncompress "
                "test/data/prepare/genomes/Database_init/ACOR001.0519.fna.gz. "
                "This genome will be ignored") in caplog.text
        # Check that there are only 2 files in the database, and that they correspond
        # to uncompressed gz files
        list_db = os.listdir(db_dir)
        assert len(list_db) == 2
        assert not os.path.isfile(os.path.join(db_dir, to_empty_filename))
        assert os.path.isfile(os.path.join(db_dir, "ACOR002.0519.fna"))
        assert os.path.isfile(os.path.join(db_dir, "ACOR003.0519.fna"))
    
        # Remove test files/Folders
        shutil.move(os.path.join(out_dir, to_empty_filename), to_empty_path)
        shutil.rmtree(db_dir)
    
    
    def test_download():
        """
        Test that, given a taxid, and a species name,
        it downloads genomes in .gz, and uncompress them in the
        db folder (which is named as expected)
    
        We cannot compare log, as it is already catched by NCBI_genome_download
        """
        species_linked = "Acetobacter_orleanensis"
        NCBI_species = "Acetobacter orleanensis"
        NCBI_taxid = "104099"
        outdir = os.path.join(DATA_TEST_DIR, "test_download_refseq")
        threads = 1
        levels = ""
    
        db_dir, nb_gen = downg.download_from_refseq(species_linked, NCBI_species, NCBI_taxid, levels,
                                                    outdir, threads)
        # Check path to uncompressed files is as expected
        assert db_dir == os.path.join(outdir, "Database_init")
        # Check number of genomes downloaded. We cannot know the exact value, as it is updated everyday. But in nov. 2019, there are 4 genomes. So, there must be at least those 4 genomes
        assert nb_gen >= 4
        # And that db_dir exists and contains nb_gen files
        assert os.path.isdir(db_dir)
        assert len(os.listdir(db_dir)) == nb_gen
    
        # Check that assembly summary file wwas donwloaded as expected
        sum_file = os.path.join(outdir, "assembly_summary-Acetobacter_orleanensis.txt" )
        assert os.path.isfile(sum_file)
    
        # Check that the NCBI_genome_download output directory exists
        ngd_outdir = os.path.join(outdir, "refseq", "bacteria")
        # And that it contains folders
        assert os.path.isdir(ngd_outdir)
        assert len(os.listdir(ngd_outdir)) >= 3
    
        # Remove test ouput dir
        shutil.rmtree(outdir)
    
    
    def test_download_noSpeName():
        """
        Test that, given a taxid, it downloads genomes in .gz, and uncompress them in the
        db folder (which is named as expected)
    
        We cannot compare log, as it is already catched by NCBI_genome_download
        """
        species_linked = "toto"
        NCBI_species = None
        NCBI_taxid = "104099"
        outdir = os.path.join(DATA_TEST_DIR, "test_download_refseq_noSpe")
        threads = 1
        levels = ""
    
        db_dir, nb_gen = downg.download_from_refseq(species_linked, NCBI_species, NCBI_taxid, levels,
                                                    outdir, threads)
    
        # Check path to uncompressed files is as expected
        assert db_dir == os.path.join(outdir, "Database_init")
        # Check number of genomes downloaded. We cannot know the exact value, as it is updated everyday. But in nov. 2019, there are 4 genomes. So, there must be at least those 4 genomes
        assert nb_gen >= 4
        # And that db_dir exists and contains nb_gen files
        assert os.path.isdir(db_dir)
        assert len(os.listdir(db_dir)) == nb_gen
    
        # Check that assembly summary file was donwloaded as expected
        sum_file = os.path.join(outdir, "assembly_summary-toto.txt" )
        assert os.path.isfile(sum_file)
    
        # Check that the NCBI_genome_download output directory exists
        ngd_outdir = os.path.join(outdir, "refseq", "bacteria")
        # And that it contains folders
        assert os.path.isdir(ngd_outdir)
        assert len(os.listdir(ngd_outdir)) >= 3
        # Remove test ouput dir
        shutil.rmtree(outdir)
    
    
    def test_download_wrongTaxID(caplog):
        """
        Test that, when a non existing taxid is given, it exits (with error message)
    
        We cannot compare log, as it is already catched by NCBI_genome_download
        """
        species_linked = "Acetobacter_orleanensis"
        NCBI_species = None
        NCBI_taxid = "10409"
        outdir = os.path.join(DATA_TEST_DIR, "test_download_refseq_wrongTaxID")
        threads = 1
        levels = ""
        with pytest.raises(SystemExit):
            downg.download_from_refseq(species_linked, NCBI_species, NCBI_taxid, levels,
                                       outdir, threads)
    
        # Check path to uncompressed files does not exist
        assert not os.path.isdir(os.path.join(outdir, "Database_init"))
    
        # Check that the NCBI_genome_download output directory was not created
        ngd_outdir = os.path.join(outdir, "refseq", "bacteria")
        assert not os.path.isdir(ngd_outdir)
    
        # Check logs
        caplog.set_level(logging.DEBUG)
        assert "ERROR" in caplog.text
        assert ("Could not download genomes. Check that you gave valid NCBI taxid and/or "
                "NCBI species name. If you gave both, check that given taxID and name really "
                "correspond to the same species.") in caplog.text
    
        # Check that output directory was not created
        assert not os.path.isdir(outdir)
    
    
    def test_download_diffSpeTaxID(caplog):
        """
        Test that, when a taxID and a species name are given, but those 2 elements do not
        match with the same genomes, it exists with error message
    
        We cannot compare log, as it is already catched by NCBI_genome_download
        """
        species_linked = "Acetobacter_orleanensis"
        NCBI_species = "Acetobacter fabarum"
        NCBI_taxid = "104099"
        outdir = os.path.join(DATA_TEST_DIR, "test_download_refseq_wrongTaxID")
        threads = 1
        levels = ""
        with pytest.raises(SystemExit):
            downg.download_from_refseq(species_linked, NCBI_species, NCBI_taxid, levels,
                                       outdir, threads)
    
        # Check path to uncompressed files does not exist
        assert not os.path.isdir(os.path.join(outdir, "Database_init"))
    
        # Check that the NCBI_genome_download output directory was not created
        ngd_outdir = os.path.join(outdir, "refseq", "bacteria")
        assert not os.path.isdir(ngd_outdir)
    
        # Check logs
        caplog.set_level(logging.DEBUG)
        assert "ERROR" in caplog.text
        assert ("Could not download genomes. Check that you gave valid NCBI taxid and/or "
                "NCBI species name. If you gave both, check that given taxID and name really "
                "correspond to the same species.") in caplog.text
    
        # Check that output directory was not created
        assert not os.path.isdir(outdir)