diff --git a/PanACoTA/subcommands/prepare.py b/PanACoTA/subcommands/prepare.py index b88cd882e10c59265c39bb4cbdf2d9f80a573d05..135628d52eca8b7864bbed4b56c48796daf9b5bf 100644 --- a/PanACoTA/subcommands/prepare.py +++ b/PanACoTA/subcommands/prepare.py @@ -66,7 +66,7 @@ def main_from_parse(arguments): """ cmd = "PanACoTA " + ' '.join(arguments.argv) - main(cmd, arguments.NCBI_species, arguments.NCBI_species_taxid, arguments.level, + main(cmd, arguments.NCBI_species, arguments.NCBI_species_taxid, arguments.levels, arguments.outdir, arguments.tmp_dir, arguments.parallel, arguments.no_refseq, arguments.db_dir, arguments.only_mash, arguments.from_info, arguments.l90, arguments.nbcont, arguments.cutn, arguments.min_dist, @@ -226,15 +226,10 @@ def main(cmd, NCBI_species, NCBI_taxid, levels, outdir, tmp_dir, threads, no_ref "output folder called 'new_outdir', make sure you have " "'-o new_outdir' option, " "and you specified where the uncompressed sequences to " - "use are ('-d sequence_database_path' -> " - "my_outdir/Database_init). ") + "use are ('-d sequence_database_path'). ") sys.exit(1) # add genomes from refseq/bacteria folder to Database_init nb_gen, _ = dgf.to_database(outdir) - # If no genome found, error -> nothing to analyse - if nb_gen == 0: - logger.error(f"There is no genome in {refseqdir}.") - sys.exit(1) # No sequence: Do all steps -> download, QC, mash filter else: # Download all genomes of the given taxID diff --git a/test/test_functional/test_prepare.py b/test/test_functional/test_prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..449303017e69ac49c83566faabc290002a0b224e --- /dev/null +++ b/test/test_functional/test_prepare.py @@ -0,0 +1,442 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +""" +Functional tests for genomeAPCAT annotate +""" + +from PanACoTA.subcommands import prepare +import test.test_unit.utilities_for_tests as tutil + +import pytest +import os +import subprocess +import shutil +import time +import argparse +import logging +import glob + + +# LOGFILE_BASE = "test_main_from_parse" +# Define variables used by several tests +DBDIR = os.path.join("test", "data", "prepare") +GEN_PATH = os.path.join(DBDIR, "genomes") +TEST_DIR = os.path.join(DBDIR, 'test_files') +GENEPATH = os.path.join(DBDIR, "generated_by_func-tests") + + +@pytest.fixture(autouse=True) +def setup_teardown_module(): + """ + Remove log files at the end of this test module + + Before each test: + - init logger + - create directory to put generated files + + After: + - remove all log files + - remove directory with generated results + """ + if not os.path.isdir(GENEPATH): + print("setup") + os.mkdir(GENEPATH) + print("setup") + + yield + shutil.rmtree(GENEPATH, ignore_errors=True) + print("teardown") + + +def test_main_from_parse(): + """ + Run + """ + args = argparse.Namespace() + args.argv = ["prepare", "test_func_prepare"] + args.NCBI_species = "Acetobacter orleanensis" + args.NCBI_species_taxid = "104099" + args.outdir = GENEPATH + args.tmp_dir = "" + args.parallel = 1 + args.no_refseq = False + args.db_dir = "" + args.only_mash = False + args.from_info = "" + args.l90 = 100 + args.nbcont = 999 + args.cutn = 0 + args.min_dist = 1e-4 + args.max_dist = 0.06 + args.verbose = 0 + args.quiet = False + args.levels = "" + + prepare.main_from_parse(args) + + # Check output files + summary = os.path.join(GENEPATH, "assembly_summary-Acetobacter_orleanensis.txt") + assert os.path.isfile(summary) + # Check that the NCBI_genome_download output directory exists + ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria") + # And that it contains folders + assert os.path.isdir(ngd_outdir) + assert len(os.listdir(ngd_outdir)) >= 4 + # Check logfiles are here + log_files = glob.glob(os.path.join(GENEPATH, "*log*")) + assert len(log_files) == 3 + # Check tmp files folder created, but empty as we do not split + tmp_folder = os.listdir(os.path.join(GENEPATH, "tmp_files")) + assert len(tmp_folder) == 0 + # Check Database_init folder created, with at list 4 ".fna" genomes + fna_files = glob.glob(os.path.join(GENEPATH, "Database_init", "*.fna")) + assert len(fna_files) >= 4 + + +def test_main_not_only_mash_infoexists(): + """ + We run without option only_mash, but still provide a lstinfo file + -> will change its name to .back to save it when the new file will be created + """ + NCBI_species = "" + NCBI_taxid = "104099" + levels = "" + outdir = GENEPATH + tmp_dir = os.path.join(outdir, "temporary_directory") + threads = 1 + no_refseq = False + db_dir = "" + only_mash = False + info_file = os.path.join(outdir, "LSTINFO-existing.lst") + open(info_file, "w").close() #create empty info file, to check it is renamed + l90 = 100 + nbcont = 999 + cutn = 5 + min_dist = 1e-4 + max_dist = 0.06 + verbose = 2 + quiet = False + prepare.main("cmd", NCBI_species, NCBI_taxid, levels, outdir, tmp_dir, threads, no_refseq, + db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, + verbose, quiet) + + # Check output files + summary = os.path.join(GENEPATH, "assembly_summary-104099.txt") + assert os.path.isfile(summary) + # Check that the NCBI_genome_download output directory exists + ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria") + # And that it contains folders + assert os.path.isdir(ngd_outdir) + assert len(os.listdir(ngd_outdir)) >= 4 + # Check logfiles are here + log_files = glob.glob(os.path.join(GENEPATH, "*log*")) + assert len(log_files) == 3 + # Check tmp files folder created, but empty as we do not split + tmp_files = glob.glob(os.path.join(tmp_dir, "*.fna_prepare-split5N.fna")) + assert len(tmp_files) >= 4 + # Check Database_init folder created, with at list 4 ".fna" genomes + fna_files = glob.glob(os.path.join(GENEPATH, "Database_init", "*.fna")) + assert len(fna_files) >= 4 + # Check that LSTINFO file existing was renamed and still empty + # And new LSTINFO file created + assert os.path.isfile(info_file + ".back") + assert os.stat(info_file + ".back").st_size == 0 + + +def test_main_wrong_taxid(capsys): + """ + We run without option only_mash, but still provide a lstinfo file + -> will change its name to .back to save it when the new file will be created + """ + NCBI_species = "" + NCBI_taxid = "123" + levels = "" + outdir = "" + tmp_dir = os.path.join("123", "temporary_directory") + threads = 1 + no_refseq = False + info_file = "" + db_dir = "" + only_mash = False + l90 = 100 + nbcont = 999 + cutn = 5 + min_dist = 1e-4 + max_dist = 0.06 + verbose = 2 + quiet = False + res_outdir = "123" + with pytest.raises(SystemExit): + prepare.main("cmd", NCBI_species, NCBI_taxid, levels, outdir, tmp_dir, threads, no_refseq, + db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, + verbose, quiet) + _, err = capsys.readouterr() + assert ("Could not download genomes. Check that you gave valid NCBI taxid and/or " + "NCBI species name. If you gave both, check that given taxID and name really " + "correspond to the same species.") in err + # Check output files + summary = os.path.join(res_outdir, "assembly_summary-104099.txt") + assert not os.path.isfile(summary) + ngd_outdir = os.path.join(res_outdir, "refseq", "bacteria") + assert not os.path.isdir(ngd_outdir) + # # Check logfiles are here + log_files = glob.glob(os.path.join(res_outdir, "*log*")) + assert len(log_files) == 3 + # Check tmp files folder created, but empty asnothing is downloaded + assert len(os.listdir(tmp_dir)) == 0 + # Check Database_init folder created, with at list 4 ".fna" genomes + assert not os.path.isdir(os.path.join(res_outdir, "Database_init")) + + # Remove output directory + shutil.rmtree(res_outdir, ignore_errors=True) + + +def test_main_norefseq_wrongdbpath(capsys): + """ + We run with option no_refseq, but given db_dir does not exist. + -> error message + """ + NCBI_species = "" + NCBI_taxid = "123" + levels = "" + outdir = GENEPATH + tmp_dir = os.path.join(outdir, "temporary_directory") + threads = 1 + no_refseq = True + db_dir = "dbdir" + only_mash = False + l90 = 100 + nbcont = 999 + cutn = 5 + min_dist = 1e-4 + max_dist = 0.06 + verbose = 15 + quiet = False + info_file = "" + with pytest.raises(SystemExit): + prepare.main("cmd", NCBI_species, NCBI_taxid, levels, outdir, tmp_dir, threads, no_refseq, + db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, + verbose, quiet) + _, err = capsys.readouterr() + assert ("You asked to skip refseq downloads") in err + assert ("Database folder dbdir supposed to contain fasta sequences does not exist. Please " + "give a valid folder, or leave the default directory (no '-d' option)") in err + # Check output files + summary = os.path.join(GENEPATH, "assembly_summary-104099.txt") + assert not os.path.isfile(summary) + ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria") + assert not os.path.isdir(ngd_outdir) + # Check logfiles are here + log_files = glob.glob(os.path.join(GENEPATH, "*log*")) + assert len(log_files) == 4 #.log.debug as we put verbose = 15 + # Check tmp files folder created, but empty asnothing is downloaded + assert len(os.listdir(tmp_dir)) == 0 + # Check Database_init folder created, with at list 4 ".fna" genomes + assert not os.path.isdir(os.path.join(GENEPATH, "Database_init")) + + +def test_main_norefseq_nodefault_dbdir_nor_refseq(capsys): + """ + We run with option no_refseq, but given db_dir does not exist. + -> error message + """ + NCBI_species = "" + NCBI_taxid = "123" + levels = "" + outdir = GENEPATH + tmp_dir = "" + threads = 1 + no_refseq = True + db_dir = "" + only_mash = False + l90 = 100 + nbcont = 999 + cutn = 5 + min_dist = 1e-4 + max_dist = 0.06 + verbose = 2 + quiet = False + info_file = "" + with pytest.raises(SystemExit): + prepare.main("cmd", NCBI_species, NCBI_taxid, levels, outdir, tmp_dir, threads, no_refseq, + db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, + verbose, quiet) + _, err = capsys.readouterr() + assert ("You asked to skip refseq downloads") in err + assert ("Database folder test/data/prepare/generated_by_func-tests/Database_init supposed " + "to contain fasta sequences does not exist. We will check if the download folder " + "(with compressed sequences) exists.") in err + assert ("Folder test/data/prepare/generated_by_func-tests/refseq/bacteria " + "does not exist. You do not have any genome to analyse. Possible reasons:\n" + "- if you want to rerun analysis in the same folder as " + "sequences were downloaded (my_outdir/Database_init or " + "my_outdir/refseq), make sure you have '-o my_outdir' option\n" + "- if you want to rerun analysis and save them in a new " + "output folder called 'new_outdir', make sure you have '-o new_outdir' option, " + "and you specified where the uncompressed sequences to use are " + "('-d sequence_database_path'") in err + # # Check output files + summary = os.path.join(GENEPATH, "assembly_summary-104099.txt") + assert not os.path.isfile(summary) + ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria") + assert not os.path.isdir(ngd_outdir) + # Check logfiles are here + log_files = glob.glob(os.path.join(GENEPATH, "*log*")) + assert len(log_files) == 3 + # Check tmp files folder created, but empty asnothing is downloaded + assert len(os.listdir(os.path.join(GENEPATH, "tmp_files"))) == 0 + # Check Database_init folder created, with at list 4 ".fna" genomes + assert not os.path.isdir(os.path.join(GENEPATH, "Database_init")) + + +def test_main_norefseq_nodefault_dbdir_but_refseq(capsys): + """ + We run with option no_refseq, but given db_dir does not exist. + -> error message + """ + NCBI_species = "" + NCBI_taxid = "123" + levels = "" + # Copy refseq/bacteria and content into outdirectory + outdir = GENEPATH + tmp_dir = "" + threads = 1 + no_refseq = True + orig_dbdir = os.path.join(GEN_PATH, "refseq") + refseq_db_dir = os.path.join(GENEPATH, "refseq") + shutil.copytree(orig_dbdir, refseq_db_dir) + db_dir = "" + only_mash = False + l90 = 100 + nbcont = 999 + cutn = 0 + min_dist = 1e-4 + max_dist = 0.06 + verbose = 2 + quiet = False + info_file = "" + prepare.main("cmd", NCBI_species, NCBI_taxid, levels, outdir, tmp_dir, threads, no_refseq, + db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, + verbose, quiet) + out, err = capsys.readouterr() + assert ("You asked to skip refseq downloads") in err + assert ("Database folder test/data/prepare/generated_by_func-tests/" + "Database_init supposed " + "to contain fasta sequences does not exist. We will check if the download folder " + "(with compressed sequences) exists.") in err + assert ("Uncompressing genome files") in out + assert ("Total number of genomes for 123: 3") in out + assert ("Computing pairwise distances between all genomes") in out + assert ("Final number of genomes in dataset: 1") in out + # Check output files + # Check that the NCBI_genome_download output directory exists + ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria") + # And that it contains folders + assert os.path.isdir(ngd_outdir) + assert len(os.listdir(ngd_outdir)) == 3 + # Check logfiles are here + log_files = glob.glob(os.path.join(GENEPATH, "*log*")) + assert len(log_files) == 3 + # Check tmp files folder created, but empty as we do not split + tmp_folder = os.listdir(os.path.join(GENEPATH, "tmp_files")) + assert len(tmp_folder) == 0 + # Check Database_init folder created, with the 3 ".fna" genomes + fna_files = glob.glob(os.path.join(GENEPATH, "Database_init", "*.fna")) + assert len(fna_files) == 3 + + +def test_main_norefseq_defaultdbdir(capsys): + """ + We run with option no_refseq, but given db_dir does not exist. + -> error message + """ + NCBI_species = "" + NCBI_taxid = "123" + levels = "" + # Copy refseq/bacteria and content into outdirectory + outdir = GENEPATH + tmp_dir = "" + threads = 1 + no_refseq = True + orig_dbdir = os.path.join(GEN_PATH, "genomes_comparison") + refseq_db_dir = os.path.join(GENEPATH, "Database_init") + shutil.copytree(orig_dbdir, refseq_db_dir) + db_dir = "" + only_mash = False + l90 = 100 + nbcont = 999 + cutn = 0 + min_dist = 1e-4 + max_dist = 0.06 + verbose = 2 + quiet = False + info_file = "" + prepare.main("cmd", NCBI_species, NCBI_taxid, levels, outdir, tmp_dir, threads, no_refseq, + db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, + verbose, quiet) + out, err = capsys.readouterr() + assert ("You asked to skip refseq downloads") in err + assert ("Total number of genomes for 123: 5") in out + assert ("Computing pairwise distances between all genomes") in out + assert ("Final number of genomes in dataset: 1") in out + # Check output files + # Check that the NCBI_genome_download output directory exists + ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria") + assert not os.path.isdir(ngd_outdir) + # Check logfiles are here + log_files = glob.glob(os.path.join(GENEPATH, "*log*")) + assert len(log_files) == 3 + # Check tmp files folder created, but empty as we do not split + tmp_folder = os.listdir(os.path.join(GENEPATH, "tmp_files")) + assert len(tmp_folder) == 0 + # Check Database_init folder created, with the 3 ".fna" genomes + fna_files = glob.glob(os.path.join(GENEPATH, "Database_init", "*.fna")) + assert len(fna_files) == 5 + + +def test_main_norefseq_givendbdir(capsys): + """ + We run with option no_refseq, but given db_dir does not exist. + -> error message + """ + NCBI_species = "" + NCBI_taxid = "" + levels = "" + # Copy refseq/bacteria and content into outdirectory + outdir = GENEPATH + tmp_dir = "" + threads = 1 + no_refseq = True + orig_dbdir = os.path.join(GEN_PATH, "genomes_comparison") + refseq_db_dir = os.path.join(GENEPATH, "genomes_comparison") + shutil.copytree(orig_dbdir, refseq_db_dir) + db_dir = refseq_db_dir + only_mash = False + l90 = 100 + nbcont = 999 + cutn = 2 + min_dist = 1e-4 + max_dist = 0.06 + verbose = 2 + quiet = False + info_file = "" + prepare.main("cmd", NCBI_species, NCBI_taxid, levels, outdir, tmp_dir, threads, no_refseq, + db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, + verbose, quiet) + out, err = capsys.readouterr() + assert ("You asked to skip refseq downloads") in err + assert ("Total number of genomes for NA: 5") in out + assert ("Computing pairwise distances between all genomes") in out + assert ("Final number of genomes in dataset: 1") in out + # Check output files + # Check that the NCBI_genome_download output directory exists + ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria") + assert not os.path.isdir(ngd_outdir) + # Check logfiles are here + log_files = glob.glob(os.path.join(GENEPATH, "*log*")) + assert len(log_files) == 3 + # Check tmp files folder created, but empty as we do not split + tmp_files = glob.glob(os.path.join(GENEPATH, "tmp_files", "*.fna_prepare-split2N.fna")) + assert len(tmp_files) == 5 +