Skip to content
Snippets Groups Projects
Commit ce82a89d authored by Amandine  PERRIN's avatar Amandine PERRIN
Browse files

Start functional tests for prepare module

parent d088a2f6
No related branches found
No related tags found
No related merge requests found
Pipeline #39685 passed
......@@ -66,7 +66,7 @@ def main_from_parse(arguments):
"""
cmd = "PanACoTA " + ' '.join(arguments.argv)
main(cmd, arguments.NCBI_species, arguments.NCBI_species_taxid, arguments.level,
main(cmd, arguments.NCBI_species, arguments.NCBI_species_taxid, arguments.levels,
arguments.outdir, arguments.tmp_dir, arguments.parallel, arguments.no_refseq,
arguments.db_dir, arguments.only_mash,
arguments.from_info, arguments.l90, arguments.nbcont, arguments.cutn, arguments.min_dist,
......@@ -226,15 +226,10 @@ def main(cmd, NCBI_species, NCBI_taxid, levels, outdir, tmp_dir, threads, no_ref
"output folder called 'new_outdir', make sure you have "
"'-o new_outdir' option, "
"and you specified where the uncompressed sequences to "
"use are ('-d sequence_database_path' -> "
"my_outdir/Database_init). ")
"use are ('-d sequence_database_path'). ")
sys.exit(1)
# add genomes from refseq/bacteria folder to Database_init
nb_gen, _ = dgf.to_database(outdir)
# If no genome found, error -> nothing to analyse
if nb_gen == 0:
logger.error(f"There is no genome in {refseqdir}.")
sys.exit(1)
# No sequence: Do all steps -> download, QC, mash filter
else:
# Download all genomes of the given taxID
......
#!/usr/bin/env python3
# coding: utf-8
"""
Functional tests for genomeAPCAT annotate
"""
from PanACoTA.subcommands import prepare
import test.test_unit.utilities_for_tests as tutil
import pytest
import os
import subprocess
import shutil
import time
import argparse
import logging
import glob
# LOGFILE_BASE = "test_main_from_parse"
# Define variables used by several tests
DBDIR = os.path.join("test", "data", "prepare")
GEN_PATH = os.path.join(DBDIR, "genomes")
TEST_DIR = os.path.join(DBDIR, 'test_files')
GENEPATH = os.path.join(DBDIR, "generated_by_func-tests")
@pytest.fixture(autouse=True)
def setup_teardown_module():
"""
Remove log files at the end of this test module
Before each test:
- init logger
- create directory to put generated files
After:
- remove all log files
- remove directory with generated results
"""
if not os.path.isdir(GENEPATH):
print("setup")
os.mkdir(GENEPATH)
print("setup")
yield
shutil.rmtree(GENEPATH, ignore_errors=True)
print("teardown")
def test_main_from_parse():
"""
Run
"""
args = argparse.Namespace()
args.argv = ["prepare", "test_func_prepare"]
args.NCBI_species = "Acetobacter orleanensis"
args.NCBI_species_taxid = "104099"
args.outdir = GENEPATH
args.tmp_dir = ""
args.parallel = 1
args.no_refseq = False
args.db_dir = ""
args.only_mash = False
args.from_info = ""
args.l90 = 100
args.nbcont = 999
args.cutn = 0
args.min_dist = 1e-4
args.max_dist = 0.06
args.verbose = 0
args.quiet = False
args.levels = ""
prepare.main_from_parse(args)
# Check output files
summary = os.path.join(GENEPATH, "assembly_summary-Acetobacter_orleanensis.txt")
assert os.path.isfile(summary)
# Check that the NCBI_genome_download output directory exists
ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria")
# And that it contains folders
assert os.path.isdir(ngd_outdir)
assert len(os.listdir(ngd_outdir)) >= 4
# Check logfiles are here
log_files = glob.glob(os.path.join(GENEPATH, "*log*"))
assert len(log_files) == 3
# Check tmp files folder created, but empty as we do not split
tmp_folder = os.listdir(os.path.join(GENEPATH, "tmp_files"))
assert len(tmp_folder) == 0
# Check Database_init folder created, with at list 4 ".fna" genomes
fna_files = glob.glob(os.path.join(GENEPATH, "Database_init", "*.fna"))
assert len(fna_files) >= 4
def test_main_not_only_mash_infoexists():
"""
We run without option only_mash, but still provide a lstinfo file
-> will change its name to .back to save it when the new file will be created
"""
NCBI_species = ""
NCBI_taxid = "104099"
levels = ""
outdir = GENEPATH
tmp_dir = os.path.join(outdir, "temporary_directory")
threads = 1
no_refseq = False
db_dir = ""
only_mash = False
info_file = os.path.join(outdir, "LSTINFO-existing.lst")
open(info_file, "w").close() #create empty info file, to check it is renamed
l90 = 100
nbcont = 999
cutn = 5
min_dist = 1e-4
max_dist = 0.06
verbose = 2
quiet = False
prepare.main("cmd", NCBI_species, NCBI_taxid, levels, outdir, tmp_dir, threads, no_refseq,
db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist,
verbose, quiet)
# Check output files
summary = os.path.join(GENEPATH, "assembly_summary-104099.txt")
assert os.path.isfile(summary)
# Check that the NCBI_genome_download output directory exists
ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria")
# And that it contains folders
assert os.path.isdir(ngd_outdir)
assert len(os.listdir(ngd_outdir)) >= 4
# Check logfiles are here
log_files = glob.glob(os.path.join(GENEPATH, "*log*"))
assert len(log_files) == 3
# Check tmp files folder created, but empty as we do not split
tmp_files = glob.glob(os.path.join(tmp_dir, "*.fna_prepare-split5N.fna"))
assert len(tmp_files) >= 4
# Check Database_init folder created, with at list 4 ".fna" genomes
fna_files = glob.glob(os.path.join(GENEPATH, "Database_init", "*.fna"))
assert len(fna_files) >= 4
# Check that LSTINFO file existing was renamed and still empty
# And new LSTINFO file created
assert os.path.isfile(info_file + ".back")
assert os.stat(info_file + ".back").st_size == 0
def test_main_wrong_taxid(capsys):
"""
We run without option only_mash, but still provide a lstinfo file
-> will change its name to .back to save it when the new file will be created
"""
NCBI_species = ""
NCBI_taxid = "123"
levels = ""
outdir = ""
tmp_dir = os.path.join("123", "temporary_directory")
threads = 1
no_refseq = False
info_file = ""
db_dir = ""
only_mash = False
l90 = 100
nbcont = 999
cutn = 5
min_dist = 1e-4
max_dist = 0.06
verbose = 2
quiet = False
res_outdir = "123"
with pytest.raises(SystemExit):
prepare.main("cmd", NCBI_species, NCBI_taxid, levels, outdir, tmp_dir, threads, no_refseq,
db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist,
verbose, quiet)
_, err = capsys.readouterr()
assert ("Could not download genomes. Check that you gave valid NCBI taxid and/or "
"NCBI species name. If you gave both, check that given taxID and name really "
"correspond to the same species.") in err
# Check output files
summary = os.path.join(res_outdir, "assembly_summary-104099.txt")
assert not os.path.isfile(summary)
ngd_outdir = os.path.join(res_outdir, "refseq", "bacteria")
assert not os.path.isdir(ngd_outdir)
# # Check logfiles are here
log_files = glob.glob(os.path.join(res_outdir, "*log*"))
assert len(log_files) == 3
# Check tmp files folder created, but empty asnothing is downloaded
assert len(os.listdir(tmp_dir)) == 0
# Check Database_init folder created, with at list 4 ".fna" genomes
assert not os.path.isdir(os.path.join(res_outdir, "Database_init"))
# Remove output directory
shutil.rmtree(res_outdir, ignore_errors=True)
def test_main_norefseq_wrongdbpath(capsys):
"""
We run with option no_refseq, but given db_dir does not exist.
-> error message
"""
NCBI_species = ""
NCBI_taxid = "123"
levels = ""
outdir = GENEPATH
tmp_dir = os.path.join(outdir, "temporary_directory")
threads = 1
no_refseq = True
db_dir = "dbdir"
only_mash = False
l90 = 100
nbcont = 999
cutn = 5
min_dist = 1e-4
max_dist = 0.06
verbose = 15
quiet = False
info_file = ""
with pytest.raises(SystemExit):
prepare.main("cmd", NCBI_species, NCBI_taxid, levels, outdir, tmp_dir, threads, no_refseq,
db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist,
verbose, quiet)
_, err = capsys.readouterr()
assert ("You asked to skip refseq downloads") in err
assert ("Database folder dbdir supposed to contain fasta sequences does not exist. Please "
"give a valid folder, or leave the default directory (no '-d' option)") in err
# Check output files
summary = os.path.join(GENEPATH, "assembly_summary-104099.txt")
assert not os.path.isfile(summary)
ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria")
assert not os.path.isdir(ngd_outdir)
# Check logfiles are here
log_files = glob.glob(os.path.join(GENEPATH, "*log*"))
assert len(log_files) == 4 #.log.debug as we put verbose = 15
# Check tmp files folder created, but empty asnothing is downloaded
assert len(os.listdir(tmp_dir)) == 0
# Check Database_init folder created, with at list 4 ".fna" genomes
assert not os.path.isdir(os.path.join(GENEPATH, "Database_init"))
def test_main_norefseq_nodefault_dbdir_nor_refseq(capsys):
"""
We run with option no_refseq, but given db_dir does not exist.
-> error message
"""
NCBI_species = ""
NCBI_taxid = "123"
levels = ""
outdir = GENEPATH
tmp_dir = ""
threads = 1
no_refseq = True
db_dir = ""
only_mash = False
l90 = 100
nbcont = 999
cutn = 5
min_dist = 1e-4
max_dist = 0.06
verbose = 2
quiet = False
info_file = ""
with pytest.raises(SystemExit):
prepare.main("cmd", NCBI_species, NCBI_taxid, levels, outdir, tmp_dir, threads, no_refseq,
db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist,
verbose, quiet)
_, err = capsys.readouterr()
assert ("You asked to skip refseq downloads") in err
assert ("Database folder test/data/prepare/generated_by_func-tests/Database_init supposed "
"to contain fasta sequences does not exist. We will check if the download folder "
"(with compressed sequences) exists.") in err
assert ("Folder test/data/prepare/generated_by_func-tests/refseq/bacteria "
"does not exist. You do not have any genome to analyse. Possible reasons:\n"
"- if you want to rerun analysis in the same folder as "
"sequences were downloaded (my_outdir/Database_init or "
"my_outdir/refseq), make sure you have '-o my_outdir' option\n"
"- if you want to rerun analysis and save them in a new "
"output folder called 'new_outdir', make sure you have '-o new_outdir' option, "
"and you specified where the uncompressed sequences to use are "
"('-d sequence_database_path'") in err
# # Check output files
summary = os.path.join(GENEPATH, "assembly_summary-104099.txt")
assert not os.path.isfile(summary)
ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria")
assert not os.path.isdir(ngd_outdir)
# Check logfiles are here
log_files = glob.glob(os.path.join(GENEPATH, "*log*"))
assert len(log_files) == 3
# Check tmp files folder created, but empty asnothing is downloaded
assert len(os.listdir(os.path.join(GENEPATH, "tmp_files"))) == 0
# Check Database_init folder created, with at list 4 ".fna" genomes
assert not os.path.isdir(os.path.join(GENEPATH, "Database_init"))
def test_main_norefseq_nodefault_dbdir_but_refseq(capsys):
"""
We run with option no_refseq, but given db_dir does not exist.
-> error message
"""
NCBI_species = ""
NCBI_taxid = "123"
levels = ""
# Copy refseq/bacteria and content into outdirectory
outdir = GENEPATH
tmp_dir = ""
threads = 1
no_refseq = True
orig_dbdir = os.path.join(GEN_PATH, "refseq")
refseq_db_dir = os.path.join(GENEPATH, "refseq")
shutil.copytree(orig_dbdir, refseq_db_dir)
db_dir = ""
only_mash = False
l90 = 100
nbcont = 999
cutn = 0
min_dist = 1e-4
max_dist = 0.06
verbose = 2
quiet = False
info_file = ""
prepare.main("cmd", NCBI_species, NCBI_taxid, levels, outdir, tmp_dir, threads, no_refseq,
db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist,
verbose, quiet)
out, err = capsys.readouterr()
assert ("You asked to skip refseq downloads") in err
assert ("Database folder test/data/prepare/generated_by_func-tests/"
"Database_init supposed "
"to contain fasta sequences does not exist. We will check if the download folder "
"(with compressed sequences) exists.") in err
assert ("Uncompressing genome files") in out
assert ("Total number of genomes for 123: 3") in out
assert ("Computing pairwise distances between all genomes") in out
assert ("Final number of genomes in dataset: 1") in out
# Check output files
# Check that the NCBI_genome_download output directory exists
ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria")
# And that it contains folders
assert os.path.isdir(ngd_outdir)
assert len(os.listdir(ngd_outdir)) == 3
# Check logfiles are here
log_files = glob.glob(os.path.join(GENEPATH, "*log*"))
assert len(log_files) == 3
# Check tmp files folder created, but empty as we do not split
tmp_folder = os.listdir(os.path.join(GENEPATH, "tmp_files"))
assert len(tmp_folder) == 0
# Check Database_init folder created, with the 3 ".fna" genomes
fna_files = glob.glob(os.path.join(GENEPATH, "Database_init", "*.fna"))
assert len(fna_files) == 3
def test_main_norefseq_defaultdbdir(capsys):
"""
We run with option no_refseq, but given db_dir does not exist.
-> error message
"""
NCBI_species = ""
NCBI_taxid = "123"
levels = ""
# Copy refseq/bacteria and content into outdirectory
outdir = GENEPATH
tmp_dir = ""
threads = 1
no_refseq = True
orig_dbdir = os.path.join(GEN_PATH, "genomes_comparison")
refseq_db_dir = os.path.join(GENEPATH, "Database_init")
shutil.copytree(orig_dbdir, refseq_db_dir)
db_dir = ""
only_mash = False
l90 = 100
nbcont = 999
cutn = 0
min_dist = 1e-4
max_dist = 0.06
verbose = 2
quiet = False
info_file = ""
prepare.main("cmd", NCBI_species, NCBI_taxid, levels, outdir, tmp_dir, threads, no_refseq,
db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist,
verbose, quiet)
out, err = capsys.readouterr()
assert ("You asked to skip refseq downloads") in err
assert ("Total number of genomes for 123: 5") in out
assert ("Computing pairwise distances between all genomes") in out
assert ("Final number of genomes in dataset: 1") in out
# Check output files
# Check that the NCBI_genome_download output directory exists
ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria")
assert not os.path.isdir(ngd_outdir)
# Check logfiles are here
log_files = glob.glob(os.path.join(GENEPATH, "*log*"))
assert len(log_files) == 3
# Check tmp files folder created, but empty as we do not split
tmp_folder = os.listdir(os.path.join(GENEPATH, "tmp_files"))
assert len(tmp_folder) == 0
# Check Database_init folder created, with the 3 ".fna" genomes
fna_files = glob.glob(os.path.join(GENEPATH, "Database_init", "*.fna"))
assert len(fna_files) == 5
def test_main_norefseq_givendbdir(capsys):
"""
We run with option no_refseq, but given db_dir does not exist.
-> error message
"""
NCBI_species = ""
NCBI_taxid = ""
levels = ""
# Copy refseq/bacteria and content into outdirectory
outdir = GENEPATH
tmp_dir = ""
threads = 1
no_refseq = True
orig_dbdir = os.path.join(GEN_PATH, "genomes_comparison")
refseq_db_dir = os.path.join(GENEPATH, "genomes_comparison")
shutil.copytree(orig_dbdir, refseq_db_dir)
db_dir = refseq_db_dir
only_mash = False
l90 = 100
nbcont = 999
cutn = 2
min_dist = 1e-4
max_dist = 0.06
verbose = 2
quiet = False
info_file = ""
prepare.main("cmd", NCBI_species, NCBI_taxid, levels, outdir, tmp_dir, threads, no_refseq,
db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist,
verbose, quiet)
out, err = capsys.readouterr()
assert ("You asked to skip refseq downloads") in err
assert ("Total number of genomes for NA: 5") in out
assert ("Computing pairwise distances between all genomes") in out
assert ("Final number of genomes in dataset: 1") in out
# Check output files
# Check that the NCBI_genome_download output directory exists
ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria")
assert not os.path.isdir(ngd_outdir)
# Check logfiles are here
log_files = glob.glob(os.path.join(GENEPATH, "*log*"))
assert len(log_files) == 3
# Check tmp files folder created, but empty as we do not split
tmp_files = glob.glob(os.path.join(GENEPATH, "tmp_files", "*.fna_prepare-split2N.fna"))
assert len(tmp_files) == 5
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment