Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
panacota
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Amandine PERRIN
panacota
Commits
ce82a89d
Commit
ce82a89d
authored
4 years ago
by
Amandine PERRIN
Browse files
Options
Downloads
Patches
Plain Diff
Start functional tests for prepare module
parent
d088a2f6
No related branches found
No related tags found
No related merge requests found
Pipeline
#39685
passed
4 years ago
Stage: test
Stage: coverage
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
PanACoTA/subcommands/prepare.py
+2
-7
2 additions, 7 deletions
PanACoTA/subcommands/prepare.py
test/test_functional/test_prepare.py
+442
-0
442 additions, 0 deletions
test/test_functional/test_prepare.py
with
444 additions
and
7 deletions
PanACoTA/subcommands/prepare.py
+
2
−
7
View file @
ce82a89d
...
...
@@ -66,7 +66,7 @@ def main_from_parse(arguments):
"""
cmd
=
"
PanACoTA
"
+
'
'
.
join
(
arguments
.
argv
)
main
(
cmd
,
arguments
.
NCBI_species
,
arguments
.
NCBI_species_taxid
,
arguments
.
level
,
main
(
cmd
,
arguments
.
NCBI_species
,
arguments
.
NCBI_species_taxid
,
arguments
.
level
s
,
arguments
.
outdir
,
arguments
.
tmp_dir
,
arguments
.
parallel
,
arguments
.
no_refseq
,
arguments
.
db_dir
,
arguments
.
only_mash
,
arguments
.
from_info
,
arguments
.
l90
,
arguments
.
nbcont
,
arguments
.
cutn
,
arguments
.
min_dist
,
...
...
@@ -226,15 +226,10 @@ def main(cmd, NCBI_species, NCBI_taxid, levels, outdir, tmp_dir, threads, no_ref
"
output folder called
'
new_outdir
'
, make sure you have
"
"'
-o new_outdir
'
option,
"
"
and you specified where the uncompressed sequences to
"
"
use are (
'
-d sequence_database_path
'
->
"
"
my_outdir/Database_init).
"
)
"
use are (
'
-d sequence_database_path
'
).
"
)
sys
.
exit
(
1
)
# add genomes from refseq/bacteria folder to Database_init
nb_gen
,
_
=
dgf
.
to_database
(
outdir
)
# If no genome found, error -> nothing to analyse
if
nb_gen
==
0
:
logger
.
error
(
f
"
There is no genome in
{
refseqdir
}
.
"
)
sys
.
exit
(
1
)
# No sequence: Do all steps -> download, QC, mash filter
else
:
# Download all genomes of the given taxID
...
...
This diff is collapsed.
Click to expand it.
test/test_functional/test_prepare.py
0 → 100644
+
442
−
0
View file @
ce82a89d
#!/usr/bin/env python3
# coding: utf-8
"""
Functional tests for genomeAPCAT annotate
"""
from
PanACoTA.subcommands
import
prepare
import
test.test_unit.utilities_for_tests
as
tutil
import
pytest
import
os
import
subprocess
import
shutil
import
time
import
argparse
import
logging
import
glob
# LOGFILE_BASE = "test_main_from_parse"
# Define variables used by several tests
DBDIR
=
os
.
path
.
join
(
"
test
"
,
"
data
"
,
"
prepare
"
)
GEN_PATH
=
os
.
path
.
join
(
DBDIR
,
"
genomes
"
)
TEST_DIR
=
os
.
path
.
join
(
DBDIR
,
'
test_files
'
)
GENEPATH
=
os
.
path
.
join
(
DBDIR
,
"
generated_by_func-tests
"
)
@pytest.fixture
(
autouse
=
True
)
def
setup_teardown_module
():
"""
Remove log files at the end of this test module
Before each test:
- init logger
- create directory to put generated files
After:
- remove all log files
- remove directory with generated results
"""
if
not
os
.
path
.
isdir
(
GENEPATH
):
print
(
"
setup
"
)
os
.
mkdir
(
GENEPATH
)
print
(
"
setup
"
)
yield
shutil
.
rmtree
(
GENEPATH
,
ignore_errors
=
True
)
print
(
"
teardown
"
)
def
test_main_from_parse
():
"""
Run
"""
args
=
argparse
.
Namespace
()
args
.
argv
=
[
"
prepare
"
,
"
test_func_prepare
"
]
args
.
NCBI_species
=
"
Acetobacter orleanensis
"
args
.
NCBI_species_taxid
=
"
104099
"
args
.
outdir
=
GENEPATH
args
.
tmp_dir
=
""
args
.
parallel
=
1
args
.
no_refseq
=
False
args
.
db_dir
=
""
args
.
only_mash
=
False
args
.
from_info
=
""
args
.
l90
=
100
args
.
nbcont
=
999
args
.
cutn
=
0
args
.
min_dist
=
1e-4
args
.
max_dist
=
0.06
args
.
verbose
=
0
args
.
quiet
=
False
args
.
levels
=
""
prepare
.
main_from_parse
(
args
)
# Check output files
summary
=
os
.
path
.
join
(
GENEPATH
,
"
assembly_summary-Acetobacter_orleanensis.txt
"
)
assert
os
.
path
.
isfile
(
summary
)
# Check that the NCBI_genome_download output directory exists
ngd_outdir
=
os
.
path
.
join
(
GENEPATH
,
"
refseq
"
,
"
bacteria
"
)
# And that it contains folders
assert
os
.
path
.
isdir
(
ngd_outdir
)
assert
len
(
os
.
listdir
(
ngd_outdir
))
>=
4
# Check logfiles are here
log_files
=
glob
.
glob
(
os
.
path
.
join
(
GENEPATH
,
"
*log*
"
))
assert
len
(
log_files
)
==
3
# Check tmp files folder created, but empty as we do not split
tmp_folder
=
os
.
listdir
(
os
.
path
.
join
(
GENEPATH
,
"
tmp_files
"
))
assert
len
(
tmp_folder
)
==
0
# Check Database_init folder created, with at list 4 ".fna" genomes
fna_files
=
glob
.
glob
(
os
.
path
.
join
(
GENEPATH
,
"
Database_init
"
,
"
*.fna
"
))
assert
len
(
fna_files
)
>=
4
def
test_main_not_only_mash_infoexists
():
"""
We run without option only_mash, but still provide a lstinfo file
-> will change its name to .back to save it when the new file will be created
"""
NCBI_species
=
""
NCBI_taxid
=
"
104099
"
levels
=
""
outdir
=
GENEPATH
tmp_dir
=
os
.
path
.
join
(
outdir
,
"
temporary_directory
"
)
threads
=
1
no_refseq
=
False
db_dir
=
""
only_mash
=
False
info_file
=
os
.
path
.
join
(
outdir
,
"
LSTINFO-existing.lst
"
)
open
(
info_file
,
"
w
"
).
close
()
#create empty info file, to check it is renamed
l90
=
100
nbcont
=
999
cutn
=
5
min_dist
=
1e-4
max_dist
=
0.06
verbose
=
2
quiet
=
False
prepare
.
main
(
"
cmd
"
,
NCBI_species
,
NCBI_taxid
,
levels
,
outdir
,
tmp_dir
,
threads
,
no_refseq
,
db_dir
,
only_mash
,
info_file
,
l90
,
nbcont
,
cutn
,
min_dist
,
max_dist
,
verbose
,
quiet
)
# Check output files
summary
=
os
.
path
.
join
(
GENEPATH
,
"
assembly_summary-104099.txt
"
)
assert
os
.
path
.
isfile
(
summary
)
# Check that the NCBI_genome_download output directory exists
ngd_outdir
=
os
.
path
.
join
(
GENEPATH
,
"
refseq
"
,
"
bacteria
"
)
# And that it contains folders
assert
os
.
path
.
isdir
(
ngd_outdir
)
assert
len
(
os
.
listdir
(
ngd_outdir
))
>=
4
# Check logfiles are here
log_files
=
glob
.
glob
(
os
.
path
.
join
(
GENEPATH
,
"
*log*
"
))
assert
len
(
log_files
)
==
3
# Check tmp files folder created, but empty as we do not split
tmp_files
=
glob
.
glob
(
os
.
path
.
join
(
tmp_dir
,
"
*.fna_prepare-split5N.fna
"
))
assert
len
(
tmp_files
)
>=
4
# Check Database_init folder created, with at list 4 ".fna" genomes
fna_files
=
glob
.
glob
(
os
.
path
.
join
(
GENEPATH
,
"
Database_init
"
,
"
*.fna
"
))
assert
len
(
fna_files
)
>=
4
# Check that LSTINFO file existing was renamed and still empty
# And new LSTINFO file created
assert
os
.
path
.
isfile
(
info_file
+
"
.back
"
)
assert
os
.
stat
(
info_file
+
"
.back
"
).
st_size
==
0
def
test_main_wrong_taxid
(
capsys
):
"""
We run without option only_mash, but still provide a lstinfo file
-> will change its name to .back to save it when the new file will be created
"""
NCBI_species
=
""
NCBI_taxid
=
"
123
"
levels
=
""
outdir
=
""
tmp_dir
=
os
.
path
.
join
(
"
123
"
,
"
temporary_directory
"
)
threads
=
1
no_refseq
=
False
info_file
=
""
db_dir
=
""
only_mash
=
False
l90
=
100
nbcont
=
999
cutn
=
5
min_dist
=
1e-4
max_dist
=
0.06
verbose
=
2
quiet
=
False
res_outdir
=
"
123
"
with
pytest
.
raises
(
SystemExit
):
prepare
.
main
(
"
cmd
"
,
NCBI_species
,
NCBI_taxid
,
levels
,
outdir
,
tmp_dir
,
threads
,
no_refseq
,
db_dir
,
only_mash
,
info_file
,
l90
,
nbcont
,
cutn
,
min_dist
,
max_dist
,
verbose
,
quiet
)
_
,
err
=
capsys
.
readouterr
()
assert
(
"
Could not download genomes. Check that you gave valid NCBI taxid and/or
"
"
NCBI species name. If you gave both, check that given taxID and name really
"
"
correspond to the same species.
"
)
in
err
# Check output files
summary
=
os
.
path
.
join
(
res_outdir
,
"
assembly_summary-104099.txt
"
)
assert
not
os
.
path
.
isfile
(
summary
)
ngd_outdir
=
os
.
path
.
join
(
res_outdir
,
"
refseq
"
,
"
bacteria
"
)
assert
not
os
.
path
.
isdir
(
ngd_outdir
)
# # Check logfiles are here
log_files
=
glob
.
glob
(
os
.
path
.
join
(
res_outdir
,
"
*log*
"
))
assert
len
(
log_files
)
==
3
# Check tmp files folder created, but empty asnothing is downloaded
assert
len
(
os
.
listdir
(
tmp_dir
))
==
0
# Check Database_init folder created, with at list 4 ".fna" genomes
assert
not
os
.
path
.
isdir
(
os
.
path
.
join
(
res_outdir
,
"
Database_init
"
))
# Remove output directory
shutil
.
rmtree
(
res_outdir
,
ignore_errors
=
True
)
def
test_main_norefseq_wrongdbpath
(
capsys
):
"""
We run with option no_refseq, but given db_dir does not exist.
-> error message
"""
NCBI_species
=
""
NCBI_taxid
=
"
123
"
levels
=
""
outdir
=
GENEPATH
tmp_dir
=
os
.
path
.
join
(
outdir
,
"
temporary_directory
"
)
threads
=
1
no_refseq
=
True
db_dir
=
"
dbdir
"
only_mash
=
False
l90
=
100
nbcont
=
999
cutn
=
5
min_dist
=
1e-4
max_dist
=
0.06
verbose
=
15
quiet
=
False
info_file
=
""
with
pytest
.
raises
(
SystemExit
):
prepare
.
main
(
"
cmd
"
,
NCBI_species
,
NCBI_taxid
,
levels
,
outdir
,
tmp_dir
,
threads
,
no_refseq
,
db_dir
,
only_mash
,
info_file
,
l90
,
nbcont
,
cutn
,
min_dist
,
max_dist
,
verbose
,
quiet
)
_
,
err
=
capsys
.
readouterr
()
assert
(
"
You asked to skip refseq downloads
"
)
in
err
assert
(
"
Database folder dbdir supposed to contain fasta sequences does not exist. Please
"
"
give a valid folder, or leave the default directory (no
'
-d
'
option)
"
)
in
err
# Check output files
summary
=
os
.
path
.
join
(
GENEPATH
,
"
assembly_summary-104099.txt
"
)
assert
not
os
.
path
.
isfile
(
summary
)
ngd_outdir
=
os
.
path
.
join
(
GENEPATH
,
"
refseq
"
,
"
bacteria
"
)
assert
not
os
.
path
.
isdir
(
ngd_outdir
)
# Check logfiles are here
log_files
=
glob
.
glob
(
os
.
path
.
join
(
GENEPATH
,
"
*log*
"
))
assert
len
(
log_files
)
==
4
#.log.debug as we put verbose = 15
# Check tmp files folder created, but empty asnothing is downloaded
assert
len
(
os
.
listdir
(
tmp_dir
))
==
0
# Check Database_init folder created, with at list 4 ".fna" genomes
assert
not
os
.
path
.
isdir
(
os
.
path
.
join
(
GENEPATH
,
"
Database_init
"
))
def
test_main_norefseq_nodefault_dbdir_nor_refseq
(
capsys
):
"""
We run with option no_refseq, but given db_dir does not exist.
-> error message
"""
NCBI_species
=
""
NCBI_taxid
=
"
123
"
levels
=
""
outdir
=
GENEPATH
tmp_dir
=
""
threads
=
1
no_refseq
=
True
db_dir
=
""
only_mash
=
False
l90
=
100
nbcont
=
999
cutn
=
5
min_dist
=
1e-4
max_dist
=
0.06
verbose
=
2
quiet
=
False
info_file
=
""
with
pytest
.
raises
(
SystemExit
):
prepare
.
main
(
"
cmd
"
,
NCBI_species
,
NCBI_taxid
,
levels
,
outdir
,
tmp_dir
,
threads
,
no_refseq
,
db_dir
,
only_mash
,
info_file
,
l90
,
nbcont
,
cutn
,
min_dist
,
max_dist
,
verbose
,
quiet
)
_
,
err
=
capsys
.
readouterr
()
assert
(
"
You asked to skip refseq downloads
"
)
in
err
assert
(
"
Database folder test/data/prepare/generated_by_func-tests/Database_init supposed
"
"
to contain fasta sequences does not exist. We will check if the download folder
"
"
(with compressed sequences) exists.
"
)
in
err
assert
(
"
Folder test/data/prepare/generated_by_func-tests/refseq/bacteria
"
"
does not exist. You do not have any genome to analyse. Possible reasons:
\n
"
"
- if you want to rerun analysis in the same folder as
"
"
sequences were downloaded (my_outdir/Database_init or
"
"
my_outdir/refseq), make sure you have
'
-o my_outdir
'
option
\n
"
"
- if you want to rerun analysis and save them in a new
"
"
output folder called
'
new_outdir
'
, make sure you have
'
-o new_outdir
'
option,
"
"
and you specified where the uncompressed sequences to use are
"
"
(
'
-d sequence_database_path
'"
)
in
err
# # Check output files
summary
=
os
.
path
.
join
(
GENEPATH
,
"
assembly_summary-104099.txt
"
)
assert
not
os
.
path
.
isfile
(
summary
)
ngd_outdir
=
os
.
path
.
join
(
GENEPATH
,
"
refseq
"
,
"
bacteria
"
)
assert
not
os
.
path
.
isdir
(
ngd_outdir
)
# Check logfiles are here
log_files
=
glob
.
glob
(
os
.
path
.
join
(
GENEPATH
,
"
*log*
"
))
assert
len
(
log_files
)
==
3
# Check tmp files folder created, but empty asnothing is downloaded
assert
len
(
os
.
listdir
(
os
.
path
.
join
(
GENEPATH
,
"
tmp_files
"
)))
==
0
# Check Database_init folder created, with at list 4 ".fna" genomes
assert
not
os
.
path
.
isdir
(
os
.
path
.
join
(
GENEPATH
,
"
Database_init
"
))
def
test_main_norefseq_nodefault_dbdir_but_refseq
(
capsys
):
"""
We run with option no_refseq, but given db_dir does not exist.
-> error message
"""
NCBI_species
=
""
NCBI_taxid
=
"
123
"
levels
=
""
# Copy refseq/bacteria and content into outdirectory
outdir
=
GENEPATH
tmp_dir
=
""
threads
=
1
no_refseq
=
True
orig_dbdir
=
os
.
path
.
join
(
GEN_PATH
,
"
refseq
"
)
refseq_db_dir
=
os
.
path
.
join
(
GENEPATH
,
"
refseq
"
)
shutil
.
copytree
(
orig_dbdir
,
refseq_db_dir
)
db_dir
=
""
only_mash
=
False
l90
=
100
nbcont
=
999
cutn
=
0
min_dist
=
1e-4
max_dist
=
0.06
verbose
=
2
quiet
=
False
info_file
=
""
prepare
.
main
(
"
cmd
"
,
NCBI_species
,
NCBI_taxid
,
levels
,
outdir
,
tmp_dir
,
threads
,
no_refseq
,
db_dir
,
only_mash
,
info_file
,
l90
,
nbcont
,
cutn
,
min_dist
,
max_dist
,
verbose
,
quiet
)
out
,
err
=
capsys
.
readouterr
()
assert
(
"
You asked to skip refseq downloads
"
)
in
err
assert
(
"
Database folder test/data/prepare/generated_by_func-tests/
"
"
Database_init supposed
"
"
to contain fasta sequences does not exist. We will check if the download folder
"
"
(with compressed sequences) exists.
"
)
in
err
assert
(
"
Uncompressing genome files
"
)
in
out
assert
(
"
Total number of genomes for 123: 3
"
)
in
out
assert
(
"
Computing pairwise distances between all genomes
"
)
in
out
assert
(
"
Final number of genomes in dataset: 1
"
)
in
out
# Check output files
# Check that the NCBI_genome_download output directory exists
ngd_outdir
=
os
.
path
.
join
(
GENEPATH
,
"
refseq
"
,
"
bacteria
"
)
# And that it contains folders
assert
os
.
path
.
isdir
(
ngd_outdir
)
assert
len
(
os
.
listdir
(
ngd_outdir
))
==
3
# Check logfiles are here
log_files
=
glob
.
glob
(
os
.
path
.
join
(
GENEPATH
,
"
*log*
"
))
assert
len
(
log_files
)
==
3
# Check tmp files folder created, but empty as we do not split
tmp_folder
=
os
.
listdir
(
os
.
path
.
join
(
GENEPATH
,
"
tmp_files
"
))
assert
len
(
tmp_folder
)
==
0
# Check Database_init folder created, with the 3 ".fna" genomes
fna_files
=
glob
.
glob
(
os
.
path
.
join
(
GENEPATH
,
"
Database_init
"
,
"
*.fna
"
))
assert
len
(
fna_files
)
==
3
def
test_main_norefseq_defaultdbdir
(
capsys
):
"""
We run with option no_refseq, but given db_dir does not exist.
-> error message
"""
NCBI_species
=
""
NCBI_taxid
=
"
123
"
levels
=
""
# Copy refseq/bacteria and content into outdirectory
outdir
=
GENEPATH
tmp_dir
=
""
threads
=
1
no_refseq
=
True
orig_dbdir
=
os
.
path
.
join
(
GEN_PATH
,
"
genomes_comparison
"
)
refseq_db_dir
=
os
.
path
.
join
(
GENEPATH
,
"
Database_init
"
)
shutil
.
copytree
(
orig_dbdir
,
refseq_db_dir
)
db_dir
=
""
only_mash
=
False
l90
=
100
nbcont
=
999
cutn
=
0
min_dist
=
1e-4
max_dist
=
0.06
verbose
=
2
quiet
=
False
info_file
=
""
prepare
.
main
(
"
cmd
"
,
NCBI_species
,
NCBI_taxid
,
levels
,
outdir
,
tmp_dir
,
threads
,
no_refseq
,
db_dir
,
only_mash
,
info_file
,
l90
,
nbcont
,
cutn
,
min_dist
,
max_dist
,
verbose
,
quiet
)
out
,
err
=
capsys
.
readouterr
()
assert
(
"
You asked to skip refseq downloads
"
)
in
err
assert
(
"
Total number of genomes for 123: 5
"
)
in
out
assert
(
"
Computing pairwise distances between all genomes
"
)
in
out
assert
(
"
Final number of genomes in dataset: 1
"
)
in
out
# Check output files
# Check that the NCBI_genome_download output directory exists
ngd_outdir
=
os
.
path
.
join
(
GENEPATH
,
"
refseq
"
,
"
bacteria
"
)
assert
not
os
.
path
.
isdir
(
ngd_outdir
)
# Check logfiles are here
log_files
=
glob
.
glob
(
os
.
path
.
join
(
GENEPATH
,
"
*log*
"
))
assert
len
(
log_files
)
==
3
# Check tmp files folder created, but empty as we do not split
tmp_folder
=
os
.
listdir
(
os
.
path
.
join
(
GENEPATH
,
"
tmp_files
"
))
assert
len
(
tmp_folder
)
==
0
# Check Database_init folder created, with the 3 ".fna" genomes
fna_files
=
glob
.
glob
(
os
.
path
.
join
(
GENEPATH
,
"
Database_init
"
,
"
*.fna
"
))
assert
len
(
fna_files
)
==
5
def
test_main_norefseq_givendbdir
(
capsys
):
"""
We run with option no_refseq, but given db_dir does not exist.
-> error message
"""
NCBI_species
=
""
NCBI_taxid
=
""
levels
=
""
# Copy refseq/bacteria and content into outdirectory
outdir
=
GENEPATH
tmp_dir
=
""
threads
=
1
no_refseq
=
True
orig_dbdir
=
os
.
path
.
join
(
GEN_PATH
,
"
genomes_comparison
"
)
refseq_db_dir
=
os
.
path
.
join
(
GENEPATH
,
"
genomes_comparison
"
)
shutil
.
copytree
(
orig_dbdir
,
refseq_db_dir
)
db_dir
=
refseq_db_dir
only_mash
=
False
l90
=
100
nbcont
=
999
cutn
=
2
min_dist
=
1e-4
max_dist
=
0.06
verbose
=
2
quiet
=
False
info_file
=
""
prepare
.
main
(
"
cmd
"
,
NCBI_species
,
NCBI_taxid
,
levels
,
outdir
,
tmp_dir
,
threads
,
no_refseq
,
db_dir
,
only_mash
,
info_file
,
l90
,
nbcont
,
cutn
,
min_dist
,
max_dist
,
verbose
,
quiet
)
out
,
err
=
capsys
.
readouterr
()
assert
(
"
You asked to skip refseq downloads
"
)
in
err
assert
(
"
Total number of genomes for NA: 5
"
)
in
out
assert
(
"
Computing pairwise distances between all genomes
"
)
in
out
assert
(
"
Final number of genomes in dataset: 1
"
)
in
out
# Check output files
# Check that the NCBI_genome_download output directory exists
ngd_outdir
=
os
.
path
.
join
(
GENEPATH
,
"
refseq
"
,
"
bacteria
"
)
assert
not
os
.
path
.
isdir
(
ngd_outdir
)
# Check logfiles are here
log_files
=
glob
.
glob
(
os
.
path
.
join
(
GENEPATH
,
"
*log*
"
))
assert
len
(
log_files
)
==
3
# Check tmp files folder created, but empty as we do not split
tmp_files
=
glob
.
glob
(
os
.
path
.
join
(
GENEPATH
,
"
tmp_files
"
,
"
*.fna_prepare-split2N.fna
"
))
assert
len
(
tmp_files
)
==
5
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment