test_prepare.py 27.8 KB
Newer Older
1
2
3
4
#!/usr/bin/env python3
# coding: utf-8

"""
5
Functional tests for PanACoTA 'prepare' subcommand
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""

from PanACoTA.subcommands import prepare
import test.test_unit.utilities_for_tests as tutil

import pytest
import os
import subprocess
import shutil
import time
import argparse
import logging
import glob


# LOGFILE_BASE = "test_main_from_parse"
# Define variables used by several tests
DBDIR = os.path.join("test", "data", "prepare")
GEN_PATH = os.path.join(DBDIR, "genomes")
TEST_DIR = os.path.join(DBDIR, 'test_files')
GENEPATH = os.path.join(DBDIR, "generated_by_func-tests")


@pytest.fixture(autouse=True)
def setup_teardown_module():
    """
    Remove log files at the end of this test module

    Before each test:
    - init logger
    - create directory to put generated files

    After:
    - remove all log files
    - remove directory with generated results
    """
    if not os.path.isdir(GENEPATH):
        print("setup")
        os.mkdir(GENEPATH)
    print("setup")

    yield
    shutil.rmtree(GENEPATH, ignore_errors=True)
    print("teardown")


def test_main_from_parse():
    """
    Run
    """
    args = argparse.Namespace()
    args.argv = ["prepare", "test_func_prepare"]
58
    args.ncbi_species_name = "Acetobacter orleanensis"
59
    args.ncbi_species_taxid = "104099"
60
    args.ncbi_taxid = ""
61
    args.strains = ""
62
    args.ncbi_section = "refseq"
63
64
65
    args.outdir = GENEPATH
    args.tmp_dir = ""
    args.parallel = 1
Amandine  PERRIN's avatar
Amandine PERRIN committed
66
    args.norefseq = False
67
68
    args.db_dir = ""
    args.only_mash = False
69
    args.info_file = ""
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
    args.l90 = 100
    args.nbcont = 999
    args.cutn = 0
    args.min_dist = 1e-4
    args.max_dist = 0.06
    args.verbose = 0
    args.quiet = False
    args.levels = ""

    prepare.main_from_parse(args)

    # Check output files
    summary =  os.path.join(GENEPATH, "assembly_summary-Acetobacter_orleanensis.txt")
    assert os.path.isfile(summary)
    # Check that the NCBI_genome_download output directory exists
    ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria")
    # And that it contains folders
    assert os.path.isdir(ngd_outdir)
    assert len(os.listdir(ngd_outdir)) >= 4
    # Check logfiles are here
    log_files = glob.glob(os.path.join(GENEPATH, "*log*"))
    assert len(log_files) == 3
    # Check tmp files folder created, but empty as we do not split
    tmp_folder = os.listdir(os.path.join(GENEPATH, "tmp_files"))
    assert len(tmp_folder) == 0
    # Check Database_init folder created, with at list 4 ".fna" genomes
    fna_files = glob.glob(os.path.join(GENEPATH, "Database_init", "*.fna"))
    assert len(fna_files) >= 4


100
101
102
103
104
105
106
107
108
def test_main_from_parse_longspeciesname():
    """
    Run
    """
    args = argparse.Namespace()
    args.argv = ["prepare", "test_func_prepare"]
    args.ncbi_species_name = "Salmonella enterica subsp. enterica serovar Paratyphi C"
    args.ncbi_species_taxid = ""
    args.ncbi_taxid = ""
109
    args.strains = ""
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
    args.ncbi_section = "refseq"
    args.outdir = GENEPATH
    args.tmp_dir = ""
    args.parallel = 1
    args.norefseq = False
    args.db_dir = ""
    args.only_mash = False
    args.info_file = ""
    args.l90 = 100
    args.nbcont = 999
    args.cutn = 0
    args.min_dist = 1e-4
    args.max_dist = 0.06
    args.verbose = 0
    args.quiet = False
    args.levels = ""

    prepare.main_from_parse(args)

    # Check output files
    summary =  os.path.join(GENEPATH, "assembly_summary-Salmonella_enterica_subsp._enterica_serovar_Paratyphi_C.txt")
    assert os.path.isfile(summary)
    # Check that the NCBI_genome_download output directory exists
    ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria")
    # And that it contains folders
    assert os.path.isdir(ngd_outdir)
    assert len(os.listdir(ngd_outdir)) >= 1
    # Check logfiles are here
    log_files = glob.glob(os.path.join(GENEPATH, "*log*"))
    assert len(log_files) == 3
    # Check tmp files folder created, but empty as we do not split
    tmp_folder = os.listdir(os.path.join(GENEPATH, "tmp_files"))
    assert len(tmp_folder) == 0
    # Check Database_init folder created, with at list 4 ".fna" genomes
    fna_files = glob.glob(os.path.join(GENEPATH, "Database_init", "*.fna"))
    assert len(fna_files) >= 1


148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
def test_main_only_strainname():
    """
    Only give strain names (no spe taxid etc). Chack that they are downloaded,
    and that the summary file has the expected name.
    """
    NCBI_species_name = ""
    NCBI_species_taxid = ""
    NCBI_taxid = ""
    NCBI_section = "refseq"
    NCBI_strains = "AS001254,KPPR1,LMG 1583"
    levels = ""
    outdir = GENEPATH
    tmp_dir = os.path.join(outdir, 'tmp')
    threads = 1
    norefseq = False
    db_dir = ""
    only_mash = False
    info_file = ""
    l90 = 100
    nbcont = 999
    cutn = 5
    min_dist = 1e-4
    max_dist = 0.06
    verbose = 2
    quiet = False
    out_info_file = os.path.join(outdir, "LSTINFO-AS001254_and_KPPR1_and_LMG_1583-filtered-0.0001_0.06.txt")
    assert prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section, outdir, tmp_dir,
                        threads, norefseq, db_dir, only_mash, info_file, l90, nbcont,
                        cutn, min_dist, max_dist, verbose, quiet) == out_info_file

    # Check output files
    summary =  os.path.join(GENEPATH, "assembly_summary-AS001254_and_KPPR1_and_LMG_1583.txt")
    assert os.path.isfile(summary)
    # Check that the NCBI_genome_download output directory exists
    ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria")
    # And that it contains folders
    assert os.path.isdir(ngd_outdir)
    assert len(os.listdir(ngd_outdir)) == 3
    # Check logfiles are here
    log_files = glob.glob(os.path.join(GENEPATH, "*log*"))
    assert len(log_files) == 3
    # Check tmp files folder created, with the 3 strain files 
    tmp_files = glob.glob(os.path.join(tmp_dir, "*.fna_prepare-split5N.fna"))
    assert len(tmp_files) == 3
    # Check Database_init folder created, with all 3 ".fna" genomes
    fna_files = glob.glob(os.path.join(GENEPATH, "Database_init", "*.fna"))
    assert len(fna_files) == 3


def test_main_only_strainname_file():
    """
    Only give strain names (no spe taxid etc). Chack that they are downloaded,
    and that the summary file has the expected name.
    """
    NCBI_species_name = ""
    NCBI_species_taxid = ""
    NCBI_taxid = ""
    NCBI_section = "refseq"
    NCBI_strains = os.path.join(TEST_DIR, "test_list-strains.txt")
    levels = ""
    outdir = GENEPATH
    tmp_dir = os.path.join(outdir, 'tmp')
    threads = 1
    norefseq = False
    db_dir = ""
    only_mash = False
    info_file = ""
    l90 = 100
    nbcont = 999
    cutn = 5
    min_dist = 1e-4
    max_dist = 0.06
    verbose = 2
    quiet = False
    out_info_file = os.path.join(outdir, "LSTINFO-test_list-strains-filtered-0.0001_0.06.txt")
    assert prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section, outdir, tmp_dir,
                        threads, norefseq, db_dir, only_mash, info_file, l90, nbcont,
                        cutn, min_dist, max_dist, verbose, quiet) == out_info_file

    # Check output files
    summary =  os.path.join(GENEPATH, "assembly_summary-test_list-strains.txt")
    assert os.path.isfile(summary)
    # Check that the NCBI_genome_download output directory exists
    ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria")
    # And that it contains folders
    assert os.path.isdir(ngd_outdir)
234
235
    nbgenomes = len(os.listdir(ngd_outdir))
    assert nbgenomes >= 3
236
237
    # Check logfiles are here
    log_files = glob.glob(os.path.join(GENEPATH, "*log*"))
Amandine  PERRIN's avatar
Amandine PERRIN committed
238
239
    print(os.listdir(ngd_outdir))
    print(log_files)
240
    assert len(log_files) == nbgenomes
241
242
    # Check tmp files folder created, with the 3 strain files 
    tmp_files = glob.glob(os.path.join(tmp_dir, "*.fna_prepare-split5N.fna"))
243
    assert len(tmp_files) == nbgenomes
244
245
    # Check Database_init folder created, with all 3 ".fna" genomes
    fna_files = glob.glob(os.path.join(GENEPATH, "Database_init", "*.fna"))
246
    assert len(fna_files) == nbgenomes
247
248


249
250
251
252
253
def test_main_not_only_mash_infoexists():
    """
    We run without option only_mash, but still provide a lstinfo file
    -> will change its name to .back to save it when the new file will be created
    """
254
255
256
257
    NCBI_species_name = ""
    NCBI_species_taxid = "104099"
    NCBI_taxid = ""
    NCBI_section = "refseq"
258
    NCBI_strains = ""
259
260
261
262
    levels = ""
    outdir = GENEPATH
    tmp_dir = os.path.join(outdir, "temporary_directory")
    threads = 1
Amandine  PERRIN's avatar
Amandine PERRIN committed
263
    norefseq = False
264
265
266
267
268
269
270
271
272
273
274
    db_dir = ""
    only_mash = False
    info_file = os.path.join(outdir, "LSTINFO-existing.lst")
    open(info_file, "w").close()  #create empty info file, to check it is renamed
    l90 = 100
    nbcont = 999
    cutn = 5
    min_dist = 1e-4
    max_dist = 0.06
    verbose = 2
    quiet = False
275
    out_info_file = os.path.join(outdir, "LSTINFO-104099-filtered-0.0001_0.06.txt")
276
    assert prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section, outdir, tmp_dir,
Amandine  PERRIN's avatar
Amandine PERRIN committed
277
                        threads, norefseq, db_dir, only_mash, info_file, l90, nbcont,
278
                        cutn, min_dist, max_dist, verbose, quiet) == out_info_file
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307

    # Check output files
    summary =  os.path.join(GENEPATH, "assembly_summary-104099.txt")
    assert os.path.isfile(summary)
    # Check that the NCBI_genome_download output directory exists
    ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria")
    # And that it contains folders
    assert os.path.isdir(ngd_outdir)
    assert len(os.listdir(ngd_outdir)) >= 4
    # Check logfiles are here
    log_files = glob.glob(os.path.join(GENEPATH, "*log*"))
    assert len(log_files) == 3
    # Check tmp files folder created, but empty as we do not split
    tmp_files = glob.glob(os.path.join(tmp_dir, "*.fna_prepare-split5N.fna"))
    assert len(tmp_files) >= 4
    # Check Database_init folder created, with at list 4 ".fna" genomes
    fna_files = glob.glob(os.path.join(GENEPATH, "Database_init", "*.fna"))
    assert len(fna_files) >= 4
    # Check that LSTINFO file existing was renamed and still empty
    # And new LSTINFO file created
    assert os.path.isfile(info_file + ".back")
    assert os.stat(info_file + ".back").st_size == 0


def test_main_wrong_taxid(capsys):
    """
    We run without option only_mash, but still provide a lstinfo file
    -> will change its name to .back to save it when the new file will be created
    """
308
    NCBI_species_name = ""
309
310
    NCBI_taxid = "123"
    NCBI_species_taxid = ""
311
    NCBI_strains = ""
312
    NCBI_section = "genbank"
313
    levels = ""
314
315
    outdir = GENEPATH
    tmp_dir = os.path.join(GENEPATH, "123", "temporary_directory")
316
    threads = 1
Amandine  PERRIN's avatar
Amandine PERRIN committed
317
    norefseq = False
318
319
320
321
322
323
324
325
326
327
328
    info_file = ""
    db_dir = ""
    only_mash = False
    l90 = 100
    nbcont = 999
    cutn = 5
    min_dist = 1e-4
    max_dist = 0.06
    verbose = 2
    quiet = False
    with pytest.raises(SystemExit):
329
        prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section, 
330
                     outdir, tmp_dir, threads, norefseq,
331
332
333
                     db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist,
                     verbose, quiet)
    _, err = capsys.readouterr()
334
335
336
337
    assert ("No strain correspond to your request. If you are sure there should have "
            "some, check that you gave valid NCBI taxid and/or "
            "NCBI species name and/or NCBI strain name. If you gave several, check that "
            "given taxIDs and names are compatible.") in err
338
    # Check output files
339
    summary =  os.path.join(outdir, "assembly_summary-123.txt")
340
    assert not os.path.isfile(summary)
341
    ngd_outdir = os.path.join(outdir, "genbank", "bacteria")
342
343
    assert not os.path.isdir(ngd_outdir)
    # # Check logfiles are here
344
    log_files = glob.glob(os.path.join(outdir, "*log*"))
345
346
347
348
    assert len(log_files) == 3
    # Check tmp files folder created, but empty asnothing is downloaded
    assert len(os.listdir(tmp_dir)) == 0
    # Check Database_init folder created, with at list 4 ".fna" genomes
349
    assert not os.path.isdir(os.path.join(outdir, "Database_init"))
350
351
352
353


def test_main_norefseq_wrongdbpath(capsys):
    """
Amandine  PERRIN's avatar
Amandine PERRIN committed
354
    We run with option norefseq, but given db_dir does not exist.
355
356
    -> error message
    """
357
358
359
    NCBI_species_name = ""
    NCBI_species_taxid = ""
    NCBI_taxid = ""
360
    NCBI_strains = ""
361
    NCBI_section = "refseq"
362
363
364
365
    levels = ""
    outdir = GENEPATH
    tmp_dir = os.path.join(outdir, "temporary_directory")
    threads = 1
Amandine  PERRIN's avatar
Amandine PERRIN committed
366
    norefseq = True
367
368
369
370
371
372
373
374
375
376
377
    db_dir = "dbdir"
    only_mash = False
    l90 = 100
    nbcont = 999
    cutn = 5
    min_dist = 1e-4
    max_dist = 0.06
    verbose = 15
    quiet = False
    info_file = ""
    with pytest.raises(SystemExit):
378
        prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section,
379
                     outdir, tmp_dir, threads, norefseq,
380
381
382
383
384
385
386
                     db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist,
                     verbose, quiet)
    _, err = capsys.readouterr()
    assert ("You asked to skip refseq downloads") in err
    assert ("Database folder dbdir supposed to contain fasta sequences does not exist. Please "
            "give a valid folder, or leave the default directory (no '-d' option)") in err
    # Check output files
387
    summary =  os.path.join(GENEPATH, "assembly_summary-123.txt")
388
389
390
391
392
393
394
395
396
397
398
399
400
401
    assert not os.path.isfile(summary)
    ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria")
    assert not os.path.isdir(ngd_outdir)
    # Check logfiles are here
    log_files = glob.glob(os.path.join(GENEPATH, "*log*"))
    assert len(log_files) == 4  #.log.debug as we put verbose = 15
    # Check tmp files folder created, but empty asnothing is downloaded
    assert len(os.listdir(tmp_dir)) == 0
    # Check Database_init folder created, with at list 4 ".fna" genomes
    assert not os.path.isdir(os.path.join(GENEPATH, "Database_init"))


def test_main_norefseq_nodefault_dbdir_nor_refseq(capsys):
    """
Amandine  PERRIN's avatar
Amandine PERRIN committed
402
    We run with option norefseq, but given db_dir does not exist.
403
404
    -> error message
    """
405
406
407
    NCBI_species_name = ""
    NCBI_species_taxid = ""
    NCBI_taxid = ""
408
    NCBI_strains = ""
409
    NCBI_section = "genbank"
410
411
412
413
    levels = ""
    outdir = GENEPATH
    tmp_dir = ""
    threads = 1
Amandine  PERRIN's avatar
Amandine PERRIN committed
414
    norefseq = True
415
416
417
418
419
420
421
422
423
424
425
    db_dir = ""
    only_mash = False
    l90 = 100
    nbcont = 999
    cutn = 5
    min_dist = 1e-4
    max_dist = 0.06
    verbose = 2
    quiet = False
    info_file = ""
    with pytest.raises(SystemExit):
426
        prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, 
427
                     NCBI_section, outdir, tmp_dir, threads, norefseq,
428
429
430
                     db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist,
                     verbose, quiet)
    _, err = capsys.readouterr()
431
    assert ("You asked to skip genbank downloads") in err
432
433
434
    assert ("Database folder test/data/prepare/generated_by_func-tests/Database_init supposed "
            "to contain fasta sequences does not exist. We will check if the download folder "
            "(with compressed sequences) exists.") in err
435
436
437
    assert ("Folder test/data/prepare/generated_by_func-tests/genbank/bacteria "
            "does not exist. You do not have any genome to analyse. Possible reasons:\n") in err
    assert ("- if you want to rerun analysis in the same folder as "
438
            "sequences were downloaded (my_outdir/Database_init or "
439
440
            "my_outdir/genbank), make sure you have '-o my_outdir' option\n") in err
    assert ("- if you want to rerun analysis and save them in a new "
441
442
443
444
            "output folder called 'new_outdir', make sure you have '-o new_outdir' option, "
            "and you specified where the uncompressed sequences to use are "
            "('-d sequence_database_path'") in err
    # # Check output files
445
    summary =  os.path.join(GENEPATH, "assembly_summary-123.txt")
446
    assert not os.path.isfile(summary)
447
    ngd_outdir = os.path.join(GENEPATH, "genbank", "bacteria")
448
449
450
451
452
453
454
455
456
457
458
459
    assert not os.path.isdir(ngd_outdir)
    # Check logfiles are here
    log_files = glob.glob(os.path.join(GENEPATH, "*log*"))
    assert len(log_files) == 3
    # Check tmp files folder created, but empty asnothing is downloaded
    assert len(os.listdir(os.path.join(GENEPATH, "tmp_files"))) == 0
    # Check Database_init folder created, with at list 4 ".fna" genomes
    assert not os.path.isdir(os.path.join(GENEPATH, "Database_init"))


def test_main_norefseq_nodefault_dbdir_but_refseq(capsys):
    """
Amandine  PERRIN's avatar
Amandine PERRIN committed
460
    We run with option norefseq, but given db_dir does not exist.
461
462
    -> error message
    """
463
464
465
    NCBI_species_name = ""
    NCBI_species_taxid = "123"
    NCBI_taxid = ""
466
    NCBI_strains = ""
467
    NCBI_section = "genbank"
468
469
470
471
472
    levels = ""
    # Copy refseq/bacteria and content into outdirectory
    outdir = GENEPATH
    tmp_dir = ""
    threads = 1
Amandine  PERRIN's avatar
Amandine PERRIN committed
473
    norefseq = True
474
    orig_dbdir = os.path.join(GEN_PATH, "refseq")
475
    refseq_db_dir = os.path.join(GENEPATH, "genbank")
476
477
478
479
480
481
482
483
484
485
486
    shutil.copytree(orig_dbdir, refseq_db_dir)
    db_dir = ""
    only_mash = False
    l90 = 100
    nbcont = 999
    cutn = 0
    min_dist = 1e-4
    max_dist = 0.06
    verbose = 2
    quiet = False
    info_file = ""
487
    out_info_file = os.path.join(outdir, f"LSTINFO-123-filtered-0.0001_0.06.txt")
488
    assert prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, 
489
                        NCBI_section, outdir, tmp_dir, threads,
Amandine  PERRIN's avatar
Amandine PERRIN committed
490
                        norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist,
491
                        max_dist, verbose, quiet) == out_info_file
492
    out, err = capsys.readouterr()
493
    assert ("You asked to skip genbank downloads") in err
494
495
496
497
498
499
500
501
502
503
    assert ("Database folder test/data/prepare/generated_by_func-tests/"
            "Database_init supposed "
            "to contain fasta sequences does not exist. We will check if the download folder "
            "(with compressed sequences) exists.") in err
    assert ("Uncompressing genome files") in out
    assert ("Total number of genomes for 123: 3") in out
    assert ("Computing pairwise distances between all genomes") in out
    assert ("Final number of genomes in dataset: 1") in out
    # Check output files
    # Check that the NCBI_genome_download output directory exists
504
    ngd_outdir = os.path.join(GENEPATH, "genbank", "bacteria")
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
    # And that it contains folders
    assert os.path.isdir(ngd_outdir)
    assert len(os.listdir(ngd_outdir)) == 3
    # Check logfiles are here
    log_files = glob.glob(os.path.join(GENEPATH, "*log*"))
    assert len(log_files) == 3
    # Check tmp files folder created, but empty as we do not split
    tmp_folder = os.listdir(os.path.join(GENEPATH, "tmp_files"))
    assert len(tmp_folder) == 0
    # Check Database_init folder created, with the 3 ".fna" genomes
    fna_files = glob.glob(os.path.join(GENEPATH, "Database_init", "*.fna"))
    assert len(fna_files) == 3


def test_main_norefseq_defaultdbdir(capsys):
    """
Amandine  PERRIN's avatar
Amandine PERRIN committed
521
    We run with option norefseq, but given db_dir does not exist.
522
523
    -> error message
    """
524
525
526
    NCBI_species_name = ""
    NCBI_species_taxid = ""
    NCBI_taxid = ""
527
    NCBI_strains = ""
528
    NCBI_section = "refseq"
529
530
531
532
533
    levels = ""
    # Copy refseq/bacteria and content into outdirectory
    outdir = GENEPATH
    tmp_dir = ""
    threads = 1
Amandine  PERRIN's avatar
Amandine PERRIN committed
534
    norefseq = True
535
536
537
538
539
540
541
542
543
544
545
546
547
    orig_dbdir = os.path.join(GEN_PATH, "genomes_comparison")
    refseq_db_dir = os.path.join(GENEPATH, "Database_init")
    shutil.copytree(orig_dbdir, refseq_db_dir)
    db_dir = ""
    only_mash = False
    l90 = 100
    nbcont = 999
    cutn = 0
    min_dist = 1e-4
    max_dist = 0.06
    verbose = 2
    quiet = False
    info_file = ""
548
    out_info_file = os.path.join(outdir, "LSTINFO-NA-filtered-0.0001_0.06.txt")
549
    assert prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, 
550
                        NCBI_section, outdir, tmp_dir, threads,
Amandine  PERRIN's avatar
Amandine PERRIN committed
551
                        norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist,
552
                        max_dist, verbose, quiet) == out_info_file
553
554
    out, err = capsys.readouterr()
    assert ("You asked to skip refseq downloads") in err
555
    assert ("Total number of genomes for NA: 5") in out
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
    assert ("Computing pairwise distances between all genomes") in out
    assert ("Final number of genomes in dataset: 1") in out
    # Check output files
    # Check that the NCBI_genome_download output directory exists
    ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria")
    assert not os.path.isdir(ngd_outdir)
    # Check logfiles are here
    log_files = glob.glob(os.path.join(GENEPATH, "*log*"))
    assert len(log_files) == 3
    # Check tmp files folder created, but empty as we do not split
    tmp_folder = os.listdir(os.path.join(GENEPATH, "tmp_files"))
    assert len(tmp_folder) == 0
    # Check Database_init folder created, with the 3 ".fna" genomes
    fna_files = glob.glob(os.path.join(GENEPATH, "Database_init", "*.fna"))
    assert len(fna_files) == 5


def test_main_norefseq_givendbdir(capsys):
    """
Amandine  PERRIN's avatar
Amandine PERRIN committed
575
    We run with option norefseq, but given db_dir does not exist.
576
577
    -> error message
    """
578
579
    NCBI_species_name = ""
    NCBI_species_taxid = ""
580
    NCBI_taxid = ""
581
    NCBI_strains = ""
582
    NCBI_section = "refseq"
583
584
585
586
587
    levels = ""
    # Copy refseq/bacteria and content into outdirectory
    outdir = GENEPATH
    tmp_dir = ""
    threads = 1
Amandine  PERRIN's avatar
Amandine PERRIN committed
588
    norefseq = True
589
590
591
592
593
594
595
596
597
598
599
600
601
    orig_dbdir = os.path.join(GEN_PATH, "genomes_comparison")
    refseq_db_dir = os.path.join(GENEPATH, "genomes_comparison")
    shutil.copytree(orig_dbdir, refseq_db_dir)
    db_dir = refseq_db_dir
    only_mash = False
    l90 = 100
    nbcont = 999
    cutn = 2
    min_dist = 1e-4
    max_dist = 0.06
    verbose = 2
    quiet = False
    info_file = ""
602
    out_info_file = os.path.join(outdir, "LSTINFO-NA-filtered-0.0001_0.06.txt")
603
    assert prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, 
604
                        NCBI_section, outdir, tmp_dir, threads,
Amandine  PERRIN's avatar
Amandine PERRIN committed
605
                        norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist,
606
                        max_dist, verbose, quiet) == out_info_file
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
    out, err = capsys.readouterr()
    assert ("You asked to skip refseq downloads") in err
    assert ("Total number of genomes for NA: 5") in out
    assert ("Computing pairwise distances between all genomes") in out
    assert ("Final number of genomes in dataset: 1") in out
    # Check output files
    # Check that the NCBI_genome_download output directory exists
    ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria")
    assert not os.path.isdir(ngd_outdir)
    # Check logfiles are here
    log_files = glob.glob(os.path.join(GENEPATH, "*log*"))
    assert len(log_files) == 3
    # Check tmp files folder created, but empty as we do not split
    tmp_files = glob.glob(os.path.join(GENEPATH, "tmp_files", "*.fna_prepare-split2N.fna"))
    assert len(tmp_files) == 5

623
624
625
626
627

def test_only_mash(capsys):
    """
    Running only mash step (giving genomes and corresponding LSTINFO file)
    """
628
629
    NCBI_species_name = ""
    NCBI_species_taxid = ""
630
    NCBI_taxid = ""
631
    NCBI_strains = ""
632
    NCBI_section = "refseq"
633
634
635
636
    levels = ""
    outdir = GENEPATH
    tmp_dir = ""
    threads = 1
Amandine  PERRIN's avatar
Amandine PERRIN committed
637
    norefseq = False
638
639
640
641
642
643
644
645
646
647
    db_dir = ""
    only_mash = True
    info_file = os.path.join(TEST_DIR, "test_lstinfo_onlymash.lst")
    l90 = 100
    nbcont = 999
    cutn = 5
    min_dist = 1e-4
    max_dist = 0.06
    verbose = 1
    quiet = False
648
    out_info_file = os.path.join(outdir, "LSTINFO-NA-filtered-0.0001_0.06.txt")
649
    assert prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, 
650
                        NCBI_section, outdir, tmp_dir, threads,
Amandine  PERRIN's avatar
Amandine PERRIN committed
651
                        norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist,
652
                        max_dist, verbose, quiet) == out_info_file
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
    out, err = capsys.readouterr()
    assert ("You asked to run only mash steps") in err
    assert ("You want to run only mash steps. Getting information from "
            "test/data/prepare/test_files/test_lstinfo_onlymash.lst") in out
    assert ("Found 5 genomes in total") in out
    assert ("Computing pairwise distances between all genomes") in out
    assert ("Sorting all 5 genomes by quality") in out
    assert ("Final number of genomes in dataset: 1") in out

    # Check output files
    assert len(os.listdir(os.path.join(outdir, "tmp_files"))) == 0
    # Check logfiles are here
    log_files = glob.glob(os.path.join(outdir, "*log*"))
    assert len(log_files) == 3
    # Check content of output lstinfo file
    out_lst = os.path.join(outdir, "LSTINFO-NA-filtered-0.0001_0.06.txt")
    exp_lst = os.path.join(DBDIR, "exp_files", "exp_lstinfo_run_only-mash.lst")
    assert tutil.compare_order_content(out_lst, exp_lst)


673
674
675
676
def test_only_mash_empty_lstinfo(capsys):
    """
    Running only mash step giving an empty lstinfo file -> error, no genome found
    """
677
678
    NCBI_species_name = ""
    NCBI_species_taxid = ""
679
    NCBI_taxid = ""
680
    NCBI_strains = ""
681
    NCBI_section = "refseq"
682
683
684
685
    levels = ""
    outdir = GENEPATH
    tmp_dir = ""
    threads = 1
Amandine  PERRIN's avatar
Amandine PERRIN committed
686
    norefseq = False
687
688
689
690
691
692
693
694
695
696
697
698
699
    db_dir = ""
    only_mash = True
    # Create empty lstinfo file
    info_file = os.path.join(GENEPATH, "LSTINFO-empty.lst")
    open(info_file, "w").close()
    l90 = 100
    nbcont = 999
    cutn = 5
    min_dist = 1e-4
    max_dist = 0.06
    verbose = 1
    quiet = False
    with pytest.raises(SystemExit):
700
        prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels,
701
                     NCBI_section, outdir, tmp_dir, threads, norefseq,
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
                     db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist,
                     verbose, quiet)
    out, err = capsys.readouterr()
    assert ("You asked to run only mash steps") in err
    assert ("You want to run only mash steps. Getting information from "
            "test/data/prepare/generated_by_func-tests/LSTINFO-empty.lst") in out
    assert ("No genome listed in test/data/prepare/generated_by_func-tests/LSTINFO-empty.lst "
            "was found.") in err

    # Check output files
    assert len(os.listdir(os.path.join(outdir, "tmp_files"))) == 0
    # Check logfiles are here
    log_files = glob.glob(os.path.join(outdir, "*log*"))
    assert len(log_files) == 3
    # Check lstinfo file is still here and still empty
    assert os.stat(info_file).st_size == 0


def test_only_mash_no_lstinfo(capsys):
    """
    Running only mash step giving an info file which does not exist -> error missing infofile
    """
724
725
    NCBI_species_name = ""
    NCBI_species_taxid = ""
726
    NCBI_taxid = ""
727
    NCBI_strains = ""
728
    NCBI_section = "refseq"
729
730
731
732
    levels = ""
    outdir = GENEPATH
    tmp_dir = ""
    threads = 1
Amandine  PERRIN's avatar
Amandine PERRIN committed
733
    norefseq = False
734
735
736
737
738
739
740
741
742
743
744
745
    db_dir = ""
    only_mash = True
    # Create empty lstinfo file
    info_file = "info_file.lst"
    l90 = 100
    nbcont = 999
    cutn = 5
    min_dist = 1e-4
    max_dist = 0.06
    verbose = 1
    quiet = False
    with pytest.raises(SystemExit):
746
        prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels,
747
                     NCBI_section, outdir, tmp_dir, threads, norefseq,
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
                     db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist,
                     verbose, quiet)
    out, err = capsys.readouterr()
    assert ("You asked to run only mash steps") in err
    assert ("Your info file info_file.lst does not exist. Please provide the  "
            "right name/path, or remove the '--mash-only option to rerun "
            "quality control.") in err

    # Check output files
    assert len(os.listdir(os.path.join(outdir, "tmp_files"))) == 0
    # Check logfiles are here
    log_files = glob.glob(os.path.join(outdir, "*log*"))
    assert len(log_files) == 3
    # Check that outdir contains only 4 elements: 3 logs + tmp_files repo
    files = os.listdir(outdir)
    files = [f for f in files if "fuse" not in f]
    assert len(files) == 4