From 046b1e92ce868cb883f190832de4acbff499bb28 Mon Sep 17 00:00:00 2001
From: Amandine PERRIN <amandine.perrin@pasteur.fr>
Date: Mon, 28 Oct 2019 12:26:08 +0100
Subject: [PATCH] test with small file
---
.gitlab-ci.yml | 2 +-
.../test_annotate/test_genome_func.py | 794 ++++++++----------
test/test_unit/test_gene.py | 4 -
3 files changed, 361 insertions(+), 439 deletions(-)
delete mode 100644 test/test_unit/test_gene.py
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 352d8bac..a494c4f8 100755
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -78,7 +78,7 @@ unit-test-ubuntu18.04:
script:
# - py.test test/test_unit/test_prepare -v -x
# - py.test test/test_unit/test_annotate/test_format_func.py::test_contig_name -v -x
- - py.test test/test_unit/test_gene.py -v -x -s
+ - py.test test/test_unit/test_annotate/test_genome_func.py -v -x -s
- mv .coverage .coverage-unit
artifacts:
paths:
diff --git a/test/test_unit/test_annotate/test_genome_func.py b/test/test_unit/test_annotate/test_genome_func.py
index 7b3cff1d..1afbba2c 100755
--- a/test/test_unit/test_annotate/test_genome_func.py
+++ b/test/test_unit/test_annotate/test_genome_func.py
@@ -17,80 +17,6 @@ DBPATH = os.path.join("test", "data", "annotate", "genomes")
BASELINE_DIR = os.path.join("..", "..", "data", "annotate", "exp_files", "baseline")
-# Start tests
-def test_sort_genomes():
- """
- Test the function sorting genomes by L90 and nb contigs.
- genome = name, path, gsize, nbcont, L90]
- """
- genome1 = ["SAEN.1116.", "path/to/genome1", 10000, 11, 2]
- genome2 = ["SAEN.1015.", "path/to/genome2", 10000, 12, 2]
- genome3 = ["SAEN.1015.", "path/to/genome3", 10000, 12, 1]
- genome4 = ["ESCO.0216.", "path/to/genome4", 10000, 12, 1]
-
- genomes = {1: genome1, 2: genome2, 3: genome3, 4: genome4}
- sorted_g = sorted(genomes.items(), key=gfunc.sort_genomes)
- exp = [(4, genome4), (3, genome3), (1, genome1), (2, genome2)]
- assert sorted_g == exp
-
-
-def test_save_contig_5n():
- """
- Test that the given contig is split at each stretch of at least 5 'N', and not at
- stretches of less than 5 'N'.
- Check that the contigs in the output file are named as expected, and that
- the contig sizes are well reported.
- """
- pat = "NNNNN+" # at least 5 'N'
- cur_cont = ("AACCGTGTCTCTCGGAGCNNNNCCGTTCGGCTCNCGGTCNNNNNCCGTTATNNCGGTTCGCNNNCTGGTC"
- "GGCTTATNNNNNNNNNNNNCCTGGTATTCGGCGCTTCNC")
- cur_cont_name = ">ESCO.0216.00001_cont2"
- # one contig saved before running, check that it is not erased
- contig_sizes = {">ESCO.0216.00001_cont1": 1623}
- seq_file = os.path.join("test", "data", "annotate", "test_save_contig5N.faa")
- resf = open(seq_file, "w")
- gfunc.save_contig(pat, cur_cont, cur_cont_name, contig_sizes, resf, -1)
- resf.close()
- exp = {">ESCO.0216.00001_cont1": 1623, ">ESCO.0216.0000_0": 39,
- ">ESCO.0216.0000_1": 33, ">ESCO.0216.0000_2": 20}
- assert contig_sizes == exp
-
- exp_file = os.path.join("test", "data", "annotate", "exp_files", "res_save_contig5N.faa")
- with open(exp_file, "r") as expf, open(seq_file, "r") as seqf:
- for line_exp, line_seq in zip(expf, seqf):
- assert line_exp == line_seq
- os.remove(seq_file)
-
-
-def test_save_contig_atcg():
- """
- Test that the given contig is split at each pattern 'ATCG' (just for test)
- Check that the contigs in the output file are named as expected, and that
- the contig sizes are well reported.
- """
- pat = "ATCG" # split each time those 4 letters are found in the sequence
- cur_cont = ("AAATGGTCTCGATGATCGATCGAGGGATTCGGAATCGGGCTCTGAATTCGATCGGTAGCTCTCGGGA"
- "GCTCTAGGCTCGTACGCCGTGATCGCATCGGTTCGTATCGATCGATCGATCGGGGGG")
- cur_cont_name = ">ESCO.0216.00001_cont2"
- # one contig saved before running, check that it is not erased
- contig_sizes = {">ESCO.0216.00001_cont1": 1623}
- seq_file = os.path.join("test", "data", "annotate", "test_save_contigATCG.faa")
- resf = open(seq_file, "w")
- gfunc.save_contig(pat, cur_cont, cur_cont_name, contig_sizes, resf, -1)
- resf.close()
- exp = {">ESCO.0216.00001_cont1": 1623, ">ESCO.0216.0000_0": 14,
- ">ESCO.0216.0000_1": 11, ">ESCO.0216.0000_2": 13,
- ">ESCO.0216.0000_3": 34, ">ESCO.0216.0000_4": 1,
- ">ESCO.0216.0000_5": 6, ">ESCO.0216.0000_6": 5}
- assert contig_sizes == exp
-
- exp_file = os.path.join("test", "data", "annotate", "exp_files", "res_save_contigATCG.faa")
- with open(exp_file, "r") as expf, open(seq_file, "r") as seqf:
- for line_exp, line_seq in zip(expf, seqf):
- assert line_exp == line_seq
- os.remove(seq_file)
-
-
def test_calc_l90_exact():
"""
Calculate L90 according to the given genome size and contig sizes
@@ -101,363 +27,363 @@ def test_calc_l90_exact():
assert l90 == 2
-def test_calc_l90_more():
- """
- Calculate L90 according to the given genome size and contig sizes
- 3 contigs get exactly more than 90%, but 2 contigs get less -> l90 = 3
- """
- cont_size = {1: 3, 2: 800, 3: 90, 4: 90, 5: 17}
- l90 = gfunc.calc_l90(cont_size)
- assert l90 == 3
-
-
-def test_rename_genomes():
- """
- From a list of genomes ({genome: [name.date, path, gsize, nbcont, L90]}),
- order them by species, and by decreasing quality (L90, nb_cont), and rename them,
- as well as their contigs.
- """
- genomes_dir = os.path.join("test", "data", "annotate", "genomes")
- gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta", "genome4.fasta",
- "genome5.fasta", "genome6.fasta", "genome7.fasta"]
-
- genomes = {gs[0]: ["SAEN.1113", os.path.join(genomes_dir, gs[0]), 51, 4, 2],
- gs[1]: ["SAEN.1114", os.path.join(genomes_dir, gs[1]), 67, 3, 3],
- gs[2]: ["ESCO.0416", os.path.join(genomes_dir, gs[2]), 70, 4, 1],
- gs[3]: ["ESCO.0216", os.path.join(genomes_dir, gs[3]), 114, 5, 2],
- gs[4]: ["SAEN.1115", os.path.join(genomes_dir, gs[4]), 106, 3, 1],
- gs[5]: ["ESCO.0216", os.path.join(genomes_dir, gs[5]), 116, 4, 2],
- gs[6]: ["SAEN.1115", os.path.join(genomes_dir, gs[6]), 137, 3, 2]}
- gfunc.rename_all_genomes(genomes)
- # SAEN genomes 1 and 2 have same characteristics. Their place will be chosen randomly,
- # so take into account both choices
- exp_genomes = {gs[0]: ["SAEN.1113.00003", os.path.join(genomes_dir, gs[0]), 51, 4, 2],
- gs[1]: ["SAEN.1114.00004", os.path.join(genomes_dir, gs[1]), 67, 3, 3],
- gs[2]: ["ESCO.0416.00001", os.path.join(genomes_dir, gs[2]), 70, 4, 1],
- gs[3]: ["ESCO.0216.00003", os.path.join(genomes_dir, gs[3]), 114, 5, 2],
- gs[4]: ["SAEN.1115.00001", os.path.join(genomes_dir, gs[4]), 106, 3, 1],
- gs[5]: ["ESCO.0216.00002", os.path.join(genomes_dir, gs[5]), 116, 4, 2],
- gs[6]: ["SAEN.1115.00002", os.path.join(genomes_dir, gs[6]), 137, 3, 2]}
- assert genomes == exp_genomes
-
-
-def test_analyse1genome_nocut():
- """
- Analyse the given genome: without cutting at stretches of N, calculate
- its genome size, nb contigs and L90, and add it to the genomes dict, as well as
- the path to the genome file.
- """
- gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta"]
- genomes = {gs[0]: ["SAEN.1113"],
- gs[1]: ["SAEN.1114"],
- gs[2]: ["ESCO.0416"]}
- genome = gs[1]
- tmp_path = os.path.join("test", "data", "annotate")
- # Put genome file in tmppath instead of dbpath, as if it was
- # the result of concatenation of several files for the same genome,
- # done in the first step.
- orig_file = os.path.join(DBPATH, genome)
- out_file = os.path.join(tmp_path, genome)
- os.rename(orig_file, out_file)
- cut = False
- pat = "NNNNN+"
- assert gfunc.analyse_genome(genome, DBPATH, tmp_path, cut, pat, genomes)
- outf = os.path.join(tmp_path, gs[1] + "-short-contig.fna")
- exp_genomes = {gs[0]: ["SAEN.1113"],
- gs[1]: ["SAEN.1114", outf, 67, 3, 3],
- gs[2]: ["ESCO.0416"]}
- assert genomes == exp_genomes
- exp_file = os.path.join(tmp_path, "exp_files", "res_test_analyse-genome2.fna")
- with open(exp_file, "r") as expf, open(outf, "r") as of:
- for linee, lineo in zip(expf, of):
- assert linee == lineo
- os.remove(outf)
- os.rename(out_file, orig_file)
-
-
-def test_analyse1genome_nocut_empty():
- """
- Analyse the given genome: without cutting at stretches of N. The genome is an empty
- file, so it is not possible to calculate L90
- """
- gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta", "empty.fasta"]
- open(os.path.join(DBPATH, gs[3]), "w").close()
- genomes = {gs[0]: ["SAEN.1113"],
- gs[1]: ["SAEN.1114"],
- gs[2]: ["ESCO.0416"],
- gs[3]: ["ESCO.0415"]}
- genome = gs[3]
- tmp_path = os.path.join("test", "data", "annotate")
- cut = False
- pat = "NNNNN+"
- assert not gfunc.analyse_genome(genome, DBPATH, tmp_path, cut, pat, genomes)
- exp_genomes = {gs[0]: ["SAEN.1113"],
- gs[1]: ["SAEN.1114"],
- gs[2]: ["ESCO.0416"],
- gs[3]: ["ESCO.0415"]}
- assert genomes == exp_genomes
- os.remove(os.path.join(DBPATH, gs[3]))
- os.remove(os.path.join(tmp_path, gs[3] + "-short-contig.fna"))
-
-
-def test_analyse1genome_cut():
- """
- Analyse the given genome: cut at each stretch of 5 N, put it to a new file,
- and then calculate its genome size, nb contigs and L90. Add this information
- to the genomes dict, as well as the path to the genome file (cut).
- """
- gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta"]
- genomes = {gs[0]: ["SAEN.1113"],
- gs[1]: ["SAEN.1114"],
- gs[2]: ["ESCO.0416"]}
- genome = gs[1]
- tmp_path = os.path.join("test", "data", "annotate")
- cut = True
- pat = "NNNNN+"
- assert gfunc.analyse_genome(genome, DBPATH, tmp_path, cut, pat, genomes)
- out_f = os.path.join(tmp_path, gs[1] + "-split5N.fna")
- exp_f = os.path.join(tmp_path, "exp_files", "res_genome2.fasta-split5N.fna")
- exp_genomes = {gs[0]: ["SAEN.1113"],
- gs[1]: ["SAEN.1114", out_f, 55, 5, 4],
- gs[2]: ["ESCO.0416"]}
- assert genomes == exp_genomes
- with open(out_f, "r") as outf, open(exp_f, "r") as expf:
- for line_exp, line_out in zip(expf, outf):
- assert line_exp == line_out
- os.remove(out_f)
-
-
-def test_analyse1genome_cut_same_names():
- """
- Analyse a genome. Its contig names all have the same first 20 characters. There is no
- stretch of at least 5N, so contigs are not split.
- New contig names should be uniq, and not all ending with _0!
- """
- genome = "genome_long_header.fst"
- genomes = {genome: ["SAEN.1015.0117"]}
- tmp_path = os.path.join("test", "data", "annotate")
- cut = True
- pat = "NNNNN+"
- assert gfunc.analyse_genome(genome, DBPATH, tmp_path, cut, pat, genomes)
- out_f = os.path.join(tmp_path, genome + "-split5N.fna")
- exp_f = os.path.join("test", "data", "annotate", "exp_files",
- "res_genome_short-long_header.fst")
- exp_genomes = {genome: ["SAEN.1015.0117", out_f, 151, 3, 3]}
- assert genomes == exp_genomes
- with open(out_f, "r") as outf, open(exp_f, "r") as expf:
- for line_exp, line_out in zip(expf, outf):
- assert line_exp == line_out
- os.remove(out_f)
-
-
-def test_analyse1genome_cut_empty():
- """
- Analyse the given genome: cut at each stretch of 5 N, but the file is empty.
- Check that it returns False
- """
- gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta", "empty.fasta"]
- open(os.path.join(DBPATH, gs[3]), "w").close()
- genomes = {gs[0]: ["SAEN.1113"],
- gs[1]: ["SAEN.1114"],
- gs[2]: ["ESCO.0416"],
- gs[3]: ["ESCO.0415"]}
- genome = gs[3]
- tmp_path = os.path.join("test", "data", "annotate")
- cut = True
- pat = "NNNNN+"
- assert not gfunc.analyse_genome(genome, DBPATH, tmp_path, cut, pat, genomes)
- exp_genomes = {gs[0]: ["SAEN.1113"],
- gs[1]: ["SAEN.1114"],
- gs[2]: ["ESCO.0416"],
- gs[3]: ["ESCO.0415"]}
- assert genomes == exp_genomes
- out_f = os.path.join(tmp_path, gs[3] + "-split5N.fna")
- with open(out_f, "r") as outf:
- assert outf.readlines() == []
- os.remove(os.path.join(DBPATH, gs[3]))
- os.remove(out_f)
-
-
-def test_analyse_all_genomes_nocut():
- """
- Analyze all given genomes: don't cut at stretches of N, but look at their sequence
- file, to calculate L90, genome size and nb contigs. Add this information, as well as the
- path to the genomic sequence, to the genomes dict.
- """
- gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta"]
- genomes = {gs[0]: ["SAEN.1113"],
- gs[1]: ["SAEN.1114"],
- gs[2]: ["ESCO.0416"]}
- tmp_path = os.path.join("test", "data", "annotate")
- opaths = [os.path.join(tmp_path, gname + "-short-contig.fna") for gname in gs]
- nbn = 0
- # Run analysis
- gfunc.analyse_all_genomes(genomes, DBPATH, tmp_path, nbn)
- # construct expected results
- exp_genomes = {gs[0]: ["SAEN.1113", opaths[0], 51, 4, 2],
- gs[1]: ["SAEN.1114", opaths[1], 67, 3, 3],
- gs[2]: ["ESCO.0416", opaths[2], 70, 4, 1]}
- assert exp_genomes == genomes
- for f in opaths:
- os.remove(f)
-
-
-def test_analyse_all_genomes_nocut_empty():
- """
- Analyze all given genomes: don't cut at stretches of N, but look at their sequence
- file, to calculate L90, genome size and nb contigs. Add this information, as well as the
- path to the genomic sequence, to the genomes dict.
- """
- gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta", "empty.fasta"]
- open(os.path.join(DBPATH, gs[3]), "w").close()
- genomes = {gs[0]: ["SAEN.1113"],
- gs[1]: ["SAEN.1114"],
- gs[2]: ["ESCO.0416"],
- gs[3]: ["ESCO.0123"]}
- tmp_path = os.path.join("test", "data", "annotate")
- opaths = [os.path.join(tmp_path, gname + "-short-contig.fna") for gname in gs]
- nbn = 0
- # Run analysis
- gfunc.analyse_all_genomes(genomes, DBPATH, tmp_path, nbn)
- # construct expected results
- exp_genomes = {gs[0]: ["SAEN.1113", opaths[0], 51, 4, 2],
- gs[1]: ["SAEN.1114", opaths[1], 67, 3, 3],
- gs[2]: ["ESCO.0416", opaths[2], 70, 4, 1]}
- assert exp_genomes == genomes
- os.remove(os.path.join(DBPATH, gs[3]))
- for f in opaths:
- os.remove(f)
-
-
-def test_analyse_all_genomes_cut():
- """
- Analyze all given genomes: cut at each stretch of 5 'N', look at the output sequence
- file, to calculate L90, genome size and nb contigs. Add this information, as well as the
- path to the genomic sequence, to the genomes dict.
- """
- gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta"]
- genomes = {gs[0]: ["SAEN.1113"],
- gs[1]: ["SAEN.1114"],
- gs[2]: ["ESCO.0416"]}
- tmp_path = os.path.join("test", "data", "annotate")
- gpaths = [os.path.join(tmp_path, gname + "-split5N.fna") for gname in gs]
- nbn = 5
- # Run analysis
- gfunc.analyse_all_genomes(genomes, DBPATH, tmp_path, nbn)
- # construct expected results
- exp_genomes = {gs[0]: ["SAEN.1113", gpaths[0], 51, 4, 2],
- gs[1]: ["SAEN.1114", gpaths[1], 55, 5, 4],
- gs[2]: ["ESCO.0416", gpaths[2], 70, 4, 1]}
- out_f = [os.path.join(tmp_path, gname + "-split5N.fna") for gname in gs]
- exp_f = [os.path.join(tmp_path, "exp_files", "res_" + gname + "-split5N.fna") for gname in gs]
- assert exp_genomes == genomes
- for out, exp in zip(out_f, exp_f):
- with open(out, "r") as outf, open(exp, "r") as expf:
- for line_exp, line_out in zip(expf, outf):
- assert line_exp == line_out
- os.remove(out)
-
-
-def test_analyse_all_genomes_cut_empty():
- """
- Analyze all given genomes: cut at each stretch of 5 'N', look at the output sequence
- file, to calculate L90, genome size and nb contigs. Add this information, as well as the
- path to the genomic sequence, to the genomes dict.
- """
- gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta", "empty.fasta"]
- open(os.path.join(DBPATH, gs[3]), "w").close()
- genomes = {gs[0]: ["SAEN.1113"],
- gs[1]: ["SAEN.1114"],
- gs[2]: ["ESCO.0416"],
- gs[3]: ["ESCO.0123"]}
- tmp_path = os.path.join("test", "data", "annotate")
- gpaths = [os.path.join(tmp_path, gname + "-split5N.fna") for gname in gs]
- nbn = 5
- # Run analysis
- gfunc.analyse_all_genomes(genomes, DBPATH, tmp_path, nbn)
- # construct expected results
- exp_genomes = {gs[0]: ["SAEN.1113", gpaths[0], 51, 4, 2],
- gs[1]: ["SAEN.1114", gpaths[1], 55, 5, 4],
- gs[2]: ["ESCO.0416", gpaths[2], 70, 4, 1]}
- out_f = [os.path.join(tmp_path, gname + "-split5N.fna") for gname in gs]
- exp_f = [os.path.join(tmp_path, "exp_files", "res_" + gname + "-split5N.fna") for gname in gs]
- assert exp_genomes == genomes
- for out, exp in zip(out_f[:-1], exp_f[:-1]):
- with open(out, "r") as outf, open(exp, "r") as expf:
- for line_exp, line_out in zip(expf, outf):
- assert line_exp == line_out
- os.remove(out)
- with open(out_f[-1], "r") as outf:
- assert outf.readlines() == []
- os.remove(out_f[-1])
- os.remove(os.path.join(DBPATH, gs[3]))
-
-
-def get_plot_distribs():
- """
- For all genomes, plot the distribution of their L90 values, and their number of contigs.
- Add a vertical line at the given threshold.
- genomes: {genome: [name, path, size, nbcont, l90]}
- output of plot_distributions is L90_vals, nbcont_vals, l90_dist, nbcont_dist
- these outputs will be compared to expected results in tests
- """
- genomes_dir = os.path.join("test", "data", "annotate", "genomes")
- gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta", "genome4.fasta",
- "genome5.fasta", "genome6.fasta", "genome7.fasta"]
- genomes = {gs[0]: ["SAEN.1113", os.path.join(genomes_dir, gs[0]), 51, 2, 2],
- gs[1]: ["SAEN.1114", os.path.join(genomes_dir, gs[1]), 67, 15, 13],
- gs[2]: ["ESCO.0416", os.path.join(genomes_dir, gs[2]), 70, 15, 11],
- gs[3]: ["ESCO.0216", os.path.join(genomes_dir, gs[3]), 114, 17, 11],
- gs[4]: ["SAEN.1115", os.path.join(genomes_dir, gs[4]), 106, 17, 12],
- gs[5]: ["ESCO.0216", os.path.join(genomes_dir, gs[5]), 116, 60, 50],
- gs[6]: ["SAEN.1115", os.path.join(genomes_dir, gs[6]), 137, 20, 12]}
- res_path = os.path.join("test", "data", "annotate")
- listfile_base = "test_plot_dist"
- l90 = 13
- nbconts = 19
- outdist = gfunc.plot_distributions(genomes, res_path, listfile_base, l90, nbconts)
- return outdist
-
-
-@pytest.mark.mpl_image_compare(baseline_dir=BASELINE_DIR, tolerance=6, backend="agg")
-def test_dist_l90():
- """
- For created L90 graph, check that calculated L90 values are as expected,
- and graph is also as expected
- """
- res_path = os.path.join("test", "data", "annotate")
- listfile_base = "test_plot_dist"
- outfile1 = os.path.join(res_path, "QC_L90-" + listfile_base + ".png")
- outfile2 = os.path.join(res_path, "QC_nb-contigs-" + listfile_base + ".png")
- l90, _, dist, _ = get_plot_distribs()
- # Check that png file was created
- assert os.path.isfile(outfile1)
- assert os.path.isfile(outfile2)
- os.remove(outfile1)
- os.remove(outfile2)
- # Check values calculated for l90
- assert set(l90) == {2, 13, 11, 11, 12, 50, 12}
- # Check that output plot is as expected
- return dist
-
-
-@pytest.mark.mpl_image_compare(baseline_dir=BASELINE_DIR, tolerance=6, backend="agg")
-def test_dist_nbcont():
- """
- For created L90 graph, check that calculated L90 values are as expected,
- and graph is also as expected
- """
- res_path = os.path.join("test", "data", "annotate")
- listfile_base = "test_plot_dist"
- outfile1 = os.path.join(res_path, "QC_L90-" + listfile_base + ".png")
- outfile2 = os.path.join(res_path, "QC_nb-contigs-" + listfile_base + ".png")
- _, nbcont, _, dist = get_plot_distribs()
- # Check that png file was created
- assert os.path.isfile(outfile1)
- assert os.path.isfile(outfile2)
- os.remove(outfile1)
- os.remove(outfile2)
- # Check values calculated for l90
- assert set(nbcont) == {2, 15, 15, 17, 17, 60, 20}
- # Check that output plot is as expected
- return dist
+# def test_calc_l90_more():
+# """
+# Calculate L90 according to the given genome size and contig sizes
+# 3 contigs get exactly more than 90%, but 2 contigs get less -> l90 = 3
+# """
+# cont_size = {1: 3, 2: 800, 3: 90, 4: 90, 5: 17}
+# l90 = gfunc.calc_l90(cont_size)
+# assert l90 == 3
+
+
+# def test_rename_genomes():
+# """
+# From a list of genomes ({genome: [name.date, path, gsize, nbcont, L90]}),
+# order them by species, and by decreasing quality (L90, nb_cont), and rename them,
+# as well as their contigs.
+# """
+# genomes_dir = os.path.join("test", "data", "annotate", "genomes")
+# gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta", "genome4.fasta",
+# "genome5.fasta", "genome6.fasta", "genome7.fasta"]
+
+# genomes = {gs[0]: ["SAEN.1113", os.path.join(genomes_dir, gs[0]), 51, 4, 2],
+# gs[1]: ["SAEN.1114", os.path.join(genomes_dir, gs[1]), 67, 3, 3],
+# gs[2]: ["ESCO.0416", os.path.join(genomes_dir, gs[2]), 70, 4, 1],
+# gs[3]: ["ESCO.0216", os.path.join(genomes_dir, gs[3]), 114, 5, 2],
+# gs[4]: ["SAEN.1115", os.path.join(genomes_dir, gs[4]), 106, 3, 1],
+# gs[5]: ["ESCO.0216", os.path.join(genomes_dir, gs[5]), 116, 4, 2],
+# gs[6]: ["SAEN.1115", os.path.join(genomes_dir, gs[6]), 137, 3, 2]}
+# gfunc.rename_all_genomes(genomes)
+# # SAEN genomes 1 and 2 have same characteristics. Their place will be chosen randomly,
+# # so take into account both choices
+# exp_genomes = {gs[0]: ["SAEN.1113.00003", os.path.join(genomes_dir, gs[0]), 51, 4, 2],
+# gs[1]: ["SAEN.1114.00004", os.path.join(genomes_dir, gs[1]), 67, 3, 3],
+# gs[2]: ["ESCO.0416.00001", os.path.join(genomes_dir, gs[2]), 70, 4, 1],
+# gs[3]: ["ESCO.0216.00003", os.path.join(genomes_dir, gs[3]), 114, 5, 2],
+# gs[4]: ["SAEN.1115.00001", os.path.join(genomes_dir, gs[4]), 106, 3, 1],
+# gs[5]: ["ESCO.0216.00002", os.path.join(genomes_dir, gs[5]), 116, 4, 2],
+# gs[6]: ["SAEN.1115.00002", os.path.join(genomes_dir, gs[6]), 137, 3, 2]}
+# assert genomes == exp_genomes
+
+
+# def test_analyse1genome_nocut():
+# """
+# Analyse the given genome: without cutting at stretches of N, calculate
+# its genome size, nb contigs and L90, and add it to the genomes dict, as well as
+# the path to the genome file.
+# """
+# gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta"]
+# genomes = {gs[0]: ["SAEN.1113"],
+# gs[1]: ["SAEN.1114"],
+# gs[2]: ["ESCO.0416"]}
+# genome = gs[1]
+# tmp_path = os.path.join("test", "data", "annotate")
+# # Put genome file in tmppath instead of dbpath, as if it was
+# # the result of concatenation of several files for the same genome,
+# # done in the first step.
+# orig_file = os.path.join(DBPATH, genome)
+# out_file = os.path.join(tmp_path, genome)
+# os.rename(orig_file, out_file)
+# cut = False
+# pat = "NNNNN+"
+# assert gfunc.analyse_genome(genome, DBPATH, tmp_path, cut, pat, genomes)
+# outf = os.path.join(tmp_path, gs[1] + "-short-contig.fna")
+# exp_genomes = {gs[0]: ["SAEN.1113"],
+# gs[1]: ["SAEN.1114", outf, 67, 3, 3],
+# gs[2]: ["ESCO.0416"]}
+# assert genomes == exp_genomes
+# exp_file = os.path.join(tmp_path, "exp_files", "res_test_analyse-genome2.fna")
+# with open(exp_file, "r") as expf, open(outf, "r") as of:
+# for linee, lineo in zip(expf, of):
+# assert linee == lineo
+# os.remove(outf)
+# os.rename(out_file, orig_file)
+
+
+# def test_analyse1genome_nocut_empty():
+# """
+# Analyse the given genome: without cutting at stretches of N. The genome is an empty
+# file, so it is not possible to calculate L90
+# """
+# gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta", "empty.fasta"]
+# open(os.path.join(DBPATH, gs[3]), "w").close()
+# genomes = {gs[0]: ["SAEN.1113"],
+# gs[1]: ["SAEN.1114"],
+# gs[2]: ["ESCO.0416"],
+# gs[3]: ["ESCO.0415"]}
+# genome = gs[3]
+# tmp_path = os.path.join("test", "data", "annotate")
+# cut = False
+# pat = "NNNNN+"
+# assert not gfunc.analyse_genome(genome, DBPATH, tmp_path, cut, pat, genomes)
+# exp_genomes = {gs[0]: ["SAEN.1113"],
+# gs[1]: ["SAEN.1114"],
+# gs[2]: ["ESCO.0416"],
+# gs[3]: ["ESCO.0415"]}
+# assert genomes == exp_genomes
+# os.remove(os.path.join(DBPATH, gs[3]))
+# os.remove(os.path.join(tmp_path, gs[3] + "-short-contig.fna"))
+
+
+# def test_analyse1genome_cut():
+# """
+# Analyse the given genome: cut at each stretch of 5 N, put it to a new file,
+# and then calculate its genome size, nb contigs and L90. Add this information
+# to the genomes dict, as well as the path to the genome file (cut).
+# """
+# gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta"]
+# genomes = {gs[0]: ["SAEN.1113"],
+# gs[1]: ["SAEN.1114"],
+# gs[2]: ["ESCO.0416"]}
+# genome = gs[1]
+# tmp_path = os.path.join("test", "data", "annotate")
+# cut = True
+# pat = "NNNNN+"
+# assert gfunc.analyse_genome(genome, DBPATH, tmp_path, cut, pat, genomes)
+# out_f = os.path.join(tmp_path, gs[1] + "-split5N.fna")
+# exp_f = os.path.join(tmp_path, "exp_files", "res_genome2.fasta-split5N.fna")
+# exp_genomes = {gs[0]: ["SAEN.1113"],
+# gs[1]: ["SAEN.1114", out_f, 55, 5, 4],
+# gs[2]: ["ESCO.0416"]}
+# assert genomes == exp_genomes
+# with open(out_f, "r") as outf, open(exp_f, "r") as expf:
+# for line_exp, line_out in zip(expf, outf):
+# assert line_exp == line_out
+# os.remove(out_f)
+
+
+# def test_analyse1genome_cut_same_names():
+# """
+# Analyse a genome. Its contig names all have the same first 20 characters. There is no
+# stretch of at least 5N, so contigs are not split.
+# New contig names should be uniq, and not all ending with _0!
+# """
+# genome = "genome_long_header.fst"
+# genomes = {genome: ["SAEN.1015.0117"]}
+# tmp_path = os.path.join("test", "data", "annotate")
+# cut = True
+# pat = "NNNNN+"
+# assert gfunc.analyse_genome(genome, DBPATH, tmp_path, cut, pat, genomes)
+# out_f = os.path.join(tmp_path, genome + "-split5N.fna")
+# exp_f = os.path.join("test", "data", "annotate", "exp_files",
+# "res_genome_short-long_header.fst")
+# exp_genomes = {genome: ["SAEN.1015.0117", out_f, 151, 3, 3]}
+# assert genomes == exp_genomes
+# with open(out_f, "r") as outf, open(exp_f, "r") as expf:
+# for line_exp, line_out in zip(expf, outf):
+# assert line_exp == line_out
+# os.remove(out_f)
+
+
+# def test_analyse1genome_cut_empty():
+# """
+# Analyse the given genome: cut at each stretch of 5 N, but the file is empty.
+# Check that it returns False
+# """
+# gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta", "empty.fasta"]
+# open(os.path.join(DBPATH, gs[3]), "w").close()
+# genomes = {gs[0]: ["SAEN.1113"],
+# gs[1]: ["SAEN.1114"],
+# gs[2]: ["ESCO.0416"],
+# gs[3]: ["ESCO.0415"]}
+# genome = gs[3]
+# tmp_path = os.path.join("test", "data", "annotate")
+# cut = True
+# pat = "NNNNN+"
+# assert not gfunc.analyse_genome(genome, DBPATH, tmp_path, cut, pat, genomes)
+# exp_genomes = {gs[0]: ["SAEN.1113"],
+# gs[1]: ["SAEN.1114"],
+# gs[2]: ["ESCO.0416"],
+# gs[3]: ["ESCO.0415"]}
+# assert genomes == exp_genomes
+# out_f = os.path.join(tmp_path, gs[3] + "-split5N.fna")
+# with open(out_f, "r") as outf:
+# assert outf.readlines() == []
+# os.remove(os.path.join(DBPATH, gs[3]))
+# os.remove(out_f)
+
+
+# def test_analyse_all_genomes_nocut():
+# """
+# Analyze all given genomes: don't cut at stretches of N, but look at their sequence
+# file, to calculate L90, genome size and nb contigs. Add this information, as well as the
+# path to the genomic sequence, to the genomes dict.
+# """
+# gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta"]
+# genomes = {gs[0]: ["SAEN.1113"],
+# gs[1]: ["SAEN.1114"],
+# gs[2]: ["ESCO.0416"]}
+# tmp_path = os.path.join("test", "data", "annotate")
+# opaths = [os.path.join(tmp_path, gname + "-short-contig.fna") for gname in gs]
+# nbn = 0
+# # Run analysis
+# gfunc.analyse_all_genomes(genomes, DBPATH, tmp_path, nbn)
+# # construct expected results
+# exp_genomes = {gs[0]: ["SAEN.1113", opaths[0], 51, 4, 2],
+# gs[1]: ["SAEN.1114", opaths[1], 67, 3, 3],
+# gs[2]: ["ESCO.0416", opaths[2], 70, 4, 1]}
+# assert exp_genomes == genomes
+# for f in opaths:
+# os.remove(f)
+
+
+# def test_analyse_all_genomes_nocut_empty():
+# """
+# Analyze all given genomes: don't cut at stretches of N, but look at their sequence
+# file, to calculate L90, genome size and nb contigs. Add this information, as well as the
+# path to the genomic sequence, to the genomes dict.
+# """
+# gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta", "empty.fasta"]
+# open(os.path.join(DBPATH, gs[3]), "w").close()
+# genomes = {gs[0]: ["SAEN.1113"],
+# gs[1]: ["SAEN.1114"],
+# gs[2]: ["ESCO.0416"],
+# gs[3]: ["ESCO.0123"]}
+# tmp_path = os.path.join("test", "data", "annotate")
+# opaths = [os.path.join(tmp_path, gname + "-short-contig.fna") for gname in gs]
+# nbn = 0
+# # Run analysis
+# gfunc.analyse_all_genomes(genomes, DBPATH, tmp_path, nbn)
+# # construct expected results
+# exp_genomes = {gs[0]: ["SAEN.1113", opaths[0], 51, 4, 2],
+# gs[1]: ["SAEN.1114", opaths[1], 67, 3, 3],
+# gs[2]: ["ESCO.0416", opaths[2], 70, 4, 1]}
+# assert exp_genomes == genomes
+# os.remove(os.path.join(DBPATH, gs[3]))
+# for f in opaths:
+# os.remove(f)
+
+
+# def test_analyse_all_genomes_cut():
+# """
+# Analyze all given genomes: cut at each stretch of 5 'N', look at the output sequence
+# file, to calculate L90, genome size and nb contigs. Add this information, as well as the
+# path to the genomic sequence, to the genomes dict.
+# """
+# gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta"]
+# genomes = {gs[0]: ["SAEN.1113"],
+# gs[1]: ["SAEN.1114"],
+# gs[2]: ["ESCO.0416"]}
+# tmp_path = os.path.join("test", "data", "annotate")
+# gpaths = [os.path.join(tmp_path, gname + "-split5N.fna") for gname in gs]
+# nbn = 5
+# # Run analysis
+# gfunc.analyse_all_genomes(genomes, DBPATH, tmp_path, nbn)
+# # construct expected results
+# exp_genomes = {gs[0]: ["SAEN.1113", gpaths[0], 51, 4, 2],
+# gs[1]: ["SAEN.1114", gpaths[1], 55, 5, 4],
+# gs[2]: ["ESCO.0416", gpaths[2], 70, 4, 1]}
+# out_f = [os.path.join(tmp_path, gname + "-split5N.fna") for gname in gs]
+# exp_f = [os.path.join(tmp_path, "exp_files", "res_" + gname + "-split5N.fna") for gname in gs]
+# assert exp_genomes == genomes
+# for out, exp in zip(out_f, exp_f):
+# with open(out, "r") as outf, open(exp, "r") as expf:
+# for line_exp, line_out in zip(expf, outf):
+# assert line_exp == line_out
+# os.remove(out)
+
+
+# def test_analyse_all_genomes_cut_empty():
+# """
+# Analyze all given genomes: cut at each stretch of 5 'N', look at the output sequence
+# file, to calculate L90, genome size and nb contigs. Add this information, as well as the
+# path to the genomic sequence, to the genomes dict.
+# """
+# gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta", "empty.fasta"]
+# open(os.path.join(DBPATH, gs[3]), "w").close()
+# genomes = {gs[0]: ["SAEN.1113"],
+# gs[1]: ["SAEN.1114"],
+# gs[2]: ["ESCO.0416"],
+# gs[3]: ["ESCO.0123"]}
+# tmp_path = os.path.join("test", "data", "annotate")
+# gpaths = [os.path.join(tmp_path, gname + "-split5N.fna") for gname in gs]
+# nbn = 5
+# # Run analysis
+# gfunc.analyse_all_genomes(genomes, DBPATH, tmp_path, nbn)
+# # construct expected results
+# exp_genomes = {gs[0]: ["SAEN.1113", gpaths[0], 51, 4, 2],
+# gs[1]: ["SAEN.1114", gpaths[1], 55, 5, 4],
+# gs[2]: ["ESCO.0416", gpaths[2], 70, 4, 1]}
+# out_f = [os.path.join(tmp_path, gname + "-split5N.fna") for gname in gs]
+# exp_f = [os.path.join(tmp_path, "exp_files", "res_" + gname + "-split5N.fna") for gname in gs]
+# assert exp_genomes == genomes
+# for out, exp in zip(out_f[:-1], exp_f[:-1]):
+# with open(out, "r") as outf, open(exp, "r") as expf:
+# for line_exp, line_out in zip(expf, outf):
+# assert line_exp == line_out
+# os.remove(out)
+# with open(out_f[-1], "r") as outf:
+# assert outf.readlines() == []
+# os.remove(out_f[-1])
+# os.remove(os.path.join(DBPATH, gs[3]))
+
+
+# def get_plot_distribs():
+# """
+# For all genomes, plot the distribution of their L90 values, and their number of contigs.
+# Add a vertical line at the given threshold.
+# genomes: {genome: [name, path, size, nbcont, l90]}
+# output of plot_distributions is L90_vals, nbcont_vals, l90_dist, nbcont_dist
+# these outputs will be compared to expected results in tests
+# """
+# genomes_dir = os.path.join("test", "data", "annotate", "genomes")
+# gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta", "genome4.fasta",
+# "genome5.fasta", "genome6.fasta", "genome7.fasta"]
+# genomes = {gs[0]: ["SAEN.1113", os.path.join(genomes_dir, gs[0]), 51, 2, 2],
+# gs[1]: ["SAEN.1114", os.path.join(genomes_dir, gs[1]), 67, 15, 13],
+# gs[2]: ["ESCO.0416", os.path.join(genomes_dir, gs[2]), 70, 15, 11],
+# gs[3]: ["ESCO.0216", os.path.join(genomes_dir, gs[3]), 114, 17, 11],
+# gs[4]: ["SAEN.1115", os.path.join(genomes_dir, gs[4]), 106, 17, 12],
+# gs[5]: ["ESCO.0216", os.path.join(genomes_dir, gs[5]), 116, 60, 50],
+# gs[6]: ["SAEN.1115", os.path.join(genomes_dir, gs[6]), 137, 20, 12]}
+# res_path = os.path.join("test", "data", "annotate")
+# listfile_base = "test_plot_dist"
+# l90 = 13
+# nbconts = 19
+# outdist = gfunc.plot_distributions(genomes, res_path, listfile_base, l90, nbconts)
+# return outdist
+
+
+# @pytest.mark.mpl_image_compare(baseline_dir=BASELINE_DIR, tolerance=6, backend="agg")
+# def test_dist_l90():
+# """
+# For created L90 graph, check that calculated L90 values are as expected,
+# and graph is also as expected
+# """
+# res_path = os.path.join("test", "data", "annotate")
+# listfile_base = "test_plot_dist"
+# outfile1 = os.path.join(res_path, "QC_L90-" + listfile_base + ".png")
+# outfile2 = os.path.join(res_path, "QC_nb-contigs-" + listfile_base + ".png")
+# l90, _, dist, _ = get_plot_distribs()
+# # Check that png file was created
+# assert os.path.isfile(outfile1)
+# assert os.path.isfile(outfile2)
+# os.remove(outfile1)
+# os.remove(outfile2)
+# # Check values calculated for l90
+# assert set(l90) == {2, 13, 11, 11, 12, 50, 12}
+# # Check that output plot is as expected
+# return dist
+
+
+# @pytest.mark.mpl_image_compare(baseline_dir=BASELINE_DIR, tolerance=6, backend="agg")
+# def test_dist_nbcont():
+# """
+# For created L90 graph, check that calculated L90 values are as expected,
+# and graph is also as expected
+# """
+# res_path = os.path.join("test", "data", "annotate")
+# listfile_base = "test_plot_dist"
+# outfile1 = os.path.join(res_path, "QC_L90-" + listfile_base + ".png")
+# outfile2 = os.path.join(res_path, "QC_nb-contigs-" + listfile_base + ".png")
+# _, nbcont, _, dist = get_plot_distribs()
+# # Check that png file was created
+# assert os.path.isfile(outfile1)
+# assert os.path.isfile(outfile2)
+# os.remove(outfile1)
+# os.remove(outfile2)
+# # Check values calculated for l90
+# assert set(nbcont) == {2, 15, 15, 17, 17, 60, 20}
+# # Check that output plot is as expected
+# return dist
diff --git a/test/test_unit/test_gene.py b/test/test_unit/test_gene.py
deleted file mode 100644
index 5fa7ae74..00000000
--- a/test/test_unit/test_gene.py
+++ /dev/null
@@ -1,4 +0,0 @@
-import PanACoTA.annotate_module.genome_seq_functions as utils
-
-def test_toto():
- print("toto")
\ No newline at end of file
--
GitLab