Adapt test to new 'splitcontig format'

before, when we split contigs, we added '_num' at the end of the header. Now, it is at the begining

Adapt test to new 'splitcontig format'
c5ac2d6b · Amandine PERRIN · fb1b7b69 · c5ac2d6b · c5ac2d6b · c5ac2d6b
Commit c5ac2d6b authored 4 years ago by Amandine PERRIN
--- a/PanACoTA/annotate_module/genome_seq_functions.py
+++ b/PanACoTA/annotate_module/genome_seq_functions.py
@@ -310,16 +310,7 @@ def format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf, num,
    if cut:
        # Cut sequence and write header + sequence to res file
        num = split_contig(pat, cur_seq, cur_contig_name, contig_sizes, gresf, num)
-    # PROKKA User does not want to cut, but will annotate with prokka, so we still
-    # have to create a new sequence file
-    elif gresf:
-        new_contig_name = "{}_{}\n".format(cur_contig_name, num)
-        gresf.write(new_contig_name)
-        gresf.write(cur_seq + "\n")
-        contig_sizes[new_contig_name] = len(cur_seq)
-        num += 1
-    # PRODIGAL No cut, and prodigal used -> no new file created, but check
-    # contig unique names
+    # No cut -> no new file created, but check contig unique names
    else:
        if cur_contig_name in contig_sizes.keys():
            logger.error("{} contig name is used for several contigs. Please put "

--- a/test/data/annotate/exp_files/genome2-split5N.fna
+++ b/test/data/annotate/exp_files/genome2-split5N.fna
->contig1 dgfdgd_1
+>1_contig1 dgfdgd
 ACGTTGCTGC
->contig1 dgfdgd_2
+>2_contig1 dgfdgd
 AGCTGTCTAG
->contig2_3
+>3_contig2
 CGACGNNCGAG
->contig2_4
+>4_contig2
 AGGTG
->contig3_5
+>5_contig3
 ANNNNGGCTTGAGGTTGAA
--- a/test/data/annotate/exp_files/res_genome_short-long_header.fst
+++ b/test/data/annotate/exp_files/res_genome_short-long_header.fst
->Long_header_with_same_1_1
+>1_Long_header_with_same_1
 AATTGCGCTAGCGCTAGGCGCTAGCGCGCTAGAGCCGCTAGGCGCCATTACGGCGCTATCCGCACGCGCATGCCACCGTTAG
->Long_header_with_same_2_2
+>2_Long_header_with_same_2
 AACCGTTGGGGGGGGGGCCCCATTAGGCGCGGAATTTTCG
->Long_header_with_same_3_3
+>3_Long_header_with_same_3
 ACGGCTCGCGGAGAGAGAGAGCTCGCGCA
--- a/test/test_unit/test_annotate/test_genome_func.py
+++ b/test/test_unit/test_annotate/test_genome_func.py
@@ -40,14 +40,16 @@ def setup_teardown_module():
    - remove directory with generated results
    """
    # utils.init_logger(LOGFILE_BASE, 0, 'test_postalign', verbose=1)
-    os.mkdir(GENEPATH)
+    if os.path.isdir(GENEPATH):
+        content = os.listdir(GENEPATH)
+        for f in content:
+            assert f.startswith(".fuse")
+    else:
+        os.mkdir(GENEPATH)
    print("setup")

    yield
-    shutil.rmtree(GENEPATH)
-    # for f in LOGFILES:
-    #     if os.path.exists(f):
-    #         os.remove(f)
+    shutil.rmtree(GENEPATH, ignore_errors=True)
    print("teardown")


@@ -195,7 +197,7 @@ def test_format_contig_cut():
    cut = True
    pat = 'NNNNN+'
    cur_seq = "AACTGCTTTTTAAGCGCGCTCCTGCGNNNNNGGTTGTGTGGGCCCAGAGCGAGNCG"
-    cur_contig_name = ">my_contig_name_for_my_sequence"
+    cur_contig_name = ">my_contig_name for_my_sequence"
    contig_sizes = {}
    resfile = os.path.join(GENEPATH, "test_format_cont_cut5N.fna")
    gresf = open(resfile, "w")
@@ -208,8 +210,8 @@ def test_format_contig_cut():
    exp_file = os.path.join(EXP_DIR, "exp_split_contig_cut3N.fna")
    assert os.path.exists(resfile)
    assert tutil.compare_order_content(resfile, exp_file)
-    assert contig_sizes == {">my_contig_name_for_my_sequence_2\n": 26,
-                            ">my_contig_name_for_my_sequence_3\n": 25}
+    assert contig_sizes == {">2_my_contig_name for_my_sequence\n": 26,
+                            ">3_my_contig_name for_my_sequence\n": 25}


 def test_format_contig_nocut():
@@ -220,23 +222,21 @@ def test_format_contig_nocut():
    cut = False
    pat = None
    cur_seq = "AACTGCTTTTTAAGCGCGCTCCTGCGNNNNNGGTTGTGTGGGCCCAGAGCGAGNCG"
-    cur_contig_name = ">my_contig_name_for_my_sequence"
+    cur_contig_name = ">my_contig_name_for_my_sequence\n"
    contig_sizes = {}
    resfile = os.path.join(GENEPATH, "test_format_cont_nocut_prokka.fna")
-    gresf = open(resfile, "w")
+    gresf = None
    num = 2

    assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf,
-                               num, logger=None) == 3
-    gresf.close()
+                               num, logger=None) == 2

    exp_file = os.path.join(EXP_DIR, "exp_split_contig_nocut.fna")
-    assert os.path.exists(resfile)
-    assert tutil.compare_order_content(resfile, exp_file)
-    assert contig_sizes == {">my_contig_name_for_my_sequence_2\n": 56}
+    assert not os.path.exists(resfile)
+    assert contig_sizes == {">my_contig_name_for_my_sequence\n": 56}


-def test_format_contig_nocut_prodigal_notSameName():
+def test_format_contig_nocut_notDuplicateName():
    """
    For a given contig, if we want to annotate it with prodigal, and do not cut,
    then we keep the same file (no need to split at 20 characters)
@@ -262,7 +262,7 @@ def test_format_contig_nocut_prodigal_notSameName():
                            ">mycontig": 155}


-def test_format_contig_nocut_prodigal_SameName(caplog):
+def test_format_contig_nocut_DuplicateName(caplog):
    """
    For a given contig, if we want to annotate it with prodigal, and do not cut, then we keep the same file. However, we must check that contig names are all different.
    Try to add a contig which name is already used, check that it prints the expected error,
@@ -460,7 +460,7 @@ def test_analyse1genome_cut_same_names():
    """
    Analyse a genome. Its contig names all have the same first 20 characters. There is no
    stretch of at least 5N, so contigs are not split.
-    New contig names should be uniq, and not all ending with _0!
+    New contig names should be uniq, and not all starting with '1_'!
    """
    genome = "genome_long_header.fst"
    genomes = {genome: ["SAEN.1015.0117"]}