From c5ac2d6b11b4bb6cf79505e2f21838bf6de87eb6 Mon Sep 17 00:00:00 2001
From: Amandine PERRIN <amandine.perrin@pasteur.fr>
Date: Mon, 12 Oct 2020 12:29:07 +0200
Subject: [PATCH] Adapt test to new 'splitcontig format'

before, when we split contigs, we added '_num' at the end of the header. Now, it is at the begining
---
 .../annotate_module/genome_seq_functions.py   | 11 +-----
 .../annotate/exp_files/genome2-split5N.fna    | 10 +++---
 .../res_genome_short-long_header.fst          |  6 ++--
 .../test_annotate/test_genome_func.py         | 36 +++++++++----------
 4 files changed, 27 insertions(+), 36 deletions(-)

diff --git a/PanACoTA/annotate_module/genome_seq_functions.py b/PanACoTA/annotate_module/genome_seq_functions.py
index 6ec2bb81..dbb8a2b3 100755
--- a/PanACoTA/annotate_module/genome_seq_functions.py
+++ b/PanACoTA/annotate_module/genome_seq_functions.py
@@ -310,16 +310,7 @@ def format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf, num,
     if cut:
         # Cut sequence and write header + sequence to res file
         num = split_contig(pat, cur_seq, cur_contig_name, contig_sizes, gresf, num)
-    # PROKKA User does not want to cut, but will annotate with prokka, so we still
-    # have to create a new sequence file
-    elif gresf:
-        new_contig_name = "{}_{}\n".format(cur_contig_name, num)
-        gresf.write(new_contig_name)
-        gresf.write(cur_seq + "\n")
-        contig_sizes[new_contig_name] = len(cur_seq)
-        num += 1
-    # PRODIGAL No cut, and prodigal used -> no new file created, but check
-    # contig unique names
+    # No cut -> no new file created, but check contig unique names
     else:
         if cur_contig_name in contig_sizes.keys():
             logger.error("{} contig name is used for several contigs. Please put "
diff --git a/test/data/annotate/exp_files/genome2-split5N.fna b/test/data/annotate/exp_files/genome2-split5N.fna
index e58ea892..f2053a5e 100644
--- a/test/data/annotate/exp_files/genome2-split5N.fna
+++ b/test/data/annotate/exp_files/genome2-split5N.fna
@@ -1,10 +1,10 @@
->contig1 dgfdgd_1
+>1_contig1 dgfdgd
 ACGTTGCTGC
->contig1 dgfdgd_2
+>2_contig1 dgfdgd
 AGCTGTCTAG
->contig2_3
+>3_contig2
 CGACGNNCGAG
->contig2_4
+>4_contig2
 AGGTG
->contig3_5
+>5_contig3
 ANNNNGGCTTGAGGTTGAA
diff --git a/test/data/annotate/exp_files/res_genome_short-long_header.fst b/test/data/annotate/exp_files/res_genome_short-long_header.fst
index 4a61e377..aaccf955 100755
--- a/test/data/annotate/exp_files/res_genome_short-long_header.fst
+++ b/test/data/annotate/exp_files/res_genome_short-long_header.fst
@@ -1,6 +1,6 @@
->Long_header_with_same_1_1
+>1_Long_header_with_same_1
 AATTGCGCTAGCGCTAGGCGCTAGCGCGCTAGAGCCGCTAGGCGCCATTACGGCGCTATCCGCACGCGCATGCCACCGTTAG
->Long_header_with_same_2_2
+>2_Long_header_with_same_2
 AACCGTTGGGGGGGGGGCCCCATTAGGCGCGGAATTTTCG
->Long_header_with_same_3_3
+>3_Long_header_with_same_3
 ACGGCTCGCGGAGAGAGAGAGCTCGCGCA
diff --git a/test/test_unit/test_annotate/test_genome_func.py b/test/test_unit/test_annotate/test_genome_func.py
index 2e782af1..b1792421 100755
--- a/test/test_unit/test_annotate/test_genome_func.py
+++ b/test/test_unit/test_annotate/test_genome_func.py
@@ -40,14 +40,16 @@ def setup_teardown_module():
     - remove directory with generated results
     """
     # utils.init_logger(LOGFILE_BASE, 0, 'test_postalign', verbose=1)
-    os.mkdir(GENEPATH)
+    if os.path.isdir(GENEPATH):
+        content = os.listdir(GENEPATH)
+        for f in content:
+            assert f.startswith(".fuse")
+    else:
+        os.mkdir(GENEPATH)
     print("setup")
 
     yield
-    shutil.rmtree(GENEPATH)
-    # for f in LOGFILES:
-    #     if os.path.exists(f):
-    #         os.remove(f)
+    shutil.rmtree(GENEPATH, ignore_errors=True)
     print("teardown")
 
 
@@ -195,7 +197,7 @@ def test_format_contig_cut():
     cut = True
     pat = 'NNNNN+'
     cur_seq = "AACTGCTTTTTAAGCGCGCTCCTGCGNNNNNGGTTGTGTGGGCCCAGAGCGAGNCG"
-    cur_contig_name = ">my_contig_name_for_my_sequence"
+    cur_contig_name = ">my_contig_name for_my_sequence"
     contig_sizes = {}
     resfile = os.path.join(GENEPATH, "test_format_cont_cut5N.fna")
     gresf = open(resfile, "w")
@@ -208,8 +210,8 @@ def test_format_contig_cut():
     exp_file = os.path.join(EXP_DIR, "exp_split_contig_cut3N.fna")
     assert os.path.exists(resfile)
     assert tutil.compare_order_content(resfile, exp_file)
-    assert contig_sizes == {">my_contig_name_for_my_sequence_2\n": 26,
-                            ">my_contig_name_for_my_sequence_3\n": 25}
+    assert contig_sizes == {">2_my_contig_name for_my_sequence\n": 26,
+                            ">3_my_contig_name for_my_sequence\n": 25}
 
 
 def test_format_contig_nocut():
@@ -220,23 +222,21 @@ def test_format_contig_nocut():
     cut = False
     pat = None
     cur_seq = "AACTGCTTTTTAAGCGCGCTCCTGCGNNNNNGGTTGTGTGGGCCCAGAGCGAGNCG"
-    cur_contig_name = ">my_contig_name_for_my_sequence"
+    cur_contig_name = ">my_contig_name_for_my_sequence\n"
     contig_sizes = {}
     resfile = os.path.join(GENEPATH, "test_format_cont_nocut_prokka.fna")
-    gresf = open(resfile, "w")
+    gresf = None
     num = 2
 
     assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf,
-                               num, logger=None) == 3
-    gresf.close()
+                               num, logger=None) == 2
 
     exp_file = os.path.join(EXP_DIR, "exp_split_contig_nocut.fna")
-    assert os.path.exists(resfile)
-    assert tutil.compare_order_content(resfile, exp_file)
-    assert contig_sizes == {">my_contig_name_for_my_sequence_2\n": 56}
+    assert not os.path.exists(resfile)
+    assert contig_sizes == {">my_contig_name_for_my_sequence\n": 56}
 
 
-def test_format_contig_nocut_prodigal_notSameName():
+def test_format_contig_nocut_notDuplicateName():
     """
     For a given contig, if we want to annotate it with prodigal, and do not cut,
     then we keep the same file (no need to split at 20 characters)
@@ -262,7 +262,7 @@ def test_format_contig_nocut_prodigal_notSameName():
                             ">mycontig": 155}
 
 
-def test_format_contig_nocut_prodigal_SameName(caplog):
+def test_format_contig_nocut_DuplicateName(caplog):
     """
     For a given contig, if we want to annotate it with prodigal, and do not cut, then we keep the same file. However, we must check that contig names are all different.
     Try to add a contig which name is already used, check that it prints the expected error,
@@ -460,7 +460,7 @@ def test_analyse1genome_cut_same_names():
     """
     Analyse a genome. Its contig names all have the same first 20 characters. There is no
     stretch of at least 5N, so contigs are not split.
-    New contig names should be uniq, and not all ending with _0!
+    New contig names should be uniq, and not all starting with '1_'!
     """
     genome = "genome_long_header.fst"
     genomes = {genome: ["SAEN.1015.0117"]}
-- 
GitLab