From 8cdec6cfd61d5c2712e0935be58935ed637d70a1 Mon Sep 17 00:00:00 2001
From: Amandine PERRIN <amandine.perrin@pasteur.fr>
Date: Mon, 14 Sep 2020 14:40:14 +0200
Subject: [PATCH] Test for generation of new contig names -> corrected error in
 logs

---
 PanACoTA/utils.py            |  7 +++++-
 test/test_unit/test_utils.py | 44 ++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/PanACoTA/utils.py b/PanACoTA/utils.py
index 04969397..4d51eecf 100755
--- a/PanACoTA/utils.py
+++ b/PanACoTA/utils.py
@@ -1150,6 +1150,10 @@ def get_genome_contigs_and_rename(gembase_name, gpath, outfile, logger):
                     prevcont_nohead = prev_cont.split(">")[1]
                     prev_orig_name_nohead = prev_orig_name.split(">")[1]
                     if prev_orig_name_nohead:
+                        if prev_orig_name_nohead in contigs:
+                            logger.error(f"several contigs have the same name "
+                                         f"{prev_orig_name_nohead} in {gpath}.")
+                            return False, False
                         sizes[prevcont_nohead] = cont_size
                         contigs[prev_orig_name_nohead] = prevcont_nohead
                         grf.write(cont)
@@ -1170,7 +1174,8 @@ def get_genome_contigs_and_rename(gembase_name, gpath, outfile, logger):
         prev_orig_name_nohead = prev_orig_name.split(">")[1]
         if prev_orig_name_nohead:
             if prev_orig_name_nohead in contigs:
-                logger.error(f"several contigs have the same name {prev_cont} in {gpath}.")
+                logger.error(f"several contigs have the same name {prev_orig_name_nohead} "
+                             f"in {gpath}.")
                 return False, False
             contigs[prev_orig_name_nohead] = prevcont_nohead
             sizes[prevcont_nohead] = cont_size
diff --git a/test/test_unit/test_utils.py b/test/test_unit/test_utils.py
index 57fffe61..a1a79707 100755
--- a/test/test_unit/test_utils.py
+++ b/test/test_unit/test_utils.py
@@ -918,6 +918,50 @@ def test_rename_contigs():
     assert utilities.compare_order_content(outfile, exp_file)
 
 
+def test_rename_contigs_duplicate(caplog):
+    """
+    From a given sequence, there are 2 contigs named "contig2". Stops and returns false
+    """
+    logger = logging.getLogger("default")
+    gpath = os.path.join(DATA_DIR, "genomes", "genome-duplicated-header.fasta")
+    gembase_name = "ESCO.0216.00005"
+    outfile = os.path.join(GENEPATH, "genome_dup_error.fna")
+    exp_file = os.path.join(DATA_DIR, "exp_files", "res_H299_H561-ESCO00005.fna")
+    contigs, sizes = utils.get_genome_contigs_and_rename(gembase_name, gpath, outfile, logger)
+    assert not contigs
+    assert not sizes
+    with open(outfile, "r") as of:
+        assert of.readline().startswith(">ESCO.0216.00005.0001")
+        of.readline() # skip sequence
+        assert of.readline().startswith(">ESCO.0216.00005.0002")
+        of.readline() # skip sequence
+        assert of.readline().startswith(">ESCO.0216.00005.0003")
+    assert ("several contigs have the same name contig2 in test/data/annotate/genomes/"
+            "genome-duplicated-header.fasta.") in caplog.text
+
+
+def test_rename_contigs_duplicate_last(caplog):
+    """
+    The last contig of the sequence has the same name as a previous contig. Stops and returns false
+    """
+    logger = logging.getLogger("default")
+    gpath = os.path.join(DATA_DIR, "genomes", "genome-duplicated-header-last.fasta")
+    gembase_name = "ESCO.0216.00005"
+    outfile = os.path.join(GENEPATH, "genome_dup_error.fna")
+    exp_file = os.path.join(DATA_DIR, "exp_files", "res_H299_H561-ESCO00005.fna")
+    contigs, sizes = utils.get_genome_contigs_and_rename(gembase_name, gpath, outfile, logger)
+    assert not contigs
+    assert not sizes
+    with open(outfile, "r") as of:
+        assert of.readline().startswith(">ESCO.0216.00005.0001")
+        of.readline() # skip sequence
+        assert of.readline().startswith(">ESCO.0216.00005.0002")
+        of.readline() # skip sequence
+        assert of.readline().startswith(">ESCO.0216.00005.0003")
+    assert ("several contigs have the same name contig2 in test/data/annotate/genomes/"
+            "genome-duplicated-header-last.fasta.") in caplog.text
+
+
 def test_cat_nobar():
     """
     Check that when cat is called on a list of several files, the output file
-- 
GitLab