From 136e1c333a38939213c993987e99f94e0d4b6e37 Mon Sep 17 00:00:00 2001
From: Amandine PERRIN <amandine.perrin@pasteur.fr>
Date: Thu, 16 Sep 2021 19:05:34 +0200
Subject: [PATCH] gene name starting by genome name

---
 PanACoTA/align_module/post_align.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/PanACoTA/align_module/post_align.py b/PanACoTA/align_module/post_align.py
index 0762a80d..74b52222 100755
--- a/PanACoTA/align_module/post_align.py
+++ b/PanACoTA/align_module/post_align.py
@@ -326,15 +326,10 @@ def get_genome(header, all_genomes):
     header = header.split(">")[1].split()[0]
 
     for genome in all_genomes:
-        if genome in header:
-            # header should be genome<something>_num
-            # -> header.split(genome) should be empty for the first field
-            # If not empty, means that genome name is included into another genome name, so
-            # we must not return this genome.
-            # For example, genome "8-KG" is in header "98-KG_xxx", but the correct genome for this
-            # header is "98-KG"
-            if not header.split(genome)[0]:
-                return genome
+        if header.startswith(genome):
+            # header should start with the genome name. Nothing before it.
+            # Ex: >86KG_12345 is from genome 86KG. >6KG_12345 is from genome 6KG, not 86KG
+            return genome
     logger.error((f"Protein {header} does not correspond to any genome name "
                   f"given... {all_genomes}"))
     return None
-- 
GitLab