From 136e1c333a38939213c993987e99f94e0d4b6e37 Mon Sep 17 00:00:00 2001 From: Amandine PERRIN <amandine.perrin@pasteur.fr> Date: Thu, 16 Sep 2021 19:05:34 +0200 Subject: [PATCH] gene name starting by genome name --- PanACoTA/align_module/post_align.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/PanACoTA/align_module/post_align.py b/PanACoTA/align_module/post_align.py index 0762a80d..74b52222 100755 --- a/PanACoTA/align_module/post_align.py +++ b/PanACoTA/align_module/post_align.py @@ -326,15 +326,10 @@ def get_genome(header, all_genomes): header = header.split(">")[1].split()[0] for genome in all_genomes: - if genome in header: - # header should be genome<something>_num - # -> header.split(genome) should be empty for the first field - # If not empty, means that genome name is included into another genome name, so - # we must not return this genome. - # For example, genome "8-KG" is in header "98-KG_xxx", but the correct genome for this - # header is "98-KG" - if not header.split(genome)[0]: - return genome + if header.startswith(genome): + # header should start with the genome name. Nothing before it. + # Ex: >86KG_12345 is from genome 86KG. >6KG_12345 is from genome 6KG, not 86KG + return genome logger.error((f"Protein {header} does not correspond to any genome name " f"given... {all_genomes}")) return None -- GitLab