diff --git a/PanACoTA/align_module/post_align.py b/PanACoTA/align_module/post_align.py index 2b32776690318424e287e89e841f79005d4aa478..0762a80db75f53bcce9a3340e3f5c4e12fc83ed4 100755 --- a/PanACoTA/align_module/post_align.py +++ b/PanACoTA/align_module/post_align.py @@ -320,10 +320,21 @@ def get_genome(header, all_genomes): name of genome from which the header is None if no genome found """ + # Name of protein is not always genome-name_num + # Ex: in gembase complete DB: >TOTO.0215.00002.i006_00065 is from genome TOTO.0215.00002 + # So, genome name cannot be deduced directly from header. But it is always included in header header = header.split(">")[1].split()[0] + for genome in all_genomes: if genome in header: - return genome + # header should be genome<something>_num + # -> header.split(genome) should be empty for the first field + # If not empty, means that genome name is included into another genome name, so + # we must not return this genome. + # For example, genome "8-KG" is in header "98-KG_xxx", but the correct genome for this + # header is "98-KG" + if not header.split(genome)[0]: + return genome logger.error((f"Protein {header} does not correspond to any genome name " f"given... {all_genomes}")) return None diff --git a/test/test_unit/test_align/test_postalign.py b/test/test_unit/test_align/test_postalign.py index 6e3aadfcc3ed3b5f994937a9e578a82f7f4bfb8e..5f320f6345fce9b7aba0847372615deca03c8b99 100755 --- a/test/test_unit/test_align/test_postalign.py +++ b/test/test_unit/test_align/test_postalign.py @@ -55,14 +55,22 @@ def test_get_genome(): assert pal.get_genome(header, genomes) == "TOTO.0215.00002" -def test_get_genome_not_start(): +def test_get_genome_included(): """ - Given a header and a list of genomes, check that it returns the expected genome. The genome - name is not at the beginning of the protein name + Given a header and a list of genomes, check that it returns the expected genome """ - header = ">mongenome,TOTO.0215.00002.i006_00065" - genomes = ["TOTO.0315.00001", "ESCO.0215.00002", "ESCO.0215.00001", "TOTO.0215.00002"] - assert pal.get_genome(header, genomes) == "TOTO.0215.00002" + header = ">aTOTO.0215.00002.i006_00065" + genomes = [ "TOTO.0215.00002", "TOTO.0315.00001", "ESCO.0215.00002", "aTOTO.0215.00002"] + assert pal.get_genome(header, genomes) == "aTOTO.0215.00002" + +# def test_get_genome_not_start(): +# """ +# Given a header and a list of genomes, check that it returns the expected genome. The genome +# name is not at the beginning of the protein name +# """ +# header = ">mongenome,TOTO.0215.00002.i006_00065" +# genomes = ["TOTO.0315.00001", "ESCO.0215.00002", "ESCO.0215.00001", "TOTO.0215.00002"] +# assert pal.get_genome(header, genomes) == "TOTO.0215.00002" def test_get_genome_notfound(caplog):