Skip to content
Snippets Groups Projects
Commit 072bbd29 authored by Amandine  PERRIN's avatar Amandine PERRIN
Browse files

Merge branch 'master' into dev

parents 4fc48470 64c45ef9
No related branches found
No related tags found
No related merge requests found
......@@ -320,10 +320,21 @@ def get_genome(header, all_genomes):
name of genome from which the header is
None if no genome found
"""
# Name of protein is not always genome-name_num
# Ex: in gembase complete DB: >TOTO.0215.00002.i006_00065 is from genome TOTO.0215.00002
# So, genome name cannot be deduced directly from header. But it is always included in header
header = header.split(">")[1].split()[0]
for genome in all_genomes:
if genome in header:
return genome
# header should be genome<something>_num
# -> header.split(genome) should be empty for the first field
# If not empty, means that genome name is included into another genome name, so
# we must not return this genome.
# For example, genome "8-KG" is in header "98-KG_xxx", but the correct genome for this
# header is "98-KG"
if not header.split(genome)[0]:
return genome
logger.error((f"Protein {header} does not correspond to any genome name "
f"given... {all_genomes}"))
return None
......@@ -55,14 +55,22 @@ def test_get_genome():
assert pal.get_genome(header, genomes) == "TOTO.0215.00002"
def test_get_genome_not_start():
def test_get_genome_included():
"""
Given a header and a list of genomes, check that it returns the expected genome. The genome
name is not at the beginning of the protein name
Given a header and a list of genomes, check that it returns the expected genome
"""
header = ">mongenome,TOTO.0215.00002.i006_00065"
genomes = ["TOTO.0315.00001", "ESCO.0215.00002", "ESCO.0215.00001", "TOTO.0215.00002"]
assert pal.get_genome(header, genomes) == "TOTO.0215.00002"
header = ">aTOTO.0215.00002.i006_00065"
genomes = [ "TOTO.0215.00002", "TOTO.0315.00001", "ESCO.0215.00002", "aTOTO.0215.00002"]
assert pal.get_genome(header, genomes) == "aTOTO.0215.00002"
# def test_get_genome_not_start():
# """
# Given a header and a list of genomes, check that it returns the expected genome. The genome
# name is not at the beginning of the protein name
# """
# header = ">mongenome,TOTO.0215.00002.i006_00065"
# genomes = ["TOTO.0315.00001", "ESCO.0215.00002", "ESCO.0215.00001", "TOTO.0215.00002"]
# assert pal.get_genome(header, genomes) == "TOTO.0215.00002"
def test_get_genome_notfound(caplog):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment