Commit 072bbd29 authored by Amandine  PERRIN's avatar Amandine PERRIN
Browse files

Merge branch 'master' into dev

parents 4fc48470 64c45ef9
......@@ -320,10 +320,21 @@ def get_genome(header, all_genomes):
name of genome from which the header is
None if no genome found
"""
# Name of protein is not always genome-name_num
# Ex: in gembase complete DB: >TOTO.0215.00002.i006_00065 is from genome TOTO.0215.00002
# So, genome name cannot be deduced directly from header. But it is always included in header
header = header.split(">")[1].split()[0]
for genome in all_genomes:
if genome in header:
return genome
# header should be genome<something>_num
# -> header.split(genome) should be empty for the first field
# If not empty, means that genome name is included into another genome name, so
# we must not return this genome.
# For example, genome "8-KG" is in header "98-KG_xxx", but the correct genome for this
# header is "98-KG"
if not header.split(genome)[0]:
return genome
logger.error((f"Protein {header} does not correspond to any genome name "
f"given... {all_genomes}"))
return None
......@@ -55,14 +55,22 @@ def test_get_genome():
assert pal.get_genome(header, genomes) == "TOTO.0215.00002"
def test_get_genome_not_start():
def test_get_genome_included():
"""
Given a header and a list of genomes, check that it returns the expected genome. The genome
name is not at the beginning of the protein name
Given a header and a list of genomes, check that it returns the expected genome
"""
header = ">mongenome,TOTO.0215.00002.i006_00065"
genomes = ["TOTO.0315.00001", "ESCO.0215.00002", "ESCO.0215.00001", "TOTO.0215.00002"]
assert pal.get_genome(header, genomes) == "TOTO.0215.00002"
header = ">aTOTO.0215.00002.i006_00065"
genomes = [ "TOTO.0215.00002", "TOTO.0315.00001", "ESCO.0215.00002", "aTOTO.0215.00002"]
assert pal.get_genome(header, genomes) == "aTOTO.0215.00002"
# def test_get_genome_not_start():
# """
# Given a header and a list of genomes, check that it returns the expected genome. The genome
# name is not at the beginning of the protein name
# """
# header = ">mongenome,TOTO.0215.00002.i006_00065"
# genomes = ["TOTO.0315.00001", "ESCO.0215.00002", "ESCO.0215.00001", "TOTO.0215.00002"]
# assert pal.get_genome(header, genomes) == "TOTO.0215.00002"
def test_get_genome_notfound(caplog):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment