Commit c78cb131 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

parser for eggnog functional categories

parent fe09bec9
Pipeline #19770 failed with stages
in 2 minutes and 18 seconds
......@@ -5,7 +5,7 @@ from django.core.exceptions import ValidationError
from metagenedb.apps.catalog.models import EggNog, EggNogFunctionalCategory
from metagenedb.common.utils.chunks import file_len
from metagenedb.common.utils.parsers import EggNogAnnotationLineParser
from metagenedb.common.utils.parsers import EggNOGAnnotationLineParser
logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s')
logger = logging.getLogger(__name__)
......@@ -15,7 +15,7 @@ class ImportEggNog(object):
def __init__(self, file_path):
self.annotation_file = file_path
self.eggnog_parser = EggNogAnnotationLineParser()
self.eggnog_parser = EggNOGAnnotationLineParser()
self.processed_count = 0
self.created_count = 0
self.updated_count = 0
......
from .eggnog import EggNogAnnotationLineParser # noqa
from .eggnog import EggNOGAnnotationLineParser # noqa
from .igc import IGCLineParser # noqa
from .kegg import KEGGLineParser # noqa
from .ncbi_taxonomy import NCBITaxonomyLineParser # noqa
class FileParser:
def __init__(self, file_path):
self.file_path = file_path
def handle_parsing(self, file_handler):
"""
This method need to be overloaded to really handle the parsing
"""
for line in file_handler:
print(line.rstrip())
def parse(self):
with open(self.file_path, 'r') as file:
self.handle_parsing(file)
import logging
from .base import FileParser
_LOGGER = logging.getLogger(__name__)
class EggNogAnnotationLineParser(object):
class EggNOGAnnotationLineParser:
@staticmethod
def get_dict(line):
......@@ -19,4 +21,26 @@ class EggNogAnnotationLineParser(object):
}
except Exception:
_LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from eggnog annotations.tsv?")
raise
raise Exception("Impossible to parse given line as eggnog from annotation.tsv file")
class EggNOGFunctionalCategoriesParser(FileParser):
"""
Parse functional categories file from EggNOG
"""
def handle_parsing(self, file_handler):
functional_categories = []
current_group = "Unknown"
for line in file_handler:
line = line.strip()
if line.startswith('['): # It is a category
elements = line.split(maxsplit=1)
functional_categories.append({
'category_id': elements[0][1],
'name': elements[1],
'group': current_group
})
elif line: # It is a group of a category
current_group = line
return functional_categories
from unittest import TestCase
from metagenedb.common.utils.parsers import EggNogAnnotationLineParser
from metagenedb.common.utils.parsers.eggnog import (
EggNOGAnnotationLineParser,
EggNOGFunctionalCategoriesParser
)
class TestEggNogAnnotationLineParser(TestCase):
class TestEggNOGAnnotationLineParser(TestCase):
def test_get_dict(self):
ko_line = "1\t28H54\tK\ttranslational termination\n"
......@@ -12,7 +15,7 @@ class TestEggNogAnnotationLineParser(TestCase):
'name': "translational termination",
'functional_categories': ["K"]
}
test_dict = EggNogAnnotationLineParser.get_dict(ko_line)
test_dict = EggNOGAnnotationLineParser.get_dict(ko_line)
self.assertDictEqual(test_dict, expected_dict)
def test_get_dict_no_name(self):
......@@ -22,7 +25,7 @@ class TestEggNogAnnotationLineParser(TestCase):
'name': "",
'functional_categories': ["S"]
}
test_dict = EggNogAnnotationLineParser.get_dict(ko_line)
test_dict = EggNOGAnnotationLineParser.get_dict(ko_line)
self.assertDictEqual(test_dict, expected_dict)
def test_get_dict_long_name(self):
......@@ -32,7 +35,7 @@ class TestEggNogAnnotationLineParser(TestCase):
'name': "Glucose-responsive transcription factor that regulates expression of several glucose transporter (HXT) genes in response to glucose", # noqa
'functional_categories': ["S"]
}
test_dict = EggNogAnnotationLineParser.get_dict(ko_line)
test_dict = EggNOGAnnotationLineParser.get_dict(ko_line)
self.assertDictEqual(test_dict, expected_dict)
def test_get_dict_multi_categories(self):
......@@ -42,10 +45,27 @@ class TestEggNogAnnotationLineParser(TestCase):
'name': "translational termination",
'functional_categories': ["K", "S"]
}
test_dict = EggNogAnnotationLineParser.get_dict(ko_line)
test_dict = EggNOGAnnotationLineParser.get_dict(ko_line)
self.assertDictEqual(test_dict, expected_dict)
def test_get_dict_wrong_format(self):
ko_line = "This is a wrong line format, with; information and tab"
with self.assertRaises(Exception) as context: # noqa
EggNogAnnotationLineParser.get_dict(ko_line)
EggNOGAnnotationLineParser.get_dict(ko_line)
class TestEggNOGFunctionalCategoriesParser(TestCase):
def test_parse_file(self):
parser = EggNOGFunctionalCategoriesParser("test")
fake_file_handler = [
"FIRST GROUP\n", " [A] Categorie name A\n", " [B] Categorie name B\n", "\n",
"SECOND GROUP\n", " [C] Categorie name C\n", " [D] Categorie name D\n",
]
expected_list = [
{'category_id': 'A', 'group': 'FIRST GROUP', 'name': 'Categorie name A'},
{'category_id': 'B', 'group': 'FIRST GROUP', 'name': 'Categorie name B'},
{'category_id': 'C', 'group': 'SECOND GROUP', 'name': 'Categorie name C'},
{'category_id': 'D', 'group': 'SECOND GROUP', 'name': 'Categorie name D'}
]
self.assertListEqual(parser.handle_parsing(fake_file_handler), expected_list)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment