Commit c78cb131 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

parser for eggnog functional categories

parent fe09bec9
Pipeline #19770 failed with stages
in 2 minutes and 18 seconds
...@@ -5,7 +5,7 @@ from django.core.exceptions import ValidationError ...@@ -5,7 +5,7 @@ from django.core.exceptions import ValidationError
from metagenedb.apps.catalog.models import EggNog, EggNogFunctionalCategory from metagenedb.apps.catalog.models import EggNog, EggNogFunctionalCategory
from metagenedb.common.utils.chunks import file_len from metagenedb.common.utils.chunks import file_len
from metagenedb.common.utils.parsers import EggNogAnnotationLineParser from metagenedb.common.utils.parsers import EggNOGAnnotationLineParser
logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s')
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -15,7 +15,7 @@ class ImportEggNog(object): ...@@ -15,7 +15,7 @@ class ImportEggNog(object):
def __init__(self, file_path): def __init__(self, file_path):
self.annotation_file = file_path self.annotation_file = file_path
self.eggnog_parser = EggNogAnnotationLineParser() self.eggnog_parser = EggNOGAnnotationLineParser()
self.processed_count = 0 self.processed_count = 0
self.created_count = 0 self.created_count = 0
self.updated_count = 0 self.updated_count = 0
......
from .eggnog import EggNogAnnotationLineParser # noqa from .eggnog import EggNOGAnnotationLineParser # noqa
from .igc import IGCLineParser # noqa from .igc import IGCLineParser # noqa
from .kegg import KEGGLineParser # noqa from .kegg import KEGGLineParser # noqa
from .ncbi_taxonomy import NCBITaxonomyLineParser # noqa from .ncbi_taxonomy import NCBITaxonomyLineParser # noqa
class FileParser:
def __init__(self, file_path):
self.file_path = file_path
def handle_parsing(self, file_handler):
"""
This method need to be overloaded to really handle the parsing
"""
for line in file_handler:
print(line.rstrip())
def parse(self):
with open(self.file_path, 'r') as file:
self.handle_parsing(file)
import logging import logging
from .base import FileParser
_LOGGER = logging.getLogger(__name__) _LOGGER = logging.getLogger(__name__)
class EggNogAnnotationLineParser(object): class EggNOGAnnotationLineParser:
@staticmethod @staticmethod
def get_dict(line): def get_dict(line):
...@@ -19,4 +21,26 @@ class EggNogAnnotationLineParser(object): ...@@ -19,4 +21,26 @@ class EggNogAnnotationLineParser(object):
} }
except Exception: except Exception:
_LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from eggnog annotations.tsv?") _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from eggnog annotations.tsv?")
raise raise Exception("Impossible to parse given line as eggnog from annotation.tsv file")
class EggNOGFunctionalCategoriesParser(FileParser):
"""
Parse functional categories file from EggNOG
"""
def handle_parsing(self, file_handler):
functional_categories = []
current_group = "Unknown"
for line in file_handler:
line = line.strip()
if line.startswith('['): # It is a category
elements = line.split(maxsplit=1)
functional_categories.append({
'category_id': elements[0][1],
'name': elements[1],
'group': current_group
})
elif line: # It is a group of a category
current_group = line
return functional_categories
from unittest import TestCase from unittest import TestCase
from metagenedb.common.utils.parsers import EggNogAnnotationLineParser from metagenedb.common.utils.parsers.eggnog import (
EggNOGAnnotationLineParser,
EggNOGFunctionalCategoriesParser
)
class TestEggNogAnnotationLineParser(TestCase): class TestEggNOGAnnotationLineParser(TestCase):
def test_get_dict(self): def test_get_dict(self):
ko_line = "1\t28H54\tK\ttranslational termination\n" ko_line = "1\t28H54\tK\ttranslational termination\n"
...@@ -12,7 +15,7 @@ class TestEggNogAnnotationLineParser(TestCase): ...@@ -12,7 +15,7 @@ class TestEggNogAnnotationLineParser(TestCase):
'name': "translational termination", 'name': "translational termination",
'functional_categories': ["K"] 'functional_categories': ["K"]
} }
test_dict = EggNogAnnotationLineParser.get_dict(ko_line) test_dict = EggNOGAnnotationLineParser.get_dict(ko_line)
self.assertDictEqual(test_dict, expected_dict) self.assertDictEqual(test_dict, expected_dict)
def test_get_dict_no_name(self): def test_get_dict_no_name(self):
...@@ -22,7 +25,7 @@ class TestEggNogAnnotationLineParser(TestCase): ...@@ -22,7 +25,7 @@ class TestEggNogAnnotationLineParser(TestCase):
'name': "", 'name': "",
'functional_categories': ["S"] 'functional_categories': ["S"]
} }
test_dict = EggNogAnnotationLineParser.get_dict(ko_line) test_dict = EggNOGAnnotationLineParser.get_dict(ko_line)
self.assertDictEqual(test_dict, expected_dict) self.assertDictEqual(test_dict, expected_dict)
def test_get_dict_long_name(self): def test_get_dict_long_name(self):
...@@ -32,7 +35,7 @@ class TestEggNogAnnotationLineParser(TestCase): ...@@ -32,7 +35,7 @@ class TestEggNogAnnotationLineParser(TestCase):
'name': "Glucose-responsive transcription factor that regulates expression of several glucose transporter (HXT) genes in response to glucose", # noqa 'name': "Glucose-responsive transcription factor that regulates expression of several glucose transporter (HXT) genes in response to glucose", # noqa
'functional_categories': ["S"] 'functional_categories': ["S"]
} }
test_dict = EggNogAnnotationLineParser.get_dict(ko_line) test_dict = EggNOGAnnotationLineParser.get_dict(ko_line)
self.assertDictEqual(test_dict, expected_dict) self.assertDictEqual(test_dict, expected_dict)
def test_get_dict_multi_categories(self): def test_get_dict_multi_categories(self):
...@@ -42,10 +45,27 @@ class TestEggNogAnnotationLineParser(TestCase): ...@@ -42,10 +45,27 @@ class TestEggNogAnnotationLineParser(TestCase):
'name': "translational termination", 'name': "translational termination",
'functional_categories': ["K", "S"] 'functional_categories': ["K", "S"]
} }
test_dict = EggNogAnnotationLineParser.get_dict(ko_line) test_dict = EggNOGAnnotationLineParser.get_dict(ko_line)
self.assertDictEqual(test_dict, expected_dict) self.assertDictEqual(test_dict, expected_dict)
def test_get_dict_wrong_format(self): def test_get_dict_wrong_format(self):
ko_line = "This is a wrong line format, with; information and tab" ko_line = "This is a wrong line format, with; information and tab"
with self.assertRaises(Exception) as context: # noqa with self.assertRaises(Exception) as context: # noqa
EggNogAnnotationLineParser.get_dict(ko_line) EggNOGAnnotationLineParser.get_dict(ko_line)
class TestEggNOGFunctionalCategoriesParser(TestCase):
def test_parse_file(self):
parser = EggNOGFunctionalCategoriesParser("test")
fake_file_handler = [
"FIRST GROUP\n", " [A] Categorie name A\n", " [B] Categorie name B\n", "\n",
"SECOND GROUP\n", " [C] Categorie name C\n", " [D] Categorie name D\n",
]
expected_list = [
{'category_id': 'A', 'group': 'FIRST GROUP', 'name': 'Categorie name A'},
{'category_id': 'B', 'group': 'FIRST GROUP', 'name': 'Categorie name B'},
{'category_id': 'C', 'group': 'SECOND GROUP', 'name': 'Categorie name C'},
{'category_id': 'D', 'group': 'SECOND GROUP', 'name': 'Categorie name D'}
]
self.assertListEqual(parser.handle_parsing(fake_file_handler), expected_list)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment