diff --git a/backend/metagenedb/apps/catalog/management/commands/create_update_eggnog.py b/backend/metagenedb/apps/catalog/management/commands/create_update_eggnog.py index 87f51ed079ead9a3fcafee717d6bd4d32ee50d55..c2a918bee7ffde500b889c87607b14a08ad8c010 100644 --- a/backend/metagenedb/apps/catalog/management/commands/create_update_eggnog.py +++ b/backend/metagenedb/apps/catalog/management/commands/create_update_eggnog.py @@ -5,7 +5,7 @@ from django.core.exceptions import ValidationError from metagenedb.apps.catalog.models import EggNog, EggNogFunctionalCategory from metagenedb.common.utils.chunks import file_len -from metagenedb.common.utils.parsers import EggNogAnnotationLineParser +from metagenedb.common.utils.parsers import EggNOGAnnotationLineParser logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') logger = logging.getLogger(__name__) @@ -15,7 +15,7 @@ class ImportEggNog(object): def __init__(self, file_path): self.annotation_file = file_path - self.eggnog_parser = EggNogAnnotationLineParser() + self.eggnog_parser = EggNOGAnnotationLineParser() self.processed_count = 0 self.created_count = 0 self.updated_count = 0 diff --git a/backend/metagenedb/common/utils/parsers/__init__.py b/backend/metagenedb/common/utils/parsers/__init__.py index 92f59fa48f3eec05d8a2e62b16083c3ccf4cfe3e..d6aa3459cb3ebaac331771005d91ba889b729da3 100644 --- a/backend/metagenedb/common/utils/parsers/__init__.py +++ b/backend/metagenedb/common/utils/parsers/__init__.py @@ -1,4 +1,4 @@ -from .eggnog import EggNogAnnotationLineParser # noqa +from .eggnog import EggNOGAnnotationLineParser # noqa from .igc import IGCLineParser # noqa from .kegg import KEGGLineParser # noqa from .ncbi_taxonomy import NCBITaxonomyLineParser # noqa diff --git a/backend/metagenedb/common/utils/parsers/base.py b/backend/metagenedb/common/utils/parsers/base.py new file mode 100644 index 0000000000000000000000000000000000000000..466ed0a5ee449029d18e1fd739cc00a291f8ae81 --- /dev/null +++ b/backend/metagenedb/common/utils/parsers/base.py @@ -0,0 +1,15 @@ +class FileParser: + + def __init__(self, file_path): + self.file_path = file_path + + def handle_parsing(self, file_handler): + """ + This method need to be overloaded to really handle the parsing + """ + for line in file_handler: + print(line.rstrip()) + + def parse(self): + with open(self.file_path, 'r') as file: + self.handle_parsing(file) diff --git a/backend/metagenedb/common/utils/parsers/eggnog.py b/backend/metagenedb/common/utils/parsers/eggnog.py index fed18047d56feacd132569f22d2c05cb4bb0d3d8..2e6b42acf74e3b3deada4fd25974e362494c27d4 100644 --- a/backend/metagenedb/common/utils/parsers/eggnog.py +++ b/backend/metagenedb/common/utils/parsers/eggnog.py @@ -1,9 +1,11 @@ import logging +from .base import FileParser + _LOGGER = logging.getLogger(__name__) -class EggNogAnnotationLineParser(object): +class EggNOGAnnotationLineParser: @staticmethod def get_dict(line): @@ -19,4 +21,26 @@ class EggNogAnnotationLineParser(object): } except Exception: _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from eggnog annotations.tsv?") - raise + raise Exception("Impossible to parse given line as eggnog from annotation.tsv file") + + +class EggNOGFunctionalCategoriesParser(FileParser): + """ + Parse functional categories file from EggNOG + """ + + def handle_parsing(self, file_handler): + functional_categories = [] + current_group = "Unknown" + for line in file_handler: + line = line.strip() + if line.startswith('['): # It is a category + elements = line.split(maxsplit=1) + functional_categories.append({ + 'category_id': elements[0][1], + 'name': elements[1], + 'group': current_group + }) + elif line: # It is a group of a category + current_group = line + return functional_categories diff --git a/backend/metagenedb/common/utils/parsers/test_eggnog.py b/backend/metagenedb/common/utils/parsers/test_eggnog.py index d7c8e62c183daee5cd8fd58b86a0008d32c57c9a..52bbaecf884fa66edeecc084ac7eb6e8e44395e9 100644 --- a/backend/metagenedb/common/utils/parsers/test_eggnog.py +++ b/backend/metagenedb/common/utils/parsers/test_eggnog.py @@ -1,9 +1,12 @@ from unittest import TestCase -from metagenedb.common.utils.parsers import EggNogAnnotationLineParser +from metagenedb.common.utils.parsers.eggnog import ( + EggNOGAnnotationLineParser, + EggNOGFunctionalCategoriesParser +) -class TestEggNogAnnotationLineParser(TestCase): +class TestEggNOGAnnotationLineParser(TestCase): def test_get_dict(self): ko_line = "1\t28H54\tK\ttranslational termination\n" @@ -12,7 +15,7 @@ class TestEggNogAnnotationLineParser(TestCase): 'name': "translational termination", 'functional_categories': ["K"] } - test_dict = EggNogAnnotationLineParser.get_dict(ko_line) + test_dict = EggNOGAnnotationLineParser.get_dict(ko_line) self.assertDictEqual(test_dict, expected_dict) def test_get_dict_no_name(self): @@ -22,7 +25,7 @@ class TestEggNogAnnotationLineParser(TestCase): 'name': "", 'functional_categories': ["S"] } - test_dict = EggNogAnnotationLineParser.get_dict(ko_line) + test_dict = EggNOGAnnotationLineParser.get_dict(ko_line) self.assertDictEqual(test_dict, expected_dict) def test_get_dict_long_name(self): @@ -32,7 +35,7 @@ class TestEggNogAnnotationLineParser(TestCase): 'name': "Glucose-responsive transcription factor that regulates expression of several glucose transporter (HXT) genes in response to glucose", # noqa 'functional_categories': ["S"] } - test_dict = EggNogAnnotationLineParser.get_dict(ko_line) + test_dict = EggNOGAnnotationLineParser.get_dict(ko_line) self.assertDictEqual(test_dict, expected_dict) def test_get_dict_multi_categories(self): @@ -42,10 +45,27 @@ class TestEggNogAnnotationLineParser(TestCase): 'name': "translational termination", 'functional_categories': ["K", "S"] } - test_dict = EggNogAnnotationLineParser.get_dict(ko_line) + test_dict = EggNOGAnnotationLineParser.get_dict(ko_line) self.assertDictEqual(test_dict, expected_dict) def test_get_dict_wrong_format(self): ko_line = "This is a wrong line format, with; information and tab" with self.assertRaises(Exception) as context: # noqa - EggNogAnnotationLineParser.get_dict(ko_line) + EggNOGAnnotationLineParser.get_dict(ko_line) + + +class TestEggNOGFunctionalCategoriesParser(TestCase): + + def test_parse_file(self): + parser = EggNOGFunctionalCategoriesParser("test") + fake_file_handler = [ + "FIRST GROUP\n", " [A] Categorie name A\n", " [B] Categorie name B\n", "\n", + "SECOND GROUP\n", " [C] Categorie name C\n", " [D] Categorie name D\n", + ] + expected_list = [ + {'category_id': 'A', 'group': 'FIRST GROUP', 'name': 'Categorie name A'}, + {'category_id': 'B', 'group': 'FIRST GROUP', 'name': 'Categorie name B'}, + {'category_id': 'C', 'group': 'SECOND GROUP', 'name': 'Categorie name C'}, + {'category_id': 'D', 'group': 'SECOND GROUP', 'name': 'Categorie name D'} + ] + self.assertListEqual(parser.handle_parsing(fake_file_handler), expected_list)