From c78cb131debd3acba3b3fc89d1979fb0ecf28cd1 Mon Sep 17 00:00:00 2001
From: Kenzo-Hugo Hillion <kenzo-hugo.hillion1@pasteur.fr>
Date: Wed, 11 Dec 2019 15:44:37 +0100
Subject: [PATCH] parser for eggnog functional categories

---
 .../commands/create_update_eggnog.py          |  4 +--
 .../common/utils/parsers/__init__.py          |  2 +-
 .../metagenedb/common/utils/parsers/base.py   | 15 ++++++++
 .../metagenedb/common/utils/parsers/eggnog.py | 28 +++++++++++++--
 .../common/utils/parsers/test_eggnog.py       | 34 +++++++++++++++----
 5 files changed, 71 insertions(+), 12 deletions(-)
 create mode 100644 backend/metagenedb/common/utils/parsers/base.py

diff --git a/backend/metagenedb/apps/catalog/management/commands/create_update_eggnog.py b/backend/metagenedb/apps/catalog/management/commands/create_update_eggnog.py
index 87f51ed..c2a918b 100644
--- a/backend/metagenedb/apps/catalog/management/commands/create_update_eggnog.py
+++ b/backend/metagenedb/apps/catalog/management/commands/create_update_eggnog.py
@@ -5,7 +5,7 @@ from django.core.exceptions import ValidationError
 
 from metagenedb.apps.catalog.models import EggNog, EggNogFunctionalCategory
 from metagenedb.common.utils.chunks import file_len
-from metagenedb.common.utils.parsers import EggNogAnnotationLineParser
+from metagenedb.common.utils.parsers import EggNOGAnnotationLineParser
 
 logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s')
 logger = logging.getLogger(__name__)
@@ -15,7 +15,7 @@ class ImportEggNog(object):
 
     def __init__(self, file_path):
         self.annotation_file = file_path
-        self.eggnog_parser = EggNogAnnotationLineParser()
+        self.eggnog_parser = EggNOGAnnotationLineParser()
         self.processed_count = 0
         self.created_count = 0
         self.updated_count = 0
diff --git a/backend/metagenedb/common/utils/parsers/__init__.py b/backend/metagenedb/common/utils/parsers/__init__.py
index 92f59fa..d6aa345 100644
--- a/backend/metagenedb/common/utils/parsers/__init__.py
+++ b/backend/metagenedb/common/utils/parsers/__init__.py
@@ -1,4 +1,4 @@
-from .eggnog import EggNogAnnotationLineParser  # noqa
+from .eggnog import EggNOGAnnotationLineParser  # noqa
 from .igc import IGCLineParser  # noqa
 from .kegg import KEGGLineParser  # noqa
 from .ncbi_taxonomy import NCBITaxonomyLineParser  # noqa
diff --git a/backend/metagenedb/common/utils/parsers/base.py b/backend/metagenedb/common/utils/parsers/base.py
new file mode 100644
index 0000000..466ed0a
--- /dev/null
+++ b/backend/metagenedb/common/utils/parsers/base.py
@@ -0,0 +1,15 @@
+class FileParser:
+
+    def __init__(self, file_path):
+        self.file_path = file_path
+
+    def handle_parsing(self, file_handler):
+        """
+        This method need to be overloaded to really handle the parsing
+        """
+        for line in file_handler:
+            print(line.rstrip())
+
+    def parse(self):
+        with open(self.file_path, 'r') as file:
+            self.handle_parsing(file)
diff --git a/backend/metagenedb/common/utils/parsers/eggnog.py b/backend/metagenedb/common/utils/parsers/eggnog.py
index fed1804..2e6b42a 100644
--- a/backend/metagenedb/common/utils/parsers/eggnog.py
+++ b/backend/metagenedb/common/utils/parsers/eggnog.py
@@ -1,9 +1,11 @@
 import logging
 
+from .base import FileParser
+
 _LOGGER = logging.getLogger(__name__)
 
 
-class EggNogAnnotationLineParser(object):
+class EggNOGAnnotationLineParser:
 
     @staticmethod
     def get_dict(line):
@@ -19,4 +21,26 @@ class EggNogAnnotationLineParser(object):
             }
         except Exception:
             _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from eggnog annotations.tsv?")
-            raise
+            raise Exception("Impossible to parse given line as eggnog from annotation.tsv file")
+
+
+class EggNOGFunctionalCategoriesParser(FileParser):
+    """
+    Parse functional categories file from EggNOG
+    """
+
+    def handle_parsing(self, file_handler):
+        functional_categories = []
+        current_group = "Unknown"
+        for line in file_handler:
+            line = line.strip()
+            if line.startswith('['):  # It is a category
+                elements = line.split(maxsplit=1)
+                functional_categories.append({
+                    'category_id': elements[0][1],
+                    'name': elements[1],
+                    'group': current_group
+                })
+            elif line:  # It is a group of a category
+                current_group = line
+        return functional_categories
diff --git a/backend/metagenedb/common/utils/parsers/test_eggnog.py b/backend/metagenedb/common/utils/parsers/test_eggnog.py
index d7c8e62..52bbaec 100644
--- a/backend/metagenedb/common/utils/parsers/test_eggnog.py
+++ b/backend/metagenedb/common/utils/parsers/test_eggnog.py
@@ -1,9 +1,12 @@
 from unittest import TestCase
 
-from metagenedb.common.utils.parsers import EggNogAnnotationLineParser
+from metagenedb.common.utils.parsers.eggnog import (
+    EggNOGAnnotationLineParser,
+    EggNOGFunctionalCategoriesParser
+)
 
 
-class TestEggNogAnnotationLineParser(TestCase):
+class TestEggNOGAnnotationLineParser(TestCase):
 
     def test_get_dict(self):
         ko_line = "1\t28H54\tK\ttranslational termination\n"
@@ -12,7 +15,7 @@ class TestEggNogAnnotationLineParser(TestCase):
                 'name': "translational termination",
                 'functional_categories': ["K"]
             }
-        test_dict = EggNogAnnotationLineParser.get_dict(ko_line)
+        test_dict = EggNOGAnnotationLineParser.get_dict(ko_line)
         self.assertDictEqual(test_dict, expected_dict)
 
     def test_get_dict_no_name(self):
@@ -22,7 +25,7 @@ class TestEggNogAnnotationLineParser(TestCase):
                 'name': "",
                 'functional_categories': ["S"]
             }
-        test_dict = EggNogAnnotationLineParser.get_dict(ko_line)
+        test_dict = EggNOGAnnotationLineParser.get_dict(ko_line)
         self.assertDictEqual(test_dict, expected_dict)
 
     def test_get_dict_long_name(self):
@@ -32,7 +35,7 @@ class TestEggNogAnnotationLineParser(TestCase):
                 'name': "Glucose-responsive transcription factor that regulates expression of several glucose transporter (HXT) genes in response to glucose",  # noqa
                 'functional_categories': ["S"]
             }
-        test_dict = EggNogAnnotationLineParser.get_dict(ko_line)
+        test_dict = EggNOGAnnotationLineParser.get_dict(ko_line)
         self.assertDictEqual(test_dict, expected_dict)
 
     def test_get_dict_multi_categories(self):
@@ -42,10 +45,27 @@ class TestEggNogAnnotationLineParser(TestCase):
                 'name': "translational termination",
                 'functional_categories': ["K", "S"]
             }
-        test_dict = EggNogAnnotationLineParser.get_dict(ko_line)
+        test_dict = EggNOGAnnotationLineParser.get_dict(ko_line)
         self.assertDictEqual(test_dict, expected_dict)
 
     def test_get_dict_wrong_format(self):
         ko_line = "This is a wrong line format, with; information   and tab"
         with self.assertRaises(Exception) as context:  # noqa
-            EggNogAnnotationLineParser.get_dict(ko_line)
+            EggNOGAnnotationLineParser.get_dict(ko_line)
+
+
+class TestEggNOGFunctionalCategoriesParser(TestCase):
+
+    def test_parse_file(self):
+        parser = EggNOGFunctionalCategoriesParser("test")
+        fake_file_handler = [
+           "FIRST GROUP\n", " [A] Categorie name A\n", " [B] Categorie name B\n", "\n",
+           "SECOND GROUP\n", " [C] Categorie name C\n", " [D] Categorie name D\n",
+        ]
+        expected_list = [
+            {'category_id': 'A', 'group': 'FIRST GROUP', 'name': 'Categorie name A'},
+            {'category_id': 'B', 'group': 'FIRST GROUP', 'name': 'Categorie name B'},
+            {'category_id': 'C', 'group': 'SECOND GROUP', 'name': 'Categorie name C'},
+            {'category_id': 'D', 'group': 'SECOND GROUP', 'name': 'Categorie name D'}
+        ]
+        self.assertListEqual(parser.handle_parsing(fake_file_handler), expected_list)
-- 
GitLab