From 09458ad3e0e18bb2d382955c5f4af0d3fae3aaa4 Mon Sep 17 00:00:00 2001
From: Kenzo-Hugo Hillion <kenzo-hugo.hillion1@pasteur.fr>
Date: Mon, 9 Dec 2019 18:11:32 +0100
Subject: [PATCH] Add script to load EggNog into the db

---
 .../metagenedb/apps/catalog/admin/function.py |  2 +-
 .../commands/create_update_eggnog.py          | 83 +++++++++++++++++++
 .../0014_remove_eggnog_long_name.py           | 17 ++++
 .../0015_increase_function_name_max_length.py | 18 ++++
 .../apps/catalog/models/function.py           |  6 +-
 backend/metagenedb/common/utils/chunks.py     |  7 ++
 .../metagenedb/common/utils/parsers/eggnog.py |  2 +-
 .../common/utils/parsers/test_eggnog.py       |  4 +-
 .../metagenedb/common/utils/test_chunks.py    |  9 +-
 9 files changed, 141 insertions(+), 7 deletions(-)
 create mode 100644 backend/metagenedb/apps/catalog/management/commands/create_update_eggnog.py
 create mode 100644 backend/metagenedb/apps/catalog/migrations/0014_remove_eggnog_long_name.py
 create mode 100644 backend/metagenedb/apps/catalog/migrations/0015_increase_function_name_max_length.py

diff --git a/backend/metagenedb/apps/catalog/admin/function.py b/backend/metagenedb/apps/catalog/admin/function.py
index 1cb688e..5c4c2e0 100644
--- a/backend/metagenedb/apps/catalog/admin/function.py
+++ b/backend/metagenedb/apps/catalog/admin/function.py
@@ -20,7 +20,7 @@ class FunctionAdmin(admin.ModelAdmin):
 @admin.register(EggNog)
 class EggNogAdmin(admin.ModelAdmin):
 
-    list_display = ('function_id', 'name', 'long_name')
+    list_display = ('function_id', 'name', 'functional_category')
     search_fields = ('function_id', 'name')
 
 
diff --git a/backend/metagenedb/apps/catalog/management/commands/create_update_eggnog.py b/backend/metagenedb/apps/catalog/management/commands/create_update_eggnog.py
new file mode 100644
index 0000000..cd2a3c9
--- /dev/null
+++ b/backend/metagenedb/apps/catalog/management/commands/create_update_eggnog.py
@@ -0,0 +1,83 @@
+import logging
+
+from django.core.management.base import BaseCommand
+from django.db import IntegrityError
+
+from metagenedb.apps.catalog.models import EggNog, EggNogFunctionalCategory
+from metagenedb.common.utils.chunks import file_len
+from metagenedb.common.utils.parsers import EggNogAnnotationLineParser
+
+logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s')
+logger = logging.getLogger(__name__)
+
+
+class ImportEggNog(object):
+
+    def __init__(self, file_path):
+        self.annotation_file = file_path
+        self.eggnog_parser = EggNogAnnotationLineParser()
+        self.processed_count = 0
+        self.created_count = 0
+        self.updated_count = 0
+        self.skipped_count = 0
+        self.skipped_ids = []
+
+    def _build_functional_category_dict(self):
+        all_categories = EggNogFunctionalCategory.objects.all()
+        self.functional_cat = {cat.category_id: cat for cat in all_categories}
+
+    def link_functional_category(self, eggnog_dict):
+        cat_key = eggnog_dict.get('functional_category', 'S')
+        category = self.functional_cat.get(cat_key)
+        eggnog_dict.update({'functional_category': category})
+
+    def load_all(self, test=False):
+        self._build_functional_category_dict()
+        self.total_eggnog_nb = file_len(self.annotation_file)
+        with open(self.annotation_file, "r") as file:
+            for line in file:
+                eggnog_dict = self.eggnog_parser.get_dict(line)
+                self.link_functional_category(eggnog_dict)
+                payload = {k: v for k, v in eggnog_dict.items() if v != ""}
+                try:
+                    eggnog = EggNog(**payload)
+                    eggnog.save()
+                    self.created_count += 1
+                except IntegrityError:
+                    try:
+                        eggnog = EggNog.objects.get(function_id=payload.get('function_id'))
+                        for k, v in payload.items():
+                            setattr(eggnog, k, v)
+                        eggnog.save()
+                        self.updated_count += 1
+                    except IntegrityError:
+                        self.skipped_ids.append(payload.get('function_id'))
+                        self.skipped_count += 1
+                self.processed_count += 1
+                if self.processed_count % 1000 == 0:
+                    logger.info("%s/%s EggNog processed so far...", self.processed_count, self.total_eggnog_nb)
+                    if test:
+                        break
+        logger.info("[DONE] %s/%s EggNog created.", self.created_count, self.total_eggnog_nb)
+        logger.info("[DONE] %s/%s EggNog updated.", self.updated_count, self.total_eggnog_nb)
+        logger.info("[DONE] %s/%s EggNog skipped. List: %s", self.skipped_count, self.total_eggnog_nb,
+                    self.skipped_ids)
+
+
+class Command(BaseCommand):
+    help = 'Create or update all Eggnog entries from annotations.tsv file.'
+
+    def add_arguments(self, parser):
+        parser.add_argument('annotation', help='annotations.tsv file from EggNog')
+        parser.add_argument('--test', action='store_true', help='Run only on first 1000 entries.')
+
+    def set_logger_level(self, verbosity):
+        if verbosity > 2:
+            logger.setLevel(logging.DEBUG)
+        elif verbosity > 1:
+            logger.setLevel(logging.INFO)
+
+    def handle(self, *args, **options):
+        self.set_logger_level(int(options['verbosity']))
+        import_eggnog = ImportEggNog(options['annotation'])
+        import_eggnog.load_all(test=options['test'])
diff --git a/backend/metagenedb/apps/catalog/migrations/0014_remove_eggnog_long_name.py b/backend/metagenedb/apps/catalog/migrations/0014_remove_eggnog_long_name.py
new file mode 100644
index 0000000..f2b03bf
--- /dev/null
+++ b/backend/metagenedb/apps/catalog/migrations/0014_remove_eggnog_long_name.py
@@ -0,0 +1,17 @@
+# Generated by Django 3.0 on 2019-12-09 17:02
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('catalog', '0013_plural_eggnog'),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name='eggnog',
+            name='long_name',
+        ),
+    ]
diff --git a/backend/metagenedb/apps/catalog/migrations/0015_increase_function_name_max_length.py b/backend/metagenedb/apps/catalog/migrations/0015_increase_function_name_max_length.py
new file mode 100644
index 0000000..5627b6c
--- /dev/null
+++ b/backend/metagenedb/apps/catalog/migrations/0015_increase_function_name_max_length.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.0 on 2019-12-09 17:06
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('catalog', '0014_remove_eggnog_long_name'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='function',
+            name='name',
+            field=models.CharField(max_length=200),
+        ),
+    ]
diff --git a/backend/metagenedb/apps/catalog/models/function.py b/backend/metagenedb/apps/catalog/models/function.py
index 9add193..2723416 100644
--- a/backend/metagenedb/apps/catalog/models/function.py
+++ b/backend/metagenedb/apps/catalog/models/function.py
@@ -12,7 +12,7 @@ class Function(models.Model):
     ]
 
     function_id = models.CharField(max_length=100, db_index=True, unique=True)
-    name = models.CharField(max_length=100)
+    name = models.CharField(max_length=200)
     source = models.CharField(max_length=10, choices=SOURCE_CHOICES, default=UNDEFINED)
 
     def __str__(self):
@@ -38,7 +38,6 @@ class KeggOrthology(Function):
 class EggNog(Function):
     SOURCE = 'eggnog'
 
-    long_name = models.CharField(max_length=500)
     functional_category = models.ForeignKey(
         'EggNogFunctionalCategory', related_name='eggnogs',
         on_delete=models.SET_NULL,
@@ -64,5 +63,8 @@ class EggNogFunctionalCategory(models.Model):
     name = models.CharField(max_length=100)
     group = models.CharField(max_length=100, choices=GROUP_CHOICES)
 
+    def __str__(self):
+        return f"{self.category_id} ({self.name})"
+
     class Meta:
         verbose_name_plural = "EggNog Functional categories"
diff --git a/backend/metagenedb/common/utils/chunks.py b/backend/metagenedb/common/utils/chunks.py
index 4650057..086e323 100644
--- a/backend/metagenedb/common/utils/chunks.py
+++ b/backend/metagenedb/common/utils/chunks.py
@@ -2,3 +2,10 @@ def generate_chunks(full_list, chunk_size):
     """Yield successive n-sized chunks from full_list."""
     for i in range(0, len(full_list), chunk_size):
         yield full_list[i:i + chunk_size]
+
+
+def file_len(file_path):
+    with open(file_path) as f:
+        for i, l in enumerate(f):
+            pass
+    return i + 1
diff --git a/backend/metagenedb/common/utils/parsers/eggnog.py b/backend/metagenedb/common/utils/parsers/eggnog.py
index 4397493..51fdf35 100644
--- a/backend/metagenedb/common/utils/parsers/eggnog.py
+++ b/backend/metagenedb/common/utils/parsers/eggnog.py
@@ -6,7 +6,7 @@ _LOGGER = logging.getLogger(__name__)
 class EggNogAnnotationLineParser(object):
 
     @staticmethod
-    def ko_list(line):
+    def get_dict(line):
         """
         Parse line from Eggnog annotations.tsv file to return organized dict
         """
diff --git a/backend/metagenedb/common/utils/parsers/test_eggnog.py b/backend/metagenedb/common/utils/parsers/test_eggnog.py
index 0f48ef7..5ffe229 100644
--- a/backend/metagenedb/common/utils/parsers/test_eggnog.py
+++ b/backend/metagenedb/common/utils/parsers/test_eggnog.py
@@ -12,10 +12,10 @@ class TestEggNogAnnotationLineParser(TestCase):
                 'name': "translational termination",
                 'functional_category': "K"
             }
-        test_dict = EggNogAnnotationLineParser.ko_list(ko_line)
+        test_dict = EggNogAnnotationLineParser.get_dict(ko_line)
         self.assertDictEqual(test_dict, expected_dict)
 
     def test_ko_list_wrong_format(self):
         ko_line = "This is a wrong line format, with; information   and tab"
         with self.assertRaises(Exception) as context:  # noqa
-            EggNogAnnotationLineParser.ko_list(ko_line)
+            EggNogAnnotationLineParser.get_dict(ko_line)
diff --git a/backend/metagenedb/common/utils/test_chunks.py b/backend/metagenedb/common/utils/test_chunks.py
index 223b874..b78e3b5 100644
--- a/backend/metagenedb/common/utils/test_chunks.py
+++ b/backend/metagenedb/common/utils/test_chunks.py
@@ -1,6 +1,6 @@
 from unittest import TestCase
 
-from metagenedb.common.utils.chunks import generate_chunks
+from metagenedb.common.utils.chunks import generate_chunks, file_len
 
 
 class TestChunks(TestCase):
@@ -24,3 +24,10 @@ class TestChunks(TestCase):
         chunks = list(generate_chunks(self.full_list, chunk_size))
         self.assertEqual(len(chunks), 1)
         self.assertEqual(len(chunks[-1]), 10)
+
+
+class TestFileLength(TestCase):
+
+    def test_file_length(self):
+        file_path = "./dev_data/IGC_sample.annotation_OF.summary"
+        self.assertEqual(file_len(file_path), 1002)
-- 
GitLab