From bee34ed243b04e148416545c2fa5a195a9ad6400 Mon Sep 17 00:00:00 2001
From: Kenzo-Hugo Hillion <kenzo-hugo.hillion1@pasteur.fr>
Date: Thu, 18 Jul 2019 17:35:09 +0200
Subject: [PATCH] add superkingdom and improve performances

---
 .../metagenedb/apps/catalog/admin/taxonomy.py |  2 +-
 .../migrations/0004_taxonomy_superkingdom.py  | 19 +++++++++++
 .../apps/catalog/models/taxonomy.py           |  7 +++-
 .../apps/catalog/serializers/taxonomy.py      |  7 +++-
 .../populate_db/import_ncbi_taxonomy.py       | 33 ++++++++++++-------
 docker-compose.yaml                           |  1 +
 6 files changed, 55 insertions(+), 14 deletions(-)
 create mode 100644 backend/metagenedb/apps/catalog/migrations/0004_taxonomy_superkingdom.py

diff --git a/backend/metagenedb/apps/catalog/admin/taxonomy.py b/backend/metagenedb/apps/catalog/admin/taxonomy.py
index 79090d3..633f57b 100644
--- a/backend/metagenedb/apps/catalog/admin/taxonomy.py
+++ b/backend/metagenedb/apps/catalog/admin/taxonomy.py
@@ -7,7 +7,7 @@ from metagenedb.apps.catalog.models import Taxonomy
 class TaxonomyAdmin(admin.ModelAdmin):
 
     list_display = (
-        'tax_id', 'name', 'rank',
+        'tax_id', 'name', 'rank', 'superkingdom',
         'kingdom', 'phylum', 'class_rank', 'order', 'family', 'genus', 'species',
     )
     search_fields = ('tax_id', 'name')
diff --git a/backend/metagenedb/apps/catalog/migrations/0004_taxonomy_superkingdom.py b/backend/metagenedb/apps/catalog/migrations/0004_taxonomy_superkingdom.py
new file mode 100644
index 0000000..4e7649b
--- /dev/null
+++ b/backend/metagenedb/apps/catalog/migrations/0004_taxonomy_superkingdom.py
@@ -0,0 +1,19 @@
+# Generated by Django 2.2.1 on 2019-07-18 09:34
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('catalog', '0003_auto_20190717_1551'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='taxonomy',
+            name='superkingdom',
+            field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='superkingdom_children', to='catalog.Taxonomy'),
+        ),
+    ]
diff --git a/backend/metagenedb/apps/catalog/models/taxonomy.py b/backend/metagenedb/apps/catalog/models/taxonomy.py
index ccd1135..fa3a096 100644
--- a/backend/metagenedb/apps/catalog/models/taxonomy.py
+++ b/backend/metagenedb/apps/catalog/models/taxonomy.py
@@ -52,6 +52,11 @@ class Taxonomy(models.Model):
         null=True, blank=True,
     )
 
+    superkingdom = models.ForeignKey(
+        'Taxonomy', related_name='superkingdom_children',
+        on_delete=models.SET_NULL,
+        null=True, blank=True,
+    )
     kingdom = models.ForeignKey(
         'Taxonomy', related_name='kingdom_children',
         on_delete=models.SET_NULL,
@@ -74,7 +79,7 @@ class Taxonomy(models.Model):
         null=True, blank=True,
     )
     family = models.ForeignKey(
-        'Taxonomy', related_name='familyphy_children',
+        'Taxonomy', related_name='family_children',
         on_delete=models.SET_NULL,
         null=True, blank=True,
     )
diff --git a/backend/metagenedb/apps/catalog/serializers/taxonomy.py b/backend/metagenedb/apps/catalog/serializers/taxonomy.py
index 87816c3..32c230c 100644
--- a/backend/metagenedb/apps/catalog/serializers/taxonomy.py
+++ b/backend/metagenedb/apps/catalog/serializers/taxonomy.py
@@ -10,6 +10,11 @@ class TaxonomySerializer(serializers.ModelSerializer):
         source='parent',
         required=False,
     )
+    superkingdom = serializers.SlugRelatedField(
+        queryset=Taxonomy.objects.all(),
+        slug_field='tax_id',
+        required=False
+    )
     kingdom = serializers.SlugRelatedField(
         queryset=Taxonomy.objects.all(),
         slug_field='tax_id',
@@ -49,6 +54,6 @@ class TaxonomySerializer(serializers.ModelSerializer):
     class Meta:
         model = Taxonomy
         fields = (
-            'tax_id', 'name', 'rank', 'parent_tax_id',
+            'tax_id', 'name', 'rank', 'parent_tax_id', 'superkingdom',
             'kingdom', 'phylum', 'class_rank', 'order', 'family', 'genus', 'species',
         )
diff --git a/backend/scripts/populate_db/import_ncbi_taxonomy.py b/backend/scripts/populate_db/import_ncbi_taxonomy.py
index 3206c02..cc8b8eb 100755
--- a/backend/scripts/populate_db/import_ncbi_taxonomy.py
+++ b/backend/scripts/populate_db/import_ncbi_taxonomy.py
@@ -18,6 +18,8 @@ from metagenedb.apps.catalog.serializers import TaxonomySerializer  # noqa
 logging.basicConfig(level=logging.INFO)
 _LOGGER = logging.getLogger(__name__)
 
+SELECT_RELATED_PARENT = "parent{}".format("__parent" * 40)
+
 
 def import_names(taxonomy_names_file, select_class="scientific name"):
     """
@@ -62,21 +64,30 @@ def update_taxo_nodes(taxonomy_nodes_file):
                 _LOGGER.warning(f"Invalid data: {serializer.errors}. Link to parent skipped. Data: {serializer.data}")
 
 
-def build_hierarchy():
+def _build_hierarchy(taxo):
+    hierarchy = taxo.build_parental_hierarchy()
+    if 'class' in hierarchy.keys():
+        hierarchy['class_rank'] = hierarchy.pop('class')
+    serializer = TaxonomySerializer(taxo, hierarchy)
+    if serializer.is_valid():
+        serializer.save()
+    else:
+        _LOGGER.warning(f"Invalid data: {serializer.errors}. Building hierarchy skipped. Data: {serializer.data}")
+
+
+def build_all_hierarchy(chunk_size=8000):
     """
     Uses class method from Taxonomy model to retrieve the parental hierarchy and
     assign corresponding attribute to each entry.
     """
     _LOGGER.info(f"Linking taxonomy objects to parental nodes from direct parental nodes...")
-    for taxo in Taxonomy.objects.all():
-        hierarchy = taxo.build_parental_hierarchy()
-        if 'class' in hierarchy.keys():
-            hierarchy['class_rank'] = hierarchy.pop('class')
-        serializer = TaxonomySerializer(taxo, hierarchy)
-        if serializer.is_valid():
-            serializer.save()
-        else:
-            _LOGGER.warning(f"Invalid data: {serializer.errors}. Building hierarchy skipped. Data: {serializer.data}")
+    all_taxo = Taxonomy.objects.select_related(SELECT_RELATED_PARENT).all()
+    cpt = 0
+    for taxo in all_taxo.iterator(chunk_size=chunk_size):
+        _build_hierarchy(taxo)
+        cpt += 1
+        if cpt % 10000 == 0:
+            _LOGGER.info(f"{cpt}/{all_taxo.count()} hierachies built...")
 
 
 def parse_arguments():
@@ -99,7 +110,7 @@ def run():
     taxonomy_names = import_names(args.names)
     create_taxo_nodes(args.nodes, taxonomy_names)
     update_taxo_nodes(args.nodes)
-    build_hierarchy()
+    build_all_hierarchy()
 
 
 if __name__ == "__main__":
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 40ead5c..c07eb43 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -20,6 +20,7 @@ services:
       - main
 
   db:
+    shm_size: '2gb'
     container_name: db
     image: postgres:latest
     ports:
-- 
GitLab