Commit 531675bf authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

add more efficient way of building hierarchy

parent 92b47fd0
Pipeline #21438 passed with stages
in 2 minutes and 45 seconds
......@@ -3,13 +3,12 @@ import logging
from django.core.management.base import BaseCommand
from metagenedb.apps.catalog.models import Taxonomy
from metagenedb.common.utils.profiling import profile
from metagenedb.common.utils.chunks import dict_chunks
logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s')
logger = logging.getLogger(__name__)
SELECT_RELATED_PARENT = "parent{}".format("__parent" * 15)
SELECT_RELATED_PARENT = "parent"
class HierarchyBuilder:
......@@ -21,20 +20,48 @@ class HierarchyBuilder:
self.hierarchy_built = 0
self.hierarchy_failed = 0
@profile('/Users/khillion/Sandbox/tax_only_many_parents.prof')
def build_all(self, chunk_size=8000, test=False):
def get_local_taxo(self):
logger.info("Building local db of all taxonomy entries...")
self.taxo_dict = {item.tax_id: {
'name': item.name, 'rank': item.rank, 'parent': item.parent.tax_id
} for item in self.queryset.iterator(chunk_size=10000)}
logger.info("[DONE] Local db of all taxonomy entries.")
def _build_instance_hierarchy(self, tax_id):
hierarchy = {}
current_taxo = self.taxo_dict[tax_id]
if current_taxo['name'] != 'root':
hierarchy[current_taxo['rank']] = {
'tax_id': tax_id,
'name': current_taxo['name']
}
hierarchy = {**hierarchy, **self._build_instance_hierarchy(current_taxo['parent'])}
return hierarchy
def build_hierarchy(self, instances):
for instance in instances:
instance.hierarchy = self._build_instance_hierarchy(instance.tax_id)
return instances
def build_all(self, chunk_size=10000, test=False):
logger.info("Building all hierarchy for all %s taxonomy items...", self.total_tax)
for taxonomy in self.queryset.iterator(chunk_size=chunk_size):
self.get_local_taxo()
for chunk in dict_chunks(self.taxo_dict, chunk_size):
try:
hierarchy = taxonomy.build_hierarchy() # noqa
self.hierarchy_built += 1
except Exception:
self.hierarchy_failed += 1
self.processed_tax += 1
if self.processed_tax % 10000 == 0:
if test is True:
break
logger.info("%s/%s Taxonomy processed so far...", self.processed_tax, self.total_tax)
instances = Taxonomy.objects.filter(tax_id__in=chunk.keys())
instances = self.build_hierarchy(instances)
Taxonomy.objects.bulk_update(
instances,
['hierarchy']
)
self.hierarchy_built += len(chunk)
except Exception as exception:
self.hierarchy_failed += len(chunk)
logger.warning("An error occured, chunk skipped %s", exception)
self.processed_tax += len(chunk)
logger.info("%s/%s Taxonomy processed so far...", self.processed_tax, self.total_tax)
if test is True:
break
logger.info("[DONE] %s/%s Hierarchy built.", self.hierarchy_built, self.total_tax)
logger.info("[DONE] %s/%s Hierarchy build skipped.", self.hierarchy_failed, self.total_tax)
......
from rest_framework.test import APITestCase
from metagenedb.apps.catalog.factory import TaxonomyFactory
from metagenedb.apps.catalog.models import Taxonomy
from .build_hierarchy import HierarchyBuilder
class TestBuildHierarchy(APITestCase):
@classmethod
def setUpTestData(cls):
"""
Build some test data for different tests
"""
cls.root = TaxonomyFactory.create(
tax_id="1",
name="root",
rank="no_rank",
)
cls.root.parent = cls.root
cls.root.save()
cls.kingdom = TaxonomyFactory(
tax_id="2",
name="KINGDOM",
rank="kingdom",
parent=cls.root
)
cls.phylum = TaxonomyFactory(
tax_id="3",
name="PHYLUM",
rank="phylum",
parent=cls.kingdom
)
def test_build_hierarchy(self):
expected_dict = {
'phylum': {
'tax_id': self.phylum.tax_id,
'name': self.phylum.name
},
'kingdom': {
'tax_id': self.kingdom.tax_id,
'name': self.kingdom.name
}
}
self.assertIsNone(getattr(self.phylum, 'hierarchy'))
hierarchy_builder = HierarchyBuilder(Taxonomy.objects.select_related('parent'))
hierarchy_builder.build_all()
updated_phylum = Taxonomy.objects.get(tax_id=self.phylum.tax_id)
self.assertIsNotNone(getattr(updated_phylum, 'hierarchy'))
self.assertDictEqual(getattr(updated_phylum, 'hierarchy'), expected_dict)
......@@ -12,6 +12,13 @@ class SimpleTaxonomySerializer(serializers.ModelSerializer):
fields = ('tax_id', 'name')
class TaxonomyHierarchySerializer(serializers.ModelSerializer):
class Meta:
model = Taxonomy
fields = ('rank', 'name', 'parent')
class TaxonomyListSerializer(BulkListSerializer):
class Meta:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment