Skip to content
Snippets Groups Projects
Commit 0345bdf9 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion :recycle:
Browse files

Merge branch '58-endpoint-hierarchy' into 'dev'

Add endpoint to generate hierarchy of taxonomy from the backend

Closes #58

See merge request !16
parents da536d6d 86e96103
No related branches found
No related tags found
2 merge requests!59Prod,!16Add endpoint to generate hierarchy of taxonomy from the backend
Pipeline #17565 passed
from rest_framework.response import Response
from metagenedb.api.catalog.filters import TaxonomyFilter from metagenedb.api.catalog.filters import TaxonomyFilter
from metagenedb.apps.catalog.models import Taxonomy from metagenedb.apps.catalog.models import Taxonomy
from metagenedb.apps.catalog.serializers import TaxonomySerializer from metagenedb.apps.catalog.serializers import TaxonomySerializer
...@@ -10,3 +12,9 @@ class TaxonomyViewSet(BulkViewSet): ...@@ -10,3 +12,9 @@ class TaxonomyViewSet(BulkViewSet):
serializer_class = TaxonomySerializer serializer_class = TaxonomySerializer
lookup_field = 'tax_id' lookup_field = 'tax_id'
filterset_class = TaxonomyFilter filterset_class = TaxonomyFilter
def retrieve(self, request, *args, **kwargs):
instance = self.get_object()
instance.build_parental_hierarchy()
serializer = self.get_serializer(instance)
return Response(serializer.data)
...@@ -100,9 +100,11 @@ class Taxonomy(models.Model): ...@@ -100,9 +100,11 @@ class Taxonomy(models.Model):
def build_parental_hierarchy(self): def build_parental_hierarchy(self):
hierarchy = {} hierarchy = {}
if self.name != 'root' and self.parent is not None: if self.name != 'root' and self.parent is not None:
hierarchy[self.rank] = self.tax_id hierarchy[self.rank] = self
hierarchy = {**hierarchy, **self.parent.build_parental_hierarchy()} hierarchy = {**hierarchy, **self.parent.build_parental_hierarchy()}
hierarchy['tax_id'] = self.tax_id for level, value in hierarchy.items():
setattr(self, level, value)
self.save()
return hierarchy return hierarchy
class Meta: class Meta:
......
from unittest import TestCase from rest_framework.test import APITestCase
from .taxonomy import Taxonomy from metagenedb.apps.catalog.factory import TaxonomyFactory
class TestBuildHierarchy(TestCase): class TestBuildHierarchy(APITestCase):
@classmethod def setUp(self):
def setUpClass(cls):
""" """
Build some test data for different tests Build some test data for different tests
""" """
cls.root = Taxonomy( self.root = TaxonomyFactory.create(
tax_id="1", tax_id="1",
name="root", name="root",
rank="no_rank", rank="no_rank",
) )
cls.kingdom = Taxonomy( self.kingdom = TaxonomyFactory(
tax_id="2", tax_id="2",
name="KINGDOM", name="KINGDOM",
rank="kingdom", rank="kingdom",
parent=cls.root parent=self.root
) )
cls.phylum = Taxonomy( self.phylum = TaxonomyFactory(
tax_id="3", tax_id="3",
name="PHYLUM", name="PHYLUM",
rank="phylum", rank="phylum",
parent=cls.kingdom parent=self.kingdom
) )
def test_build_hierarchy(self): def test_build_hierarchy(self):
expected_dict = { expected_dict = {
'tax_id': '3', 'phylum': self.phylum,
'phylum': '3', 'kingdom': self.kingdom
'kingdom': '2'
} }
self.assertNotEqual(getattr(self.phylum, 'kingdom', None), self.kingdom)
test_dict = self.phylum.build_parental_hierarchy() test_dict = self.phylum.build_parental_hierarchy()
self.assertDictEqual(test_dict, expected_dict) self.assertDictEqual(test_dict, expected_dict)
self.assertEqual(getattr(self.phylum, 'kingdom', None), self.kingdom)
...@@ -5,6 +5,7 @@ import sys ...@@ -5,6 +5,7 @@ import sys
from itertools import islice from itertools import islice
from bioapi import MetageneDBCatalogTaxonomyAPI from bioapi import MetageneDBCatalogTaxonomyAPI
from requests.exceptions import HTTPError
from metagenedb.common.utils.parsers import NCBITaxonomyLineParser from metagenedb.common.utils.parsers import NCBITaxonomyLineParser
...@@ -93,38 +94,28 @@ class ImportNCBITaxonomy(object): ...@@ -93,38 +94,28 @@ class ImportNCBITaxonomy(object):
logger.info("[DONE] %s/%s Taxonomy updated.", self.updated_tax, self.total_tax) logger.info("[DONE] %s/%s Taxonomy updated.", self.updated_tax, self.total_tax)
logger.info("[DONE] %s/%s Taxonomy skipped.", self.skipped_tax, self.total_tax) logger.info("[DONE] %s/%s Taxonomy skipped.", self.skipped_tax, self.total_tax)
def build_all_hierarchy(self, chunk_size=1000):
""" """
_build_hierarchy and build_all_hierarchy need to be moved and executed through a specific endpoint The hierarchy is automatically built when retrieving an taxonomy entry so we get all of them
It will be much faster to build all the hierarchy from the backend server directly. """
logger.info(f"Building hierarchy for all entries in %s...", self.tax_nodes_file)
def _build_hierarchy(taxo): with open(self.tax_nodes_file, "r") as f:
hierarchy = taxo.build_parental_hierarchy() while True:
if 'class' in hierarchy.keys(): next_nodes = list(islice(f, chunk_size))
hierarchy['class_rank'] = hierarchy.pop('class') if not next_nodes:
serializer = TaxonomySerializer(taxo, hierarchy) break
if serializer.is_valid(): nodes = [NCBITaxonomyLineParser.node(i) for i in next_nodes]
serializer.save() for node in nodes:
else: try:
logger.warning(f"Invalid data: {serializer.errors}. Building hierarchy skipped. Data: {serializer.data}") response = self.metagenedb_tax_api.get(node.get('tax_id')) # noqa
self.updated_tax += 1
except HTTPError as http_error:
def build_all_hierarchy(chunk_size=8000): logger.warning(http_error)
''' self.skipped_tax += 1
Uses class method from Taxonomy model to retrieve the parental hierarchy and self.processed_tax += len(nodes)
assign corresponding attribute to each entry. logger.info("%s/%s Taxonomy processed so far...", self.processed_tax, self.total_tax)
''' logger.info("[DONE] %s/%s Hierarchy built.", self.updated_tax, self.total_tax)
logger.info(f"Linking taxonomy objects to parental nodes from direct parental nodes...") logger.info("[DONE] %s/%s Taxonomy skipped.", self.skipped_tax, self.total_tax)
all_taxo = Taxonomy.objects.select_related(SELECT_RELATED_PARENT).all()
cpt = 0
for taxo in all_taxo.iterator(chunk_size=chunk_size):
_build_hierarchy(taxo)
cpt += 1
if cpt % 10000 == 0:
logger.info(f"{cpt}/{all_taxo.count()} hierachies built...")
logger.info(f"[DONE] {cpt}/{all_taxo.count()} hierachies built.")
"""
def parse_arguments(): def parse_arguments():
...@@ -135,6 +126,8 @@ def parse_arguments(): ...@@ -135,6 +126,8 @@ def parse_arguments():
# Common arguments for analysis and annotations # Common arguments for analysis and annotations
parser.add_argument('--nodes', help='nodes.dmp file from ncbi_taxonomy', required=True) parser.add_argument('--nodes', help='nodes.dmp file from ncbi_taxonomy', required=True)
parser.add_argument('--names', help='names.dmp file from ncbi_taxonomy', required=True) parser.add_argument('--names', help='names.dmp file from ncbi_taxonomy', required=True)
parser.add_argument('--skip_creation', action='store_true', help='Skip taxonomy creation.')
parser.add_argument('--skip_hierarchy', action='store_true', help='Skip taxonomy hierarchy built.')
parser.add_argument('--url', help='base URL of the instance.', default='http://localhost/') parser.add_argument('--url', help='base URL of the instance.', default='http://localhost/')
parser.add_argument('-v', '--verbose', action='store_true') parser.add_argument('-v', '--verbose', action='store_true')
...@@ -150,8 +143,11 @@ def run(): ...@@ -150,8 +143,11 @@ def run():
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
import_ncbi_tax = ImportNCBITaxonomy(args.url, args.names, args.nodes) import_ncbi_tax = ImportNCBITaxonomy(args.url, args.names, args.nodes)
taxonomy_names = import_ncbi_tax.import_names() taxonomy_names = import_ncbi_tax.import_names()
import_ncbi_tax.create_taxo_nodes(taxonomy_names) if not args.skip_creation:
import_ncbi_tax.update_taxo_nodes() import_ncbi_tax.create_taxo_nodes(taxonomy_names)
import_ncbi_tax.update_taxo_nodes()
if not args.skip_hierarchy:
import_ncbi_tax.build_all_hierarchy()
if __name__ == "__main__": if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment