Commit 0345bdf9 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

Merge branch '58-endpoint-hierarchy' into 'dev'

Add endpoint to generate hierarchy of taxonomy from the backend

Closes #58

See merge request !16
parents da536d6d 86e96103
Pipeline #17565 passed with stages
in 2 minutes and 29 seconds
from rest_framework.response import Response
from metagenedb.api.catalog.filters import TaxonomyFilter
from metagenedb.apps.catalog.models import Taxonomy
from metagenedb.apps.catalog.serializers import TaxonomySerializer
......@@ -10,3 +12,9 @@ class TaxonomyViewSet(BulkViewSet):
serializer_class = TaxonomySerializer
lookup_field = 'tax_id'
filterset_class = TaxonomyFilter
def retrieve(self, request, *args, **kwargs):
instance = self.get_object()
instance.build_parental_hierarchy()
serializer = self.get_serializer(instance)
return Response(serializer.data)
......@@ -100,9 +100,11 @@ class Taxonomy(models.Model):
def build_parental_hierarchy(self):
hierarchy = {}
if self.name != 'root' and self.parent is not None:
hierarchy[self.rank] = self.tax_id
hierarchy[self.rank] = self
hierarchy = {**hierarchy, **self.parent.build_parental_hierarchy()}
hierarchy['tax_id'] = self.tax_id
for level, value in hierarchy.items():
setattr(self, level, value)
self.save()
return hierarchy
class Meta:
......
from unittest import TestCase
from rest_framework.test import APITestCase
from .taxonomy import Taxonomy
from metagenedb.apps.catalog.factory import TaxonomyFactory
class TestBuildHierarchy(TestCase):
class TestBuildHierarchy(APITestCase):
@classmethod
def setUpClass(cls):
def setUp(self):
"""
Build some test data for different tests
"""
cls.root = Taxonomy(
self.root = TaxonomyFactory.create(
tax_id="1",
name="root",
rank="no_rank",
)
cls.kingdom = Taxonomy(
self.kingdom = TaxonomyFactory(
tax_id="2",
name="KINGDOM",
rank="kingdom",
parent=cls.root
parent=self.root
)
cls.phylum = Taxonomy(
self.phylum = TaxonomyFactory(
tax_id="3",
name="PHYLUM",
rank="phylum",
parent=cls.kingdom
parent=self.kingdom
)
def test_build_hierarchy(self):
expected_dict = {
'tax_id': '3',
'phylum': '3',
'kingdom': '2'
'phylum': self.phylum,
'kingdom': self.kingdom
}
self.assertNotEqual(getattr(self.phylum, 'kingdom', None), self.kingdom)
test_dict = self.phylum.build_parental_hierarchy()
self.assertDictEqual(test_dict, expected_dict)
self.assertEqual(getattr(self.phylum, 'kingdom', None), self.kingdom)
......@@ -5,6 +5,7 @@ import sys
from itertools import islice
from bioapi import MetageneDBCatalogTaxonomyAPI
from requests.exceptions import HTTPError
from metagenedb.common.utils.parsers import NCBITaxonomyLineParser
......@@ -93,38 +94,28 @@ class ImportNCBITaxonomy(object):
logger.info("[DONE] %s/%s Taxonomy updated.", self.updated_tax, self.total_tax)
logger.info("[DONE] %s/%s Taxonomy skipped.", self.skipped_tax, self.total_tax)
"""
_build_hierarchy and build_all_hierarchy need to be moved and executed through a specific endpoint
It will be much faster to build all the hierarchy from the backend server directly.
def _build_hierarchy(taxo):
hierarchy = taxo.build_parental_hierarchy()
if 'class' in hierarchy.keys():
hierarchy['class_rank'] = hierarchy.pop('class')
serializer = TaxonomySerializer(taxo, hierarchy)
if serializer.is_valid():
serializer.save()
else:
logger.warning(f"Invalid data: {serializer.errors}. Building hierarchy skipped. Data: {serializer.data}")
def build_all_hierarchy(chunk_size=8000):
'''
Uses class method from Taxonomy model to retrieve the parental hierarchy and
assign corresponding attribute to each entry.
'''
logger.info(f"Linking taxonomy objects to parental nodes from direct parental nodes...")
all_taxo = Taxonomy.objects.select_related(SELECT_RELATED_PARENT).all()
cpt = 0
for taxo in all_taxo.iterator(chunk_size=chunk_size):
_build_hierarchy(taxo)
cpt += 1
if cpt % 10000 == 0:
logger.info(f"{cpt}/{all_taxo.count()} hierachies built...")
logger.info(f"[DONE] {cpt}/{all_taxo.count()} hierachies built.")
"""
def build_all_hierarchy(self, chunk_size=1000):
"""
The hierarchy is automatically built when retrieving an taxonomy entry so we get all of them
"""
logger.info(f"Building hierarchy for all entries in %s...", self.tax_nodes_file)
with open(self.tax_nodes_file, "r") as f:
while True:
next_nodes = list(islice(f, chunk_size))
if not next_nodes:
break
nodes = [NCBITaxonomyLineParser.node(i) for i in next_nodes]
for node in nodes:
try:
response = self.metagenedb_tax_api.get(node.get('tax_id')) # noqa
self.updated_tax += 1
except HTTPError as http_error:
logger.warning(http_error)
self.skipped_tax += 1
self.processed_tax += len(nodes)
logger.info("%s/%s Taxonomy processed so far...", self.processed_tax, self.total_tax)
logger.info("[DONE] %s/%s Hierarchy built.", self.updated_tax, self.total_tax)
logger.info("[DONE] %s/%s Taxonomy skipped.", self.skipped_tax, self.total_tax)
def parse_arguments():
......@@ -135,6 +126,8 @@ def parse_arguments():
# Common arguments for analysis and annotations
parser.add_argument('--nodes', help='nodes.dmp file from ncbi_taxonomy', required=True)
parser.add_argument('--names', help='names.dmp file from ncbi_taxonomy', required=True)
parser.add_argument('--skip_creation', action='store_true', help='Skip taxonomy creation.')
parser.add_argument('--skip_hierarchy', action='store_true', help='Skip taxonomy hierarchy built.')
parser.add_argument('--url', help='base URL of the instance.', default='http://localhost/')
parser.add_argument('-v', '--verbose', action='store_true')
......@@ -150,8 +143,11 @@ def run():
logger.setLevel(logging.INFO)
import_ncbi_tax = ImportNCBITaxonomy(args.url, args.names, args.nodes)
taxonomy_names = import_ncbi_tax.import_names()
import_ncbi_tax.create_taxo_nodes(taxonomy_names)
import_ncbi_tax.update_taxo_nodes()
if not args.skip_creation:
import_ncbi_tax.create_taxo_nodes(taxonomy_names)
import_ncbi_tax.update_taxo_nodes()
if not args.skip_hierarchy:
import_ncbi_tax.build_all_hierarchy()
if __name__ == "__main__":
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment