diff --git a/backend/metagenedb/api/catalog/views/taxonomy.py b/backend/metagenedb/api/catalog/views/taxonomy.py index 174b92310945a4196105fee23cbc3cb350835219..9b6f92ec0d64c057f3137a074ec2fe3acc5ba203 100644 --- a/backend/metagenedb/api/catalog/views/taxonomy.py +++ b/backend/metagenedb/api/catalog/views/taxonomy.py @@ -1,3 +1,5 @@ +from rest_framework.response import Response + from metagenedb.api.catalog.filters import TaxonomyFilter from metagenedb.apps.catalog.models import Taxonomy from metagenedb.apps.catalog.serializers import TaxonomySerializer @@ -10,3 +12,9 @@ class TaxonomyViewSet(BulkViewSet): serializer_class = TaxonomySerializer lookup_field = 'tax_id' filterset_class = TaxonomyFilter + + def retrieve(self, request, *args, **kwargs): + instance = self.get_object() + instance.build_parental_hierarchy() + serializer = self.get_serializer(instance) + return Response(serializer.data) diff --git a/backend/metagenedb/apps/catalog/models/taxonomy.py b/backend/metagenedb/apps/catalog/models/taxonomy.py index c10a5de240be671a3009b821a6ff198de3e26a82..bc3498db2dfa975726ce66941aa9ba127f23340d 100644 --- a/backend/metagenedb/apps/catalog/models/taxonomy.py +++ b/backend/metagenedb/apps/catalog/models/taxonomy.py @@ -100,9 +100,11 @@ class Taxonomy(models.Model): def build_parental_hierarchy(self): hierarchy = {} if self.name != 'root' and self.parent is not None: - hierarchy[self.rank] = self.tax_id + hierarchy[self.rank] = self hierarchy = {**hierarchy, **self.parent.build_parental_hierarchy()} - hierarchy['tax_id'] = self.tax_id + for level, value in hierarchy.items(): + setattr(self, level, value) + self.save() return hierarchy class Meta: diff --git a/backend/metagenedb/apps/catalog/models/test_taxonomy.py b/backend/metagenedb/apps/catalog/models/test_taxonomy.py index 141485119c0b315d64d856c17bd9e52c88a46f3a..a9e8cad224396a3d208a226ece1082de537592c4 100644 --- a/backend/metagenedb/apps/catalog/models/test_taxonomy.py +++ b/backend/metagenedb/apps/catalog/models/test_taxonomy.py @@ -1,38 +1,38 @@ -from unittest import TestCase +from rest_framework.test import APITestCase -from .taxonomy import Taxonomy +from metagenedb.apps.catalog.factory import TaxonomyFactory -class TestBuildHierarchy(TestCase): +class TestBuildHierarchy(APITestCase): - @classmethod - def setUpClass(cls): + def setUp(self): """ Build some test data for different tests """ - cls.root = Taxonomy( + self.root = TaxonomyFactory.create( tax_id="1", name="root", rank="no_rank", ) - cls.kingdom = Taxonomy( + self.kingdom = TaxonomyFactory( tax_id="2", name="KINGDOM", rank="kingdom", - parent=cls.root + parent=self.root ) - cls.phylum = Taxonomy( + self.phylum = TaxonomyFactory( tax_id="3", name="PHYLUM", rank="phylum", - parent=cls.kingdom + parent=self.kingdom ) def test_build_hierarchy(self): expected_dict = { - 'tax_id': '3', - 'phylum': '3', - 'kingdom': '2' + 'phylum': self.phylum, + 'kingdom': self.kingdom } + self.assertNotEqual(getattr(self.phylum, 'kingdom', None), self.kingdom) test_dict = self.phylum.build_parental_hierarchy() self.assertDictEqual(test_dict, expected_dict) + self.assertEqual(getattr(self.phylum, 'kingdom', None), self.kingdom) diff --git a/backend/scripts/populate_db/import_ncbi_taxonomy.py b/backend/scripts/populate_db/import_ncbi_taxonomy.py index d8db227e67e48ace973e681599904ac13b2ecb55..f4b64a74fd503eb15a0e146d57cdbf35e00a9355 100755 --- a/backend/scripts/populate_db/import_ncbi_taxonomy.py +++ b/backend/scripts/populate_db/import_ncbi_taxonomy.py @@ -5,6 +5,7 @@ import sys from itertools import islice from bioapi import MetageneDBCatalogTaxonomyAPI +from requests.exceptions import HTTPError from metagenedb.common.utils.parsers import NCBITaxonomyLineParser @@ -93,38 +94,28 @@ class ImportNCBITaxonomy(object): logger.info("[DONE] %s/%s Taxonomy updated.", self.updated_tax, self.total_tax) logger.info("[DONE] %s/%s Taxonomy skipped.", self.skipped_tax, self.total_tax) - -""" -_build_hierarchy and build_all_hierarchy need to be moved and executed through a specific endpoint -It will be much faster to build all the hierarchy from the backend server directly. - -def _build_hierarchy(taxo): - hierarchy = taxo.build_parental_hierarchy() - if 'class' in hierarchy.keys(): - hierarchy['class_rank'] = hierarchy.pop('class') - serializer = TaxonomySerializer(taxo, hierarchy) - if serializer.is_valid(): - serializer.save() - else: - logger.warning(f"Invalid data: {serializer.errors}. Building hierarchy skipped. Data: {serializer.data}") - - -def build_all_hierarchy(chunk_size=8000): - ''' - Uses class method from Taxonomy model to retrieve the parental hierarchy and - assign corresponding attribute to each entry. - ''' - logger.info(f"Linking taxonomy objects to parental nodes from direct parental nodes...") - all_taxo = Taxonomy.objects.select_related(SELECT_RELATED_PARENT).all() - cpt = 0 - for taxo in all_taxo.iterator(chunk_size=chunk_size): - _build_hierarchy(taxo) - cpt += 1 - if cpt % 10000 == 0: - logger.info(f"{cpt}/{all_taxo.count()} hierachies built...") - logger.info(f"[DONE] {cpt}/{all_taxo.count()} hierachies built.") - -""" + def build_all_hierarchy(self, chunk_size=1000): + """ + The hierarchy is automatically built when retrieving an taxonomy entry so we get all of them + """ + logger.info(f"Building hierarchy for all entries in %s...", self.tax_nodes_file) + with open(self.tax_nodes_file, "r") as f: + while True: + next_nodes = list(islice(f, chunk_size)) + if not next_nodes: + break + nodes = [NCBITaxonomyLineParser.node(i) for i in next_nodes] + for node in nodes: + try: + response = self.metagenedb_tax_api.get(node.get('tax_id')) # noqa + self.updated_tax += 1 + except HTTPError as http_error: + logger.warning(http_error) + self.skipped_tax += 1 + self.processed_tax += len(nodes) + logger.info("%s/%s Taxonomy processed so far...", self.processed_tax, self.total_tax) + logger.info("[DONE] %s/%s Hierarchy built.", self.updated_tax, self.total_tax) + logger.info("[DONE] %s/%s Taxonomy skipped.", self.skipped_tax, self.total_tax) def parse_arguments(): @@ -135,6 +126,8 @@ def parse_arguments(): # Common arguments for analysis and annotations parser.add_argument('--nodes', help='nodes.dmp file from ncbi_taxonomy', required=True) parser.add_argument('--names', help='names.dmp file from ncbi_taxonomy', required=True) + parser.add_argument('--skip_creation', action='store_true', help='Skip taxonomy creation.') + parser.add_argument('--skip_hierarchy', action='store_true', help='Skip taxonomy hierarchy built.') parser.add_argument('--url', help='base URL of the instance.', default='http://localhost/') parser.add_argument('-v', '--verbose', action='store_true') @@ -150,8 +143,11 @@ def run(): logger.setLevel(logging.INFO) import_ncbi_tax = ImportNCBITaxonomy(args.url, args.names, args.nodes) taxonomy_names = import_ncbi_tax.import_names() - import_ncbi_tax.create_taxo_nodes(taxonomy_names) - import_ncbi_tax.update_taxo_nodes() + if not args.skip_creation: + import_ncbi_tax.create_taxo_nodes(taxonomy_names) + import_ncbi_tax.update_taxo_nodes() + if not args.skip_hierarchy: + import_ncbi_tax.build_all_hierarchy() if __name__ == "__main__":