Commit 0ea79473 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

add step to build hierachy in script

parent f4593166
......@@ -5,6 +5,7 @@ import sys
from itertools import islice
from bioapi import MetageneDBCatalogTaxonomyAPI
from requests.exceptions import HTTPError
from metagenedb.common.utils.parsers import NCBITaxonomyLineParser
......@@ -93,38 +94,28 @@ class ImportNCBITaxonomy(object):
logger.info("[DONE] %s/%s Taxonomy updated.", self.updated_tax, self.total_tax)
logger.info("[DONE] %s/%s Taxonomy skipped.", self.skipped_tax, self.total_tax)
"""
_build_hierarchy and build_all_hierarchy need to be moved and executed through a specific endpoint
It will be much faster to build all the hierarchy from the backend server directly.
def _build_hierarchy(taxo):
hierarchy = taxo.build_parental_hierarchy()
if 'class' in hierarchy.keys():
hierarchy['class_rank'] = hierarchy.pop('class')
serializer = TaxonomySerializer(taxo, hierarchy)
if serializer.is_valid():
serializer.save()
else:
logger.warning(f"Invalid data: {serializer.errors}. Building hierarchy skipped. Data: {serializer.data}")
def build_all_hierarchy(chunk_size=8000):
'''
Uses class method from Taxonomy model to retrieve the parental hierarchy and
assign corresponding attribute to each entry.
'''
logger.info(f"Linking taxonomy objects to parental nodes from direct parental nodes...")
all_taxo = Taxonomy.objects.select_related(SELECT_RELATED_PARENT).all()
cpt = 0
for taxo in all_taxo.iterator(chunk_size=chunk_size):
_build_hierarchy(taxo)
cpt += 1
if cpt % 10000 == 0:
logger.info(f"{cpt}/{all_taxo.count()} hierachies built...")
logger.info(f"[DONE] {cpt}/{all_taxo.count()} hierachies built.")
"""
def build_all_hierarchy(self, chunk_size=1000):
"""
The hierarchy is automatically built when retrieving an taxonomy entry so we get all of them
"""
logger.info(f"Building hierarchy for all entries in %s...", self.tax_nodes_file)
with open(self.tax_nodes_file, "r") as f:
while True:
next_nodes = list(islice(f, chunk_size))
if not next_nodes:
break
nodes = [NCBITaxonomyLineParser.node(i) for i in next_nodes]
for node in nodes:
try:
response = self.metagenedb_tax_api.get(node.get('tax_id')) # noqa
self.updated_tax += 1
except HTTPError as http_error:
logger.warning(http_error)
self.skipped_tax += 1
self.processed_tax += len(nodes)
logger.info("%s/%s Taxonomy processed so far...", self.processed_tax, self.total_tax)
logger.info("[DONE] %s/%s Hierarchy built.", self.updated_tax, self.total_tax)
logger.info("[DONE] %s/%s Taxonomy skipped.", self.skipped_tax, self.total_tax)
def parse_arguments():
......@@ -152,6 +143,7 @@ def run():
taxonomy_names = import_ncbi_tax.import_names()
import_ncbi_tax.create_taxo_nodes(taxonomy_names)
import_ncbi_tax.update_taxo_nodes()
import_ncbi_tax.build_all_hierarchy()
if __name__ == "__main__":
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment