Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Metagenomics
metagenedb
Commits
0ea79473
Commit
0ea79473
authored
Nov 04, 2019
by
Kenzo-Hugo Hillion
♻
Browse files
add step to build hierachy in script
parent
f4593166
Changes
1
Hide whitespace changes
Inline
Side-by-side
backend/scripts/populate_db/import_ncbi_taxonomy.py
View file @
0ea79473
...
...
@@ -5,6 +5,7 @@ import sys
from
itertools
import
islice
from
bioapi
import
MetageneDBCatalogTaxonomyAPI
from
requests.exceptions
import
HTTPError
from
metagenedb.common.utils.parsers
import
NCBITaxonomyLineParser
...
...
@@ -93,38 +94,28 @@ class ImportNCBITaxonomy(object):
logger
.
info
(
"[DONE] %s/%s Taxonomy updated."
,
self
.
updated_tax
,
self
.
total_tax
)
logger
.
info
(
"[DONE] %s/%s Taxonomy skipped."
,
self
.
skipped_tax
,
self
.
total_tax
)
"""
_build_hierarchy and build_all_hierarchy need to be moved and executed through a specific endpoint
It will be much faster to build all the hierarchy from the backend server directly.
def _build_hierarchy(taxo):
hierarchy = taxo.build_parental_hierarchy()
if 'class' in hierarchy.keys():
hierarchy['class_rank'] = hierarchy.pop('class')
serializer = TaxonomySerializer(taxo, hierarchy)
if serializer.is_valid():
serializer.save()
else:
logger.warning(f"Invalid data: {serializer.errors}. Building hierarchy skipped. Data: {serializer.data}")
def build_all_hierarchy(chunk_size=8000):
'''
Uses class method from Taxonomy model to retrieve the parental hierarchy and
assign corresponding attribute to each entry.
'''
logger.info(f"Linking taxonomy objects to parental nodes from direct parental nodes...")
all_taxo = Taxonomy.objects.select_related(SELECT_RELATED_PARENT).all()
cpt = 0
for taxo in all_taxo.iterator(chunk_size=chunk_size):
_build_hierarchy(taxo)
cpt += 1
if cpt % 10000 == 0:
logger.info(f"{cpt}/{all_taxo.count()} hierachies built...")
logger.info(f"[DONE] {cpt}/{all_taxo.count()} hierachies built.")
"""
def
build_all_hierarchy
(
self
,
chunk_size
=
1000
):
"""
The hierarchy is automatically built when retrieving an taxonomy entry so we get all of them
"""
logger
.
info
(
f
"Building hierarchy for all entries in %s..."
,
self
.
tax_nodes_file
)
with
open
(
self
.
tax_nodes_file
,
"r"
)
as
f
:
while
True
:
next_nodes
=
list
(
islice
(
f
,
chunk_size
))
if
not
next_nodes
:
break
nodes
=
[
NCBITaxonomyLineParser
.
node
(
i
)
for
i
in
next_nodes
]
for
node
in
nodes
:
try
:
response
=
self
.
metagenedb_tax_api
.
get
(
node
.
get
(
'tax_id'
))
# noqa
self
.
updated_tax
+=
1
except
HTTPError
as
http_error
:
logger
.
warning
(
http_error
)
self
.
skipped_tax
+=
1
self
.
processed_tax
+=
len
(
nodes
)
logger
.
info
(
"%s/%s Taxonomy processed so far..."
,
self
.
processed_tax
,
self
.
total_tax
)
logger
.
info
(
"[DONE] %s/%s Hierarchy built."
,
self
.
updated_tax
,
self
.
total_tax
)
logger
.
info
(
"[DONE] %s/%s Taxonomy skipped."
,
self
.
skipped_tax
,
self
.
total_tax
)
def
parse_arguments
():
...
...
@@ -152,6 +143,7 @@ def run():
taxonomy_names
=
import_ncbi_tax
.
import_names
()
import_ncbi_tax
.
create_taxo_nodes
(
taxonomy_names
)
import_ncbi_tax
.
update_taxo_nodes
()
import_ncbi_tax
.
build_all_hierarchy
()
if
__name__
==
"__main__"
:
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment