Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Metagenomics
metagenedb
Commits
531675bf
Commit
531675bf
authored
Jan 06, 2020
by
Kenzo-Hugo Hillion
♻
Browse files
add more efficient way of building hierarchy
parent
92b47fd0
Pipeline
#21438
passed with stages
in 2 minutes and 45 seconds
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
backend/metagenedb/apps/catalog/management/commands/build_hierarchy.py
View file @
531675bf
...
...
@@ -3,13 +3,12 @@ import logging
from
django.core.management.base
import
BaseCommand
from
metagenedb.apps.catalog.models
import
Taxonomy
from
metagenedb.common.utils.profiling
import
profile
from
metagenedb.common.utils.chunks
import
dict_chunks
logging
.
basicConfig
(
format
=
'[%(asctime)s] %(levelname)s:%(name)s:%(message)s'
)
logger
=
logging
.
getLogger
(
__name__
)
SELECT_RELATED_PARENT
=
"parent
{}"
.
format
(
"__parent"
*
15
)
SELECT_RELATED_PARENT
=
"parent
"
class
HierarchyBuilder
:
...
...
@@ -21,20 +20,48 @@ class HierarchyBuilder:
self
.
hierarchy_built
=
0
self
.
hierarchy_failed
=
0
@
profile
(
'/Users/khillion/Sandbox/tax_only_many_parents.prof'
)
def
build_all
(
self
,
chunk_size
=
8000
,
test
=
False
):
def
get_local_taxo
(
self
):
logger
.
info
(
"Building local db of all taxonomy entries..."
)
self
.
taxo_dict
=
{
item
.
tax_id
:
{
'name'
:
item
.
name
,
'rank'
:
item
.
rank
,
'parent'
:
item
.
parent
.
tax_id
}
for
item
in
self
.
queryset
.
iterator
(
chunk_size
=
10000
)}
logger
.
info
(
"[DONE] Local db of all taxonomy entries."
)
def
_build_instance_hierarchy
(
self
,
tax_id
):
hierarchy
=
{}
current_taxo
=
self
.
taxo_dict
[
tax_id
]
if
current_taxo
[
'name'
]
!=
'root'
:
hierarchy
[
current_taxo
[
'rank'
]]
=
{
'tax_id'
:
tax_id
,
'name'
:
current_taxo
[
'name'
]
}
hierarchy
=
{
**
hierarchy
,
**
self
.
_build_instance_hierarchy
(
current_taxo
[
'parent'
])}
return
hierarchy
def
build_hierarchy
(
self
,
instances
):
for
instance
in
instances
:
instance
.
hierarchy
=
self
.
_build_instance_hierarchy
(
instance
.
tax_id
)
return
instances
def
build_all
(
self
,
chunk_size
=
10000
,
test
=
False
):
logger
.
info
(
"Building all hierarchy for all %s taxonomy items..."
,
self
.
total_tax
)
for
taxonomy
in
self
.
queryset
.
iterator
(
chunk_size
=
chunk_size
):
self
.
get_local_taxo
()
for
chunk
in
dict_chunks
(
self
.
taxo_dict
,
chunk_size
):
try
:
hierarchy
=
taxonomy
.
build_hierarchy
()
# noqa
self
.
hierarchy_built
+=
1
except
Exception
:
self
.
hierarchy_failed
+=
1
self
.
processed_tax
+=
1
if
self
.
processed_tax
%
10000
==
0
:
if
test
is
True
:
break
logger
.
info
(
"%s/%s Taxonomy processed so far..."
,
self
.
processed_tax
,
self
.
total_tax
)
instances
=
Taxonomy
.
objects
.
filter
(
tax_id__in
=
chunk
.
keys
())
instances
=
self
.
build_hierarchy
(
instances
)
Taxonomy
.
objects
.
bulk_update
(
instances
,
[
'hierarchy'
]
)
self
.
hierarchy_built
+=
len
(
chunk
)
except
Exception
as
exception
:
self
.
hierarchy_failed
+=
len
(
chunk
)
logger
.
warning
(
"An error occured, chunk skipped %s"
,
exception
)
self
.
processed_tax
+=
len
(
chunk
)
logger
.
info
(
"%s/%s Taxonomy processed so far..."
,
self
.
processed_tax
,
self
.
total_tax
)
if
test
is
True
:
break
logger
.
info
(
"[DONE] %s/%s Hierarchy built."
,
self
.
hierarchy_built
,
self
.
total_tax
)
logger
.
info
(
"[DONE] %s/%s Hierarchy build skipped."
,
self
.
hierarchy_failed
,
self
.
total_tax
)
...
...
backend/metagenedb/apps/catalog/management/commands/test_build_hierarchy.py
0 → 100644
View file @
531675bf
from
rest_framework.test
import
APITestCase
from
metagenedb.apps.catalog.factory
import
TaxonomyFactory
from
metagenedb.apps.catalog.models
import
Taxonomy
from
.build_hierarchy
import
HierarchyBuilder
class
TestBuildHierarchy
(
APITestCase
):
@
classmethod
def
setUpTestData
(
cls
):
"""
Build some test data for different tests
"""
cls
.
root
=
TaxonomyFactory
.
create
(
tax_id
=
"1"
,
name
=
"root"
,
rank
=
"no_rank"
,
)
cls
.
root
.
parent
=
cls
.
root
cls
.
root
.
save
()
cls
.
kingdom
=
TaxonomyFactory
(
tax_id
=
"2"
,
name
=
"KINGDOM"
,
rank
=
"kingdom"
,
parent
=
cls
.
root
)
cls
.
phylum
=
TaxonomyFactory
(
tax_id
=
"3"
,
name
=
"PHYLUM"
,
rank
=
"phylum"
,
parent
=
cls
.
kingdom
)
def
test_build_hierarchy
(
self
):
expected_dict
=
{
'phylum'
:
{
'tax_id'
:
self
.
phylum
.
tax_id
,
'name'
:
self
.
phylum
.
name
},
'kingdom'
:
{
'tax_id'
:
self
.
kingdom
.
tax_id
,
'name'
:
self
.
kingdom
.
name
}
}
self
.
assertIsNone
(
getattr
(
self
.
phylum
,
'hierarchy'
))
hierarchy_builder
=
HierarchyBuilder
(
Taxonomy
.
objects
.
select_related
(
'parent'
))
hierarchy_builder
.
build_all
()
updated_phylum
=
Taxonomy
.
objects
.
get
(
tax_id
=
self
.
phylum
.
tax_id
)
self
.
assertIsNotNone
(
getattr
(
updated_phylum
,
'hierarchy'
))
self
.
assertDictEqual
(
getattr
(
updated_phylum
,
'hierarchy'
),
expected_dict
)
backend/metagenedb/apps/catalog/serializers/taxonomy.py
View file @
531675bf
...
...
@@ -12,6 +12,13 @@ class SimpleTaxonomySerializer(serializers.ModelSerializer):
fields
=
(
'tax_id'
,
'name'
)
class
TaxonomyHierarchySerializer
(
serializers
.
ModelSerializer
):
class
Meta
:
model
=
Taxonomy
fields
=
(
'rank'
,
'name'
,
'parent'
)
class
TaxonomyListSerializer
(
BulkListSerializer
):
class
Meta
:
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment