From bee34ed243b04e148416545c2fa5a195a9ad6400 Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion <kenzo-hugo.hillion1@pasteur.fr> Date: Thu, 18 Jul 2019 17:35:09 +0200 Subject: [PATCH] add superkingdom and improve performances --- .../metagenedb/apps/catalog/admin/taxonomy.py | 2 +- .../migrations/0004_taxonomy_superkingdom.py | 19 +++++++++++ .../apps/catalog/models/taxonomy.py | 7 +++- .../apps/catalog/serializers/taxonomy.py | 7 +++- .../populate_db/import_ncbi_taxonomy.py | 33 ++++++++++++------- docker-compose.yaml | 1 + 6 files changed, 55 insertions(+), 14 deletions(-) create mode 100644 backend/metagenedb/apps/catalog/migrations/0004_taxonomy_superkingdom.py diff --git a/backend/metagenedb/apps/catalog/admin/taxonomy.py b/backend/metagenedb/apps/catalog/admin/taxonomy.py index 79090d3..633f57b 100644 --- a/backend/metagenedb/apps/catalog/admin/taxonomy.py +++ b/backend/metagenedb/apps/catalog/admin/taxonomy.py @@ -7,7 +7,7 @@ from metagenedb.apps.catalog.models import Taxonomy class TaxonomyAdmin(admin.ModelAdmin): list_display = ( - 'tax_id', 'name', 'rank', + 'tax_id', 'name', 'rank', 'superkingdom', 'kingdom', 'phylum', 'class_rank', 'order', 'family', 'genus', 'species', ) search_fields = ('tax_id', 'name') diff --git a/backend/metagenedb/apps/catalog/migrations/0004_taxonomy_superkingdom.py b/backend/metagenedb/apps/catalog/migrations/0004_taxonomy_superkingdom.py new file mode 100644 index 0000000..4e7649b --- /dev/null +++ b/backend/metagenedb/apps/catalog/migrations/0004_taxonomy_superkingdom.py @@ -0,0 +1,19 @@ +# Generated by Django 2.2.1 on 2019-07-18 09:34 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('catalog', '0003_auto_20190717_1551'), + ] + + operations = [ + migrations.AddField( + model_name='taxonomy', + name='superkingdom', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='superkingdom_children', to='catalog.Taxonomy'), + ), + ] diff --git a/backend/metagenedb/apps/catalog/models/taxonomy.py b/backend/metagenedb/apps/catalog/models/taxonomy.py index ccd1135..fa3a096 100644 --- a/backend/metagenedb/apps/catalog/models/taxonomy.py +++ b/backend/metagenedb/apps/catalog/models/taxonomy.py @@ -52,6 +52,11 @@ class Taxonomy(models.Model): null=True, blank=True, ) + superkingdom = models.ForeignKey( + 'Taxonomy', related_name='superkingdom_children', + on_delete=models.SET_NULL, + null=True, blank=True, + ) kingdom = models.ForeignKey( 'Taxonomy', related_name='kingdom_children', on_delete=models.SET_NULL, @@ -74,7 +79,7 @@ class Taxonomy(models.Model): null=True, blank=True, ) family = models.ForeignKey( - 'Taxonomy', related_name='familyphy_children', + 'Taxonomy', related_name='family_children', on_delete=models.SET_NULL, null=True, blank=True, ) diff --git a/backend/metagenedb/apps/catalog/serializers/taxonomy.py b/backend/metagenedb/apps/catalog/serializers/taxonomy.py index 87816c3..32c230c 100644 --- a/backend/metagenedb/apps/catalog/serializers/taxonomy.py +++ b/backend/metagenedb/apps/catalog/serializers/taxonomy.py @@ -10,6 +10,11 @@ class TaxonomySerializer(serializers.ModelSerializer): source='parent', required=False, ) + superkingdom = serializers.SlugRelatedField( + queryset=Taxonomy.objects.all(), + slug_field='tax_id', + required=False + ) kingdom = serializers.SlugRelatedField( queryset=Taxonomy.objects.all(), slug_field='tax_id', @@ -49,6 +54,6 @@ class TaxonomySerializer(serializers.ModelSerializer): class Meta: model = Taxonomy fields = ( - 'tax_id', 'name', 'rank', 'parent_tax_id', + 'tax_id', 'name', 'rank', 'parent_tax_id', 'superkingdom', 'kingdom', 'phylum', 'class_rank', 'order', 'family', 'genus', 'species', ) diff --git a/backend/scripts/populate_db/import_ncbi_taxonomy.py b/backend/scripts/populate_db/import_ncbi_taxonomy.py index 3206c02..cc8b8eb 100755 --- a/backend/scripts/populate_db/import_ncbi_taxonomy.py +++ b/backend/scripts/populate_db/import_ncbi_taxonomy.py @@ -18,6 +18,8 @@ from metagenedb.apps.catalog.serializers import TaxonomySerializer # noqa logging.basicConfig(level=logging.INFO) _LOGGER = logging.getLogger(__name__) +SELECT_RELATED_PARENT = "parent{}".format("__parent" * 40) + def import_names(taxonomy_names_file, select_class="scientific name"): """ @@ -62,21 +64,30 @@ def update_taxo_nodes(taxonomy_nodes_file): _LOGGER.warning(f"Invalid data: {serializer.errors}. Link to parent skipped. Data: {serializer.data}") -def build_hierarchy(): +def _build_hierarchy(taxo): + hierarchy = taxo.build_parental_hierarchy() + if 'class' in hierarchy.keys(): + hierarchy['class_rank'] = hierarchy.pop('class') + serializer = TaxonomySerializer(taxo, hierarchy) + if serializer.is_valid(): + serializer.save() + else: + _LOGGER.warning(f"Invalid data: {serializer.errors}. Building hierarchy skipped. Data: {serializer.data}") + + +def build_all_hierarchy(chunk_size=8000): """ Uses class method from Taxonomy model to retrieve the parental hierarchy and assign corresponding attribute to each entry. """ _LOGGER.info(f"Linking taxonomy objects to parental nodes from direct parental nodes...") - for taxo in Taxonomy.objects.all(): - hierarchy = taxo.build_parental_hierarchy() - if 'class' in hierarchy.keys(): - hierarchy['class_rank'] = hierarchy.pop('class') - serializer = TaxonomySerializer(taxo, hierarchy) - if serializer.is_valid(): - serializer.save() - else: - _LOGGER.warning(f"Invalid data: {serializer.errors}. Building hierarchy skipped. Data: {serializer.data}") + all_taxo = Taxonomy.objects.select_related(SELECT_RELATED_PARENT).all() + cpt = 0 + for taxo in all_taxo.iterator(chunk_size=chunk_size): + _build_hierarchy(taxo) + cpt += 1 + if cpt % 10000 == 0: + _LOGGER.info(f"{cpt}/{all_taxo.count()} hierachies built...") def parse_arguments(): @@ -99,7 +110,7 @@ def run(): taxonomy_names = import_names(args.names) create_taxo_nodes(args.nodes, taxonomy_names) update_taxo_nodes(args.nodes) - build_hierarchy() + build_all_hierarchy() if __name__ == "__main__": diff --git a/docker-compose.yaml b/docker-compose.yaml index 40ead5c..c07eb43 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -20,6 +20,7 @@ services: - main db: + shm_size: '2gb' container_name: db image: postgres:latest ports: -- GitLab