From f00c7a440023d5a75632e0947b558ed660ebbfa2 Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion <kenzo-hugo.hillion1@pasteur.fr> Date: Wed, 17 Jul 2019 14:31:55 +0200 Subject: [PATCH] Add Taxonomy model and script to import from local files --- .../metagenedb/apps/catalog/admin/__init__.py | 3 +- .../metagenedb/apps/catalog/admin/taxonomy.py | 10 +++ .../apps/catalog/migrations/0002_taxonomy.py | 27 ++++++ .../apps/catalog/models/__init__.py | 3 +- .../apps/catalog/models/taxonomy.py | 60 +++++++++++++ .../apps/catalog/serializers/__init__.py | 6 ++ .../apps/catalog/serializers/function.py | 8 ++ .../apps/catalog/serializers/gene.py | 11 +++ .../apps/catalog/serializers/taxonomy.py | 15 ++++ .../apps/catalog/views/insertion_model.py | 6 +- .../populate_db/import_ncbi_taxonomy.py | 88 +++++++++++++++++++ 11 files changed, 234 insertions(+), 3 deletions(-) create mode 100644 backend/metagenedb/apps/catalog/admin/taxonomy.py create mode 100644 backend/metagenedb/apps/catalog/migrations/0002_taxonomy.py create mode 100644 backend/metagenedb/apps/catalog/models/taxonomy.py create mode 100644 backend/metagenedb/apps/catalog/serializers/__init__.py create mode 100644 backend/metagenedb/apps/catalog/serializers/function.py create mode 100644 backend/metagenedb/apps/catalog/serializers/gene.py create mode 100644 backend/metagenedb/apps/catalog/serializers/taxonomy.py create mode 100755 backend/scripts/populate_db/import_ncbi_taxonomy.py diff --git a/backend/metagenedb/apps/catalog/admin/__init__.py b/backend/metagenedb/apps/catalog/admin/__init__.py index d5ce815..83c364a 100644 --- a/backend/metagenedb/apps/catalog/admin/__init__.py +++ b/backend/metagenedb/apps/catalog/admin/__init__.py @@ -1,5 +1,6 @@ from .gene import GeneAdmin from .function import FunctionAdmin, KeggOrthologyAdmin +from .taxonomy import TaxonomyAdmin -__all__ = ['GeneAdmin', 'FunctionAdmin', 'KeggOrthologyAdmin'] +__all__ = ['GeneAdmin', 'FunctionAdmin', 'KeggOrthologyAdmin', 'TaxonomyAdmin'] diff --git a/backend/metagenedb/apps/catalog/admin/taxonomy.py b/backend/metagenedb/apps/catalog/admin/taxonomy.py new file mode 100644 index 0000000..cb5857e --- /dev/null +++ b/backend/metagenedb/apps/catalog/admin/taxonomy.py @@ -0,0 +1,10 @@ +from django.contrib import admin + +from metagenedb.apps.catalog.models import Taxonomy + + +@admin.register(Taxonomy) +class TaxonomyAdmin(admin.ModelAdmin): + + list_display = ('tax_id', 'name', 'rank', 'parent') + search_fields = ('tax_id', 'name') diff --git a/backend/metagenedb/apps/catalog/migrations/0002_taxonomy.py b/backend/metagenedb/apps/catalog/migrations/0002_taxonomy.py new file mode 100644 index 0000000..0f3d263 --- /dev/null +++ b/backend/metagenedb/apps/catalog/migrations/0002_taxonomy.py @@ -0,0 +1,27 @@ +# Generated by Django 2.2.1 on 2019-07-17 12:20 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('catalog', '0001_initial'), + ] + + operations = [ + migrations.CreateModel( + name='Taxonomy', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('tax_id', models.CharField(db_index=True, max_length=20, unique=True)), + ('name', models.CharField(default='No scientific name', max_length=200)), + ('rank', models.CharField(choices=[('infraclass', 'Infraclass'), ('class', 'Class'), ('forma', 'Forma'), ('phylum', 'Phylum'), ('species_subgroup', 'Species subgroup'), ('genus', 'Genus'), ('parvorder', 'Parvorder'), ('subcohort', 'Subcohort'), ('subtribe', 'Subtribe'), ('superphylum', 'Superphylum'), ('subgenus', 'Subgenus'), ('superorder', 'Superorder'), ('species', 'Species'), ('subphylum', 'Subphylum'), ('infraorder', 'Infraorder'), ('section', 'Section'), ('tribe', 'Tribe'), ('cohort', 'Cohort'), ('subsection', 'Subsection'), ('series', 'Series'), ('order', 'Order'), ('subclass', 'Subclass'), ('superfamily', 'Superfamily'), ('superclass', 'Superclass'), ('superkingdom', 'Superkingdom'), ('kingdom', 'Kingdom'), ('family', 'Family'), ('suborder', 'Suborder'), ('subkingdom', 'Subkingdom'), ('subspecies', 'Subspecies'), ('no_rank', 'No rank'), ('subfamily', 'Subfamily'), ('varietas', 'Varietas'), ('species_group', 'Species group')], max_length=20)), + ('parent', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='children', to='catalog.Taxonomy')), + ], + options={ + 'verbose_name_plural': 'Taxonomy', + }, + ), + ] diff --git a/backend/metagenedb/apps/catalog/models/__init__.py b/backend/metagenedb/apps/catalog/models/__init__.py index c1a956a..fe34c79 100644 --- a/backend/metagenedb/apps/catalog/models/__init__.py +++ b/backend/metagenedb/apps/catalog/models/__init__.py @@ -1,5 +1,6 @@ from .function import Function, KeggOrthology from .gene import Gene +from .taxonomy import Taxonomy -__all__ = ['Function', 'KeggOrthology', 'Gene'] +__all__ = ['Function', 'KeggOrthology', 'Gene', 'Taxonomy'] diff --git a/backend/metagenedb/apps/catalog/models/taxonomy.py b/backend/metagenedb/apps/catalog/models/taxonomy.py new file mode 100644 index 0000000..3f6fc43 --- /dev/null +++ b/backend/metagenedb/apps/catalog/models/taxonomy.py @@ -0,0 +1,60 @@ +from django.db import models + + +class Taxonomy(models.Model): + """ + Taxonomy is based on NCBI taxonomy: https://www.ncbi.nlm.nih.gov/taxonomy + """ + NAME_DEFAULT = "No scientific name" + RANK_CHOICES = [ + ('infraclass', 'Infraclass'), + ('class', 'Class'), + ('forma', 'Forma'), + ('phylum', 'Phylum'), + ('species_subgroup', 'Species subgroup'), + ('genus', 'Genus'), + ('parvorder', 'Parvorder'), + ('subcohort', 'Subcohort'), + ('subtribe', 'Subtribe'), + ('superphylum', 'Superphylum'), + ('subgenus', 'Subgenus'), + ('superorder', 'Superorder'), + ('species', 'Species'), + ('subphylum', 'Subphylum'), + ('infraorder', 'Infraorder'), + ('section', 'Section'), + ('tribe', 'Tribe'), + ('cohort', 'Cohort'), + ('subsection', 'Subsection'), + ('series', 'Series'), + ('order', 'Order'), + ('subclass', 'Subclass'), + ('superfamily', 'Superfamily'), + ('superclass', 'Superclass'), + ('superkingdom', 'Superkingdom'), + ('kingdom', 'Kingdom'), + ('family', 'Family'), + ('suborder', 'Suborder'), + ('subkingdom', 'Subkingdom'), + ('subspecies', 'Subspecies'), + ('no_rank', 'No rank'), + ('subfamily', 'Subfamily'), + ('varietas', 'Varietas'), + ('species_group', 'Species group'), + ] + + tax_id = models.CharField(max_length=20, unique=True, db_index=True) + name = models.CharField(max_length=200, default=NAME_DEFAULT) + rank = models.CharField(max_length=20, choices=RANK_CHOICES) + parent = models.ForeignKey( + 'Taxonomy', + related_name='children', + on_delete=models.SET_NULL, + null=True, blank=True, + ) + + def __str__(self): + return f"{self.name}" + + class Meta: + verbose_name_plural = "Taxonomy" diff --git a/backend/metagenedb/apps/catalog/serializers/__init__.py b/backend/metagenedb/apps/catalog/serializers/__init__.py new file mode 100644 index 0000000..295db50 --- /dev/null +++ b/backend/metagenedb/apps/catalog/serializers/__init__.py @@ -0,0 +1,6 @@ +from .function import FunctionSerializer +from .gene import GeneSerializer +from .taxonomy import TaxonomySerializer + + +__all__ = ['FunctionSerializer', 'GeneSerializer', 'TaxonomySerializer'] \ No newline at end of file diff --git a/backend/metagenedb/apps/catalog/serializers/function.py b/backend/metagenedb/apps/catalog/serializers/function.py new file mode 100644 index 0000000..87b053b --- /dev/null +++ b/backend/metagenedb/apps/catalog/serializers/function.py @@ -0,0 +1,8 @@ +from rest_framework import serializers +from metagenedb.apps.catalog.models import Function + + +class FunctionSerializer(serializers.ModelSerializer): + class Meta: + model = Function + fields = ('function_id', 'source', 'name') \ No newline at end of file diff --git a/backend/metagenedb/apps/catalog/serializers/gene.py b/backend/metagenedb/apps/catalog/serializers/gene.py new file mode 100644 index 0000000..a522084 --- /dev/null +++ b/backend/metagenedb/apps/catalog/serializers/gene.py @@ -0,0 +1,11 @@ +from rest_framework import serializers +from metagenedb.apps.catalog.models import Gene +from metagenedb.apps.catalog.serializers import FunctionSerializer + + +class GeneSerializer(serializers.ModelSerializer): + functions = FunctionSerializer(many=True, read_only=True) + + class Meta: + model = Gene + fields = ('gene_id', 'gene_length', 'functions') \ No newline at end of file diff --git a/backend/metagenedb/apps/catalog/serializers/taxonomy.py b/backend/metagenedb/apps/catalog/serializers/taxonomy.py new file mode 100644 index 0000000..8da45ff --- /dev/null +++ b/backend/metagenedb/apps/catalog/serializers/taxonomy.py @@ -0,0 +1,15 @@ +from rest_framework import serializers +from metagenedb.apps.catalog.models import Taxonomy + + +class TaxonomySerializer(serializers.ModelSerializer): + parent_tax_id = serializers.SlugRelatedField( + queryset=Taxonomy.objects.all(), + slug_field='tax_id', + source='parent', + required=False + ) + + class Meta: + model = Taxonomy + fields = ('tax_id', 'name', 'rank', 'parent_tax_id') diff --git a/backend/metagenedb/apps/catalog/views/insertion_model.py b/backend/metagenedb/apps/catalog/views/insertion_model.py index 4522f36..33b63e6 100644 --- a/backend/metagenedb/apps/catalog/views/insertion_model.py +++ b/backend/metagenedb/apps/catalog/views/insertion_model.py @@ -9,6 +9,7 @@ class InsertionBase(ABC): """ MANY_TO_MANY_FIELDS = [] FOREIGN_KEY_FIELDS = [] + SIMPLE_FIELDS = [] # Fields you want to be able to create with the class @property def model(self): @@ -22,7 +23,10 @@ class InsertionBase(ABC): self.full_dict = model_dict.copy() self.foreign_key_dict = extract_dict(model_dict, self.FOREIGN_KEY_FIELDS) self.many_to_many_dict = extract_dict(model_dict, self.MANY_TO_MANY_FIELDS) - self.simple_dict = model_dict.copy() + if self.SIMPLE_FIELDS: + self.simple_dict = extract_dict(model_dict, self.SIMPLE_FIELDS) + else: + self.simple_dict = model_dict.copy() self.obj = None def upsert_to_db(self): diff --git a/backend/scripts/populate_db/import_ncbi_taxonomy.py b/backend/scripts/populate_db/import_ncbi_taxonomy.py new file mode 100755 index 0000000..351b87e --- /dev/null +++ b/backend/scripts/populate_db/import_ncbi_taxonomy.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +import argparse +import logging +import os +import sys + +import django + +from metagenedb.utils.parsers import NCBITaxonomyLineParser + +# Before model import, we need to called django.setup() to Load apps +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings") +django.setup() + +from metagenedb.apps.catalog.models import Taxonomy # noqa +from metagenedb.apps.catalog.serializers import TaxonomySerializer # noqa + +logging.basicConfig(level=logging.INFO) +_LOGGER = logging.getLogger(__name__) + + +def import_names(taxonomy_names_file, select_class="scientific name"): + """ + Build and return a DICT {tax_id: taxe_name} for the chosen select_class + """ + _LOGGER.info(f"Importing {select_class} from {taxonomy_names_file}...") + taxo_name_dict = {} + with open(taxonomy_names_file, "r") as file: + for line in file: + if select_class in line: + name = NCBITaxonomyLineParser.name(line) + taxo_name_dict[name.get('tax_id')] = name.get('name_txt') + return taxo_name_dict + + +def create_taxo_nodes(taxonomy_nodes_file, taxo_name_dict): + _LOGGER.info(f"Create taxonomy objects from {taxonomy_nodes_file}...") + FOREIGN_KEY_FIELDS = ['parent_tax_id'] + with open(taxonomy_nodes_file, "r") as file: + for i in file: + node = NCBITaxonomyLineParser.node(i) + node['name'] = taxo_name_dict.get(node.get('tax_id'), "No name") + for key in FOREIGN_KEY_FIELDS: + del node[key] + serializer = TaxonomySerializer(data=node) + if serializer.is_valid(): + serializer.save() + else: + _LOGGER.warning(f"Invalid data: {serializer.errors}. Insertion skipped. Data: {serializer.data}") + + +def update_taxo_nodes(taxonomy_nodes_file): + _LOGGER.info(f"Linking taxonomy objects to parental nodes from {taxonomy_nodes_file}...") + with open(taxonomy_nodes_file, "r") as file: + for i in file: + node = NCBITaxonomyLineParser.node(i) + taxo_obj = Taxonomy.objects.get(tax_id=node.get('tax_id')) + serializer = TaxonomySerializer(taxo_obj, data=node) + if serializer.is_valid(): + serializer.save() + else: + _LOGGER.warning(f"Invalid data: {serializer.errors}. Insertion skipped. Data: {serializer.data}") + + +def parse_arguments(): + """ + Defines parser. + """ + parser = argparse.ArgumentParser(description='Populate database from a given NCBI taxonomy files.') + # Common arguments for analysis and annotations + parser.add_argument('--nodes', help='nodes.dmp file from ncbi_taxonomy', required=True) + parser.add_argument('--names', help='names.dmp file from ncbi_taxonomy', required=True) + + try: + return parser.parse_args() + except SystemExit: + sys.exit(1) + + +def run(): + args = parse_arguments() + taxonomy_names = import_names(args.names) + create_taxo_nodes(args.nodes, taxonomy_names) + update_taxo_nodes(args.nodes) + + +if __name__ == "__main__": + run() -- GitLab