Commit f00c7a44 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

Add Taxonomy model and script to import from local files

parent 08993b8c
Pipeline #13218 failed with stage
in 1 minute and 15 seconds
from .gene import GeneAdmin from .gene import GeneAdmin
from .function import FunctionAdmin, KeggOrthologyAdmin from .function import FunctionAdmin, KeggOrthologyAdmin
from .taxonomy import TaxonomyAdmin
__all__ = ['GeneAdmin', 'FunctionAdmin', 'KeggOrthologyAdmin'] __all__ = ['GeneAdmin', 'FunctionAdmin', 'KeggOrthologyAdmin', 'TaxonomyAdmin']
from django.contrib import admin
from metagenedb.apps.catalog.models import Taxonomy
@admin.register(Taxonomy)
class TaxonomyAdmin(admin.ModelAdmin):
list_display = ('tax_id', 'name', 'rank', 'parent')
search_fields = ('tax_id', 'name')
# Generated by Django 2.2.1 on 2019-07-17 12:20
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('catalog', '0001_initial'),
]
operations = [
migrations.CreateModel(
name='Taxonomy',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('tax_id', models.CharField(db_index=True, max_length=20, unique=True)),
('name', models.CharField(default='No scientific name', max_length=200)),
('rank', models.CharField(choices=[('infraclass', 'Infraclass'), ('class', 'Class'), ('forma', 'Forma'), ('phylum', 'Phylum'), ('species_subgroup', 'Species subgroup'), ('genus', 'Genus'), ('parvorder', 'Parvorder'), ('subcohort', 'Subcohort'), ('subtribe', 'Subtribe'), ('superphylum', 'Superphylum'), ('subgenus', 'Subgenus'), ('superorder', 'Superorder'), ('species', 'Species'), ('subphylum', 'Subphylum'), ('infraorder', 'Infraorder'), ('section', 'Section'), ('tribe', 'Tribe'), ('cohort', 'Cohort'), ('subsection', 'Subsection'), ('series', 'Series'), ('order', 'Order'), ('subclass', 'Subclass'), ('superfamily', 'Superfamily'), ('superclass', 'Superclass'), ('superkingdom', 'Superkingdom'), ('kingdom', 'Kingdom'), ('family', 'Family'), ('suborder', 'Suborder'), ('subkingdom', 'Subkingdom'), ('subspecies', 'Subspecies'), ('no_rank', 'No rank'), ('subfamily', 'Subfamily'), ('varietas', 'Varietas'), ('species_group', 'Species group')], max_length=20)),
('parent', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='children', to='catalog.Taxonomy')),
],
options={
'verbose_name_plural': 'Taxonomy',
},
),
]
from .function import Function, KeggOrthology from .function import Function, KeggOrthology
from .gene import Gene from .gene import Gene
from .taxonomy import Taxonomy
__all__ = ['Function', 'KeggOrthology', 'Gene'] __all__ = ['Function', 'KeggOrthology', 'Gene', 'Taxonomy']
from django.db import models
class Taxonomy(models.Model):
"""
Taxonomy is based on NCBI taxonomy: https://www.ncbi.nlm.nih.gov/taxonomy
"""
NAME_DEFAULT = "No scientific name"
RANK_CHOICES = [
('infraclass', 'Infraclass'),
('class', 'Class'),
('forma', 'Forma'),
('phylum', 'Phylum'),
('species_subgroup', 'Species subgroup'),
('genus', 'Genus'),
('parvorder', 'Parvorder'),
('subcohort', 'Subcohort'),
('subtribe', 'Subtribe'),
('superphylum', 'Superphylum'),
('subgenus', 'Subgenus'),
('superorder', 'Superorder'),
('species', 'Species'),
('subphylum', 'Subphylum'),
('infraorder', 'Infraorder'),
('section', 'Section'),
('tribe', 'Tribe'),
('cohort', 'Cohort'),
('subsection', 'Subsection'),
('series', 'Series'),
('order', 'Order'),
('subclass', 'Subclass'),
('superfamily', 'Superfamily'),
('superclass', 'Superclass'),
('superkingdom', 'Superkingdom'),
('kingdom', 'Kingdom'),
('family', 'Family'),
('suborder', 'Suborder'),
('subkingdom', 'Subkingdom'),
('subspecies', 'Subspecies'),
('no_rank', 'No rank'),
('subfamily', 'Subfamily'),
('varietas', 'Varietas'),
('species_group', 'Species group'),
]
tax_id = models.CharField(max_length=20, unique=True, db_index=True)
name = models.CharField(max_length=200, default=NAME_DEFAULT)
rank = models.CharField(max_length=20, choices=RANK_CHOICES)
parent = models.ForeignKey(
'Taxonomy',
related_name='children',
on_delete=models.SET_NULL,
null=True, blank=True,
)
def __str__(self):
return f"{self.name}"
class Meta:
verbose_name_plural = "Taxonomy"
from .function import FunctionSerializer
from .gene import GeneSerializer
from .taxonomy import TaxonomySerializer
__all__ = ['FunctionSerializer', 'GeneSerializer', 'TaxonomySerializer']
\ No newline at end of file
from rest_framework import serializers
from metagenedb.apps.catalog.models import Function
class FunctionSerializer(serializers.ModelSerializer):
class Meta:
model = Function
fields = ('function_id', 'source', 'name')
\ No newline at end of file
from rest_framework import serializers
from metagenedb.apps.catalog.models import Gene
from metagenedb.apps.catalog.serializers import FunctionSerializer
class GeneSerializer(serializers.ModelSerializer):
functions = FunctionSerializer(many=True, read_only=True)
class Meta:
model = Gene
fields = ('gene_id', 'gene_length', 'functions')
\ No newline at end of file
from rest_framework import serializers
from metagenedb.apps.catalog.models import Taxonomy
class TaxonomySerializer(serializers.ModelSerializer):
parent_tax_id = serializers.SlugRelatedField(
queryset=Taxonomy.objects.all(),
slug_field='tax_id',
source='parent',
required=False
)
class Meta:
model = Taxonomy
fields = ('tax_id', 'name', 'rank', 'parent_tax_id')
...@@ -9,6 +9,7 @@ class InsertionBase(ABC): ...@@ -9,6 +9,7 @@ class InsertionBase(ABC):
""" """
MANY_TO_MANY_FIELDS = [] MANY_TO_MANY_FIELDS = []
FOREIGN_KEY_FIELDS = [] FOREIGN_KEY_FIELDS = []
SIMPLE_FIELDS = [] # Fields you want to be able to create with the class
@property @property
def model(self): def model(self):
...@@ -22,7 +23,10 @@ class InsertionBase(ABC): ...@@ -22,7 +23,10 @@ class InsertionBase(ABC):
self.full_dict = model_dict.copy() self.full_dict = model_dict.copy()
self.foreign_key_dict = extract_dict(model_dict, self.FOREIGN_KEY_FIELDS) self.foreign_key_dict = extract_dict(model_dict, self.FOREIGN_KEY_FIELDS)
self.many_to_many_dict = extract_dict(model_dict, self.MANY_TO_MANY_FIELDS) self.many_to_many_dict = extract_dict(model_dict, self.MANY_TO_MANY_FIELDS)
self.simple_dict = model_dict.copy() if self.SIMPLE_FIELDS:
self.simple_dict = extract_dict(model_dict, self.SIMPLE_FIELDS)
else:
self.simple_dict = model_dict.copy()
self.obj = None self.obj = None
def upsert_to_db(self): def upsert_to_db(self):
......
#!/usr/bin/env python
import argparse
import logging
import os
import sys
import django
from metagenedb.utils.parsers import NCBITaxonomyLineParser
# Before model import, we need to called django.setup() to Load apps
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings")
django.setup()
from metagenedb.apps.catalog.models import Taxonomy # noqa
from metagenedb.apps.catalog.serializers import TaxonomySerializer # noqa
logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)
def import_names(taxonomy_names_file, select_class="scientific name"):
"""
Build and return a DICT {tax_id: taxe_name} for the chosen select_class
"""
_LOGGER.info(f"Importing {select_class} from {taxonomy_names_file}...")
taxo_name_dict = {}
with open(taxonomy_names_file, "r") as file:
for line in file:
if select_class in line:
name = NCBITaxonomyLineParser.name(line)
taxo_name_dict[name.get('tax_id')] = name.get('name_txt')
return taxo_name_dict
def create_taxo_nodes(taxonomy_nodes_file, taxo_name_dict):
_LOGGER.info(f"Create taxonomy objects from {taxonomy_nodes_file}...")
FOREIGN_KEY_FIELDS = ['parent_tax_id']
with open(taxonomy_nodes_file, "r") as file:
for i in file:
node = NCBITaxonomyLineParser.node(i)
node['name'] = taxo_name_dict.get(node.get('tax_id'), "No name")
for key in FOREIGN_KEY_FIELDS:
del node[key]
serializer = TaxonomySerializer(data=node)
if serializer.is_valid():
serializer.save()
else:
_LOGGER.warning(f"Invalid data: {serializer.errors}. Insertion skipped. Data: {serializer.data}")
def update_taxo_nodes(taxonomy_nodes_file):
_LOGGER.info(f"Linking taxonomy objects to parental nodes from {taxonomy_nodes_file}...")
with open(taxonomy_nodes_file, "r") as file:
for i in file:
node = NCBITaxonomyLineParser.node(i)
taxo_obj = Taxonomy.objects.get(tax_id=node.get('tax_id'))
serializer = TaxonomySerializer(taxo_obj, data=node)
if serializer.is_valid():
serializer.save()
else:
_LOGGER.warning(f"Invalid data: {serializer.errors}. Insertion skipped. Data: {serializer.data}")
def parse_arguments():
"""
Defines parser.
"""
parser = argparse.ArgumentParser(description='Populate database from a given NCBI taxonomy files.')
# Common arguments for analysis and annotations
parser.add_argument('--nodes', help='nodes.dmp file from ncbi_taxonomy', required=True)
parser.add_argument('--names', help='names.dmp file from ncbi_taxonomy', required=True)
try:
return parser.parse_args()
except SystemExit:
sys.exit(1)
def run():
args = parse_arguments()
taxonomy_names = import_names(args.names)
create_taxo_nodes(args.nodes, taxonomy_names)
update_taxo_nodes(args.nodes)
if __name__ == "__main__":
run()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment