From d4cb92b0cee8f907d22f6dd1de05fcecf14088bf Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion <kenzo-hugo.hillion1@pasteur.fr> Date: Mon, 17 Jun 2019 17:59:18 +0200 Subject: [PATCH] add Kegg function to Gene and script to load them from API --- web/metagenedb/apps/catalog/admin.py | 17 +++- .../apps/catalog/migrations/0001_initial.py | 45 ++++++++++ .../apps/catalog/models/function.py | 2 +- web/metagenedb/apps/catalog/models/gene.py | 3 +- web/metagenedb/apps/catalog/serializers.py | 12 ++- web/scripts/import_igc_data.py | 56 ++++++++---- web/scripts/load_kegg_ko.py | 89 +++++++++++++++++++ 7 files changed, 202 insertions(+), 22 deletions(-) create mode 100644 web/metagenedb/apps/catalog/migrations/0001_initial.py mode change 100644 => 100755 web/scripts/import_igc_data.py create mode 100755 web/scripts/load_kegg_ko.py diff --git a/web/metagenedb/apps/catalog/admin.py b/web/metagenedb/apps/catalog/admin.py index 4d729e8..13e59de 100644 --- a/web/metagenedb/apps/catalog/admin.py +++ b/web/metagenedb/apps/catalog/admin.py @@ -1,18 +1,29 @@ from django.contrib import admin -from .models import Gene, Function +from .models import Gene, Function, KeggOrthology @admin.register(Gene) class GeneAdmin(admin.ModelAdmin): - list_display = ('gene_id', 'gene_length') + list_display = ('gene_id', 'gene_length', 'get_functions') search_fields = ('gene_id',) + def get_functions(self, obj): + return ",".join([str(f) for f in obj.functions.all()]) + get_functions.short_description = 'Functions' + + +@admin.register(KeggOrthology) +class KeggOrthologyAdmin(admin.ModelAdmin): + + list_display = ('function_id', 'name', 'long_name', 'ec_number', 'source') + search_fields = ('function_id',) + @admin.register(Function) class FunctionAdmin(admin.ModelAdmin): - list_display = (('function_id', 'source')) + list_display = ('function_id', 'name', 'source') search_fields = ('function_id',) diff --git a/web/metagenedb/apps/catalog/migrations/0001_initial.py b/web/metagenedb/apps/catalog/migrations/0001_initial.py new file mode 100644 index 0000000..2a2bc33 --- /dev/null +++ b/web/metagenedb/apps/catalog/migrations/0001_initial.py @@ -0,0 +1,45 @@ +# Generated by Django 2.2.1 on 2019-06-17 14:38 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='Function', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('function_id', models.CharField(db_index=True, max_length=100)), + ('name', models.CharField(max_length=100)), + ('source', models.CharField(choices=[('undef', 'Undefined'), ('kegg', 'KEGG'), ('eggnog', 'EggNOG')], default='undef', max_length=10)), + ], + ), + migrations.CreateModel( + name='KeggOrthology', + fields=[ + ('function_ptr', models.OneToOneField(auto_created=True, on_delete=django.db.models.deletion.CASCADE, parent_link=True, primary_key=True, serialize=False, to='catalog.Function')), + ('ec_number', models.CharField(blank=True, default='', max_length=200)), + ('long_name', models.CharField(max_length=500)), + ], + options={ + 'verbose_name_plural': 'Kegg orthologies', + }, + bases=('catalog.function',), + ), + migrations.CreateModel( + name='Gene', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('gene_id', models.CharField(db_index=True, max_length=100, unique=True)), + ('gene_length', models.IntegerField()), + ('functions', models.ManyToManyField(to='catalog.Function')), + ], + ), + ] diff --git a/web/metagenedb/apps/catalog/models/function.py b/web/metagenedb/apps/catalog/models/function.py index b1cb40e..de43a73 100644 --- a/web/metagenedb/apps/catalog/models/function.py +++ b/web/metagenedb/apps/catalog/models/function.py @@ -11,7 +11,7 @@ class Function(models.Model): (EGGNOG, 'EggNOG') ] - function_id = models.CharField(max_length=100, unique=True, db_index=True) + function_id = models.CharField(max_length=100, db_index=True) name = models.CharField(max_length=100) source = models.CharField(max_length=10, choices=SOURCE_CHOICES, default=UNDEFINED) diff --git a/web/metagenedb/apps/catalog/models/gene.py b/web/metagenedb/apps/catalog/models/gene.py index c5f46ea..adabb55 100644 --- a/web/metagenedb/apps/catalog/models/gene.py +++ b/web/metagenedb/apps/catalog/models/gene.py @@ -6,7 +6,8 @@ from .function import Function class Gene(models.Model): gene_id = models.CharField(max_length=100, unique=True, db_index=True) gene_length = models.IntegerField() - functions = models.ManyToManyField(Function) + functions = models.ManyToManyField(Function, null=True) def __str__(self): return self.gene_id + diff --git a/web/metagenedb/apps/catalog/serializers.py b/web/metagenedb/apps/catalog/serializers.py index 9f80681..df3dea8 100644 --- a/web/metagenedb/apps/catalog/serializers.py +++ b/web/metagenedb/apps/catalog/serializers.py @@ -1,9 +1,17 @@ from rest_framework import serializers -from .models import Gene +from .models import Gene, Function + + +class FunctionSerializer(serializers.ModelSerializer): + class Meta: + model = Function + fields = ('function_id', 'source', 'name') class GeneSerializer(serializers.ModelSerializer): + functions = FunctionSerializer(many=True, read_only=True) class Meta: model = Gene - fields = ('gene_id', 'gene_length') + fields = ('gene_id', 'gene_length', 'functions') + diff --git a/web/scripts/import_igc_data.py b/web/scripts/import_igc_data.py old mode 100644 new mode 100755 index b20aa3b..ea87b30 --- a/web/scripts/import_igc_data.py +++ b/web/scripts/import_igc_data.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python import argparse import logging import os @@ -11,13 +12,13 @@ from django.core.exceptions import ValidationError os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings") django.setup() -from metagenedb.apps.catalog.models import Gene +from metagenedb.apps.catalog.models import Gene, Function logging.basicConfig(level=logging.INFO) _LOGGER = logging.getLogger(__name__) -def create_gene(raw_line): +def parse_gene(raw_line): """ IGC annotation columns: 0: Gene ID Unique ID @@ -37,36 +38,61 @@ def create_gene(raw_line): representative gene or a redundant gene belonging to it """ gene_info = raw_line.rstrip().split('\t') - gene = Gene(gene_id=gene_info[1], - gene_length=gene_info[2]) - return gene + return { + 'gene_id': gene_info[1], + 'gene_length': gene_info[2], + 'kegg_ko': gene_info[7] + } -def insert_gene(gene): - gene.full_clean() - gene.save() +def link_to_function(obj_gene, gene_dict): + try: + function = Function.objects.get(function_id=gene_dict.get('kegg_ko')) + obj_gene.functions.add(function) + obj_gene.full_clean() + obj_gene.save() + except Function.DoesNotExist: + _LOGGER.warning(f"{gene_dict.get('kegg_ko')} not found in the database {gene_dict}.") + + +def insert_gene(gene_dict): + MANY_TO_MANY_FIELDS = ['kegg_ko'] + + try: + obj_gene = Gene.objects.get(gene_id=gene_dict.get('gene_id')) + for key, value in gene_dict.items(): + if key not in MANY_TO_MANY_FIELDS: + setattr(obj_gene, key, value) + except Gene.DoesNotExist: + obj_gene = Gene(gene_id=gene_dict.get('gene_id'), + gene_length=gene_dict.get('gene_length')) + obj_gene.full_clean() + obj_gene.save() + # Add link to KEGG + if gene_dict.get('kegg_ko') != 'unknown': + link_to_function(obj_gene, gene_dict) def insert_gene_list(chunk_genes): for i in chunk_genes: try: - gene = create_gene(i) - insert_gene(gene) + gene_dict = parse_gene(i) + insert_gene(gene_dict) except ValidationError as e: - _LOGGER.warning(f"{e.__dict__} for gene_id: {gene.gene_id}. Insertion skipped.") + _LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.") def load_annotation_file_to_db_in_chunks(annotation_file, chunk_size=100000): - loaded_genes = 0 + processed_genes = 0 with open(annotation_file, 'r') as file: while True: chunk_genes = list(islice(file, chunk_size)) if not chunk_genes: break - loaded_genes += len(chunk_genes) + processed_genes += len(chunk_genes) insert_gene_list(chunk_genes) - _LOGGER.info(f"{loaded_genes} genes processed so far...") - _LOGGER.info(f"[DONE] {loaded_genes} genes processed.") + _LOGGER.info(f"{processed_genes} genes processed so far...") + _LOGGER.info(f"[DONE] {processed_genes} genes processed.") def parse_arguments(): diff --git a/web/scripts/load_kegg_ko.py b/web/scripts/load_kegg_ko.py new file mode 100755 index 0000000..10cd104 --- /dev/null +++ b/web/scripts/load_kegg_ko.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python +import argparse +import logging +import os +import requests +import sys + +import django +from django.core.exceptions import ValidationError + +# Before model import, we need to called django.setup() to Load apps +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings") +django.setup() + +from metagenedb.apps.catalog.models import KeggOrthology + +logging.basicConfig(level=logging.INFO) +_LOGGER = logging.getLogger(__name__) + +KEGG_KO_LIST_API = "http://rest.kegg.jp/list/ko" + + +def parse_arguments(): + """ + Defines parser. + """ + parser = argparse.ArgumentParser(description=f'Populate KEGG KO database from {KEGG_KO_LIST_API}.') + try: + return parser.parse_args() + except SystemExit: + sys.exit(1) + + +def parse_ko(line): + """ + Parse line from kegg KO list to return organized dict + """ + content = line.split('\t') + function_id = content[0].split(':')[1] + names = content[1].split(';') + if '[EC:' in names[1]: + ec_number = names[1].split('[EC:')[1].rstrip(']') + else: + ec_number = '' + return { + 'function_id': function_id, + 'name': names[0], + 'long_name': names[1].lstrip(), + 'ec_number': ec_number + } + + +def create_kegg_ko(kegg_ko): + try: + obj_kegg = KeggOrthology.objects.get(function_id=kegg_ko.get('function_id')) + for key, value in kegg_ko.items(): + setattr(obj_kegg, key, value) + except KeggOrthology.DoesNotExist: + obj_kegg = KeggOrthology(**kegg_ko) + obj_kegg.full_clean() + obj_kegg.save() + + +def run(): + args = parse_arguments() + all_ko = requests.get("http://rest.kegg.jp/list/ko") + all_ko.raise_for_status() + inserted_kegg = 0 + skipped_kegg = 0 + total_kegg = len(all_ko.text.splitlines()) + for line in all_ko.text.splitlines(): + kegg_ko = parse_ko(line) + try: + create_kegg_ko(kegg_ko) + inserted_kegg += 1 + except ValidationError as e: + skipped_kegg += 1 + _LOGGER.warning(f"{e.__dict__} for function_id: {kegg_ko.get('function_id')}. Insertion skipped.") + if inserted_kegg > 0 and inserted_kegg % 100 == 0: + _LOGGER.info(f"{inserted_kegg}/{total_kegg} KEGG KO inserted so far...") + _LOGGER.info(f"[DONE] {inserted_kegg}/{total_kegg} KEGG KO inserted.") + _LOGGER.info(f"[DONE] {skipped_kegg}/{total_kegg} KEGG KO skipped.") + + # Create unknown entry + + + +if __name__ == "__main__": + run() -- GitLab