Commit 09458ad3 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

Add script to load EggNog into the db

parent 7f863a0d
...@@ -20,7 +20,7 @@ class FunctionAdmin(admin.ModelAdmin): ...@@ -20,7 +20,7 @@ class FunctionAdmin(admin.ModelAdmin):
@admin.register(EggNog) @admin.register(EggNog)
class EggNogAdmin(admin.ModelAdmin): class EggNogAdmin(admin.ModelAdmin):
list_display = ('function_id', 'name', 'long_name') list_display = ('function_id', 'name', 'functional_category')
search_fields = ('function_id', 'name') search_fields = ('function_id', 'name')
......
import logging
from django.core.management.base import BaseCommand
from django.db import IntegrityError
from metagenedb.apps.catalog.models import EggNog, EggNogFunctionalCategory
from metagenedb.common.utils.chunks import file_len
from metagenedb.common.utils.parsers import EggNogAnnotationLineParser
logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s')
logger = logging.getLogger(__name__)
class ImportEggNog(object):
def __init__(self, file_path):
self.annotation_file = file_path
self.eggnog_parser = EggNogAnnotationLineParser()
self.processed_count = 0
self.created_count = 0
self.updated_count = 0
self.skipped_count = 0
self.skipped_ids = []
def _build_functional_category_dict(self):
all_categories = EggNogFunctionalCategory.objects.all()
self.functional_cat = {cat.category_id: cat for cat in all_categories}
def link_functional_category(self, eggnog_dict):
cat_key = eggnog_dict.get('functional_category', 'S')
category = self.functional_cat.get(cat_key)
eggnog_dict.update({'functional_category': category})
def load_all(self, test=False):
self._build_functional_category_dict()
self.total_eggnog_nb = file_len(self.annotation_file)
with open(self.annotation_file, "r") as file:
for line in file:
eggnog_dict = self.eggnog_parser.get_dict(line)
self.link_functional_category(eggnog_dict)
payload = {k: v for k, v in eggnog_dict.items() if v != ""}
try:
eggnog = EggNog(**payload)
eggnog.save()
self.created_count += 1
except IntegrityError:
try:
eggnog = EggNog.objects.get(function_id=payload.get('function_id'))
for k, v in payload.items():
setattr(eggnog, k, v)
eggnog.save()
self.updated_count += 1
except IntegrityError:
self.skipped_ids.append(payload.get('function_id'))
self.skipped_count += 1
self.processed_count += 1
if self.processed_count % 1000 == 0:
logger.info("%s/%s EggNog processed so far...", self.processed_count, self.total_eggnog_nb)
if test:
break
logger.info("[DONE] %s/%s EggNog created.", self.created_count, self.total_eggnog_nb)
logger.info("[DONE] %s/%s EggNog updated.", self.updated_count, self.total_eggnog_nb)
logger.info("[DONE] %s/%s EggNog skipped. List: %s", self.skipped_count, self.total_eggnog_nb,
self.skipped_ids)
class Command(BaseCommand):
help = 'Create or update all Eggnog entries from annotations.tsv file.'
def add_arguments(self, parser):
parser.add_argument('annotation', help='annotations.tsv file from EggNog')
parser.add_argument('--test', action='store_true', help='Run only on first 1000 entries.')
def set_logger_level(self, verbosity):
if verbosity > 2:
logger.setLevel(logging.DEBUG)
elif verbosity > 1:
logger.setLevel(logging.INFO)
def handle(self, *args, **options):
self.set_logger_level(int(options['verbosity']))
import_eggnog = ImportEggNog(options['annotation'])
import_eggnog.load_all(test=options['test'])
# Generated by Django 3.0 on 2019-12-09 17:02
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('catalog', '0013_plural_eggnog'),
]
operations = [
migrations.RemoveField(
model_name='eggnog',
name='long_name',
),
]
# Generated by Django 3.0 on 2019-12-09 17:06
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('catalog', '0014_remove_eggnog_long_name'),
]
operations = [
migrations.AlterField(
model_name='function',
name='name',
field=models.CharField(max_length=200),
),
]
...@@ -12,7 +12,7 @@ class Function(models.Model): ...@@ -12,7 +12,7 @@ class Function(models.Model):
] ]
function_id = models.CharField(max_length=100, db_index=True, unique=True) function_id = models.CharField(max_length=100, db_index=True, unique=True)
name = models.CharField(max_length=100) name = models.CharField(max_length=200)
source = models.CharField(max_length=10, choices=SOURCE_CHOICES, default=UNDEFINED) source = models.CharField(max_length=10, choices=SOURCE_CHOICES, default=UNDEFINED)
def __str__(self): def __str__(self):
...@@ -38,7 +38,6 @@ class KeggOrthology(Function): ...@@ -38,7 +38,6 @@ class KeggOrthology(Function):
class EggNog(Function): class EggNog(Function):
SOURCE = 'eggnog' SOURCE = 'eggnog'
long_name = models.CharField(max_length=500)
functional_category = models.ForeignKey( functional_category = models.ForeignKey(
'EggNogFunctionalCategory', related_name='eggnogs', 'EggNogFunctionalCategory', related_name='eggnogs',
on_delete=models.SET_NULL, on_delete=models.SET_NULL,
...@@ -64,5 +63,8 @@ class EggNogFunctionalCategory(models.Model): ...@@ -64,5 +63,8 @@ class EggNogFunctionalCategory(models.Model):
name = models.CharField(max_length=100) name = models.CharField(max_length=100)
group = models.CharField(max_length=100, choices=GROUP_CHOICES) group = models.CharField(max_length=100, choices=GROUP_CHOICES)
def __str__(self):
return f"{self.category_id} ({self.name})"
class Meta: class Meta:
verbose_name_plural = "EggNog Functional categories" verbose_name_plural = "EggNog Functional categories"
...@@ -2,3 +2,10 @@ def generate_chunks(full_list, chunk_size): ...@@ -2,3 +2,10 @@ def generate_chunks(full_list, chunk_size):
"""Yield successive n-sized chunks from full_list.""" """Yield successive n-sized chunks from full_list."""
for i in range(0, len(full_list), chunk_size): for i in range(0, len(full_list), chunk_size):
yield full_list[i:i + chunk_size] yield full_list[i:i + chunk_size]
def file_len(file_path):
with open(file_path) as f:
for i, l in enumerate(f):
pass
return i + 1
...@@ -6,7 +6,7 @@ _LOGGER = logging.getLogger(__name__) ...@@ -6,7 +6,7 @@ _LOGGER = logging.getLogger(__name__)
class EggNogAnnotationLineParser(object): class EggNogAnnotationLineParser(object):
@staticmethod @staticmethod
def ko_list(line): def get_dict(line):
""" """
Parse line from Eggnog annotations.tsv file to return organized dict Parse line from Eggnog annotations.tsv file to return organized dict
""" """
......
...@@ -12,10 +12,10 @@ class TestEggNogAnnotationLineParser(TestCase): ...@@ -12,10 +12,10 @@ class TestEggNogAnnotationLineParser(TestCase):
'name': "translational termination", 'name': "translational termination",
'functional_category': "K" 'functional_category': "K"
} }
test_dict = EggNogAnnotationLineParser.ko_list(ko_line) test_dict = EggNogAnnotationLineParser.get_dict(ko_line)
self.assertDictEqual(test_dict, expected_dict) self.assertDictEqual(test_dict, expected_dict)
def test_ko_list_wrong_format(self): def test_ko_list_wrong_format(self):
ko_line = "This is a wrong line format, with; information and tab" ko_line = "This is a wrong line format, with; information and tab"
with self.assertRaises(Exception) as context: # noqa with self.assertRaises(Exception) as context: # noqa
EggNogAnnotationLineParser.ko_list(ko_line) EggNogAnnotationLineParser.get_dict(ko_line)
from unittest import TestCase from unittest import TestCase
from metagenedb.common.utils.chunks import generate_chunks from metagenedb.common.utils.chunks import generate_chunks, file_len
class TestChunks(TestCase): class TestChunks(TestCase):
...@@ -24,3 +24,10 @@ class TestChunks(TestCase): ...@@ -24,3 +24,10 @@ class TestChunks(TestCase):
chunks = list(generate_chunks(self.full_list, chunk_size)) chunks = list(generate_chunks(self.full_list, chunk_size))
self.assertEqual(len(chunks), 1) self.assertEqual(len(chunks), 1)
self.assertEqual(len(chunks[-1]), 10) self.assertEqual(len(chunks[-1]), 10)
class TestFileLength(TestCase):
def test_file_length(self):
file_path = "./dev_data/IGC_sample.annotation_OF.summary"
self.assertEqual(file_len(file_path), 1002)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment