diff --git a/backend/dev_data/IGC_sample.annotation_OF.summary b/backend/dev_data/IGC_sample.annotation_OF.summary index 114a3b1df421cc28d39bdf687cda43a25d483100..f055c1088ab2532e30006f40a3d6e77ba41d0a33 100644 --- a/backend/dev_data/IGC_sample.annotation_OF.summary +++ b/backend/dev_data/IGC_sample.annotation_OF.summary @@ -999,3 +999,4 @@ 999 158499257-stool1_revised_C1458534_1_gene127873 11955 Complete USA unknown unknown unknown NOG295308 0.00315706393054459 0.00280373831775701 unknown unknown USA 1000 MH0385_GL0059251 11946 Lack both ends EUR unknown unknown unknown unknown 0.000789265982636148 0.000934579439252336 unknown unknown EUR 1000 MH0385_GL0059251 11946 Lack both ends EUR unknown unknown unknown unknown 0.000789265982636148 0.000934579439252336 unknown unknown EUR +353535 wrong_length the_length Info EUR unknown unknown unknown unknown 0.0000001 0.00000001 0.0000001 unknown unknown EUR diff --git a/backend/metagenedb/apps/catalog/migrations/0003_auto_20190717_1551.py b/backend/metagenedb/apps/catalog/migrations/0003_complete_taxonomy.py similarity index 100% rename from backend/metagenedb/apps/catalog/migrations/0003_auto_20190717_1551.py rename to backend/metagenedb/apps/catalog/migrations/0003_complete_taxonomy.py diff --git a/backend/metagenedb/apps/catalog/migrations/0004_taxonomy_superkingdom.py b/backend/metagenedb/apps/catalog/migrations/0004_taxonomy_superkingdom.py index 4e7649b39d513257f3c009d11abc4436c0c4b36a..46560dffb8425091fbabbe2cf51f1eea90e47c63 100644 --- a/backend/metagenedb/apps/catalog/migrations/0004_taxonomy_superkingdom.py +++ b/backend/metagenedb/apps/catalog/migrations/0004_taxonomy_superkingdom.py @@ -7,7 +7,7 @@ import django.db.models.deletion class Migration(migrations.Migration): dependencies = [ - ('catalog', '0003_auto_20190717_1551'), + ('catalog', '0003_complete_taxonomy'), ] operations = [ diff --git a/backend/metagenedb/apps/catalog/migrations/0005_gene_ordering.py b/backend/metagenedb/apps/catalog/migrations/0005_gene_ordering.py new file mode 100644 index 0000000000000000000000000000000000000000..993eb6a8bbf8aa869921981495f2c99fae022ef4 --- /dev/null +++ b/backend/metagenedb/apps/catalog/migrations/0005_gene_ordering.py @@ -0,0 +1,17 @@ +# Generated by Django 2.2.1 on 2019-08-01 14:16 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('catalog', '0004_taxonomy_superkingdom'), + ] + + operations = [ + migrations.AlterModelOptions( + name='gene', + options={'ordering': ['-gene_id']}, + ), + ] diff --git a/backend/metagenedb/apps/catalog/models/gene.py b/backend/metagenedb/apps/catalog/models/gene.py index c5f46eacd316fc57e91ce12375a8a9ba92a9ad27..ca73c855748f2d6a10055cd5c24b1a9e2a19cd92 100644 --- a/backend/metagenedb/apps/catalog/models/gene.py +++ b/backend/metagenedb/apps/catalog/models/gene.py @@ -10,3 +10,6 @@ class Gene(models.Model): def __str__(self): return self.gene_id + + class Meta: + ordering = ['-gene_id'] diff --git a/backend/metagenedb/apps/catalog/views/gene.py b/backend/metagenedb/apps/catalog/views/gene.py index 4915640212746d8b38c9607c925a4f91b3230379..9effac1d28be5e9d91487ed440ce2bbfb38f466b 100644 --- a/backend/metagenedb/apps/catalog/views/gene.py +++ b/backend/metagenedb/apps/catalog/views/gene.py @@ -9,30 +9,13 @@ from rest_framework.decorators import ( from rest_framework.response import Response from django.core.paginator import Paginator, EmptyPage, PageNotAnInteger -from metagenedb.apps.catalog.models import Function, Gene +from metagenedb.apps.catalog.models import Gene from metagenedb.apps.catalog.serializers import GeneSerializer -from metagenedb.apps.catalog.views.insertion_model import InsertionBase logging.basicConfig(level=logging.INFO) _LOGGER = logging.getLogger(__name__) -class GeneInsertion(InsertionBase): - MANY_TO_MANY_FIELDS = ['kegg_ko'] - model = Gene - obj_id = "gene_id" - - def _link_kegg_ko(self, function_id): - VALUE_TO_SKIP = ['unknown'] - if function_id not in VALUE_TO_SKIP: - try: - function = Function.objects.get(function_id=function_id) - self.obj.functions.add(function) - self.full_clean_and_save() - except Function.DoesNotExist: - _LOGGER.warning(f"{function_id} not found in the database. Full dict: {self.full_dict}.") - - @api_view(['GET']) @authentication_classes(()) @permission_classes(()) diff --git a/backend/metagenedb/apps/catalog/views/insertion_model.py b/backend/metagenedb/apps/catalog/views/insertion_model.py deleted file mode 100644 index 33b63e6a6de8bde4c65ff2680fb4aa5c0cfcf73c..0000000000000000000000000000000000000000 --- a/backend/metagenedb/apps/catalog/views/insertion_model.py +++ /dev/null @@ -1,56 +0,0 @@ -from abc import ABC -from metagenedb.utils.dict_operations import extract_dict - - -class InsertionBase(ABC): - """ - Base for insertion in DB for different models. - This base will be used for POST methods but also direct insertion to DB from scripts. - """ - MANY_TO_MANY_FIELDS = [] - FOREIGN_KEY_FIELDS = [] - SIMPLE_FIELDS = [] # Fields you want to be able to create with the class - - @property - def model(self): - raise NotImplementedError - - @property - def obj_id(self): - raise NotImplementedError - - def __init__(self, model_dict): - self.full_dict = model_dict.copy() - self.foreign_key_dict = extract_dict(model_dict, self.FOREIGN_KEY_FIELDS) - self.many_to_many_dict = extract_dict(model_dict, self.MANY_TO_MANY_FIELDS) - if self.SIMPLE_FIELDS: - self.simple_dict = extract_dict(model_dict, self.SIMPLE_FIELDS) - else: - self.simple_dict = model_dict.copy() - self.obj = None - - def upsert_to_db(self): - try: - self.obj = self.model.objects.get(**{self.obj_id: self.full_dict.get(self.obj_id)}) - for key, value in self.simple_dict.items(): - setattr(self.obj, key, value) - except self.model.DoesNotExist: - self.create_obj() - self.full_clean_and_save() - self.handle_foreign_fields() - self.handle_many_to_many_fields() - - def create_obj(self): - self.obj = self.model(**self.simple_dict) - - def full_clean_and_save(self): - self.obj.full_clean() - self.obj.save() - - def handle_foreign_fields(self): - for key, value in self.foreign_key_dict.items(): - getattr(self, f"_link_{key}")(value) - - def handle_many_to_many_fields(self): - for key, value in self.many_to_many_dict.items(): - getattr(self, f"_link_{key}")(value) diff --git a/backend/scripts/populate_db/import_igc_data.py b/backend/scripts/populate_db/import_igc_data.py index 65a7250eaaf7ad0c955c185559be60cd728271d2..2faae48803959ab5e8bde19f4605d42f25c46594 100755 --- a/backend/scripts/populate_db/import_igc_data.py +++ b/backend/scripts/populate_db/import_igc_data.py @@ -6,14 +6,14 @@ import sys from itertools import islice import django -from django.core.exceptions import ValidationError +from rest_framework.exceptions import ValidationError # Before model import, we need to called django.setup() to Load apps os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings") django.setup() from metagenedb.apps.catalog.models import Gene, Function # noqa -from metagenedb.apps.catalog.views.gene import GeneInsertion # noqa +from metagenedb.apps.catalog.serializers import GeneSerializer # noqa logging.basicConfig(level=logging.INFO) _LOGGER = logging.getLogger(__name__) @@ -46,13 +46,21 @@ def parse_gene(raw_line): } +def upsert_gene(gene_dict): + try: + gene_obj = Gene.objects.get(gene_id=gene_dict.get('gene_id')) + serializer = GeneSerializer(gene_obj, data=gene_dict) + except Gene.DoesNotExist: + serializer = GeneSerializer(data=gene_dict) + serializer.is_valid(raise_exception=True) + serializer.save() + + def insert_gene_list(chunk_genes): for gene_line in chunk_genes: + gene_dict = parse_gene(gene_line) try: - gene_dict = parse_gene(gene_line) - # insert_gene(gene_dict) - gene_insertion = GeneInsertion(gene_dict) - gene_insertion.upsert_to_db() + upsert_gene(gene_dict) except ValidationError as e: _LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.") diff --git a/backend/scripts/populate_db/test_import_igc_data.py b/backend/scripts/populate_db/test_import_igc_data.py new file mode 100644 index 0000000000000000000000000000000000000000..f06e88956c949a495d18d440c11d04083b999041 --- /dev/null +++ b/backend/scripts/populate_db/test_import_igc_data.py @@ -0,0 +1,69 @@ +from unittest import TestCase + +from rest_framework.exceptions import ValidationError +from rest_framework.test import APITestCase + +from metagenedb.apps.catalog.models import Gene +from scripts.populate_db.import_igc_data import parse_gene, upsert_gene + + +class TestParseGene(TestCase): + + def test_parse_gene(self): + raw_data = [ + 'gene_id', + 'gene_name', + 'gene_length', + 'gene_completeness_status', + 'cohort_origin', + 'taxo_phylum', + 'taxo_genus', + 'kegg', + 'eggnog', + 'sample_occurence_freq', + 'ind_occurence_freq', + 'kegg_functional_cat', + 'eggnog_functional_cat', + 'cohort_assembled' + ] + raw_line = "\t".join(raw_data) + expected_dict = { + 'gene_id': 'gene_name', # We use the gene name for our gene ID + 'gene_length': 'gene_length', + 'kegg_ko': 'kegg' + } + tested_dict = parse_gene(raw_line) + self.assertDictEqual(tested_dict, expected_dict) + + +class TestUpsertGene(APITestCase): + + def test_insert_valid_gene_no_kegg(self): + valid_gene = { + 'gene_id': 'test_gene01', + 'gene_length': 3556 + } + upsert_gene(valid_gene) + self.assertEqual(Gene.objects.all().count(), 1) + + def test_insert_invalid_gene_length(self): + invalid_gene = { + 'gene_id': 'test_gene01', + 'gene_length': 'wrong_format' + } + with self.assertRaises(ValidationError) as context: # noqa + upsert_gene(invalid_gene) + + def test_update_gene(self): + valid_gene = { + 'gene_id': 'test_gene01', + 'gene_length': 3556 + } + updated_gene = { + 'gene_id': 'test_gene01', + 'gene_length': 356 + } + upsert_gene(valid_gene) + self.assertEqual(Gene.objects.get(gene_id="test_gene01").gene_length, 3556) + upsert_gene(updated_gene) + self.assertEqual(Gene.objects.get(gene_id="test_gene01").gene_length, 356)