diff --git a/backend/metagenedb/apps/catalog/models/gene.py b/backend/metagenedb/apps/catalog/models/gene.py index 8c33c15f6bf4a7fea66c097dc023954780c395d2..ea549c3eeac3829362a1c478d5ea6fe48e4c053b 100644 --- a/backend/metagenedb/apps/catalog/models/gene.py +++ b/backend/metagenedb/apps/catalog/models/gene.py @@ -15,7 +15,7 @@ class Gene(models.Model): ) def __str__(self): - return self.gene_slug + return self.gene_id class Meta: ordering = ['-gene_id'] diff --git a/backend/metagenedb/common/utils/api/__init__.py b/backend/metagenedb/common/utils/api/__init__.py index fef0d405eae431f9e3e7634e765cddf7c75f1ec8..f06e30ccebe7ed68ce879f63d98b2ca72cc6c12f 100644 --- a/backend/metagenedb/common/utils/api/__init__.py +++ b/backend/metagenedb/common/utils/api/__init__.py @@ -1,2 +1,2 @@ -from .togows import TogoWSEntry # noqa -from .metagenedb import MetageneDBCatalogGene # noqa +from .togows import TogoWSEntryAPI # noqa +from .metagenedb import MetageneDBCatalogGeneAPI # noqa diff --git a/backend/metagenedb/common/utils/api/metagenedb.py b/backend/metagenedb/common/utils/api/metagenedb.py index 84ab221f3677bb956a14e04ba02fc344c036adcf..a641e8244ddd3dba072ec4a9fbedbfa8c35e9c15 100644 --- a/backend/metagenedb/common/utils/api/metagenedb.py +++ b/backend/metagenedb/common/utils/api/metagenedb.py @@ -1,7 +1,7 @@ from .baseapi import BaseAPI -class MetageneDB(BaseAPI): +class MetageneDBAPI(BaseAPI): BASE_URL = 'http://localhost/' def __init__(self, base_url=BASE_URL): @@ -9,5 +9,5 @@ class MetageneDB(BaseAPI): super().__init__() -class MetageneDBCatalogGene(MetageneDB): +class MetageneDBCatalogGeneAPI(MetageneDBAPI): ROUTE = 'api/catalog/v1/genes/' diff --git a/backend/metagenedb/common/utils/api/togows.py b/backend/metagenedb/common/utils/api/togows.py index 7951b78d30d5575c07cbd0bf35ad4c31e1635cee..7a6831eaf1c4321d5343a23a213c66fecd9fa0c5 100644 --- a/backend/metagenedb/common/utils/api/togows.py +++ b/backend/metagenedb/common/utils/api/togows.py @@ -3,11 +3,11 @@ from urllib.parse import urljoin from .baseapi import BaseAPI -class TogoWS(BaseAPI): +class TogoWSAPI(BaseAPI): BASE_URL = 'http://togows.org' -class TogoWSEntry(TogoWS): +class TogoWSEntryAPI(TogoWSAPI): TYPE = 'entry' def __init__(self, database, entry_format='json'): diff --git a/backend/scripts/populate_db/import_igc_data.py b/backend/scripts/populate_db/import_igc_data.py index 8298a55460c4a7a6aa2fdf7bad80b2f1de7ef92d..9102f5c5c3f6a0614595c14d91d7d91b45b69320 100755 --- a/backend/scripts/populate_db/import_igc_data.py +++ b/backend/scripts/populate_db/import_igc_data.py @@ -4,10 +4,13 @@ import logging import os import sys from itertools import islice +from requests.exceptions import HTTPError import django from rest_framework.exceptions import ValidationError +from slugify import slugify +from metagenedb.common.utils.api import MetageneDBCatalogGeneAPI from metagenedb.common.utils.parsers import IGCLineParser # Before model import, we need to called django.setup() to Load apps @@ -59,37 +62,54 @@ def select_taxonomy(gene_dict, unknown_val='unknown'): return gene_dict -def upsert_gene(gene_dict): - try: - gene_obj = Gene.objects.get(gene_id=gene_dict.get('gene_id')) - serializer = GeneSerializer(gene_obj, data=gene_dict) - except Gene.DoesNotExist: - serializer = GeneSerializer(data=gene_dict) - serializer.is_valid(raise_exception=True) - serializer.save() - - -def insert_gene_list(chunk_genes): - for gene_line in chunk_genes: - gene_dict = parse_gene(gene_line) - gene_dict_with_taxo = select_taxonomy(gene_dict) - try: - upsert_gene(gene_dict_with_taxo) - except ValidationError as e: - _LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.") +class ImportIGCGenes(object): + METAGENEDB_GENE_API = MetageneDBCatalogGeneAPI + def __init__(self, annotation_file, url, skip_tax=False, skip_functions=False): + self.annotation_file = annotation_file + self.url = url + self.metagenedb_gene_api = self.METAGENEDB_GENE_API(base_url=self.url) + # Skip some insertion if specified in script options + self.skip_tax = skip_tax + self.skip_functions = skip_functions -def load_annotation_file_to_db_in_chunks(annotation_file, chunk_size=100000): - processed_genes = 0 - with open(annotation_file, 'r') as file: - while True: - chunk_genes = list(islice(file, chunk_size)) - if not chunk_genes: - break - processed_genes += len(chunk_genes) - insert_gene_list(chunk_genes) - _LOGGER.info(f"{processed_genes} genes processed so far...") - _LOGGER.info(f"[DONE] {processed_genes} genes processed.") + def _clean_gene(self, gene_dict): + gene_dict['gene_id'] = slugify(gene_dict['gene_id']) + if self.skip_tax: + gene_dict.pop('taxonomy') + if self.skip_functions: + gene_dict.pop('functions') + return gene_dict + + def _upsert_gene(self, gene_dict): + clean_gene_dict = self._clean_gene(gene_dict) + try: + gene_id = clean_gene_dict['gene_id'] + self.metagenedb_gene_api.get(gene_id) # Try to get obj to check if it exists + self.metagenedb_gene_api.put(gene_id, clean_gene_dict) + except HTTPError: + self.metagenedb_gene_api.post(clean_gene_dict) + + def _insert_gene_list(self, chunk_genes): + for gene_line in chunk_genes: + gene_dict = parse_gene(gene_line) + gene_dict_with_taxo = select_taxonomy(gene_dict) + try: + self._upsert_gene(gene_dict_with_taxo) + except ValidationError as e: + _LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.") + + def load_annotation_file_to_db_in_chunks(self, chunk_size=100000): + processed_genes = 0 + with open(self.annotation_file, 'r') as file: + while True: + chunk_genes = list(islice(file, chunk_size)) + if not chunk_genes: + break + processed_genes += len(chunk_genes) + self._insert_gene_list(chunk_genes) + _LOGGER.info(f"{processed_genes} genes processed so far...") + _LOGGER.info(f"[DONE] {processed_genes} genes processed.") def parse_arguments(): @@ -99,7 +119,9 @@ def parse_arguments(): parser = argparse.ArgumentParser(description='Populate database from a given IGC annotation file.') # Common arguments for analysis and annotations parser.add_argument('annotation', help='IGC annotation file') - parser.add_argument('--delete_all', action='store_true', help='Empty database before insertion.') + parser.add_argument('url', help='base URL of the instance.', default='http://localhost/') + parser.add_argument('--skip_taxonomy', action='store_true', help='Skip taxonomy information from genes.') + parser.add_argument('--skip_functions', action='store_true', help='Skip functions information from genes.') try: return parser.parse_args() @@ -109,9 +131,7 @@ def parse_arguments(): def run(): args = parse_arguments() - if args.delete_all: - Gene.objects.all().delete() - load_annotation_file_to_db_in_chunks(args.annotation) + load_annotation_file_to_db_in_chunks(args.annotation, args.url) if __name__ == "__main__": diff --git a/backend/scripts/populate_db/test_import_igc_data.py b/backend/scripts/populate_db/test_import_igc_data.py index 6f9d1158493949883abb9c56af85a1f6043bbbe7..6a105882007e3bd6f2a1dea1322ff514efd8ecdd 100644 --- a/backend/scripts/populate_db/test_import_igc_data.py +++ b/backend/scripts/populate_db/test_import_igc_data.py @@ -1,12 +1,13 @@ +from requests.exceptions import HTTPError from unittest import TestCase import pytest -from rest_framework.exceptions import ValidationError +from django.urls import reverse from rest_framework.test import APITestCase -from metagenedb.apps.catalog.models import Gene -from metagenedb.apps.catalog.factory.taxonomy import TaxonomyFactory -from scripts.populate_db.import_igc_data import parse_gene, upsert_gene, select_taxonomy +from metagenedb.common.utils.api import MetageneDBCatalogGeneAPI +from metagenedb.apps.catalog.factory import TaxonomyFactory +from scripts.populate_db.import_igc_data import parse_gene, select_taxonomy, ImportIGCGenes class TestParseGene(TestCase): @@ -69,40 +70,75 @@ class TestParseGene(TestCase): self.assertDictEqual(tested_dict, expected_dict) +class MetageneDBCatalogGeneAPIMock(MetageneDBCatalogGeneAPI): + """ + Just a simple mock to go through the Test client. The idea is to test the upsert behaviour and not + the insertion to the db. + """ + + def __init__(self, client): + self.client = client + self.reverse_path = 'api:catalog:v1:genes' + + def get_all(self): + return self.client.get(reverse(f'{self.reverse_path}-list')).json() + + def get(self, entry_id): + response = self.client.get(reverse(f'{self.reverse_path}-detail', kwargs={'gene_id': entry_id})) + if response.status_code == 404: + raise HTTPError + return response.json() + + def post(self, data): + response = self.client.post(reverse(f'{self.reverse_path}-list'), data, format='json') + if response.status_code == 400: + raise HTTPError + return response.json() + + def put(self, entry_id, data): + return self.client.put(reverse(f'{self.reverse_path}-detail', kwargs={'gene_id': entry_id}), + data, format='json').json() + + class TestUpsertGene(APITestCase): + def setUp(self): + self.import_igc_genes = ImportIGCGenes('test', 'test') + self.api_mock = MetageneDBCatalogGeneAPIMock(self.client) + self.import_igc_genes.metagenedb_gene_api = self.api_mock + def test_insert_valid_gene_no_kegg(self): valid_gene = { 'gene_name': 'test_gene.01', - 'gene_id': 'test_gene01', + 'gene_id': 'test-gene01', 'length': 3556 } - upsert_gene(valid_gene) - self.assertEqual(Gene.objects.all().count(), 1) + self.import_igc_genes._upsert_gene(valid_gene) + self.assertEqual(self.api_mock.get_all()['count'], 1) def test_insert_invalid_length(self): invalid_gene = { - 'gene_id': 'test_gene01', + 'gene_id': 'test-gene01', 'length': 'wrong_format' } - with self.assertRaises(ValidationError) as context: # noqa - upsert_gene(invalid_gene) + with self.assertRaises(HTTPError) as context: # noqa + self.import_igc_genes._upsert_gene(invalid_gene) def test_update_gene(self): valid_gene = { 'gene_name': 'test_gene.01', - 'gene_id': 'test_gene01', + 'gene_id': 'test-gene01', 'length': 3556 } updated_gene = { 'gene_name': 'test_gene.01', - 'gene_id': 'test_gene01', + 'gene_id': 'test-gene01', 'length': 356 } - upsert_gene(valid_gene) - self.assertEqual(Gene.objects.get(gene_id="test_gene01").length, 3556) - upsert_gene(updated_gene) - self.assertEqual(Gene.objects.get(gene_id="test_gene01").length, 356) + self.import_igc_genes._upsert_gene(valid_gene) + self.assertEqual(self.api_mock.get('test-gene01')['length'], 3556) + self.import_igc_genes._upsert_gene(updated_gene) + self.assertEqual(self.api_mock.get('test-gene01')['length'], 356) @pytest.mark.django_db