Skip to content
Snippets Groups Projects
Commit a77ffe7b authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion :recycle:
Browse files

Refactor script into classes and start using API

parent 86a4744d
No related branches found
No related tags found
2 merge requests!59Prod,!12Resolve "Use POST in scripts to create elements instead of doing it manually for Taxonomy"
...@@ -15,7 +15,7 @@ class Gene(models.Model): ...@@ -15,7 +15,7 @@ class Gene(models.Model):
) )
def __str__(self): def __str__(self):
return self.gene_slug return self.gene_id
class Meta: class Meta:
ordering = ['-gene_id'] ordering = ['-gene_id']
from .togows import TogoWSEntry # noqa from .togows import TogoWSEntryAPI # noqa
from .metagenedb import MetageneDBCatalogGene # noqa from .metagenedb import MetageneDBCatalogGeneAPI # noqa
from .baseapi import BaseAPI from .baseapi import BaseAPI
class MetageneDB(BaseAPI): class MetageneDBAPI(BaseAPI):
BASE_URL = 'http://localhost/' BASE_URL = 'http://localhost/'
def __init__(self, base_url=BASE_URL): def __init__(self, base_url=BASE_URL):
...@@ -9,5 +9,5 @@ class MetageneDB(BaseAPI): ...@@ -9,5 +9,5 @@ class MetageneDB(BaseAPI):
super().__init__() super().__init__()
class MetageneDBCatalogGene(MetageneDB): class MetageneDBCatalogGeneAPI(MetageneDBAPI):
ROUTE = 'api/catalog/v1/genes/' ROUTE = 'api/catalog/v1/genes/'
...@@ -3,11 +3,11 @@ from urllib.parse import urljoin ...@@ -3,11 +3,11 @@ from urllib.parse import urljoin
from .baseapi import BaseAPI from .baseapi import BaseAPI
class TogoWS(BaseAPI): class TogoWSAPI(BaseAPI):
BASE_URL = 'http://togows.org' BASE_URL = 'http://togows.org'
class TogoWSEntry(TogoWS): class TogoWSEntryAPI(TogoWSAPI):
TYPE = 'entry' TYPE = 'entry'
def __init__(self, database, entry_format='json'): def __init__(self, database, entry_format='json'):
......
...@@ -4,10 +4,13 @@ import logging ...@@ -4,10 +4,13 @@ import logging
import os import os
import sys import sys
from itertools import islice from itertools import islice
from requests.exceptions import HTTPError
import django import django
from rest_framework.exceptions import ValidationError from rest_framework.exceptions import ValidationError
from slugify import slugify
from metagenedb.common.utils.api import MetageneDBCatalogGeneAPI
from metagenedb.common.utils.parsers import IGCLineParser from metagenedb.common.utils.parsers import IGCLineParser
# Before model import, we need to called django.setup() to Load apps # Before model import, we need to called django.setup() to Load apps
...@@ -59,37 +62,54 @@ def select_taxonomy(gene_dict, unknown_val='unknown'): ...@@ -59,37 +62,54 @@ def select_taxonomy(gene_dict, unknown_val='unknown'):
return gene_dict return gene_dict
def upsert_gene(gene_dict): class ImportIGCGenes(object):
try: METAGENEDB_GENE_API = MetageneDBCatalogGeneAPI
gene_obj = Gene.objects.get(gene_id=gene_dict.get('gene_id'))
serializer = GeneSerializer(gene_obj, data=gene_dict)
except Gene.DoesNotExist:
serializer = GeneSerializer(data=gene_dict)
serializer.is_valid(raise_exception=True)
serializer.save()
def insert_gene_list(chunk_genes):
for gene_line in chunk_genes:
gene_dict = parse_gene(gene_line)
gene_dict_with_taxo = select_taxonomy(gene_dict)
try:
upsert_gene(gene_dict_with_taxo)
except ValidationError as e:
_LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.")
def __init__(self, annotation_file, url, skip_tax=False, skip_functions=False):
self.annotation_file = annotation_file
self.url = url
self.metagenedb_gene_api = self.METAGENEDB_GENE_API(base_url=self.url)
# Skip some insertion if specified in script options
self.skip_tax = skip_tax
self.skip_functions = skip_functions
def load_annotation_file_to_db_in_chunks(annotation_file, chunk_size=100000): def _clean_gene(self, gene_dict):
processed_genes = 0 gene_dict['gene_id'] = slugify(gene_dict['gene_id'])
with open(annotation_file, 'r') as file: if self.skip_tax:
while True: gene_dict.pop('taxonomy')
chunk_genes = list(islice(file, chunk_size)) if self.skip_functions:
if not chunk_genes: gene_dict.pop('functions')
break return gene_dict
processed_genes += len(chunk_genes)
insert_gene_list(chunk_genes) def _upsert_gene(self, gene_dict):
_LOGGER.info(f"{processed_genes} genes processed so far...") clean_gene_dict = self._clean_gene(gene_dict)
_LOGGER.info(f"[DONE] {processed_genes} genes processed.") try:
gene_id = clean_gene_dict['gene_id']
self.metagenedb_gene_api.get(gene_id) # Try to get obj to check if it exists
self.metagenedb_gene_api.put(gene_id, clean_gene_dict)
except HTTPError:
self.metagenedb_gene_api.post(clean_gene_dict)
def _insert_gene_list(self, chunk_genes):
for gene_line in chunk_genes:
gene_dict = parse_gene(gene_line)
gene_dict_with_taxo = select_taxonomy(gene_dict)
try:
self._upsert_gene(gene_dict_with_taxo)
except ValidationError as e:
_LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.")
def load_annotation_file_to_db_in_chunks(self, chunk_size=100000):
processed_genes = 0
with open(self.annotation_file, 'r') as file:
while True:
chunk_genes = list(islice(file, chunk_size))
if not chunk_genes:
break
processed_genes += len(chunk_genes)
self._insert_gene_list(chunk_genes)
_LOGGER.info(f"{processed_genes} genes processed so far...")
_LOGGER.info(f"[DONE] {processed_genes} genes processed.")
def parse_arguments(): def parse_arguments():
...@@ -99,7 +119,9 @@ def parse_arguments(): ...@@ -99,7 +119,9 @@ def parse_arguments():
parser = argparse.ArgumentParser(description='Populate database from a given IGC annotation file.') parser = argparse.ArgumentParser(description='Populate database from a given IGC annotation file.')
# Common arguments for analysis and annotations # Common arguments for analysis and annotations
parser.add_argument('annotation', help='IGC annotation file') parser.add_argument('annotation', help='IGC annotation file')
parser.add_argument('--delete_all', action='store_true', help='Empty database before insertion.') parser.add_argument('url', help='base URL of the instance.', default='http://localhost/')
parser.add_argument('--skip_taxonomy', action='store_true', help='Skip taxonomy information from genes.')
parser.add_argument('--skip_functions', action='store_true', help='Skip functions information from genes.')
try: try:
return parser.parse_args() return parser.parse_args()
...@@ -109,9 +131,7 @@ def parse_arguments(): ...@@ -109,9 +131,7 @@ def parse_arguments():
def run(): def run():
args = parse_arguments() args = parse_arguments()
if args.delete_all: load_annotation_file_to_db_in_chunks(args.annotation, args.url)
Gene.objects.all().delete()
load_annotation_file_to_db_in_chunks(args.annotation)
if __name__ == "__main__": if __name__ == "__main__":
......
from requests.exceptions import HTTPError
from unittest import TestCase from unittest import TestCase
import pytest import pytest
from rest_framework.exceptions import ValidationError from django.urls import reverse
from rest_framework.test import APITestCase from rest_framework.test import APITestCase
from metagenedb.apps.catalog.models import Gene from metagenedb.common.utils.api import MetageneDBCatalogGeneAPI
from metagenedb.apps.catalog.factory.taxonomy import TaxonomyFactory from metagenedb.apps.catalog.factory import TaxonomyFactory
from scripts.populate_db.import_igc_data import parse_gene, upsert_gene, select_taxonomy from scripts.populate_db.import_igc_data import parse_gene, select_taxonomy, ImportIGCGenes
class TestParseGene(TestCase): class TestParseGene(TestCase):
...@@ -69,40 +70,75 @@ class TestParseGene(TestCase): ...@@ -69,40 +70,75 @@ class TestParseGene(TestCase):
self.assertDictEqual(tested_dict, expected_dict) self.assertDictEqual(tested_dict, expected_dict)
class MetageneDBCatalogGeneAPIMock(MetageneDBCatalogGeneAPI):
"""
Just a simple mock to go through the Test client. The idea is to test the upsert behaviour and not
the insertion to the db.
"""
def __init__(self, client):
self.client = client
self.reverse_path = 'api:catalog:v1:genes'
def get_all(self):
return self.client.get(reverse(f'{self.reverse_path}-list')).json()
def get(self, entry_id):
response = self.client.get(reverse(f'{self.reverse_path}-detail', kwargs={'gene_id': entry_id}))
if response.status_code == 404:
raise HTTPError
return response.json()
def post(self, data):
response = self.client.post(reverse(f'{self.reverse_path}-list'), data, format='json')
if response.status_code == 400:
raise HTTPError
return response.json()
def put(self, entry_id, data):
return self.client.put(reverse(f'{self.reverse_path}-detail', kwargs={'gene_id': entry_id}),
data, format='json').json()
class TestUpsertGene(APITestCase): class TestUpsertGene(APITestCase):
def setUp(self):
self.import_igc_genes = ImportIGCGenes('test', 'test')
self.api_mock = MetageneDBCatalogGeneAPIMock(self.client)
self.import_igc_genes.metagenedb_gene_api = self.api_mock
def test_insert_valid_gene_no_kegg(self): def test_insert_valid_gene_no_kegg(self):
valid_gene = { valid_gene = {
'gene_name': 'test_gene.01', 'gene_name': 'test_gene.01',
'gene_id': 'test_gene01', 'gene_id': 'test-gene01',
'length': 3556 'length': 3556
} }
upsert_gene(valid_gene) self.import_igc_genes._upsert_gene(valid_gene)
self.assertEqual(Gene.objects.all().count(), 1) self.assertEqual(self.api_mock.get_all()['count'], 1)
def test_insert_invalid_length(self): def test_insert_invalid_length(self):
invalid_gene = { invalid_gene = {
'gene_id': 'test_gene01', 'gene_id': 'test-gene01',
'length': 'wrong_format' 'length': 'wrong_format'
} }
with self.assertRaises(ValidationError) as context: # noqa with self.assertRaises(HTTPError) as context: # noqa
upsert_gene(invalid_gene) self.import_igc_genes._upsert_gene(invalid_gene)
def test_update_gene(self): def test_update_gene(self):
valid_gene = { valid_gene = {
'gene_name': 'test_gene.01', 'gene_name': 'test_gene.01',
'gene_id': 'test_gene01', 'gene_id': 'test-gene01',
'length': 3556 'length': 3556
} }
updated_gene = { updated_gene = {
'gene_name': 'test_gene.01', 'gene_name': 'test_gene.01',
'gene_id': 'test_gene01', 'gene_id': 'test-gene01',
'length': 356 'length': 356
} }
upsert_gene(valid_gene) self.import_igc_genes._upsert_gene(valid_gene)
self.assertEqual(Gene.objects.get(gene_id="test_gene01").length, 3556) self.assertEqual(self.api_mock.get('test-gene01')['length'], 3556)
upsert_gene(updated_gene) self.import_igc_genes._upsert_gene(updated_gene)
self.assertEqual(Gene.objects.get(gene_id="test_gene01").length, 356) self.assertEqual(self.api_mock.get('test-gene01')['length'], 356)
@pytest.mark.django_db @pytest.mark.django_db
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment