Commit a77ffe7b authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

Refactor script into classes and start using API

parent 86a4744d
......@@ -15,7 +15,7 @@ class Gene(models.Model):
)
def __str__(self):
return self.gene_slug
return self.gene_id
class Meta:
ordering = ['-gene_id']
from .togows import TogoWSEntry # noqa
from .metagenedb import MetageneDBCatalogGene # noqa
from .togows import TogoWSEntryAPI # noqa
from .metagenedb import MetageneDBCatalogGeneAPI # noqa
from .baseapi import BaseAPI
class MetageneDB(BaseAPI):
class MetageneDBAPI(BaseAPI):
BASE_URL = 'http://localhost/'
def __init__(self, base_url=BASE_URL):
......@@ -9,5 +9,5 @@ class MetageneDB(BaseAPI):
super().__init__()
class MetageneDBCatalogGene(MetageneDB):
class MetageneDBCatalogGeneAPI(MetageneDBAPI):
ROUTE = 'api/catalog/v1/genes/'
......@@ -3,11 +3,11 @@ from urllib.parse import urljoin
from .baseapi import BaseAPI
class TogoWS(BaseAPI):
class TogoWSAPI(BaseAPI):
BASE_URL = 'http://togows.org'
class TogoWSEntry(TogoWS):
class TogoWSEntryAPI(TogoWSAPI):
TYPE = 'entry'
def __init__(self, database, entry_format='json'):
......
......@@ -4,10 +4,13 @@ import logging
import os
import sys
from itertools import islice
from requests.exceptions import HTTPError
import django
from rest_framework.exceptions import ValidationError
from slugify import slugify
from metagenedb.common.utils.api import MetageneDBCatalogGeneAPI
from metagenedb.common.utils.parsers import IGCLineParser
# Before model import, we need to called django.setup() to Load apps
......@@ -59,37 +62,54 @@ def select_taxonomy(gene_dict, unknown_val='unknown'):
return gene_dict
def upsert_gene(gene_dict):
try:
gene_obj = Gene.objects.get(gene_id=gene_dict.get('gene_id'))
serializer = GeneSerializer(gene_obj, data=gene_dict)
except Gene.DoesNotExist:
serializer = GeneSerializer(data=gene_dict)
serializer.is_valid(raise_exception=True)
serializer.save()
def insert_gene_list(chunk_genes):
for gene_line in chunk_genes:
gene_dict = parse_gene(gene_line)
gene_dict_with_taxo = select_taxonomy(gene_dict)
try:
upsert_gene(gene_dict_with_taxo)
except ValidationError as e:
_LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.")
class ImportIGCGenes(object):
METAGENEDB_GENE_API = MetageneDBCatalogGeneAPI
def __init__(self, annotation_file, url, skip_tax=False, skip_functions=False):
self.annotation_file = annotation_file
self.url = url
self.metagenedb_gene_api = self.METAGENEDB_GENE_API(base_url=self.url)
# Skip some insertion if specified in script options
self.skip_tax = skip_tax
self.skip_functions = skip_functions
def load_annotation_file_to_db_in_chunks(annotation_file, chunk_size=100000):
processed_genes = 0
with open(annotation_file, 'r') as file:
while True:
chunk_genes = list(islice(file, chunk_size))
if not chunk_genes:
break
processed_genes += len(chunk_genes)
insert_gene_list(chunk_genes)
_LOGGER.info(f"{processed_genes} genes processed so far...")
_LOGGER.info(f"[DONE] {processed_genes} genes processed.")
def _clean_gene(self, gene_dict):
gene_dict['gene_id'] = slugify(gene_dict['gene_id'])
if self.skip_tax:
gene_dict.pop('taxonomy')
if self.skip_functions:
gene_dict.pop('functions')
return gene_dict
def _upsert_gene(self, gene_dict):
clean_gene_dict = self._clean_gene(gene_dict)
try:
gene_id = clean_gene_dict['gene_id']
self.metagenedb_gene_api.get(gene_id) # Try to get obj to check if it exists
self.metagenedb_gene_api.put(gene_id, clean_gene_dict)
except HTTPError:
self.metagenedb_gene_api.post(clean_gene_dict)
def _insert_gene_list(self, chunk_genes):
for gene_line in chunk_genes:
gene_dict = parse_gene(gene_line)
gene_dict_with_taxo = select_taxonomy(gene_dict)
try:
self._upsert_gene(gene_dict_with_taxo)
except ValidationError as e:
_LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.")
def load_annotation_file_to_db_in_chunks(self, chunk_size=100000):
processed_genes = 0
with open(self.annotation_file, 'r') as file:
while True:
chunk_genes = list(islice(file, chunk_size))
if not chunk_genes:
break
processed_genes += len(chunk_genes)
self._insert_gene_list(chunk_genes)
_LOGGER.info(f"{processed_genes} genes processed so far...")
_LOGGER.info(f"[DONE] {processed_genes} genes processed.")
def parse_arguments():
......@@ -99,7 +119,9 @@ def parse_arguments():
parser = argparse.ArgumentParser(description='Populate database from a given IGC annotation file.')
# Common arguments for analysis and annotations
parser.add_argument('annotation', help='IGC annotation file')
parser.add_argument('--delete_all', action='store_true', help='Empty database before insertion.')
parser.add_argument('url', help='base URL of the instance.', default='http://localhost/')
parser.add_argument('--skip_taxonomy', action='store_true', help='Skip taxonomy information from genes.')
parser.add_argument('--skip_functions', action='store_true', help='Skip functions information from genes.')
try:
return parser.parse_args()
......@@ -109,9 +131,7 @@ def parse_arguments():
def run():
args = parse_arguments()
if args.delete_all:
Gene.objects.all().delete()
load_annotation_file_to_db_in_chunks(args.annotation)
load_annotation_file_to_db_in_chunks(args.annotation, args.url)
if __name__ == "__main__":
......
from requests.exceptions import HTTPError
from unittest import TestCase
import pytest
from rest_framework.exceptions import ValidationError
from django.urls import reverse
from rest_framework.test import APITestCase
from metagenedb.apps.catalog.models import Gene
from metagenedb.apps.catalog.factory.taxonomy import TaxonomyFactory
from scripts.populate_db.import_igc_data import parse_gene, upsert_gene, select_taxonomy
from metagenedb.common.utils.api import MetageneDBCatalogGeneAPI
from metagenedb.apps.catalog.factory import TaxonomyFactory
from scripts.populate_db.import_igc_data import parse_gene, select_taxonomy, ImportIGCGenes
class TestParseGene(TestCase):
......@@ -69,40 +70,75 @@ class TestParseGene(TestCase):
self.assertDictEqual(tested_dict, expected_dict)
class MetageneDBCatalogGeneAPIMock(MetageneDBCatalogGeneAPI):
"""
Just a simple mock to go through the Test client. The idea is to test the upsert behaviour and not
the insertion to the db.
"""
def __init__(self, client):
self.client = client
self.reverse_path = 'api:catalog:v1:genes'
def get_all(self):
return self.client.get(reverse(f'{self.reverse_path}-list')).json()
def get(self, entry_id):
response = self.client.get(reverse(f'{self.reverse_path}-detail', kwargs={'gene_id': entry_id}))
if response.status_code == 404:
raise HTTPError
return response.json()
def post(self, data):
response = self.client.post(reverse(f'{self.reverse_path}-list'), data, format='json')
if response.status_code == 400:
raise HTTPError
return response.json()
def put(self, entry_id, data):
return self.client.put(reverse(f'{self.reverse_path}-detail', kwargs={'gene_id': entry_id}),
data, format='json').json()
class TestUpsertGene(APITestCase):
def setUp(self):
self.import_igc_genes = ImportIGCGenes('test', 'test')
self.api_mock = MetageneDBCatalogGeneAPIMock(self.client)
self.import_igc_genes.metagenedb_gene_api = self.api_mock
def test_insert_valid_gene_no_kegg(self):
valid_gene = {
'gene_name': 'test_gene.01',
'gene_id': 'test_gene01',
'gene_id': 'test-gene01',
'length': 3556
}
upsert_gene(valid_gene)
self.assertEqual(Gene.objects.all().count(), 1)
self.import_igc_genes._upsert_gene(valid_gene)
self.assertEqual(self.api_mock.get_all()['count'], 1)
def test_insert_invalid_length(self):
invalid_gene = {
'gene_id': 'test_gene01',
'gene_id': 'test-gene01',
'length': 'wrong_format'
}
with self.assertRaises(ValidationError) as context: # noqa
upsert_gene(invalid_gene)
with self.assertRaises(HTTPError) as context: # noqa
self.import_igc_genes._upsert_gene(invalid_gene)
def test_update_gene(self):
valid_gene = {
'gene_name': 'test_gene.01',
'gene_id': 'test_gene01',
'gene_id': 'test-gene01',
'length': 3556
}
updated_gene = {
'gene_name': 'test_gene.01',
'gene_id': 'test_gene01',
'gene_id': 'test-gene01',
'length': 356
}
upsert_gene(valid_gene)
self.assertEqual(Gene.objects.get(gene_id="test_gene01").length, 3556)
upsert_gene(updated_gene)
self.assertEqual(Gene.objects.get(gene_id="test_gene01").length, 356)
self.import_igc_genes._upsert_gene(valid_gene)
self.assertEqual(self.api_mock.get('test-gene01')['length'], 3556)
self.import_igc_genes._upsert_gene(updated_gene)
self.assertEqual(self.api_mock.get('test-gene01')['length'], 356)
@pytest.mark.django_db
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment