Commit da010ccf authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

Merge branch '63-service-for-external-API-request' into 'dev'

Resolve "Create backend service to perform request to external APIs"

Closes #63

See merge request !18
parents 41f9f815 c6c3e257
Pipeline #18555 passed with stages
in 2 minutes and 31 seconds
......@@ -28,6 +28,8 @@ jupyter = "*"
factory-boy = "*"
pytest-factoryboy = "*"
pylint = "*"
mock = "*"
snakeviz = "*"
[packages]
certifi = "*"
......@@ -55,6 +57,7 @@ drf-yasg = "*"
packaging = "*"
python-slugify = "*"
master = {git = "https://github.com/khillion/bioapi.git"}
marshmallow = "*"
[requires]
python_version = "3.7"
This diff is collapsed.
from marshmallow import Schema
class EmptyQueryParams(Schema):
pass
from marshmallow import Schema, fields
class FunctionQueryParams(Schema):
detailed = fields.Boolean()
from marshmallow import Schema, fields
class TaxonomyQueryParams(Schema):
rank = fields.String()
name = fields.String()
from marshmallow.exceptions import ValidationError
from rest_framework import status
from rest_framework.viewsets import ModelViewSet
from rest_framework.response import Response
from rest_framework.viewsets import ModelViewSet
from metagenedb.api.catalog.qparams_validators.empty import EmptyQueryParams
class BulkViewSet(ModelViewSet):
query_params_parser = EmptyQueryParams
def get_objects(self, instance_ids):
return self.queryset.in_bulk(instance_ids, field_name=self.lookup_field)
def _get_qparams(self, raw_query_params):
return self.query_params_parser().load(raw_query_params)
def _created_payload(self, serializer, request):
if isinstance(request.data, list):
return {
......@@ -79,3 +86,27 @@ class BulkViewSet(ModelViewSet):
self._updated_payload(create_serializer, update_serializer, request),
status=status.HTTP_201_CREATED, headers=headers
)
def list(self, request, *args, **kwargs):
try:
query_params = self._get_qparams(request.query_params) # noqa
except ValidationError as validation_error:
return Response(validation_error.normalized_messages(), status=status.HTTP_422_UNPROCESSABLE_ENTITY)
queryset = self.filter_queryset(self.get_queryset())
page = self.paginate_queryset(queryset)
if page is not None:
serializer = self.get_serializer(page, many=True)
return self.get_paginated_response(serializer.data)
serializer = self.get_serializer(queryset, many=True)
return Response(serializer.data)
def retrieve(self, request, *args, **kwargs):
try:
query_params = self._get_qparams(request.query_params) # noqa
except ValidationError as validation_error:
return Response(validation_error.normalized_messages(), status=status.HTTP_422_UNPROCESSABLE_ENTITY)
instance = self.get_object()
serializer = self.get_serializer(instance)
return Response(serializer.data)
import logging
from marshmallow.exceptions import ValidationError
from rest_framework.response import Response
from rest_framework.status import HTTP_422_UNPROCESSABLE_ENTITY
from metagenedb.api.catalog.filters import FunctionFilter
from metagenedb.api.catalog.qparams_validators.function import FunctionQueryParams
from metagenedb.apps.catalog.models import Function
from metagenedb.apps.catalog.serializers import FunctionSerializer
from metagenedb.common.utils.external_api.togows import GetFunctionExternalInfo
from .bulk_viewset import BulkViewSet
logger = logging.getLogger(__name__)
class FunctionViewSet(BulkViewSet):
queryset = Function.objects.all()
serializer_class = FunctionSerializer
lookup_field = 'function_id'
filterset_class = FunctionFilter
query_params_parser = FunctionQueryParams
def _get_external_info(self, db_data):
detailed_info_retriever = GetFunctionExternalInfo(db_data['function_id'],
db_data['source'])
try:
detailed_data = detailed_info_retriever.get_details()
except NotImplementedError as not_implemented_error:
logger.warning("Could not found API for the source, returning item from the DB. Error: %s" %
not_implemented_error)
detailed_data = db_data
return detailed_data
def retrieve(self, request, *args, **kwargs):
try:
query_params = self._get_qparams(request.query_params)
except ValidationError as validation_error:
return Response(validation_error.normalized_messages(), status=HTTP_422_UNPROCESSABLE_ENTITY)
instance = self.get_object()
serializer = self.get_serializer(instance)
returned_data = serializer.data
if query_params.get('detailed', False) is True:
returned_data = self._get_external_info(returned_data)
return Response(returned_data)
from marshmallow.exceptions import ValidationError
from rest_framework.response import Response
from rest_framework.status import HTTP_422_UNPROCESSABLE_ENTITY
from metagenedb.api.catalog.filters import TaxonomyFilter
from metagenedb.api.catalog.qparams_validators.taxonomy import TaxonomyQueryParams
from metagenedb.apps.catalog.models import Taxonomy
from metagenedb.apps.catalog.serializers import TaxonomySerializer
......@@ -13,8 +16,13 @@ class TaxonomyViewSet(BulkViewSet):
serializer_class = TaxonomySerializer
lookup_field = 'tax_id'
filterset_class = TaxonomyFilter
query_params_parser = TaxonomyQueryParams
def retrieve(self, request, *args, **kwargs):
try:
query_params = self._get_qparams(request.query_params) # noqa
except ValidationError as validation_error:
return Response(validation_error.normalized_messages(), status=HTTP_422_UNPROCESSABLE_ENTITY)
instance = self.get_object()
hierarchy = instance.parental_hierarchy # noqa
serializer = self.get_serializer(instance)
......
from requests.exceptions import HTTPError
from rest_framework.test import APITestCase
from metagenedb.apps.catalog.factory import FunctionFactory
......@@ -136,3 +137,21 @@ class TestOperationsBulkViewSet(APITestCase):
self.assertEqual(self.function_api.get_all()['count'], 4)
for element in data:
self.assertDictEqual(self.function_api.get(element['function_id']), element)
def test_get_item(self):
function = FunctionFactory.create(name="Test")
response = self.function_api.get(function.function_id)
self.assertEqual(response['name'], 'Test')
# Use wrong query params, expect 422 returned
fake_qparams = {'qparam': 'fake'}
with self.assertRaises(HTTPError):
response = self.function_api.get(function.function_id, params=fake_qparams)
def test_get_items(self):
FunctionFactory.create_batch(5)
response = self.function_api.get_all()
self.assertEqual(response['count'], 5)
# Use wrong query params, expect 422 returned
fake_qparams = {'qparam': 'fake'}
with self.assertRaises(HTTPError):
response = self.function_api.get_all(params=fake_qparams)
from rest_framework.test import APITestCase
import mock
from metagenedb.apps.catalog.factory import FunctionFactory
from metagenedb.common.utils.mocks.metagenedb import MetageneDBCatalogFunctionAPIMock
class TestFunctionViewSet(APITestCase):
def setUp(self):
self.function_api = MetageneDBCatalogFunctionAPIMock(self.client)
self.kegg_function = FunctionFactory.create(source='kegg')
self.eggnog_function = FunctionFactory.create(source='eggnog')
def test_retrieve(self):
for function in [self.kegg_function, self.eggnog_function]:
expected_function = {
'function_id': function.function_id,
'name': function.name,
'source': function.source
}
self.assertDictEqual(self.function_api.get(function.function_id), expected_function)
def test_retrieve_detailed_available(self):
query_params = {
'detailed': 'true'
}
class_to_mock = 'metagenedb.api.catalog.views.function.GetFunctionExternalInfo'
detailed_kegg = {
'function_id': self.kegg_function.function_id,
'name': self.kegg_function.name,
'details': 'some details'
}
with mock.patch(class_to_mock) as MockGetFunctionExternalInfo:
MockGetFunctionExternalInfo.return_value.get_details.return_value = detailed_kegg
tested_dict = self.function_api.get(self.kegg_function.function_id, params=query_params)
self.assertDictEqual(tested_dict, detailed_kegg)
def test_retrieve_detailed_unavailable(self):
"""
eggnog is not available so it is a good example and should return the DB value.
"""
query_params = {
'detailed': 'true'
}
expected_function = {
'function_id': self.eggnog_function.function_id,
'name': self.eggnog_function.name,
'source': self.eggnog_function.source
}
tested_dict = self.function_api.get(self.eggnog_function.function_id, params=query_params)
self.assertDictEqual(tested_dict, expected_function)
from unittest import TestCase
import mock
from django.conf import settings
from metagenedb.common.utils.external_api.togows import GetFunctionExternalInfo
class TestGetFunctionExternalInfo(TestCase):
def test_get_details_unknown_source(self):
with self.assertRaises(NotImplementedError):
external_info_retriever = GetFunctionExternalInfo("test_id", "unknown")
external_info_retriever.get_details()
def test_get_details_kegg(self):
with mock.patch('metagenedb.common.utils.external_api.togows.TogoWSEntryAPI') as MockTogoWSEntryAPI:
MockTogoWSEntryAPI.return_value.get.return_value = [{"info": "some_info"}]
test_url = "http://test.com/"
test_id = "test_kegg_id"
MockTogoWSEntryAPI.return_value.url = test_url
expected_dict = {
'info': 'some_info',
settings.API_KEY_ADDITIONAL_INFO: {
'comment': f"Information retrieved from external source: {test_url}",
'url': f"{test_url}{test_id}"
}
}
external_info_retriever = GetFunctionExternalInfo(test_id, "kegg")
self.assertDictEqual(external_info_retriever.get_details(), expected_dict)
import logging
from django.conf import settings
from bioapi.togows import TogoWSEntryAPI
logger = logging.getLogger(__name__)
class GetFunctionExternalInfo:
def __init__(self, function_id, source):
self.function_id = function_id
self.source = source
def _get_unknown_source(self):
raise NotImplementedError("No source of information for %s from %s" % (self.function_id, self.source))
def _get_kegg(self):
"""
Get detailed information from KEGG orthology through Togows.
"""
kegg_api = TogoWSEntryAPI("kegg-orthology")
response = kegg_api.get(self.function_id)[0]
response[settings.API_KEY_ADDITIONAL_INFO] = {
'comment': f"Information retrieved from external source: {kegg_api.url}",
'url': f"{kegg_api.url}{self.function_id}"
}
return response
def get_details(self):
logger.info("Retrieving information from KEGG through togows")
return getattr(self, f"_get_{self.source}", self._get_unknown_source)()
......@@ -2,7 +2,6 @@ from requests.exceptions import HTTPError
from bioapi import MetageneDBCatalogGeneAPI
from django.urls import reverse
from django.utils.http import urlencode
class MetageneDBAPIMock(MetageneDBCatalogGeneAPI):
......@@ -13,27 +12,27 @@ class MetageneDBAPIMock(MetageneDBCatalogGeneAPI):
KEY_ID = ''
BASE_REVERSE = 'api'
REVERSE_PATH = ''
BAD_REQUESTS = range(400, 452)
def __init__(self, client):
self.client = client
self.reverse_path = ':'.join([self.BASE_REVERSE, self.REVERSE_PATH])
def get_all(self, params=None):
url = reverse(f'{self.reverse_path}-list')
if params is not None:
query_params = urlencode(params)
return self.client.get(f"{url}?{query_params}").json()
return self.client.get(f"{url}").json()
def get(self, entry_id):
response = self.client.get(reverse(f'{self.reverse_path}-detail', kwargs={self.KEY_ID: entry_id}))
if response.status_code == 404:
response = self.client.get(reverse(f'{self.reverse_path}-list'), params)
if response.status_code in self.BAD_REQUESTS:
raise HTTPError
return response.json()
def get(self, entry_id, params=None):
response = self.client.get(reverse(f'{self.reverse_path}-detail', kwargs={self.KEY_ID: entry_id}), params)
if response.status_code in self.BAD_REQUESTS:
raise HTTPError
return response.json()
def post(self, data):
response = self.client.post(reverse(f'{self.reverse_path}-list'), data, format='json')
if response.status_code == 400:
if response.status_code in self.BAD_REQUESTS:
raise HTTPError
return response.json()
......
......@@ -37,7 +37,7 @@ class IGCLineParser(object):
'cohort_origin': gene_info[4],
'taxo_phylum': gene_info[5],
'taxo_genus': gene_info[6],
'kegg_ko': gene_info[7],
'kegg_ko': gene_info[7].split(';'),
'eggnog': gene_info[8],
'sample_occurence_frequency': gene_info[9],
'individual_occurence_frequency': gene_info[10],
......
......@@ -31,7 +31,7 @@ class TestIGCLineParser(TestCase):
'cohort_origin': raw_data[4],
'taxo_phylum': raw_data[5],
'taxo_genus': raw_data[6],
'kegg_ko': raw_data[7],
'kegg_ko': [raw_data[7]],
'eggnog': raw_data[8],
'sample_occurence_frequency': raw_data[9],
'individual_occurence_frequency': raw_data[10],
......@@ -46,3 +46,40 @@ class TestIGCLineParser(TestCase):
raw_line = "This is a wrong line format, with; information and tab"
with self.assertRaises(Exception) as context: # noqa
IGCLineParser.gene(raw_line)
def test_multiple_functions(self):
raw_data = [
'gene_id',
'gene_name',
'length',
'gene_completeness_status',
'cohort_origin',
'taxo_phylum',
'taxo_genus',
'kegg;kegg2',
'eggnog',
'sample_occurence_freq',
'ind_occurence_freq',
'kegg_functional_cat',
'eggnog_functional_cat',
'cohort_assembled'
]
raw_line = "\t".join(raw_data)
expected_dict = {
'igc_id': raw_data[0],
'gene_id': raw_data[1],
'length': raw_data[2],
'gene_completeness_status': raw_data[3],
'cohort_origin': raw_data[4],
'taxo_phylum': raw_data[5],
'taxo_genus': raw_data[6],
'kegg_ko': ['kegg', 'kegg2'],
'eggnog': raw_data[8],
'sample_occurence_frequency': raw_data[9],
'individual_occurence_frequency': raw_data[10],
'kegg_functional_categories': raw_data[11],
'eggnog_functional_categories': raw_data[12],
'cohort_assembled': raw_data[13]
}
test_dict = IGCLineParser.gene(raw_line)
self.assertDictEqual(test_dict, expected_dict)
......@@ -146,3 +146,6 @@ STATIC_ROOT = public_root('static')
STATIC_URL = env.str('STATIC_URL', default='/static/')
SECRET_KEY = env.str('SECRET_KEY', default='')
# -- key for API when additional information is added to the payload
API_KEY_ADDITIONAL_INFO = env.str('API_KEY_ADDITIONAL_INFO', default='metagenedb_additional_info')
......@@ -11,7 +11,7 @@ from slugify import slugify
from metagenedb.common.utils.parsers import IGCLineParser
logging.basicConfig()
logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s')
logger = logging.getLogger()
......@@ -55,6 +55,8 @@ class ImportIGCGenes(object):
"""
phylum = gene_dict.pop(self.PHYLUM_COL)
genus = gene_dict.pop(self.GENUS_COL)
if self.skip_tax:
return gene_dict
resp_dict = {}
if genus != unknown_val:
resp_dict = self.metagenedb_taxonomy_api.get_all(params={'name': genus, 'rank': 'genus'})
......@@ -65,9 +67,7 @@ class ImportIGCGenes(object):
if len(resp_dict['results']) > 1:
logger.warning(f"More than 1 result found for phylum {phylum}. First result is kept.")
if resp_dict.get('count', 0) > 0:
gene_dict.update(
{'taxonomy': resp_dict['results'][0]['tax_id']}
)
gene_dict.update({'taxonomy': resp_dict['results'][0]['tax_id']})
return gene_dict
def _parse_gene(self, raw_line, selected_keys=SELECTED_KEYS):
......@@ -82,28 +82,30 @@ class ImportIGCGenes(object):
def _clean_gene(self, gene_dict):
gene_dict['gene_name'] = gene_dict['gene_id']
gene_dict['gene_id'] = slugify(gene_dict['gene_id'])
gene_dict['functions'] = [gene_dict.pop('kegg_ko')]
if self.skip_tax:
gene_dict.pop('taxonomy')
gene_dict['functions'] = gene_dict.pop('kegg_ko')
gene_dict = self._select_taxonomy(gene_dict)
if self.skip_functions or 'unknown' in gene_dict['functions']:
gene_dict.pop('functions')
return gene_dict
def load_annotation_file_to_db_in_chunks(self, chunk_size=1000):
def load_annotation_file_to_db_in_chunks(self, chunk_size=1000, test=False):
with open(self.annotation_file, 'r') as file:
while True:
chunk_genes = list(islice(file, chunk_size))
if not chunk_genes:
break
genes = [self._clean_gene(self._select_taxonomy(self._parse_gene(i))) for i in chunk_genes]
genes = [self._clean_gene(self._parse_gene(i)) for i in chunk_genes]
try:
response = self.metagenedb_gene_api.put(genes)
self.created_genes += response.get('created').get('count')
self.updated_genes += response.get('updated').get('count')
except HTTPError as http_error:
logging.warning("%s: %s; %s", http_error, http_error.response.json(), genes)
self.skipped_genes += len(genes)
self.processed_genes += len(chunk_genes)
logger.info("%s Genes processed so far...", self.processed_genes)
if test is True:
break
logger.info("[DONE] %s/%s Genes created.", self.created_genes, self.total_genes)
logger.info("[DONE] %s/%s Genes updated.", self.updated_genes, self.total_genes)
logger.info("[DONE] %s/%s Genes skipped.", self.skipped_genes, self.total_genes)
......@@ -117,8 +119,11 @@ def parse_arguments():
# Common arguments for analysis and annotations
parser.add_argument('annotation', help='IGC annotation file')
parser.add_argument('--url', help='base URL of the instance.', default='http://localhost/')
parser.add_argument('--chunk_size', type=int, default=1000,
help='How many genes to handle and create in the same time.')
parser.add_argument('--skip_taxonomy', action='store_true', help='Skip taxonomy information from genes.')
parser.add_argument('--skip_functions', action='store_true', help='Skip functions information from genes.')
parser.add_argument('--test', action='store_true', help='Run only on first chunk.')
parser.add_argument('-v', '--verbose', action='store_true')
try:
......@@ -133,7 +138,7 @@ def run():
logger.setLevel(logging.INFO)
import_igc_genes = ImportIGCGenes(args.annotation, args.url,
skip_tax=args.skip_taxonomy, skip_functions=args.skip_functions)
import_igc_genes.load_annotation_file_to_db_in_chunks()
import_igc_genes.load_annotation_file_to_db_in_chunks(chunk_size=args.chunk_size, test=args.test)
if __name__ == "__main__":
......
......@@ -5,7 +5,6 @@ import sys
from itertools import islice
from bioapi import MetageneDBCatalogTaxonomyAPI
from requests.exceptions import HTTPError
from metagenedb.common.utils.parsers import NCBITaxonomyLineParser
......@@ -94,29 +93,6 @@ class ImportNCBITaxonomy(object):
logger.info("[DONE] %s/%s Taxonomy updated.", self.updated_tax, self.total_tax)
logger.info("[DONE] %s/%s Taxonomy skipped.", self.skipped_tax, self.total_tax)
def build_all_hierarchy(self, chunk_size=1000):
"""
The hierarchy is automatically built when retrieving an taxonomy entry so we get all of them
"""
logger.info(f"Building hierarchy for all entries in %s...", self.tax_nodes_file)
with open(self.tax_nodes_file, "r") as f:
while True:
next_nodes = list(islice(f, chunk_size))
if not next_nodes:
break
nodes = [NCBITaxonomyLineParser.node(i) for i in next_nodes]
for node in nodes:
try:
response = self.metagenedb_tax_api.get(node.get('tax_id')) # noqa
self.updated_tax += 1
except HTTPError as http_error:
logger.warning(http_error)
self.skipped_tax += 1
self.processed_tax += len(nodes)
logger.info("%s/%s Taxonomy processed so far...", self.processed_tax, self.total_tax)
logger.info("[DONE] %s/%s Hierarchy built.", self.updated_tax, self.total_tax)
logger.info("[DONE] %s/%s Taxonomy skipped.", self.skipped_tax, self.total_tax)
def parse_arguments():
"""
......@@ -127,7 +103,6 @@ def parse_arguments():
parser.add_argument('--nodes', help='nodes.dmp file from ncbi_taxonomy', required=True)
parser.add_argument('--names', help='names.dmp file from ncbi_taxonomy', required=True)
parser.add_argument('--skip_creation', action='store_true', help='Skip taxonomy creation.')
parser.add_argument('--skip_hierarchy', action='store_true', help='Skip taxonomy hierarchy built.')
parser.add_argument('--url', help='base URL of the instance.', default='http://localhost/')
parser.add_argument('-v', '--verbose', action='store_true')
......@@ -146,8 +121,6 @@ def run():
if not args.skip_creation:
import_ncbi_tax.create_taxo_nodes(taxonomy_names)
import_ncbi_tax.update_taxo_nodes()
if not args.skip_hierarchy:
import_ncbi_tax.build_all_hierarchy()
if __name__ == "__main__":
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment