Commit 05fddb8e authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

Use API in IGC import script

parent 1f715857
Pipeline #13987 passed with stages
in 1 minute and 40 seconds
......@@ -6,4 +6,4 @@ class TaxonomyFilter(filters.FilterSet):
class Meta:
model = Taxonomy
fields = ['rank']
fields = ['rank', 'name']
from django_filters import rest_framework as filters
from rest_framework.viewsets import ModelViewSet
from metagenedb.api.catalog.filters import TaxonomyFilter
......
# Generated by Django 2.2.4 on 2019-08-27 11:05
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('catalog', '0008_gene_id_slug'),
]
operations = [
migrations.AlterModelOptions(
name='taxonomy',
options={'ordering': ['-tax_id'], 'verbose_name_plural': 'Taxonomy'},
),
]
......@@ -25,6 +25,13 @@ class GeneSerializer(serializers.ModelSerializer):
model = Gene
fields = ('gene_id', 'gene_name', 'length', 'functions', 'taxonomy')
def _extract_many_to_many(self, validated_data, info):
many_to_many = {}
for field_name, relation_info in info.relations.items():
if relation_info.to_many and (field_name in validated_data):
many_to_many[field_name] = validated_data.pop(field_name)
return many_to_many
def _handle_functions(self, functions, instance):
for function in functions:
try:
......@@ -36,17 +43,10 @@ class GeneSerializer(serializers.ModelSerializer):
_LOGGER.warning(f"{function.get('function_id')} not found for {instance.gene_id}. Function ignored")
def create(self, validated_data):
ModelClass = self.Meta.model
# Remove many-to-many relationships from validated_data.
# They are not valid arguments to the default `.create()` method,
# as they require that the instance has already been saved.
info = model_meta.get_field_info(ModelClass)
many_to_many = {}
for field_name, relation_info in info.relations.items():
if relation_info.to_many and (field_name in validated_data):
many_to_many[field_name] = validated_data.pop(field_name)
# Remove many-to-many relationships from validated_data.
many_to_many = self._extract_many_to_many(validated_data, info)
try:
instance = ModelClass._default_manager.create(**validated_data)
......@@ -70,9 +70,29 @@ class GeneSerializer(serializers.ModelSerializer):
)
raise TypeError(msg)
# Save many-to-many relationships after the instance is created.
print(many_to_many)
# Link existing many-to-many relationships after the instance is created.
if many_to_many:
for field_name, value in many_to_many.items():
getattr(self, f'_handle_{field_name}', None)(value, instance)
return instance
def update(self, instance, validated_data):
ModelClass = self.Meta.model
info = model_meta.get_field_info(ModelClass)
# Remove many-to-many relationships from validated_data.
many_to_many = self._extract_many_to_many(validated_data, info)
for attr, value in validated_data.items():
if attr in info.relations and info.relations[attr].to_many:
field = getattr(instance, attr)
field.set(value)
else:
setattr(instance, attr, value)
# Link existing many-to-many relationships.
if many_to_many:
for field_name, value in many_to_many.items():
getattr(self, f'_handle_{field_name}', None)(value, instance)
instance.save()
return instance
from .togows import TogoWSEntryAPI # noqa
from .metagenedb import MetageneDBCatalogGeneAPI # noqa
from .metagenedb import (MetageneDBCatalogGeneAPI, MetageneDBCatalogTaxonomyAPI, # noqa
MetageneDBCatalogFunctionAPI)
......@@ -33,8 +33,8 @@ class BaseAPI(object):
self.session = self.SESSION()
self.session.headers.update(self.HEADERS)
def get_all(self):
response = self.session.get(self.url)
def get_all(self, params=None):
response = self.session.get(self.url, params=params)
response.raise_for_status()
return response.json()
......
......@@ -11,3 +11,11 @@ class MetageneDBAPI(BaseAPI):
class MetageneDBCatalogGeneAPI(MetageneDBAPI):
ROUTE = 'api/catalog/v1/genes/'
class MetageneDBCatalogTaxonomyAPI(MetageneDBAPI):
ROUTE = 'api/catalog/v1/taxonomy/'
class MetageneDBCatalogFunctionAPI(MetageneDBAPI):
ROUTE = 'api/catalog/v1/functions/'
from requests.exceptions import HTTPError
from django.urls import reverse
from django.utils.http import urlencode
from metagenedb.common.utils.api import MetageneDBCatalogGeneAPI
class MetageneDBAPIMock(MetageneDBCatalogGeneAPI):
"""
Just a simple mock to go through the Test client. The idea is to test the upsert behaviour and not
the insertion to the db.
"""
KEY_ID = ''
BASE_REVERSE = 'api'
REVERSE_PATH = ''
def __init__(self, client):
self.client = client
self.reverse_path = ':'.join([self.BASE_REVERSE, self.REVERSE_PATH])
def get_all(self, params=None):
url = reverse(f'{self.reverse_path}-list')
if params is not None:
query_params = urlencode(params)
return self.client.get(f"{url}?{query_params}").json()
return self.client.get(f"{url}").json()
def get(self, entry_id):
response = self.client.get(reverse(f'{self.reverse_path}-detail', kwargs={self.KEY_ID: entry_id}))
if response.status_code == 404:
raise HTTPError
return response.json()
def post(self, data):
response = self.client.post(reverse(f'{self.reverse_path}-list'), data, format='json')
if response.status_code == 400:
raise HTTPError
return response.json()
def put(self, entry_id, data):
return self.client.put(reverse(f'{self.reverse_path}-detail', kwargs={self.KEY_ID: entry_id}),
data, format='json').json()
class MetageneDBCatalogGeneAPIMock(MetageneDBAPIMock):
KEY_ID = 'gene_id'
REVERSE_PATH = 'catalog:v1:genes'
class MetageneDBCatalogTaxonomyAPIMock(MetageneDBAPIMock):
KEY_ID = 'gene_id'
REVERSE_PATH = 'catalog:v1:taxonomy'
#!/usr/bin/env python
import argparse
import logging
import os
import sys
from itertools import islice
from requests.exceptions import HTTPError
import django
from slugify import slugify
from metagenedb.common.utils.api import MetageneDBCatalogGeneAPI
from metagenedb.common.utils.api import MetageneDBCatalogGeneAPI, MetageneDBCatalogTaxonomyAPI
from metagenedb.common.utils.parsers import IGCLineParser
# Before model import, we need to called django.setup() to Load apps
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings")
django.setup()
from metagenedb.apps.catalog.models import Taxonomy # noqa
logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)
PHYLUM_COL = 'taxo_phylum'
GENUS_COL = 'taxo_genus'
SELECTED_KEYS = ['gene_id', 'length', 'kegg_ko', PHYLUM_COL, GENUS_COL]
def parse_gene(raw_line, selected_keys=SELECTED_KEYS):
"""
Use IGCLineParser and return selected keys
"""
gene_parser = IGCLineParser()
all_dict = gene_parser.gene(raw_line)
selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys}
return selected_dict
def select_taxonomy(gene_dict, unknown_val='unknown'):
"""
Select the taxonomy to be assigned for the gene.
genus has priority on phylum. If both unknow, remove the taxonomy key
"""
phylum = gene_dict.pop(PHYLUM_COL)
genus = gene_dict.pop(GENUS_COL)
if genus != unknown_val:
queryset = Taxonomy.objects.filter(name=genus, rank="genus")
if queryset.count() > 1:
_LOGGER.warning(f"More than 1 result found for genus {genus}. First result is kept.")
gene_dict.update(
{'taxonomy': queryset[0].tax_id}
)
elif phylum != unknown_val:
queryset = Taxonomy.objects.filter(name=phylum, rank="phylum")
if queryset.count() > 1:
_LOGGER.warning(f"More than 1 result found for phylum {phylum}. First result is kept.")
gene_dict.update(
{'taxonomy': queryset[0].tax_id}
)
return gene_dict
class ImportIGCGenes(object):
METAGENEDB_GENE_API = MetageneDBCatalogGeneAPI
METAGENEDB_TAXONOMY_API = MetageneDBCatalogTaxonomyAPI
PHYLUM_COL = 'taxo_phylum'
GENUS_COL = 'taxo_genus'
SELECTED_KEYS = ['gene_id', 'length', 'kegg_ko', PHYLUM_COL, GENUS_COL]
def __init__(self, annotation_file, url, skip_tax=False, skip_functions=False):
self.annotation_file = annotation_file
self.url = url
self.metagenedb_gene_api = self.METAGENEDB_GENE_API(base_url=self.url)
self.metagenedb_taxonomy_api = self.METAGENEDB_TAXONOMY_API(base_url=self.url)
# Skip some insertion if specified in script options
self.skip_tax = skip_tax
self.skip_functions = skip_functions
def _parse_gene(self, raw_line, selected_keys=SELECTED_KEYS):
"""
Use IGCLineParser and return selected keys
"""
gene_parser = IGCLineParser()
all_dict = gene_parser.gene(raw_line)
selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys}
return selected_dict
def _select_taxonomy(self, taxonomy_dict, unknown_val='unknown'):
"""
Select the taxonomy to be assigned for the gene.
genus has priority on phylum. If both unknow, remove the taxonomy key
"""
phylum = taxonomy_dict.pop(self.PHYLUM_COL)
genus = taxonomy_dict.pop(self.GENUS_COL)
resp_dict = {}
if genus != unknown_val:
resp_dict = self.metagenedb_taxonomy_api.get_all(params={'name': genus, 'rank': 'genus'})
if len(resp_dict['results']) > 1:
_LOGGER.warning(f"More than 1 result found for genus {genus}. First result is kept.")
elif phylum != unknown_val:
resp_dict = self.metagenedb_taxonomy_api.get_all(params={'name': phylum, 'rank': 'phylum'})
if len(resp_dict['results']) > 1:
_LOGGER.warning(f"More than 1 result found for phylum {phylum}. First result is kept.")
if resp_dict:
taxonomy_dict.update(
{'taxonomy': resp_dict['results'][0]['tax_id']}
)
return taxonomy_dict
def _clean_gene(self, gene_dict):
print(gene_dict)
gene_dict['gene_name'] = gene_dict['gene_id']
gene_dict['gene_id'] = slugify(gene_dict['gene_id'])
gene_dict['functions'] = [{'function_id': gene_dict.pop('kegg_ko')}]
......@@ -93,8 +83,8 @@ class ImportIGCGenes(object):
def _insert_gene_list(self, chunk_genes):
for gene_line in chunk_genes:
gene_dict = parse_gene(gene_line)
gene_dict_with_taxo = select_taxonomy(gene_dict)
gene_dict = self._parse_gene(gene_line)
gene_dict_with_taxo = self._select_taxonomy(gene_dict)
try:
self._upsert_gene(gene_dict_with_taxo)
except HTTPError as e:
......
from requests.exceptions import HTTPError
from unittest import TestCase
import pytest
from django.urls import reverse
from rest_framework.test import APITestCase
from metagenedb.common.utils.api import MetageneDBCatalogGeneAPI
from metagenedb.common.utils.mocks.metagenedb import (MetageneDBCatalogGeneAPIMock,
MetageneDBCatalogTaxonomyAPIMock)
from metagenedb.apps.catalog.factory import TaxonomyFactory
from scripts.populate_db.import_igc_data import parse_gene, select_taxonomy, ImportIGCGenes
from scripts.populate_db.import_igc_data import ImportIGCGenes
class TestParseGene(TestCase):
......@@ -30,6 +29,7 @@ class TestParseGene(TestCase):
'cohort_assembled'
]
self.raw_line = "\t".join(raw_data)
self.import_igc_genes = ImportIGCGenes('test', 'test')
def test_parse_gene_default_selected_keys(self):
"""
......@@ -42,7 +42,7 @@ class TestParseGene(TestCase):
'taxo_phylum': 'taxo_phylum',
'taxo_genus': 'taxo_genus',
}
tested_dict = parse_gene(self.raw_line)
tested_dict = self.import_igc_genes._parse_gene(self.raw_line)
self.assertDictEqual(tested_dict, expected_dict)
def test_parse_gene(self):
......@@ -54,7 +54,7 @@ class TestParseGene(TestCase):
'gene_id': 'gene_name',
'length': 'length'
}
tested_dict = parse_gene(self.raw_line, selected_keys=selected_keys)
tested_dict = self.import_igc_genes._parse_gene(self.raw_line, selected_keys=selected_keys)
self.assertDictEqual(tested_dict, expected_dict)
def test_parse_gene_unknown_key(self):
......@@ -66,40 +66,10 @@ class TestParseGene(TestCase):
'gene_id': 'gene_name',
'length': 'length'
}
tested_dict = parse_gene(self.raw_line, selected_keys=selected_keys)
tested_dict = self.import_igc_genes._parse_gene(self.raw_line, selected_keys=selected_keys)
self.assertDictEqual(tested_dict, expected_dict)
class MetageneDBCatalogGeneAPIMock(MetageneDBCatalogGeneAPI):
"""
Just a simple mock to go through the Test client. The idea is to test the upsert behaviour and not
the insertion to the db.
"""
def __init__(self, client):
self.client = client
self.reverse_path = 'api:catalog:v1:genes'
def get_all(self):
return self.client.get(reverse(f'{self.reverse_path}-list')).json()
def get(self, entry_id):
response = self.client.get(reverse(f'{self.reverse_path}-detail', kwargs={'gene_id': entry_id}))
if response.status_code == 404:
raise HTTPError
return response.json()
def post(self, data):
response = self.client.post(reverse(f'{self.reverse_path}-list'), data, format='json')
if response.status_code == 400:
raise HTTPError
return response.json()
def put(self, entry_id, data):
return self.client.put(reverse(f'{self.reverse_path}-detail', kwargs={'gene_id': entry_id}),
data, format='json').json()
class TestUpsertGene(APITestCase):
def setUp(self):
......@@ -191,8 +161,7 @@ class TestCleanGene(TestCase):
self.assertDictEqual(test_gene_dict, expected_gene_dict)
@pytest.mark.django_db
class TestSelectTaxonomy(TestCase):
class TestSelectTaxonomy(APITestCase):
def setUp(self):
self.genus_name = 'Genus'
......@@ -200,6 +169,9 @@ class TestSelectTaxonomy(TestCase):
self.unknown_name = 'unknown'
self.genus = TaxonomyFactory(rank="genus", name=self.genus_name)
self.phylum = TaxonomyFactory(rank="phylum", name=self.phylum_name)
self.import_igc_genes = ImportIGCGenes('test', 'test')
self.api_mock = MetageneDBCatalogTaxonomyAPIMock(self.client)
self.import_igc_genes.metagenedb_taxonomy_api = self.api_mock
def test_genus_only(self):
gene_dict = {
......@@ -213,7 +185,7 @@ class TestSelectTaxonomy(TestCase):
'length': 135,
'taxonomy': str(self.genus.tax_id)
}
tested_dict = select_taxonomy(gene_dict)
tested_dict = self.import_igc_genes._select_taxonomy(gene_dict)
self.assertDictEqual(tested_dict, expected_dict)
def test_phylum_only(self):
......@@ -228,7 +200,7 @@ class TestSelectTaxonomy(TestCase):
'length': 135,
'taxonomy': str(self.phylum.tax_id)
}
tested_dict = select_taxonomy(gene_dict)
tested_dict = self.import_igc_genes._select_taxonomy(gene_dict)
self.assertDictEqual(tested_dict, expected_dict)
def test_genus_phylum(self):
......@@ -243,7 +215,7 @@ class TestSelectTaxonomy(TestCase):
'length': 135,
'taxonomy': str(self.genus.tax_id)
}
tested_dict = select_taxonomy(gene_dict)
tested_dict = self.import_igc_genes._select_taxonomy(gene_dict)
self.assertDictEqual(tested_dict, expected_dict)
def test_both_unknown(self):
......@@ -257,5 +229,5 @@ class TestSelectTaxonomy(TestCase):
'gene_id': 'gene',
'length': 135
}
tested_dict = select_taxonomy(gene_dict)
tested_dict = self.import_igc_genes._select_taxonomy(gene_dict)
self.assertDictEqual(tested_dict, expected_dict)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment