Commit 3bd58750 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

Merge branch '66-get-all-genus-phylum-local' into 'dev'

mprove the way mapping between taxonomy names and id is done during IGC gene creation

Closes #66

See merge request !19
parents 191075f1 83746eaa
Pipeline #18621 passed with stages
in 3 minutes and 59 seconds
from marshmallow import Schema
class EmptyQueryParams(Schema):
pass
from marshmallow import Schema, fields
from marshmallow import fields
from metagenedb.common.django_default.qparams_validators import PaginatedQueryParams
class FunctionQueryParams(Schema):
class FunctionQueryParams(PaginatedQueryParams):
detailed = fields.Boolean()
from marshmallow import Schema, fields
from marshmallow import fields
from metagenedb.common.django_default.qparams_validators import PaginatedQueryParams
class TaxonomyQueryParams(Schema):
class TaxonomyQueryParams(PaginatedQueryParams):
rank = fields.String()
name = fields.String()
......@@ -3,11 +3,11 @@ from rest_framework import status
from rest_framework.response import Response
from rest_framework.viewsets import ModelViewSet
from metagenedb.api.catalog.qparams_validators.empty import EmptyQueryParams
from metagenedb.common.django_default.qparams_validators import PaginatedQueryParams
class BulkViewSet(ModelViewSet):
query_params_parser = EmptyQueryParams
query_params_parser = PaginatedQueryParams
def get_objects(self, instance_ids):
return self.queryset.in_bulk(instance_ids, field_name=self.lookup_field)
......
from factory import fuzzy
from factory import DjangoModelFactory, lazy_attribute
from factory import DjangoModelFactory, fuzzy
from faker import Factory
from metagenedb.apps.catalog import models
from .fuzzy_base import FuzzyLowerText
faker = Factory.create()
SELECTED_SOURCE = [i[0] for i in models.Function.SOURCE_CHOICES]
......@@ -14,7 +15,4 @@ class FunctionFactory(DjangoModelFactory):
model = models.Function
source = fuzzy.FuzzyChoice(SELECTED_SOURCE)
@lazy_attribute
def function_id(self):
return str(faker.pyint())
function_id = FuzzyLowerText(prefix='function-', length=15)
import string
from factory import fuzzy
class FuzzyLowerText(fuzzy.FuzzyText):
CHARS = string.ascii_lowercase + ''.join(map(str, range(1, 10)))
def __init__(self, **kwargs):
super(FuzzyLowerText, self).__init__(chars=self.CHARS, **kwargs)
from factory import DjangoModelFactory, lazy_attribute
from factory import DjangoModelFactory, fuzzy
from faker import Factory
from metagenedb.apps.catalog import models
from .fuzzy_base import FuzzyLowerText
faker = Factory.create()
SELECTED_SOURCE = [i[0] for i in models.Function.SOURCE_CHOICES]
......@@ -12,10 +14,5 @@ class GeneFactory(DjangoModelFactory):
class Meta:
model = models.Gene
@lazy_attribute
def gene_id(self):
return str(faker.pyint())
@lazy_attribute
def length(self):
return str(faker.pyint())
gene_id = FuzzyLowerText(prefix='gene-', length=15)
length = fuzzy.FuzzyInteger(200, 10000)
from factory import fuzzy
from factory import DjangoModelFactory, lazy_attribute
from factory import DjangoModelFactory, fuzzy
from faker import Factory
from metagenedb.apps.catalog import models
from .fuzzy_base import FuzzyLowerText
faker = Factory.create()
SELECTED_RANK = [i[0] for i in models.Taxonomy.RANK_CHOICES]
......@@ -14,7 +15,5 @@ class TaxonomyFactory(DjangoModelFactory):
model = models.Taxonomy
rank = fuzzy.FuzzyChoice(SELECTED_RANK)
@lazy_attribute
def tax_id(self):
return str(faker.pyint())
tax_id = FuzzyLowerText(prefix='tax-', length=15)
name = fuzzy.FuzzyText(length=20)
from rest_framework.pagination import PageNumberPagination
class DefaultPageNumberPagination(PageNumberPagination):
page_size_query_param = 'page_size'
max_page_size = 5000
from marshmallow import Schema, fields
class PaginatedQueryParams(Schema):
page = fields.Integer()
page_size = fields.Integer()
......@@ -105,7 +105,7 @@ REST_FRAMEWORK = {
'rest_framework_jwt.authentication.JSONWebTokenAuthentication',
'rest_framework.authentication.SessionAuthentication',
),
'DEFAULT_PAGINATION_CLASS': 'rest_framework.pagination.PageNumberPagination',
'DEFAULT_PAGINATION_CLASS': 'metagenedb.common.django_default.pagination.DefaultPageNumberPagination',
'PAGE_SIZE': 100,
'DEFAULT_FILTER_BACKENDS': (
'django_filters.rest_framework.DjangoFilterBackend',
......
......@@ -34,6 +34,29 @@ class ImportIGCGenes(object):
self.skip_tax = skip_tax
self.skip_functions = skip_functions
def _build_taxo_mapping(self, rank, page_size=1000):
logger.info("Building local mapping for %s level...", rank)
counter = 1
next_page = None
mapping = {}
while counter == 1 or next_page is not None:
params = {
'page': counter,
'page_size': page_size,
'rank': rank,
}
current_page = self.metagenedb_taxonomy_api.get_all(params=params)
next_page = current_page['next']
mapping.update({
value['name']: value['tax_id'] for value in current_page['results']
})
counter += 1
return mapping
def build_mapping(self, page_size=1000):
self.phylum_mapping = self._build_taxo_mapping("phylum", page_size=page_size)
self.genus_mapping = self._build_taxo_mapping("genus", page_size=page_size)
def _reset_counters(self):
self.processed_genes = 0
self.created_genes = 0
......@@ -57,17 +80,17 @@ class ImportIGCGenes(object):
genus = gene_dict.pop(self.GENUS_COL)
if self.skip_tax:
return gene_dict
resp_dict = {}
taxonomy_id = None
if genus != unknown_val:
resp_dict = self.metagenedb_taxonomy_api.get_all(params={'name': genus, 'rank': 'genus'})
if len(resp_dict['results']) > 1:
logger.warning(f"More than 1 result found for genus {genus}. First result is kept.")
taxonomy_id = self.genus_mapping.get(genus, None)
if taxonomy_id is None:
logger.warning("No tax_id found for genus %s" % genus)
elif phylum != unknown_val:
resp_dict = self.metagenedb_taxonomy_api.get_all(params={'name': phylum, 'rank': 'phylum'})
if len(resp_dict['results']) > 1:
logger.warning(f"More than 1 result found for phylum {phylum}. First result is kept.")
if resp_dict.get('count', 0) > 0:
gene_dict.update({'taxonomy': resp_dict['results'][0]['tax_id']})
taxonomy_id = self.phylum_mapping.get(phylum, None)
if taxonomy_id is None:
logger.warning("No tax_id found for phylum %s" % genus)
if taxonomy_id is not None:
gene_dict.update({'taxonomy': taxonomy_id})
return gene_dict
def _parse_gene(self, raw_line, selected_keys=SELECTED_KEYS):
......@@ -89,6 +112,9 @@ class ImportIGCGenes(object):
return gene_dict
def load_annotation_file_to_db_in_chunks(self, chunk_size=1000, test=False):
# Build mapping for different phylum and genus
if not self.skip_tax:
self.build_mapping()
with open(self.annotation_file, 'r') as file:
while True:
chunk_genes = list(islice(file, chunk_size))
......
......@@ -114,17 +114,21 @@ class TestCleanGene(TestCase):
self.assertDictEqual(test_gene_dict, expected_gene_dict)
class TestSelectTaxonomy(APITestCase):
class TestSelectTaxonomy(TestCase):
def setUp(self):
self.genus_name = 'Genus'
self.phylum_name = 'Phylum'
self.unknown_name = 'unknown'
self.genus = TaxonomyFactory(rank="genus", name=self.genus_name)
self.phylum = TaxonomyFactory(rank="phylum", name=self.phylum_name)
self.genus_id = 'genus_1'
self.genus_name = 'Genus1'
self.phylum_id = 'phylum_1'
self.phylum_name = 'Phylum1'
self.import_igc_genes = ImportIGCGenes('test', 'test')
self.api_mock = MetageneDBCatalogTaxonomyAPIMock(self.client)
self.import_igc_genes.metagenedb_taxonomy_api = self.api_mock
self.import_igc_genes.phylum_mapping = {
self.phylum_name: self.phylum_id
}
self.import_igc_genes.genus_mapping = {
self.genus_name: self.genus_id
}
def test_genus_only(self):
gene_dict = {
......@@ -136,7 +140,21 @@ class TestSelectTaxonomy(APITestCase):
expected_dict = {
'gene_id': 'gene',
'length': 135,
'taxonomy': str(self.genus.tax_id)
'taxonomy': self.genus_id
}
tested_dict = self.import_igc_genes._select_taxonomy(gene_dict)
self.assertDictEqual(tested_dict, expected_dict)
def test_genus_not_in_mapping(self):
gene_dict = {
'gene_id': 'gene',
'length': 135,
'taxo_phylum': self.unknown_name,
'taxo_genus': "Genus2"
}
expected_dict = {
'gene_id': 'gene',
'length': 135
}
tested_dict = self.import_igc_genes._select_taxonomy(gene_dict)
self.assertDictEqual(tested_dict, expected_dict)
......@@ -151,7 +169,21 @@ class TestSelectTaxonomy(APITestCase):
expected_dict = {
'gene_id': 'gene',
'length': 135,
'taxonomy': str(self.phylum.tax_id)
'taxonomy': self.phylum_id
}
tested_dict = self.import_igc_genes._select_taxonomy(gene_dict)
self.assertDictEqual(tested_dict, expected_dict)
def test_phylum_not_in_mapping(self):
gene_dict = {
'gene_id': 'gene',
'length': 135,
'taxo_phylum': "Phylum2",
'taxo_genus': self.unknown_name
}
expected_dict = {
'gene_id': 'gene',
'length': 135
}
tested_dict = self.import_igc_genes._select_taxonomy(gene_dict)
self.assertDictEqual(tested_dict, expected_dict)
......@@ -166,7 +198,7 @@ class TestSelectTaxonomy(APITestCase):
expected_dict = {
'gene_id': 'gene',
'length': 135,
'taxonomy': str(self.genus.tax_id)
'taxonomy': self.genus_id
}
tested_dict = self.import_igc_genes._select_taxonomy(gene_dict)
self.assertDictEqual(tested_dict, expected_dict)
......@@ -184,3 +216,27 @@ class TestSelectTaxonomy(APITestCase):
}
tested_dict = self.import_igc_genes._select_taxonomy(gene_dict)
self.assertDictEqual(tested_dict, expected_dict)
class TestBuildTaxoMapping(APITestCase):
@classmethod
def setUpTestData(cls):
cls.genus_items = TaxonomyFactory.create_batch(200, rank='genus')
cls.phylum_items = TaxonomyFactory.create_batch(20, rank='phylum')
def setUp(self):
self.import_igc_genes = ImportIGCGenes('test', 'test')
self.api_mock = MetageneDBCatalogTaxonomyAPIMock(self.client)
self.import_igc_genes.metagenedb_taxonomy_api = self.api_mock
def test_build_mapping(self):
expected_genus_dict = {
item.name: item.tax_id for item in self.genus_items
}
expected_phylum_dict = {
item.name: item.tax_id for item in self.phylum_items
}
self.import_igc_genes.build_mapping(page_size=100)
self.assertDictEqual(self.import_igc_genes.genus_mapping, expected_genus_dict)
self.assertDictEqual(self.import_igc_genes.phylum_mapping, expected_phylum_dict)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment