Commit 10c4b4ad authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

Check if function exists in the db or remove it from payload

parent 3bd58750
Pipeline #18637 passed with stages
in 2 minutes and 19 seconds
from collections import OrderedDict
from rest_framework import serializers
from rest_framework.relations import SlugRelatedField
class AsymetricSlugRelatedField(serializers.SlugRelatedField):
class AsymetricSlugRelatedField(SlugRelatedField):
def to_representation(self, value):
return self.serializer_class(value).data
......
......@@ -5,7 +5,7 @@ import os
import sys
from itertools import islice
from bioapi import MetageneDBCatalogGeneAPI, MetageneDBCatalogTaxonomyAPI
from bioapi import MetageneDBCatalogFunctionAPI, MetageneDBCatalogGeneAPI, MetageneDBCatalogTaxonomyAPI
from requests.exceptions import HTTPError
from slugify import slugify
......@@ -18,6 +18,7 @@ logger = logging.getLogger()
class ImportIGCGenes(object):
METAGENEDB_GENE_API = MetageneDBCatalogGeneAPI
METAGENEDB_TAXONOMY_API = MetageneDBCatalogTaxonomyAPI
METAGENEDB_FUNCTION_API = MetageneDBCatalogFunctionAPI
PHYLUM_COL = 'taxo_phylum'
GENUS_COL = 'taxo_genus'
......@@ -28,6 +29,7 @@ class ImportIGCGenes(object):
self.url = url
self.metagenedb_gene_api = self.METAGENEDB_GENE_API(base_url=self.url)
self.metagenedb_taxonomy_api = self.METAGENEDB_TAXONOMY_API(base_url=self.url)
self.metagenedb_function_api = self.METAGENEDB_FUNCTION_API(base_url=self.url)
self.total_genes = self._get_number_genes()
self._reset_counters()
# Skip some insertion if specified in script options
......@@ -53,6 +55,24 @@ class ImportIGCGenes(object):
counter += 1
return mapping
def build_function_catalog(self, page_size=1000):
logger.info("Building local function catalog...")
counter = 1
next_page = None
functions = set()
while counter == 1 or next_page is not None:
params = {
'page': counter,
'page_size': page_size,
}
current_page = self.metagenedb_function_api.get_all(params=params)
next_page = current_page['next']
functions = functions.union(set(
[item['function_id'] for item in current_page['results']]
))
counter += 1
self.metagenedb_functions = functions
def build_mapping(self, page_size=1000):
self.phylum_mapping = self._build_taxo_mapping("phylum", page_size=page_size)
self.genus_mapping = self._build_taxo_mapping("genus", page_size=page_size)
......@@ -102,6 +122,15 @@ class ImportIGCGenes(object):
selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys}
return selected_dict
def _clean_functions(self, functions):
clean_functions = []
for function in functions:
if function in self.metagenedb_functions:
clean_functions.append(function)
elif function != 'unknown':
logger.warning("Function %s not found in metagenedb", function)
return clean_functions
def _clean_gene(self, gene_dict):
gene_dict['gene_name'] = gene_dict['gene_id']
gene_dict['gene_id'] = slugify(gene_dict['gene_id'])
......@@ -109,12 +138,15 @@ class ImportIGCGenes(object):
gene_dict = self._select_taxonomy(gene_dict)
if self.skip_functions or 'unknown' in gene_dict['functions']:
gene_dict.pop('functions')
else:
gene_dict['functions'] = self._clean_functions(gene_dict['functions'])
return gene_dict
def load_annotation_file_to_db_in_chunks(self, chunk_size=1000, test=False):
# Build mapping for different phylum and genus
if not self.skip_tax:
self.build_mapping()
if not self.skip_functions:
self.build_function_catalog()
with open(self.annotation_file, 'r') as file:
while True:
chunk_genes = list(islice(file, chunk_size))
......
......@@ -2,8 +2,8 @@ from unittest import TestCase
from rest_framework.test import APITestCase
from metagenedb.common.utils.mocks.metagenedb import MetageneDBCatalogTaxonomyAPIMock
from metagenedb.apps.catalog.factory import TaxonomyFactory
from metagenedb.common.utils.mocks.metagenedb import MetageneDBCatalogTaxonomyAPIMock, MetageneDBCatalogFunctionAPIMock
from metagenedb.apps.catalog.factory import TaxonomyFactory, FunctionFactory
from scripts.populate_db.import_igc_data import ImportIGCGenes
......@@ -73,6 +73,7 @@ class TestCleanGene(TestCase):
def setUp(self):
self.import_igc_genes = ImportIGCGenes('test', 'test')
self.import_igc_genes._select_taxonomy = lambda x: x # Mock to return same dict
self.import_igc_genes._clean_functions = lambda x: x
self.gene_dict = {
'gene_id': 'gene.01',
'length': 135,
......@@ -240,3 +241,22 @@ class TestBuildTaxoMapping(APITestCase):
self.import_igc_genes.build_mapping(page_size=100)
self.assertDictEqual(self.import_igc_genes.genus_mapping, expected_genus_dict)
self.assertDictEqual(self.import_igc_genes.phylum_mapping, expected_phylum_dict)
class TestBuildBuildFunctionCatalog(APITestCase):
@classmethod
def setUpTestData(cls):
cls.functions = FunctionFactory.create_batch(100)
def setUp(self):
self.import_igc_genes = ImportIGCGenes('test', 'test')
self.api_mock = MetageneDBCatalogFunctionAPIMock(self.client)
self.import_igc_genes.metagenedb_function_api = self.api_mock
def test_build_catalog(self):
expected_catalog = set(
[function.function_id for function in self.functions]
)
self.import_igc_genes.build_function_catalog(page_size=100)
self.assertSetEqual(self.import_igc_genes.metagenedb_functions, expected_catalog)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment