Commit db4cb890 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

Deal with EggNOG in creation of genes

parent 793e61eb
Pipeline #19797 passed with stages
in 2 minutes and 27 seconds
......@@ -29,7 +29,7 @@ class EggNogSerializer(serializers.ModelSerializer):
class Meta:
model = EggNog
list_serializer_class = EggNogListSerializer
fields = ('function_id', 'name', 'functional_category')
fields = ('function_id', 'name', 'functional_categories')
class KeggOrthologyListSerializer(BulkListSerializer):
......
......@@ -38,7 +38,7 @@ class IGCLineParser(object):
'taxo_phylum': gene_info[5],
'taxo_genus': gene_info[6],
'kegg_ko': gene_info[7].split(';'),
'eggnog': gene_info[8],
'eggnog': gene_info[8].split(';'),
'sample_occurence_frequency': gene_info[9],
'individual_occurence_frequency': gene_info[10],
'kegg_functional_categories': gene_info[11],
......
......@@ -32,7 +32,7 @@ class TestIGCLineParser(TestCase):
'taxo_phylum': raw_data[5],
'taxo_genus': raw_data[6],
'kegg_ko': [raw_data[7]],
'eggnog': raw_data[8],
'eggnog': [raw_data[8]],
'sample_occurence_frequency': raw_data[9],
'individual_occurence_frequency': raw_data[10],
'kegg_functional_categories': raw_data[11],
......@@ -57,7 +57,7 @@ class TestIGCLineParser(TestCase):
'taxo_phylum',
'taxo_genus',
'kegg;kegg2',
'eggnog',
'eggnog1;eggnog2',
'sample_occurence_freq',
'ind_occurence_freq',
'kegg_functional_cat',
......@@ -74,7 +74,7 @@ class TestIGCLineParser(TestCase):
'taxo_phylum': raw_data[5],
'taxo_genus': raw_data[6],
'kegg_ko': ['kegg', 'kegg2'],
'eggnog': raw_data[8],
'eggnog': ['eggnog1', 'eggnog2'],
'sample_occurence_frequency': raw_data[9],
'individual_occurence_frequency': raw_data[10],
'kegg_functional_categories': raw_data[11],
......
......@@ -30,20 +30,31 @@ class ImportIGCGenes(object):
PHYLUM_COL = 'taxo_phylum'
GENUS_COL = 'taxo_genus'
SELECTED_KEYS = ['gene_id', 'length', 'kegg_ko', PHYLUM_COL, GENUS_COL]
SELECTED_KEYS = ['gene_id', 'length', 'kegg_ko', 'eggnog', PHYLUM_COL, GENUS_COL]
def __init__(self, annotation_file, url, jwt_token, skip_tax=False, skip_functions=False):
self.annotation_file = annotation_file
self.url = url
self.metagenedb_gene_api = self.METAGENEDB_GENE_API(base_url=self.url, jwt_token=jwt_token)
self.metagenedb_taxonomy_api = self.METAGENEDB_TAXONOMY_API(base_url=self.url, jwt_token=jwt_token)
self.metagenedb_function_api = self.METAGENEDB_FUNCTION_API(base_url=self.url, jwt_token=jwt_token)
self._open_api_endpoints(jwt_token)
self.total_genes = self._get_number_genes()
self._reset_counters()
# Skip some insertion if specified in script options
self.skip_tax = skip_tax
self.skip_functions = skip_functions
def _reset_counters(self):
self.processed_genes = 0
self.created_genes = 0
self.updated_genes = 0
self.skipped_genes = 0
def _open_api_endpoints(self, jwt_token):
self.metagenedb_gene_api = self.METAGENEDB_GENE_API(base_url=self.url, jwt_token=jwt_token)
self.metagenedb_taxonomy_api = self.METAGENEDB_TAXONOMY_API(base_url=self.url, jwt_token=jwt_token)
self.metagenedb_function_api = self.METAGENEDB_FUNCTION_API(base_url=self.url, jwt_token=jwt_token)
self.metagenedb_kegg_api = self.METAGENEDB_KEGG_API(base_url=self.url, jwt_token=jwt_token)
self.metagenedb_eggnog_api = self.METAGENEDB_EGGNOG_API(base_url=self.url, jwt_token=jwt_token)
def _build_taxo_mapping(self, rank, page_size=1000):
logger.info("Building local mapping for %s level...", rank)
counter = 1
......@@ -63,8 +74,8 @@ class ImportIGCGenes(object):
counter += 1
return mapping
def build_function_catalog(self, page_size=1000):
logger.info("Building local function catalog...")
def _retrieve_function_catalog(self, api, page_size=1000):
logger.info("Building local catalog from %s...", api.ROUTE)
counter = 1
next_page = None
functions = set()
......@@ -73,24 +84,22 @@ class ImportIGCGenes(object):
'page': counter,
'page_size': page_size,
}
current_page = self.metagenedb_function_api.get_all(params=params)
current_page = api.get_all(params=params)
next_page = current_page['next']
functions = functions.union(set(
[item['function_id'] for item in current_page['results']]
))
counter += 1
self.metagenedb_functions = functions
return functions
def build_function_mappings(self, page_size=1000):
self.metagenedb_keggs = self._retrieve_function_catalog(self.metagenedb_kegg_api, page_size=page_size)
self.metagenedb_eggnogs = self._retrieve_function_catalog(self.metagenedb_eggnog_api, page_size=page_size)
def build_mapping(self, page_size=1000):
self.phylum_mapping = self._build_taxo_mapping("phylum", page_size=page_size)
self.genus_mapping = self._build_taxo_mapping("genus", page_size=page_size)
def _reset_counters(self):
self.processed_genes = 0
self.created_genes = 0
self.updated_genes = 0
self.skipped_genes = 0
def _get_number_genes(self):
if not os.path.isfile(self.annotation_file):
return 0
......@@ -133,18 +142,21 @@ class ImportIGCGenes(object):
def _clean_functions(self, functions):
clean_functions = []
for function in functions:
if function in self.metagenedb_functions:
clean_functions.append(function)
elif function != 'unknown':
logger.warning("Function %s not found in metagenedb", function)
if function['function_id'] in getattr(self, f"metagenedb_{function['source']}s"):
clean_functions.append(function['function_id'])
else:
logger.warning("Function %s not found from %s in metagenedb",
function['function_id'], function['source'])
return clean_functions
def _clean_gene(self, gene_dict):
gene_dict['gene_name'] = gene_dict['gene_id']
gene_dict['gene_id'] = slugify(gene_dict['gene_id'])
gene_dict['functions'] = gene_dict.pop('kegg_ko')
gene_dict['functions'] = [
{'source': 'kegg', 'function_id': v} for v in gene_dict.pop('kegg_ko') if v != 'unknown'] + \
[{'source': 'eggnog', 'function_id': v} for v in gene_dict.pop('eggnog') if v != 'unknown']
gene_dict = self._select_taxonomy(gene_dict)
if self.skip_functions or 'unknown' in gene_dict['functions']:
if self.skip_functions or not gene_dict['functions']:
gene_dict.pop('functions')
else:
gene_dict['functions'] = self._clean_functions(gene_dict['functions'])
......@@ -154,7 +166,7 @@ class ImportIGCGenes(object):
if not self.skip_tax:
self.build_mapping()
if not self.skip_functions:
self.build_function_catalog()
self.build_function_mappings()
with open(self.annotation_file, 'r') as file:
while True:
chunk_genes = list(islice(file, chunk_size))
......
......@@ -2,8 +2,16 @@ from unittest import TestCase
from rest_framework.test import APITestCase
from metagenedb.common.utils.mocks.metagenedb import MetageneDBCatalogTaxonomyAPIMock, MetageneDBCatalogFunctionAPIMock
from metagenedb.apps.catalog.factory import TaxonomyFactory, FunctionFactory
from metagenedb.common.utils.mocks.metagenedb import (
MetageneDBCatalogTaxonomyAPIMock,
MetageneDBCatalogEggNogAPIMock,
MetageneDBCatalogKeggOrthologyAPIMock
)
from metagenedb.apps.catalog.factory import (
TaxonomyFactory,
KeggOrthologyFactory,
EggNogFactory
)
from scripts.populate_db.import_igc_data import ImportIGCGenes
......@@ -37,6 +45,7 @@ class TestParseGene(TestCase):
'gene_id': 'gene_name',
'length': 'length',
'kegg_ko': ['kegg'],
'eggnog': ['eggnog'],
'taxo_phylum': 'taxo_phylum',
'taxo_genus': 'taxo_genus',
}
......@@ -77,7 +86,8 @@ class TestCleanGene(TestCase):
self.gene_dict = {
'gene_id': 'gene.01',
'length': 135,
'kegg_ko': ['K00001']
'kegg_ko': ['K00001'],
'eggnog': ['COG1']
}
def test_clean_gene(self):
......@@ -85,7 +95,10 @@ class TestCleanGene(TestCase):
'gene_id': 'gene-01',
'gene_name': 'gene.01',
'length': 135,
'functions': ['K00001']
'functions': [
{'source': 'kegg', 'function_id': 'K00001'},
{'source': 'eggnog', 'function_id': 'COG1'}
]
}
test_gene_dict = self.import_igc_genes._clean_gene(self.gene_dict)
self.assertDictEqual(test_gene_dict, expected_gene_dict)
......@@ -104,16 +117,57 @@ class TestCleanGene(TestCase):
gene_dict = {
'gene_id': 'gene.01',
'length': 135,
'kegg_ko': 'unknown'
'kegg_ko': ['unknown'],
'eggnog': ['COG1']
}
expected_gene_dict = {
'gene_id': 'gene-01',
'gene_name': 'gene.01',
'functions': [{'function_id': 'COG1', 'source': 'eggnog'}],
'length': 135
}
test_gene_dict = self.import_igc_genes._clean_gene(gene_dict)
self.assertDictEqual(test_gene_dict, expected_gene_dict)
def test_unknow_kegg_and_eggnog(self):
gene_dict = {
'gene_id': 'gene.01',
'length': 135,
'kegg_ko': ['unknown'],
'eggnog': ['unknown']
}
expected_gene_dict = {
'gene_id': 'gene-01',
'gene_name': 'gene.01',
'length': 135
}
test_gene_dict = self.import_igc_genes._clean_gene(gene_dict)
self.assertDictEqual(test_gene_dict, expected_gene_dict)
class TestCleanFunctions(TestCase):
def setUp(self):
self.import_igc_genes = ImportIGCGenes('test', 'test_url', 'test_token')
self.import_igc_genes.metagenedb_eggnogs = set(['COG1', 'COG2'])
self.import_igc_genes.metagenedb_keggs = set(['K00001', 'K00002'])
def test_clean_functions(self):
functions = [
{'function_id': 'K00001', 'source': 'kegg'},
{'function_id': 'COG1', 'source': 'eggnog'}
]
expected_list = ['K00001', 'COG1']
self.assertListEqual(self.import_igc_genes._clean_functions(functions), expected_list)
def test_clean_functions_unknown_kegg(self):
functions = [
{'function_id': 'K00301', 'source': 'kegg'},
{'function_id': 'COG1', 'source': 'eggnog'}
]
expected_list = ['COG1']
self.assertListEqual(self.import_igc_genes._clean_functions(functions), expected_list)
class TestSelectTaxonomy(TestCase):
......@@ -247,16 +301,23 @@ class TestBuildBuildFunctionCatalog(APITestCase):
@classmethod
def setUpTestData(cls):
cls.functions = FunctionFactory.create_batch(100)
cls.keggs = KeggOrthologyFactory.create_batch(100)
cls.eggnogs = EggNogFactory.create_batch(100)
def setUp(self):
self.import_igc_genes = ImportIGCGenes('test', 'test_url', 'test_token')
self.api_mock = MetageneDBCatalogFunctionAPIMock(self.client)
self.import_igc_genes.metagenedb_function_api = self.api_mock
self.kegg_api_mock = MetageneDBCatalogKeggOrthologyAPIMock(self.client)
self.eggnog_api_mock = MetageneDBCatalogEggNogAPIMock(self.client)
self.import_igc_genes.metagenedb_kegg_api = self.kegg_api_mock
self.import_igc_genes.metagenedb_eggnog_api = self.eggnog_api_mock
def test_build_catalog(self):
expected_catalog = set(
[function.function_id for function in self.functions]
expected_kegg_catalog = set(
[function.function_id for function in self.keggs]
)
expected_eggnog_catalog = set(
[function.function_id for function in self.eggnogs]
)
self.import_igc_genes.build_function_catalog(page_size=100)
self.assertSetEqual(self.import_igc_genes.metagenedb_functions, expected_catalog)
self.import_igc_genes.build_function_mappings(page_size=100)
self.assertSetEqual(self.import_igc_genes.metagenedb_keggs, expected_kegg_catalog)
self.assertSetEqual(self.import_igc_genes.metagenedb_eggnogs, expected_eggnog_catalog)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment