Commit db4cb890 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

Deal with EggNOG in creation of genes

parent 793e61eb
Pipeline #19797 passed with stages
in 2 minutes and 27 seconds
...@@ -29,7 +29,7 @@ class EggNogSerializer(serializers.ModelSerializer): ...@@ -29,7 +29,7 @@ class EggNogSerializer(serializers.ModelSerializer):
class Meta: class Meta:
model = EggNog model = EggNog
list_serializer_class = EggNogListSerializer list_serializer_class = EggNogListSerializer
fields = ('function_id', 'name', 'functional_category') fields = ('function_id', 'name', 'functional_categories')
class KeggOrthologyListSerializer(BulkListSerializer): class KeggOrthologyListSerializer(BulkListSerializer):
......
...@@ -38,7 +38,7 @@ class IGCLineParser(object): ...@@ -38,7 +38,7 @@ class IGCLineParser(object):
'taxo_phylum': gene_info[5], 'taxo_phylum': gene_info[5],
'taxo_genus': gene_info[6], 'taxo_genus': gene_info[6],
'kegg_ko': gene_info[7].split(';'), 'kegg_ko': gene_info[7].split(';'),
'eggnog': gene_info[8], 'eggnog': gene_info[8].split(';'),
'sample_occurence_frequency': gene_info[9], 'sample_occurence_frequency': gene_info[9],
'individual_occurence_frequency': gene_info[10], 'individual_occurence_frequency': gene_info[10],
'kegg_functional_categories': gene_info[11], 'kegg_functional_categories': gene_info[11],
......
...@@ -32,7 +32,7 @@ class TestIGCLineParser(TestCase): ...@@ -32,7 +32,7 @@ class TestIGCLineParser(TestCase):
'taxo_phylum': raw_data[5], 'taxo_phylum': raw_data[5],
'taxo_genus': raw_data[6], 'taxo_genus': raw_data[6],
'kegg_ko': [raw_data[7]], 'kegg_ko': [raw_data[7]],
'eggnog': raw_data[8], 'eggnog': [raw_data[8]],
'sample_occurence_frequency': raw_data[9], 'sample_occurence_frequency': raw_data[9],
'individual_occurence_frequency': raw_data[10], 'individual_occurence_frequency': raw_data[10],
'kegg_functional_categories': raw_data[11], 'kegg_functional_categories': raw_data[11],
...@@ -57,7 +57,7 @@ class TestIGCLineParser(TestCase): ...@@ -57,7 +57,7 @@ class TestIGCLineParser(TestCase):
'taxo_phylum', 'taxo_phylum',
'taxo_genus', 'taxo_genus',
'kegg;kegg2', 'kegg;kegg2',
'eggnog', 'eggnog1;eggnog2',
'sample_occurence_freq', 'sample_occurence_freq',
'ind_occurence_freq', 'ind_occurence_freq',
'kegg_functional_cat', 'kegg_functional_cat',
...@@ -74,7 +74,7 @@ class TestIGCLineParser(TestCase): ...@@ -74,7 +74,7 @@ class TestIGCLineParser(TestCase):
'taxo_phylum': raw_data[5], 'taxo_phylum': raw_data[5],
'taxo_genus': raw_data[6], 'taxo_genus': raw_data[6],
'kegg_ko': ['kegg', 'kegg2'], 'kegg_ko': ['kegg', 'kegg2'],
'eggnog': raw_data[8], 'eggnog': ['eggnog1', 'eggnog2'],
'sample_occurence_frequency': raw_data[9], 'sample_occurence_frequency': raw_data[9],
'individual_occurence_frequency': raw_data[10], 'individual_occurence_frequency': raw_data[10],
'kegg_functional_categories': raw_data[11], 'kegg_functional_categories': raw_data[11],
......
...@@ -30,20 +30,31 @@ class ImportIGCGenes(object): ...@@ -30,20 +30,31 @@ class ImportIGCGenes(object):
PHYLUM_COL = 'taxo_phylum' PHYLUM_COL = 'taxo_phylum'
GENUS_COL = 'taxo_genus' GENUS_COL = 'taxo_genus'
SELECTED_KEYS = ['gene_id', 'length', 'kegg_ko', PHYLUM_COL, GENUS_COL] SELECTED_KEYS = ['gene_id', 'length', 'kegg_ko', 'eggnog', PHYLUM_COL, GENUS_COL]
def __init__(self, annotation_file, url, jwt_token, skip_tax=False, skip_functions=False): def __init__(self, annotation_file, url, jwt_token, skip_tax=False, skip_functions=False):
self.annotation_file = annotation_file self.annotation_file = annotation_file
self.url = url self.url = url
self.metagenedb_gene_api = self.METAGENEDB_GENE_API(base_url=self.url, jwt_token=jwt_token) self._open_api_endpoints(jwt_token)
self.metagenedb_taxonomy_api = self.METAGENEDB_TAXONOMY_API(base_url=self.url, jwt_token=jwt_token)
self.metagenedb_function_api = self.METAGENEDB_FUNCTION_API(base_url=self.url, jwt_token=jwt_token)
self.total_genes = self._get_number_genes() self.total_genes = self._get_number_genes()
self._reset_counters() self._reset_counters()
# Skip some insertion if specified in script options # Skip some insertion if specified in script options
self.skip_tax = skip_tax self.skip_tax = skip_tax
self.skip_functions = skip_functions self.skip_functions = skip_functions
def _reset_counters(self):
self.processed_genes = 0
self.created_genes = 0
self.updated_genes = 0
self.skipped_genes = 0
def _open_api_endpoints(self, jwt_token):
self.metagenedb_gene_api = self.METAGENEDB_GENE_API(base_url=self.url, jwt_token=jwt_token)
self.metagenedb_taxonomy_api = self.METAGENEDB_TAXONOMY_API(base_url=self.url, jwt_token=jwt_token)
self.metagenedb_function_api = self.METAGENEDB_FUNCTION_API(base_url=self.url, jwt_token=jwt_token)
self.metagenedb_kegg_api = self.METAGENEDB_KEGG_API(base_url=self.url, jwt_token=jwt_token)
self.metagenedb_eggnog_api = self.METAGENEDB_EGGNOG_API(base_url=self.url, jwt_token=jwt_token)
def _build_taxo_mapping(self, rank, page_size=1000): def _build_taxo_mapping(self, rank, page_size=1000):
logger.info("Building local mapping for %s level...", rank) logger.info("Building local mapping for %s level...", rank)
counter = 1 counter = 1
...@@ -63,8 +74,8 @@ class ImportIGCGenes(object): ...@@ -63,8 +74,8 @@ class ImportIGCGenes(object):
counter += 1 counter += 1
return mapping return mapping
def build_function_catalog(self, page_size=1000): def _retrieve_function_catalog(self, api, page_size=1000):
logger.info("Building local function catalog...") logger.info("Building local catalog from %s...", api.ROUTE)
counter = 1 counter = 1
next_page = None next_page = None
functions = set() functions = set()
...@@ -73,24 +84,22 @@ class ImportIGCGenes(object): ...@@ -73,24 +84,22 @@ class ImportIGCGenes(object):
'page': counter, 'page': counter,
'page_size': page_size, 'page_size': page_size,
} }
current_page = self.metagenedb_function_api.get_all(params=params) current_page = api.get_all(params=params)
next_page = current_page['next'] next_page = current_page['next']
functions = functions.union(set( functions = functions.union(set(
[item['function_id'] for item in current_page['results']] [item['function_id'] for item in current_page['results']]
)) ))
counter += 1 counter += 1
self.metagenedb_functions = functions return functions
def build_function_mappings(self, page_size=1000):
self.metagenedb_keggs = self._retrieve_function_catalog(self.metagenedb_kegg_api, page_size=page_size)
self.metagenedb_eggnogs = self._retrieve_function_catalog(self.metagenedb_eggnog_api, page_size=page_size)
def build_mapping(self, page_size=1000): def build_mapping(self, page_size=1000):
self.phylum_mapping = self._build_taxo_mapping("phylum", page_size=page_size) self.phylum_mapping = self._build_taxo_mapping("phylum", page_size=page_size)
self.genus_mapping = self._build_taxo_mapping("genus", page_size=page_size) self.genus_mapping = self._build_taxo_mapping("genus", page_size=page_size)
def _reset_counters(self):
self.processed_genes = 0
self.created_genes = 0
self.updated_genes = 0
self.skipped_genes = 0
def _get_number_genes(self): def _get_number_genes(self):
if not os.path.isfile(self.annotation_file): if not os.path.isfile(self.annotation_file):
return 0 return 0
...@@ -133,18 +142,21 @@ class ImportIGCGenes(object): ...@@ -133,18 +142,21 @@ class ImportIGCGenes(object):
def _clean_functions(self, functions): def _clean_functions(self, functions):
clean_functions = [] clean_functions = []
for function in functions: for function in functions:
if function in self.metagenedb_functions: if function['function_id'] in getattr(self, f"metagenedb_{function['source']}s"):
clean_functions.append(function) clean_functions.append(function['function_id'])
elif function != 'unknown': else:
logger.warning("Function %s not found in metagenedb", function) logger.warning("Function %s not found from %s in metagenedb",
function['function_id'], function['source'])
return clean_functions return clean_functions
def _clean_gene(self, gene_dict): def _clean_gene(self, gene_dict):
gene_dict['gene_name'] = gene_dict['gene_id'] gene_dict['gene_name'] = gene_dict['gene_id']
gene_dict['gene_id'] = slugify(gene_dict['gene_id']) gene_dict['gene_id'] = slugify(gene_dict['gene_id'])
gene_dict['functions'] = gene_dict.pop('kegg_ko') gene_dict['functions'] = [
{'source': 'kegg', 'function_id': v} for v in gene_dict.pop('kegg_ko') if v != 'unknown'] + \
[{'source': 'eggnog', 'function_id': v} for v in gene_dict.pop('eggnog') if v != 'unknown']
gene_dict = self._select_taxonomy(gene_dict) gene_dict = self._select_taxonomy(gene_dict)
if self.skip_functions or 'unknown' in gene_dict['functions']: if self.skip_functions or not gene_dict['functions']:
gene_dict.pop('functions') gene_dict.pop('functions')
else: else:
gene_dict['functions'] = self._clean_functions(gene_dict['functions']) gene_dict['functions'] = self._clean_functions(gene_dict['functions'])
...@@ -154,7 +166,7 @@ class ImportIGCGenes(object): ...@@ -154,7 +166,7 @@ class ImportIGCGenes(object):
if not self.skip_tax: if not self.skip_tax:
self.build_mapping() self.build_mapping()
if not self.skip_functions: if not self.skip_functions:
self.build_function_catalog() self.build_function_mappings()
with open(self.annotation_file, 'r') as file: with open(self.annotation_file, 'r') as file:
while True: while True:
chunk_genes = list(islice(file, chunk_size)) chunk_genes = list(islice(file, chunk_size))
......
...@@ -2,8 +2,16 @@ from unittest import TestCase ...@@ -2,8 +2,16 @@ from unittest import TestCase
from rest_framework.test import APITestCase from rest_framework.test import APITestCase
from metagenedb.common.utils.mocks.metagenedb import MetageneDBCatalogTaxonomyAPIMock, MetageneDBCatalogFunctionAPIMock from metagenedb.common.utils.mocks.metagenedb import (
from metagenedb.apps.catalog.factory import TaxonomyFactory, FunctionFactory MetageneDBCatalogTaxonomyAPIMock,
MetageneDBCatalogEggNogAPIMock,
MetageneDBCatalogKeggOrthologyAPIMock
)
from metagenedb.apps.catalog.factory import (
TaxonomyFactory,
KeggOrthologyFactory,
EggNogFactory
)
from scripts.populate_db.import_igc_data import ImportIGCGenes from scripts.populate_db.import_igc_data import ImportIGCGenes
...@@ -37,6 +45,7 @@ class TestParseGene(TestCase): ...@@ -37,6 +45,7 @@ class TestParseGene(TestCase):
'gene_id': 'gene_name', 'gene_id': 'gene_name',
'length': 'length', 'length': 'length',
'kegg_ko': ['kegg'], 'kegg_ko': ['kegg'],
'eggnog': ['eggnog'],
'taxo_phylum': 'taxo_phylum', 'taxo_phylum': 'taxo_phylum',
'taxo_genus': 'taxo_genus', 'taxo_genus': 'taxo_genus',
} }
...@@ -77,7 +86,8 @@ class TestCleanGene(TestCase): ...@@ -77,7 +86,8 @@ class TestCleanGene(TestCase):
self.gene_dict = { self.gene_dict = {
'gene_id': 'gene.01', 'gene_id': 'gene.01',
'length': 135, 'length': 135,
'kegg_ko': ['K00001'] 'kegg_ko': ['K00001'],
'eggnog': ['COG1']
} }
def test_clean_gene(self): def test_clean_gene(self):
...@@ -85,7 +95,10 @@ class TestCleanGene(TestCase): ...@@ -85,7 +95,10 @@ class TestCleanGene(TestCase):
'gene_id': 'gene-01', 'gene_id': 'gene-01',
'gene_name': 'gene.01', 'gene_name': 'gene.01',
'length': 135, 'length': 135,
'functions': ['K00001'] 'functions': [
{'source': 'kegg', 'function_id': 'K00001'},
{'source': 'eggnog', 'function_id': 'COG1'}
]
} }
test_gene_dict = self.import_igc_genes._clean_gene(self.gene_dict) test_gene_dict = self.import_igc_genes._clean_gene(self.gene_dict)
self.assertDictEqual(test_gene_dict, expected_gene_dict) self.assertDictEqual(test_gene_dict, expected_gene_dict)
...@@ -104,16 +117,57 @@ class TestCleanGene(TestCase): ...@@ -104,16 +117,57 @@ class TestCleanGene(TestCase):
gene_dict = { gene_dict = {
'gene_id': 'gene.01', 'gene_id': 'gene.01',
'length': 135, 'length': 135,
'kegg_ko': 'unknown' 'kegg_ko': ['unknown'],
'eggnog': ['COG1']
} }
expected_gene_dict = { expected_gene_dict = {
'gene_id': 'gene-01', 'gene_id': 'gene-01',
'gene_name': 'gene.01', 'gene_name': 'gene.01',
'functions': [{'function_id': 'COG1', 'source': 'eggnog'}],
'length': 135 'length': 135
} }
test_gene_dict = self.import_igc_genes._clean_gene(gene_dict) test_gene_dict = self.import_igc_genes._clean_gene(gene_dict)
self.assertDictEqual(test_gene_dict, expected_gene_dict) self.assertDictEqual(test_gene_dict, expected_gene_dict)
def test_unknow_kegg_and_eggnog(self):
gene_dict = {
'gene_id': 'gene.01',
'length': 135,
'kegg_ko': ['unknown'],
'eggnog': ['unknown']
}
expected_gene_dict = {
'gene_id': 'gene-01',
'gene_name': 'gene.01',
'length': 135
}
test_gene_dict = self.import_igc_genes._clean_gene(gene_dict)
self.assertDictEqual(test_gene_dict, expected_gene_dict)
class TestCleanFunctions(TestCase):
def setUp(self):
self.import_igc_genes = ImportIGCGenes('test', 'test_url', 'test_token')
self.import_igc_genes.metagenedb_eggnogs = set(['COG1', 'COG2'])
self.import_igc_genes.metagenedb_keggs = set(['K00001', 'K00002'])
def test_clean_functions(self):
functions = [
{'function_id': 'K00001', 'source': 'kegg'},
{'function_id': 'COG1', 'source': 'eggnog'}
]
expected_list = ['K00001', 'COG1']
self.assertListEqual(self.import_igc_genes._clean_functions(functions), expected_list)
def test_clean_functions_unknown_kegg(self):
functions = [
{'function_id': 'K00301', 'source': 'kegg'},
{'function_id': 'COG1', 'source': 'eggnog'}
]
expected_list = ['COG1']
self.assertListEqual(self.import_igc_genes._clean_functions(functions), expected_list)
class TestSelectTaxonomy(TestCase): class TestSelectTaxonomy(TestCase):
...@@ -247,16 +301,23 @@ class TestBuildBuildFunctionCatalog(APITestCase): ...@@ -247,16 +301,23 @@ class TestBuildBuildFunctionCatalog(APITestCase):
@classmethod @classmethod
def setUpTestData(cls): def setUpTestData(cls):
cls.functions = FunctionFactory.create_batch(100) cls.keggs = KeggOrthologyFactory.create_batch(100)
cls.eggnogs = EggNogFactory.create_batch(100)
def setUp(self): def setUp(self):
self.import_igc_genes = ImportIGCGenes('test', 'test_url', 'test_token') self.import_igc_genes = ImportIGCGenes('test', 'test_url', 'test_token')
self.api_mock = MetageneDBCatalogFunctionAPIMock(self.client) self.kegg_api_mock = MetageneDBCatalogKeggOrthologyAPIMock(self.client)
self.import_igc_genes.metagenedb_function_api = self.api_mock self.eggnog_api_mock = MetageneDBCatalogEggNogAPIMock(self.client)
self.import_igc_genes.metagenedb_kegg_api = self.kegg_api_mock
self.import_igc_genes.metagenedb_eggnog_api = self.eggnog_api_mock
def test_build_catalog(self): def test_build_catalog(self):
expected_catalog = set( expected_kegg_catalog = set(
[function.function_id for function in self.functions] [function.function_id for function in self.keggs]
)
expected_eggnog_catalog = set(
[function.function_id for function in self.eggnogs]
) )
self.import_igc_genes.build_function_catalog(page_size=100) self.import_igc_genes.build_function_mappings(page_size=100)
self.assertSetEqual(self.import_igc_genes.metagenedb_functions, expected_catalog) self.assertSetEqual(self.import_igc_genes.metagenedb_keggs, expected_kegg_catalog)
self.assertSetEqual(self.import_igc_genes.metagenedb_eggnogs, expected_eggnog_catalog)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment