Commit 3c3073ca authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

update script to import IGC genes using API

parent 9b1503a2
Pipeline #14503 passed with stages
in 2 minutes and 55 seconds
#!/usr/bin/env python
import argparse
import logging
import os
import sys
from itertools import islice
from requests.exceptions import HTTPError
from bioapi import MetageneDBCatalogGeneAPI, MetageneDBCatalogTaxonomyAPI
from slugify import slugify
from metagenedb.common.utils.parsers import IGCLineParser
_LOGGER = logging.getLogger(__name__)
logging.basicConfig()
logger = logging.getLogger()
class ImportIGCGenes(object):
......@@ -26,42 +27,58 @@ class ImportIGCGenes(object):
self.url = url
self.metagenedb_gene_api = self.METAGENEDB_GENE_API(base_url=self.url)
self.metagenedb_taxonomy_api = self.METAGENEDB_TAXONOMY_API(base_url=self.url)
self.processed_genes = 0
self.skipped_genes = 0
self.total_genes = self._get_number_genes()
self._reset_counters()
# Skip some insertion if specified in script options
self.skip_tax = skip_tax
self.skip_functions = skip_functions
def _parse_gene(self, raw_line, selected_keys=SELECTED_KEYS):
"""
Use IGCLineParser and return selected keys
"""
gene_parser = IGCLineParser()
all_dict = gene_parser.gene(raw_line)
selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys}
return selected_dict
def _reset_counters(self):
self.processed_genes = 0
self.created_genes = 0
self.updated_genes = 0
self.skipped_genes = 0
def _select_taxonomy(self, taxonomy_dict, unknown_val='unknown'):
def _get_number_genes(self):
if not os.path.isfile(self.annotation_file):
return 0
with open(self.annotation_file) as f:
for i, l in enumerate(f):
pass
return i + 1
def _select_taxonomy(self, gene_dict, unknown_val='unknown'):
"""
Select the taxonomy to be assigned for the gene.
genus has priority on phylum. If both unknow, remove the taxonomy key
"""
phylum = taxonomy_dict.pop(self.PHYLUM_COL)
genus = taxonomy_dict.pop(self.GENUS_COL)
phylum = gene_dict.pop(self.PHYLUM_COL)
genus = gene_dict.pop(self.GENUS_COL)
resp_dict = {}
if genus != unknown_val:
resp_dict = self.metagenedb_taxonomy_api.get_all(params={'name': genus, 'rank': 'genus'})
if len(resp_dict['results']) > 1:
_LOGGER.warning(f"More than 1 result found for genus {genus}. First result is kept.")
logger.warning(f"More than 1 result found for genus {genus}. First result is kept.")
elif phylum != unknown_val:
resp_dict = self.metagenedb_taxonomy_api.get_all(params={'name': phylum, 'rank': 'phylum'})
if len(resp_dict['results']) > 1:
_LOGGER.warning(f"More than 1 result found for phylum {phylum}. First result is kept.")
if resp_dict:
taxonomy_dict.update(
logger.warning(f"More than 1 result found for phylum {phylum}. First result is kept.")
if resp_dict.get('count', 0) > 0:
gene_dict.update(
{'taxonomy': resp_dict['results'][0]['tax_id']}
)
return taxonomy_dict
else:
gene_dict.update({'taxonomy': None})
return gene_dict
def _parse_gene(self, raw_line, selected_keys=SELECTED_KEYS):
"""
Use IGCLineParser and return selected keys
"""
gene_parser = IGCLineParser()
all_dict = gene_parser.gene(raw_line)
selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys}
return selected_dict
def _clean_gene(self, gene_dict):
gene_dict['gene_name'] = gene_dict['gene_id']
......@@ -73,36 +90,22 @@ class ImportIGCGenes(object):
gene_dict.pop('functions')
return gene_dict
def _upsert_gene(self, gene_dict):
clean_gene_dict = self._clean_gene(gene_dict)
try:
gene_id = clean_gene_dict['gene_id']
self.metagenedb_gene_api.get(gene_id) # Try to get obj to check if it exists
self.metagenedb_gene_api.put(clean_gene_dict, entry_id=gene_id)
except HTTPError:
self.metagenedb_gene_api.post(clean_gene_dict)
def _insert_gene_list(self, chunk_genes):
for gene_line in chunk_genes:
gene_dict = self._parse_gene(gene_line)
gene_dict_with_taxo = self._select_taxonomy(gene_dict)
try:
self._upsert_gene(gene_dict_with_taxo)
except HTTPError as e:
self.skipped_genes += 1
_LOGGER.warning(f"{e.response.json()} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.")
def load_annotation_file_to_db_in_chunks(self, chunk_size=100000):
def load_annotation_file_to_db_in_chunks(self, chunk_size=1000):
with open(self.annotation_file, 'r') as file:
while True:
chunk_genes = list(islice(file, chunk_size))
if not chunk_genes:
break
genes = [self._clean_gene(self._select_taxonomy(self._parse_gene(i))) for i in chunk_genes]
response = self.metagenedb_gene_api.put(genes)
self.created_genes += response.get('created').get('count')
self.updated_genes += response.get('updated').get('count')
self.processed_genes += len(chunk_genes)
self._insert_gene_list(chunk_genes)
_LOGGER.info(f"{self.processed_genes} genes inserted/updated so far...")
_LOGGER.info(f"[DONE] {self.processed_genes} genes inserted/updated.")
_LOGGER.info(f"[DONE] {self.skipped_genes} genes skipped.")
logger.info("%s Genes processed so far...", self.processed_genes)
break
logger.info("[DONE] %s/%s Genes created.", self.created_genes, self.total_genes)
logger.info("[DONE] %s/%s Genes updated.", self.updated_genes, self.total_genes)
logger.info("[DONE] %s/%s Genes skipped.", self.skipped_genes, self.total_genes)
def parse_arguments():
......@@ -115,6 +118,7 @@ def parse_arguments():
parser.add_argument('--url', help='base URL of the instance.', default='http://localhost/')
parser.add_argument('--skip_taxonomy', action='store_true', help='Skip taxonomy information from genes.')
parser.add_argument('--skip_functions', action='store_true', help='Skip functions information from genes.')
parser.add_argument('-v', '--verbose', action='store_true')
try:
return parser.parse_args()
......@@ -124,6 +128,8 @@ def parse_arguments():
def run():
args = parse_arguments()
if args.verbose:
logger.setLevel(logging.INFO)
import_igc_genes = ImportIGCGenes(args.annotation, args.url,
skip_tax=args.skip_taxonomy, skip_functions=args.skip_functions)
import_igc_genes.load_annotation_file_to_db_in_chunks()
......
from requests.exceptions import HTTPError
from unittest import TestCase
from rest_framework.test import APITestCase
from metagenedb.common.utils.mocks.metagenedb import (MetageneDBCatalogGeneAPIMock,
MetageneDBCatalogTaxonomyAPIMock)
from metagenedb.common.utils.mocks.metagenedb import MetageneDBCatalogTaxonomyAPIMock
from metagenedb.apps.catalog.factory import TaxonomyFactory
from scripts.populate_db.import_igc_data import ImportIGCGenes
......@@ -70,52 +68,6 @@ class TestParseGene(TestCase):
self.assertDictEqual(tested_dict, expected_dict)
class TestUpsertGene(APITestCase):
def setUp(self):
self.import_igc_genes = ImportIGCGenes('test', 'test', skip_functions=True)
self.api_mock = MetageneDBCatalogGeneAPIMock(self.client)
self.import_igc_genes.metagenedb_gene_api = self.api_mock
def test_insert_valid_gene_no_kegg(self):
valid_gene = {
'gene_name': 'test_gene.01',
'gene_id': 'test-gene-01',
'length': 3556,
'kegg_ko': 'K00001'
}
self.import_igc_genes._upsert_gene(valid_gene)
self.assertEqual(self.api_mock.get_all()['count'], 1)
def test_insert_invalid_length(self):
invalid_gene = {
'gene_name': 'test_gene.01',
'gene_id': 'test-gene-01',
'length': 'wrong_format',
'kegg_ko': 'K00001'
}
with self.assertRaises(HTTPError) as context: # noqa
self.import_igc_genes._upsert_gene(invalid_gene)
def test_update_gene(self):
valid_gene = {
'gene_name': 'test_gene.01',
'gene_id': 'test-gene-01',
'length': 3556,
'kegg_ko': 'K00001'
}
updated_gene = {
'gene_name': 'test_gene.01',
'gene_id': 'test-gene-01',
'length': 356,
'kegg_ko': 'K00001'
}
self.import_igc_genes._upsert_gene(valid_gene)
self.assertEqual(self.api_mock.get('test-gene-01')['length'], 3556)
self.import_igc_genes._upsert_gene(updated_gene)
self.assertEqual(self.api_mock.get('test-gene-01')['length'], 356)
class TestCleanGene(TestCase):
def setUp(self):
......@@ -227,7 +179,8 @@ class TestSelectTaxonomy(APITestCase):
}
expected_dict = {
'gene_id': 'gene',
'length': 135
'length': 135,
'taxonomy': None
}
tested_dict = self.import_igc_genes._select_taxonomy(gene_dict)
self.assertDictEqual(tested_dict, expected_dict)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment