Commit 9ff69a05 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

update import scripts

parent 05fddb8e
......@@ -27,6 +27,8 @@ class ImportIGCGenes(object):
self.url = url
self.metagenedb_gene_api = self.METAGENEDB_GENE_API(base_url=self.url)
self.metagenedb_taxonomy_api = self.METAGENEDB_TAXONOMY_API(base_url=self.url)
self.processed_genes = 0
self.skipped_genes = 0
# Skip some insertion if specified in script options
self.skip_tax = skip_tax
self.skip_functions = skip_functions
......@@ -88,19 +90,20 @@ class ImportIGCGenes(object):
try:
self._upsert_gene(gene_dict_with_taxo)
except HTTPError as e:
self.skipped_genes += 1
_LOGGER.warning(f"{e.response.json()} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.")
def load_annotation_file_to_db_in_chunks(self, chunk_size=100000):
processed_genes = 0
with open(self.annotation_file, 'r') as file:
while True:
chunk_genes = list(islice(file, chunk_size))
if not chunk_genes:
break
processed_genes += len(chunk_genes)
self.processed_genes += len(chunk_genes)
self._insert_gene_list(chunk_genes)
_LOGGER.info(f"{processed_genes} genes processed so far...")
_LOGGER.info(f"[DONE] {processed_genes} genes processed.")
_LOGGER.info(f"{self.processed_genes} genes inserted/updated so far...")
_LOGGER.info(f"[DONE] {self.processed_genes} genes inserted/updated.")
_LOGGER.info(f"[DONE] {self.skipped_genes} genes skipped.")
def parse_arguments():
......
......@@ -4,10 +4,12 @@ import logging
import os
import requests
import sys
from requests.exceptions import HTTPError
import django
from django.core.exceptions import ValidationError
from metagenedb.common.utils.api import MetageneDBCatalogFunctionAPI
from metagenedb.common.utils.parsers import KEGGLineParser
# Before model import, we need to called django.setup() to Load apps
......@@ -27,44 +29,51 @@ def parse_arguments():
Defines parser.
"""
parser = argparse.ArgumentParser(description=f'Populate KEGG KO database from {KEGG_KO_LIST_API}.')
parser.add_argument('--url', help='base URL of the instance.', default='http://localhost/')
try:
return parser.parse_args()
except SystemExit:
sys.exit(1)
def create_kegg_ko(kegg_ko):
try:
obj_kegg = KeggOrthology.objects.get(function_id=kegg_ko.get('function_id'))
for key, value in kegg_ko.items():
setattr(obj_kegg, key, value)
except KeggOrthology.DoesNotExist:
obj_kegg = KeggOrthology(**kegg_ko)
obj_kegg.full_clean()
obj_kegg.save()
class ImportKEGGKO(object):
METAGENEDB_FUNCTION_API = MetageneDBCatalogFunctionAPI
def __init__(self, url, kegg_ko_list_api=KEGG_KO_LIST_API):
self.kegg_ko_list_api = kegg_ko_list_api
self.metagenedb_function_api = self.METAGENEDB_FUNCTION_API(base_url=url)
self.inserted_kegg = 0
self.skipped_kegg = 0
def run():
args = parse_arguments() # noqa
all_ko = requests.get("http://rest.kegg.jp/list/ko")
all_ko.raise_for_status()
inserted_kegg = 0
skipped_kegg = 0
total_kegg = len(all_ko.text.splitlines())
for line in all_ko.text.splitlines():
kegg_ko = KEGGLineParser.ko_list(line)
def _upsert_kegg_ko(self, kegg_ko):
try:
create_kegg_ko(kegg_ko)
inserted_kegg += 1
except ValidationError as e:
skipped_kegg += 1
_LOGGER.warning(f"{e.__dict__} for function_id: {kegg_ko.get('function_id')}. Insertion skipped.")
if inserted_kegg > 0 and inserted_kegg % 100 == 0:
_LOGGER.info(f"{inserted_kegg}/{total_kegg} KEGG KO inserted so far...")
_LOGGER.info(f"[DONE] {inserted_kegg}/{total_kegg} KEGG KO inserted.")
_LOGGER.info(f"[DONE] {skipped_kegg}/{total_kegg} KEGG KO skipped.")
# Create unknown entry
self.metagenedb_function_api.get(kegg_ko.get('function_id')) # Try to get obj to check if it exists
self.metagenedb_function_api.put(kegg_ko.get('function_id'), kegg_ko)
except HTTPError:
self.metagenedb_function_api.post(kegg_ko)
def load_all_kegg_ko(self):
all_ko = requests.get(self.kegg_ko_list_api)
all_ko.raise_for_status()
self.total_kegg_nb = len(all_ko.text.splitlines())
for line in all_ko.text.splitlines():
kegg_ko = KEGGLineParser.ko_list(line)
try:
self._upsert_kegg_ko(kegg_ko)
self.inserted_kegg += 1
except ValidationError as e:
self.skipped_kegg += 1
_LOGGER.warning(f"{e.__dict__} for function_id: {kegg_ko.get('function_id')}. Insertion skipped.")
if self.inserted_kegg > 0 and self.inserted_kegg % 100 == 0:
_LOGGER.info(f"{self.inserted_kegg}/{self.total_kegg_nb} KEGG KO inserted so far...")
_LOGGER.info(f"[DONE] {self.inserted_kegg}/{self.total_kegg_nb} KEGG KO inserted.")
_LOGGER.info(f"[DONE] {self.skipped_kegg}/{self.total_kegg_nb} KEGG KO skipped.")
def run():
args = parse_arguments()
import_kegg_ko = ImportKEGGKO(args.url)
import_kegg_ko.load_all_kegg_ko()
if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment