diff --git a/backend/scripts/populate_db/import_igc_data.py b/backend/scripts/populate_db/import_igc_data.py index 67e9202e3a86473f6ab78469df838be6a70d4370..5902f6593cf93562712b3518808b8aff18adfe91 100755 --- a/backend/scripts/populate_db/import_igc_data.py +++ b/backend/scripts/populate_db/import_igc_data.py @@ -27,6 +27,8 @@ class ImportIGCGenes(object): self.url = url self.metagenedb_gene_api = self.METAGENEDB_GENE_API(base_url=self.url) self.metagenedb_taxonomy_api = self.METAGENEDB_TAXONOMY_API(base_url=self.url) + self.processed_genes = 0 + self.skipped_genes = 0 # Skip some insertion if specified in script options self.skip_tax = skip_tax self.skip_functions = skip_functions @@ -88,19 +90,20 @@ class ImportIGCGenes(object): try: self._upsert_gene(gene_dict_with_taxo) except HTTPError as e: + self.skipped_genes += 1 _LOGGER.warning(f"{e.response.json()} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.") def load_annotation_file_to_db_in_chunks(self, chunk_size=100000): - processed_genes = 0 with open(self.annotation_file, 'r') as file: while True: chunk_genes = list(islice(file, chunk_size)) if not chunk_genes: break - processed_genes += len(chunk_genes) + self.processed_genes += len(chunk_genes) self._insert_gene_list(chunk_genes) - _LOGGER.info(f"{processed_genes} genes processed so far...") - _LOGGER.info(f"[DONE] {processed_genes} genes processed.") + _LOGGER.info(f"{self.processed_genes} genes inserted/updated so far...") + _LOGGER.info(f"[DONE] {self.processed_genes} genes inserted/updated.") + _LOGGER.info(f"[DONE] {self.skipped_genes} genes skipped.") def parse_arguments(): diff --git a/backend/scripts/populate_db/load_kegg_ko.py b/backend/scripts/populate_db/load_kegg_ko.py index 7ec4a74ef07a485a2c6ab9ea3313c6751eac9970..eae93c0cca9f650e45af43dc7fc400ed02c3083e 100755 --- a/backend/scripts/populate_db/load_kegg_ko.py +++ b/backend/scripts/populate_db/load_kegg_ko.py @@ -4,10 +4,12 @@ import logging import os import requests import sys +from requests.exceptions import HTTPError import django from django.core.exceptions import ValidationError +from metagenedb.common.utils.api import MetageneDBCatalogFunctionAPI from metagenedb.common.utils.parsers import KEGGLineParser # Before model import, we need to called django.setup() to Load apps @@ -27,44 +29,51 @@ def parse_arguments(): Defines parser. """ parser = argparse.ArgumentParser(description=f'Populate KEGG KO database from {KEGG_KO_LIST_API}.') + parser.add_argument('--url', help='base URL of the instance.', default='http://localhost/') try: return parser.parse_args() except SystemExit: sys.exit(1) -def create_kegg_ko(kegg_ko): - try: - obj_kegg = KeggOrthology.objects.get(function_id=kegg_ko.get('function_id')) - for key, value in kegg_ko.items(): - setattr(obj_kegg, key, value) - except KeggOrthology.DoesNotExist: - obj_kegg = KeggOrthology(**kegg_ko) - obj_kegg.full_clean() - obj_kegg.save() +class ImportKEGGKO(object): + METAGENEDB_FUNCTION_API = MetageneDBCatalogFunctionAPI + def __init__(self, url, kegg_ko_list_api=KEGG_KO_LIST_API): + self.kegg_ko_list_api = kegg_ko_list_api + self.metagenedb_function_api = self.METAGENEDB_FUNCTION_API(base_url=url) + self.inserted_kegg = 0 + self.skipped_kegg = 0 -def run(): - args = parse_arguments() # noqa - all_ko = requests.get("http://rest.kegg.jp/list/ko") - all_ko.raise_for_status() - inserted_kegg = 0 - skipped_kegg = 0 - total_kegg = len(all_ko.text.splitlines()) - for line in all_ko.text.splitlines(): - kegg_ko = KEGGLineParser.ko_list(line) + def _upsert_kegg_ko(self, kegg_ko): try: - create_kegg_ko(kegg_ko) - inserted_kegg += 1 - except ValidationError as e: - skipped_kegg += 1 - _LOGGER.warning(f"{e.__dict__} for function_id: {kegg_ko.get('function_id')}. Insertion skipped.") - if inserted_kegg > 0 and inserted_kegg % 100 == 0: - _LOGGER.info(f"{inserted_kegg}/{total_kegg} KEGG KO inserted so far...") - _LOGGER.info(f"[DONE] {inserted_kegg}/{total_kegg} KEGG KO inserted.") - _LOGGER.info(f"[DONE] {skipped_kegg}/{total_kegg} KEGG KO skipped.") - - # Create unknown entry + self.metagenedb_function_api.get(kegg_ko.get('function_id')) # Try to get obj to check if it exists + self.metagenedb_function_api.put(kegg_ko.get('function_id'), kegg_ko) + except HTTPError: + self.metagenedb_function_api.post(kegg_ko) + + def load_all_kegg_ko(self): + all_ko = requests.get(self.kegg_ko_list_api) + all_ko.raise_for_status() + self.total_kegg_nb = len(all_ko.text.splitlines()) + for line in all_ko.text.splitlines(): + kegg_ko = KEGGLineParser.ko_list(line) + try: + self._upsert_kegg_ko(kegg_ko) + self.inserted_kegg += 1 + except ValidationError as e: + self.skipped_kegg += 1 + _LOGGER.warning(f"{e.__dict__} for function_id: {kegg_ko.get('function_id')}. Insertion skipped.") + if self.inserted_kegg > 0 and self.inserted_kegg % 100 == 0: + _LOGGER.info(f"{self.inserted_kegg}/{self.total_kegg_nb} KEGG KO inserted so far...") + _LOGGER.info(f"[DONE] {self.inserted_kegg}/{self.total_kegg_nb} KEGG KO inserted.") + _LOGGER.info(f"[DONE] {self.skipped_kegg}/{self.total_kegg_nb} KEGG KO skipped.") + + +def run(): + args = parse_arguments() + import_kegg_ko = ImportKEGGKO(args.url) + import_kegg_ko.load_all_kegg_ko() if __name__ == "__main__":