diff --git a/backend/metagenedb/utils/parsers.py b/backend/metagenedb/utils/parsers.py index 8bd5070bec5e0970bb5f0736dc2d95b09175ac2b..e8c50dc762730c71f60032dc498c9225eaf20ff1 100644 --- a/backend/metagenedb/utils/parsers.py +++ b/backend/metagenedb/utils/parsers.py @@ -4,73 +4,104 @@ logging.basicConfig(level=logging.INFO) _LOGGER = logging.getLogger(__name__) -def parse_ncbi_taxonomy_node(line): - """ - parse line from ncbi nodes.dmp file +class KEGGLineParser(object): - From documentation: + @staticmethod + def ko_list(line): + """ + Parse line from kegg KO list (http://rest.kegg.jp/list/ko) to return organized dict + """ + try: + elements = line.split('\t') + function_id = elements[0].split(':')[1] + if ';' in elements[1]: + names = elements[1].split(';') + else: + _LOGGER.warning(f"Parsing issue with {function_id}, corresponding line: {line}") + names = [elements[1], ''] # Ugly fix to handle one specific case with no name: K23479 + if '[EC:' in names[1]: + ec_number = names[1].split('[EC:')[1].rstrip(']') + else: + ec_number = '' + return { + 'function_id': function_id, + 'name': names[0], + 'long_name': names[1].lstrip(), + 'ec_number': ec_number + } + except: + _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from KEGG KO list?") + raise - nodes.dmp file consists of taxonomy nodes. - The description for each node includes the following fields: - tax_id -- node id in GenBank taxonomy database - parent tax_id -- parent node id in GenBank taxonomy database - rank -- rank of this node (superkingdom, kingdom, ...) - embl code -- locus-name prefix; not unique - division id -- see division.dmp file - inherited div flag (1 or 0) -- 1 if node inherits division from parent - genetic code id -- see gencode.dmp file - inherited GC flag (1 or 0) -- 1 if node inherits genetic code from parent - mitochondrial genetic code id -- see gencode.dmp file - inherited MGC flag (1 or 0) -- 1 if node inherits mitochondrial gencode from parent - GenBank hidden flag (1 or 0) -- 1 if name is suppressed in GenBank entry lineage - hidden subtree root flag (1 or 0) -- 1 if this subtree has no sequence data yet - comments -- free-text comments and citations - """ - elements = line.rstrip().split('|') - try: - parsed_line = { - "tax_id": elements[0].strip(), - "parent_tax_id": elements[1].strip(), - "rank": elements[2].strip(), - "embl_code": elements[3].strip(), - "division_id": elements[4].strip(), - "inherited_div_flag": elements[5].strip(), - "genetic_code_id": elements[6].strip(), - "inherited_GC_flag": elements[7].strip(), - "mitochondrial_genetic_code_id": elements[8].strip(), - "inherited_MGC_flag": elements[9].strip(), - "GenBank_hidden_flag": elements[10].strip(), - "hidden_subtree_root_flag": elements[11].strip(), - "comments": elements[12].strip() - } - return parsed_line - except Exception as e: - _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from nodes.dmp file?") - raise(e) +class NCBITaxonomyLineParser(object): + @staticmethod + def node(line): + """ + parse line from ncbi nodes.dmp file -def parse_ncbi_taxonomy_name(line): - """ - parse line from ncbi names.dmp file + From documentation: - From documentation: + nodes.dmp file consists of taxonomy nodes. + The description for each node includes the following fields: - Taxonomy names file (names.dmp): - tax_id -- the id of node associated with this name - name_txt -- name itself - unique name -- the unique variant of this name if name not unique - name class -- (synonym, common name, ...) - """ - elements = line.rstrip().split('|') - try: - parsed_line = { - "tax_id": elements[0].strip(), - "name_txt": elements[1].strip(), - "unique_name": elements[2].strip(), - "name_class": elements[3].strip(), - } - return parsed_line - except Exception as e: - _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from nodes.dmp file?") - raise(e) + tax_id -- node id in GenBank taxonomy database + parent tax_id -- parent node id in GenBank taxonomy database + rank -- rank of this node (superkingdom, kingdom, ...) + embl code -- locus-name prefix; not unique + division id -- see division.dmp file + inherited div flag (1 or 0) -- 1 if node inherits division from parent + genetic code id -- see gencode.dmp file + inherited GC flag (1 or 0) -- 1 if node inherits genetic code from parent + mitochondrial genetic code id -- see gencode.dmp file + inherited MGC flag (1 or 0) -- 1 if node inherits mitochondrial gencode from parent + GenBank hidden flag (1 or 0) -- 1 if name is suppressed in GenBank entry lineage + hidden subtree root flag (1 or 0) -- 1 if this subtree has no sequence data yet + comments -- free-text comments and citations + """ + elements = line.rstrip().split('|') + try: + return { + "tax_id": elements[0].strip(), + "parent_tax_id": elements[1].strip(), + "rank": elements[2].strip(), + "embl_code": elements[3].strip(), + "division_id": elements[4].strip(), + "inherited_div_flag": elements[5].strip(), + "genetic_code_id": elements[6].strip(), + "inherited_GC_flag": elements[7].strip(), + "mitochondrial_genetic_code_id": elements[8].strip(), + "inherited_MGC_flag": elements[9].strip(), + "GenBank_hidden_flag": elements[10].strip(), + "hidden_subtree_root_flag": elements[11].strip(), + "comments": elements[12].strip() + } + except: + _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from nodes.dmp file?") + raise + + @staticmethod + def name(line): + """ + parse line from ncbi names.dmp file + + From documentation: + + Taxonomy names file (names.dmp): + tax_id -- the id of node associated with this name + name_txt -- name itself + unique name -- the unique variant of this name if name not unique + name class -- (synonym, common name, ...) + """ + elements = line.rstrip().split('|') + try: + return { + "tax_id": elements[0].strip(), + "name_txt": elements[1].strip(), + "unique_name": elements[2].strip(), + "name_class": elements[3].strip(), + } + except: + _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from nodes.dmp file?") + raise diff --git a/backend/metagenedb/utils/test_parsers.py b/backend/metagenedb/utils/test_parsers.py index b853d0314c1f4760c919739a754ebcc2af61a143..c145b5e48c1a5a3a58fe86a344928024d2d1daaa 100644 --- a/backend/metagenedb/utils/test_parsers.py +++ b/backend/metagenedb/utils/test_parsers.py @@ -1,11 +1,30 @@ from unittest import TestCase -from metagenedb.utils.parsers import parse_ncbi_taxonomy_node, parse_ncbi_taxonomy_name +from metagenedb.utils.parsers import KEGGLineParser, NCBITaxonomyLineParser -class TestNCBITaxonomyNodeParser(TestCase): +class TestKEGGLineParser(TestCase): - def test_parse_ncbi_taxonomy_node(self): + def test_ko_list(self): + ko_line = "ko:K00809 DHPS, dys; deoxyhypusine synthase [EC:2.5.1.46]" + expected_dict = { + 'function_id': "K00809", + 'name': "DHPS, dys", + 'long_name': "deoxyhypusine synthase [EC:2.5.1.46]", + 'ec_number': "2.5.1.46" + } + test_dict = KEGGLineParser.ko_list(ko_line) + self.assertDictEqual(test_dict, expected_dict) + + def test_ko_list_wrong_format(self): + ko_line = "This is a wrong line format, with; information and tab" + with self.assertRaises(Exception) as context: # noqa + KEGGLineParser.ko_list(ko_line) + + +class TestNCBITaxonomyLineParser(TestCase): + + def test_node(self): node_line = "6 | 335928 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |\n" expected_dict = { "tax_id": "6", @@ -22,29 +41,26 @@ class TestNCBITaxonomyNodeParser(TestCase): "hidden_subtree_root_flag": "0", "comments": "" } - test_dict = parse_ncbi_taxonomy_node(node_line) + test_dict = NCBITaxonomyLineParser.node(node_line) self.assertDictEqual(test_dict, expected_dict) - def test_parse_wrong_line_format(self): - node_line = "This is a wrong line format." + def test_node_wrong_format(self): + node_line = "This is a wrong line format, with; information and tab" with self.assertRaises(Exception) as context: # noqa - parse_ncbi_taxonomy_node(node_line) - - -class TestNCBITaxonomyNameParser(TestCase): + NCBITaxonomyLineParser.node(node_line) - def test_parse_ncbi_taxonomy_name(self): - node_line = "2 | Bacteria | Bacteria <prokaryotes> | scientific name |\n" + def test_name(self): + name_line = "2 | Bacteria | Bacteria <prokaryotes> | scientific name |\n" expected_dict = { "tax_id": "2", "name_txt": "Bacteria", "unique_name": "Bacteria <prokaryotes>", "name_class": "scientific name", } - test_dict = parse_ncbi_taxonomy_name(node_line) + test_dict = NCBITaxonomyLineParser.name(name_line) self.assertDictEqual(test_dict, expected_dict) - def test_parse_wrong_line_format(self): - node_line = "This is a wrong line format." + def test_name_wrong_format(self): + name_line = "This is a wrong line format, with; information and tab" with self.assertRaises(Exception) as context: # noqa - parse_ncbi_taxonomy_name(node_line) + NCBITaxonomyLineParser.name(name_line) diff --git a/backend/scripts/import_igc_data.py b/backend/scripts/populate_db/import_igc_data.py similarity index 100% rename from backend/scripts/import_igc_data.py rename to backend/scripts/populate_db/import_igc_data.py diff --git a/backend/scripts/load_kegg_ko.py b/backend/scripts/populate_db/load_kegg_ko.py similarity index 73% rename from backend/scripts/load_kegg_ko.py rename to backend/scripts/populate_db/load_kegg_ko.py index 7222c285d48c3c95b4329bb8de0f05615305f855..8a289a6e0051b8e6197b6d5403b25c258e26c766 100755 --- a/backend/scripts/load_kegg_ko.py +++ b/backend/scripts/populate_db/load_kegg_ko.py @@ -8,6 +8,8 @@ import sys import django from django.core.exceptions import ValidationError +from metagenedb.utils.parsers import KEGGLineParser + # Before model import, we need to called django.setup() to Load apps os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings") django.setup() @@ -31,29 +33,6 @@ def parse_arguments(): sys.exit(1) -def parse_ko(line): - """ - Parse line from kegg KO list to return organized dict - """ - content = line.split('\t') - function_id = content[0].split(':')[1] - if ';' in content[1]: - names = content[1].split(';') - else: - _LOGGER.warning(f"Parsing issue with {function_id}, corresponding line: {line}") - names = [content[1], ''] # Ugly fix to handle one specific case with no name: K23479 - if '[EC:' in names[1]: - ec_number = names[1].split('[EC:')[1].rstrip(']') - else: - ec_number = '' - return { - 'function_id': function_id, - 'name': names[0], - 'long_name': names[1].lstrip(), - 'ec_number': ec_number - } - - def create_kegg_ko(kegg_ko): try: obj_kegg = KeggOrthology.objects.get(function_id=kegg_ko.get('function_id')) @@ -73,7 +52,7 @@ def run(): skipped_kegg = 0 total_kegg = len(all_ko.text.splitlines()) for line in all_ko.text.splitlines(): - kegg_ko = parse_ko(line) + kegg_ko = KEGGLineParser.ko_list(line) try: create_kegg_ko(kegg_ko) inserted_kegg += 1