diff --git a/backend/metagenedb/utils/__init__.py b/backend/metagenedb/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/backend/metagenedb/utils/parsers.py b/backend/metagenedb/utils/parsers.py new file mode 100644 index 0000000000000000000000000000000000000000..a40e819b8f409fdee719cc154351efb9a978504c --- /dev/null +++ b/backend/metagenedb/utils/parsers.py @@ -0,0 +1,50 @@ +import logging + +logging.basicConfig(level=logging.INFO) +_LOGGER = logging.getLogger(__name__) + + +def parse_ncbi_taxonomy_node(line): + """ + parse line from ncbi nodes.dmp file + + From documentation: + + nodes.dmp file consists of taxonomy nodes. + The description for each node includes the following fields: + + tax_id -- node id in GenBank taxonomy database + parent tax_id -- parent node id in GenBank taxonomy database + rank -- rank of this node (superkingdom, kingdom, ...) + embl code -- locus-name prefix; not unique + division id -- see division.dmp file + inherited div flag (1 or 0) -- 1 if node inherits division from parent + genetic code id -- see gencode.dmp file + inherited GC flag (1 or 0) -- 1 if node inherits genetic code from parent + mitochondrial genetic code id -- see gencode.dmp file + inherited MGC flag (1 or 0) -- 1 if node inherits mitochondrial gencode from parent + GenBank hidden flag (1 or 0) -- 1 if name is suppressed in GenBank entry lineage + hidden subtree root flag (1 or 0) -- 1 if this subtree has no sequence data yet + comments -- free-text comments and citations + """ + elements = line.rstrip().split('|') + try: + parsed_line = { + "tax_id": elements[0].strip(), + "parent_tax_id": elements[1].strip(), + "rank": elements[2].strip(), + "embl_code": elements[3].strip(), + "division_id": elements[4].strip(), + "inherited_div_flag": elements[5].strip(), + "genetic_code_id": elements[6].strip(), + "inherited_GC_flag": elements[7].strip(), + "mitochondrial_genetic_code_id": elements[8].strip(), + "inherited_MGC_flag": elements[9].strip(), + "GenBank_hidden_flag": elements[10].strip(), + "hidden_subtree_root_flag": elements[11].strip(), + "comments": elements[12].strip() + } + return parsed_line + except Exception as e: + _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from nodes.dmp file?") + raise(e) diff --git a/backend/metagenedb/utils/test_parsers.py b/backend/metagenedb/utils/test_parsers.py new file mode 100644 index 0000000000000000000000000000000000000000..c895864f84920a5bfc9316ab8ec665728afa3b31 --- /dev/null +++ b/backend/metagenedb/utils/test_parsers.py @@ -0,0 +1,31 @@ +from unittest import TestCase + +from metagenedb.utils.parsers import parse_ncbi_taxonomy_node + + +class TestNCBITaxonomyNodeParser(TestCase): + + def test_parse_ncbi_taxonomy_node(self): + node_line = "6 | 335928 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |\n" + expected_dict = { + "tax_id": "6", + "parent_tax_id": "335928", + "rank": "genus", + "embl_code": "", + "division_id": "0", + "inherited_div_flag": "1", + "genetic_code_id": "11", + "inherited_GC_flag": "1", + "mitochondrial_genetic_code_id": "0", + "inherited_MGC_flag": "1", + "GenBank_hidden_flag": "0", + "hidden_subtree_root_flag": "0", + "comments": "" + } + test_dict = parse_ncbi_taxonomy_node(node_line) + self.assertDictEqual(test_dict, expected_dict) + + def test_parse_wrong_line_format(self): + node_line = "This is a wrong line format." + with self.assertRaises(Exception) as context: + test_dict = parse_ncbi_taxonomy_node(node_line)