From 82af11b1c0484bdedcb1bbe536a106b52e559f18 Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion <kenzo-hugo.hillion1@pasteur.fr> Date: Mon, 15 Jul 2019 14:27:19 +0200 Subject: [PATCH] add line parser for ncbi nodes.dmp file --- backend/metagenedb/utils/__init__.py | 0 backend/metagenedb/utils/parsers.py | 50 ++++++++++++++++++++++++ backend/metagenedb/utils/test_parsers.py | 31 +++++++++++++++ 3 files changed, 81 insertions(+) create mode 100644 backend/metagenedb/utils/__init__.py create mode 100644 backend/metagenedb/utils/parsers.py create mode 100644 backend/metagenedb/utils/test_parsers.py diff --git a/backend/metagenedb/utils/__init__.py b/backend/metagenedb/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/metagenedb/utils/parsers.py b/backend/metagenedb/utils/parsers.py new file mode 100644 index 0000000..a40e819 --- /dev/null +++ b/backend/metagenedb/utils/parsers.py @@ -0,0 +1,50 @@ +import logging + +logging.basicConfig(level=logging.INFO) +_LOGGER = logging.getLogger(__name__) + + +def parse_ncbi_taxonomy_node(line): + """ + parse line from ncbi nodes.dmp file + + From documentation: + + nodes.dmp file consists of taxonomy nodes. + The description for each node includes the following fields: + + tax_id -- node id in GenBank taxonomy database + parent tax_id -- parent node id in GenBank taxonomy database + rank -- rank of this node (superkingdom, kingdom, ...) + embl code -- locus-name prefix; not unique + division id -- see division.dmp file + inherited div flag (1 or 0) -- 1 if node inherits division from parent + genetic code id -- see gencode.dmp file + inherited GC flag (1 or 0) -- 1 if node inherits genetic code from parent + mitochondrial genetic code id -- see gencode.dmp file + inherited MGC flag (1 or 0) -- 1 if node inherits mitochondrial gencode from parent + GenBank hidden flag (1 or 0) -- 1 if name is suppressed in GenBank entry lineage + hidden subtree root flag (1 or 0) -- 1 if this subtree has no sequence data yet + comments -- free-text comments and citations + """ + elements = line.rstrip().split('|') + try: + parsed_line = { + "tax_id": elements[0].strip(), + "parent_tax_id": elements[1].strip(), + "rank": elements[2].strip(), + "embl_code": elements[3].strip(), + "division_id": elements[4].strip(), + "inherited_div_flag": elements[5].strip(), + "genetic_code_id": elements[6].strip(), + "inherited_GC_flag": elements[7].strip(), + "mitochondrial_genetic_code_id": elements[8].strip(), + "inherited_MGC_flag": elements[9].strip(), + "GenBank_hidden_flag": elements[10].strip(), + "hidden_subtree_root_flag": elements[11].strip(), + "comments": elements[12].strip() + } + return parsed_line + except Exception as e: + _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from nodes.dmp file?") + raise(e) diff --git a/backend/metagenedb/utils/test_parsers.py b/backend/metagenedb/utils/test_parsers.py new file mode 100644 index 0000000..c895864 --- /dev/null +++ b/backend/metagenedb/utils/test_parsers.py @@ -0,0 +1,31 @@ +from unittest import TestCase + +from metagenedb.utils.parsers import parse_ncbi_taxonomy_node + + +class TestNCBITaxonomyNodeParser(TestCase): + + def test_parse_ncbi_taxonomy_node(self): + node_line = "6 | 335928 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |\n" + expected_dict = { + "tax_id": "6", + "parent_tax_id": "335928", + "rank": "genus", + "embl_code": "", + "division_id": "0", + "inherited_div_flag": "1", + "genetic_code_id": "11", + "inherited_GC_flag": "1", + "mitochondrial_genetic_code_id": "0", + "inherited_MGC_flag": "1", + "GenBank_hidden_flag": "0", + "hidden_subtree_root_flag": "0", + "comments": "" + } + test_dict = parse_ncbi_taxonomy_node(node_line) + self.assertDictEqual(test_dict, expected_dict) + + def test_parse_wrong_line_format(self): + node_line = "This is a wrong line format." + with self.assertRaises(Exception) as context: + test_dict = parse_ncbi_taxonomy_node(node_line) -- GitLab