Commit 82af11b1 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

add line parser for ncbi nodes.dmp file

parent cbad11aa
import logging
logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)
def parse_ncbi_taxonomy_node(line):
"""
parse line from ncbi nodes.dmp file
From documentation:
nodes.dmp file consists of taxonomy nodes.
The description for each node includes the following fields:
tax_id -- node id in GenBank taxonomy database
parent tax_id -- parent node id in GenBank taxonomy database
rank -- rank of this node (superkingdom, kingdom, ...)
embl code -- locus-name prefix; not unique
division id -- see division.dmp file
inherited div flag (1 or 0) -- 1 if node inherits division from parent
genetic code id -- see gencode.dmp file
inherited GC flag (1 or 0) -- 1 if node inherits genetic code from parent
mitochondrial genetic code id -- see gencode.dmp file
inherited MGC flag (1 or 0) -- 1 if node inherits mitochondrial gencode from parent
GenBank hidden flag (1 or 0) -- 1 if name is suppressed in GenBank entry lineage
hidden subtree root flag (1 or 0) -- 1 if this subtree has no sequence data yet
comments -- free-text comments and citations
"""
elements = line.rstrip().split('|')
try:
parsed_line = {
"tax_id": elements[0].strip(),
"parent_tax_id": elements[1].strip(),
"rank": elements[2].strip(),
"embl_code": elements[3].strip(),
"division_id": elements[4].strip(),
"inherited_div_flag": elements[5].strip(),
"genetic_code_id": elements[6].strip(),
"inherited_GC_flag": elements[7].strip(),
"mitochondrial_genetic_code_id": elements[8].strip(),
"inherited_MGC_flag": elements[9].strip(),
"GenBank_hidden_flag": elements[10].strip(),
"hidden_subtree_root_flag": elements[11].strip(),
"comments": elements[12].strip()
}
return parsed_line
except Exception as e:
_LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from nodes.dmp file?")
raise(e)
from unittest import TestCase
from metagenedb.utils.parsers import parse_ncbi_taxonomy_node
class TestNCBITaxonomyNodeParser(TestCase):
def test_parse_ncbi_taxonomy_node(self):
node_line = "6 | 335928 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |\n"
expected_dict = {
"tax_id": "6",
"parent_tax_id": "335928",
"rank": "genus",
"embl_code": "",
"division_id": "0",
"inherited_div_flag": "1",
"genetic_code_id": "11",
"inherited_GC_flag": "1",
"mitochondrial_genetic_code_id": "0",
"inherited_MGC_flag": "1",
"GenBank_hidden_flag": "0",
"hidden_subtree_root_flag": "0",
"comments": ""
}
test_dict = parse_ncbi_taxonomy_node(node_line)
self.assertDictEqual(test_dict, expected_dict)
def test_parse_wrong_line_format(self):
node_line = "This is a wrong line format."
with self.assertRaises(Exception) as context:
test_dict = parse_ncbi_taxonomy_node(node_line)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment