Commit bb32c198 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

add parser for names.dmp from NCBI taxonomy

parent 82af11b1
...@@ -48,3 +48,29 @@ def parse_ncbi_taxonomy_node(line): ...@@ -48,3 +48,29 @@ def parse_ncbi_taxonomy_node(line):
except Exception as e: except Exception as e:
_LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from nodes.dmp file?") _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from nodes.dmp file?")
raise(e) raise(e)
def parse_ncbi_taxonomy_name(line):
"""
parse line from ncbi names.dmp file
From documentation:
Taxonomy names file (names.dmp):
tax_id -- the id of node associated with this name
name_txt -- name itself
unique name -- the unique variant of this name if name not unique
name class -- (synonym, common name, ...)
"""
elements = line.rstrip().split('|')
try:
parsed_line = {
"tax_id": elements[0].strip(),
"name_txt": elements[1].strip(),
"unique_name": elements[2].strip(),
"name_class": elements[3].strip(),
}
return parsed_line
except Exception as e:
_LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from nodes.dmp file?")
raise(e)
from unittest import TestCase from unittest import TestCase
from metagenedb.utils.parsers import parse_ncbi_taxonomy_node from metagenedb.utils.parsers import parse_ncbi_taxonomy_node, parse_ncbi_taxonomy_name
class TestNCBITaxonomyNodeParser(TestCase): class TestNCBITaxonomyNodeParser(TestCase):
...@@ -27,5 +27,24 @@ class TestNCBITaxonomyNodeParser(TestCase): ...@@ -27,5 +27,24 @@ class TestNCBITaxonomyNodeParser(TestCase):
def test_parse_wrong_line_format(self): def test_parse_wrong_line_format(self):
node_line = "This is a wrong line format." node_line = "This is a wrong line format."
with self.assertRaises(Exception) as context: with self.assertRaises(Exception) as context: # noqa
test_dict = parse_ncbi_taxonomy_node(node_line) parse_ncbi_taxonomy_node(node_line)
class TestNCBITaxonomyNameParser(TestCase):
def test_parse_ncbi_taxonomy_name(self):
node_line = "2 | Bacteria | Bacteria <prokaryotes> | scientific name |\n"
expected_dict = {
"tax_id": "2",
"name_txt": "Bacteria",
"unique_name": "Bacteria <prokaryotes>",
"name_class": "scientific name",
}
test_dict = parse_ncbi_taxonomy_name(node_line)
self.assertDictEqual(test_dict, expected_dict)
def test_parse_wrong_line_format(self):
node_line = "This is a wrong line format."
with self.assertRaises(Exception) as context: # noqa
parse_ncbi_taxonomy_name(node_line)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment