Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Metagenomics
metagenedb
Commits
bb32c198
Commit
bb32c198
authored
Jul 15, 2019
by
Kenzo-Hugo Hillion
♻
Browse files
add parser for names.dmp from NCBI taxonomy
parent
82af11b1
Changes
2
Hide whitespace changes
Inline
Side-by-side
backend/metagenedb/utils/parsers.py
View file @
bb32c198
...
...
@@ -48,3 +48,29 @@ def parse_ncbi_taxonomy_node(line):
except
Exception
as
e
:
_LOGGER
.
error
(
f
"Could not parse:
{
line
.
rstrip
()
}
. Are you sure it comes from nodes.dmp file?"
)
raise
(
e
)
def
parse_ncbi_taxonomy_name
(
line
):
"""
parse line from ncbi names.dmp file
From documentation:
Taxonomy names file (names.dmp):
tax_id -- the id of node associated with this name
name_txt -- name itself
unique name -- the unique variant of this name if name not unique
name class -- (synonym, common name, ...)
"""
elements
=
line
.
rstrip
().
split
(
'|'
)
try
:
parsed_line
=
{
"tax_id"
:
elements
[
0
].
strip
(),
"name_txt"
:
elements
[
1
].
strip
(),
"unique_name"
:
elements
[
2
].
strip
(),
"name_class"
:
elements
[
3
].
strip
(),
}
return
parsed_line
except
Exception
as
e
:
_LOGGER
.
error
(
f
"Could not parse:
{
line
.
rstrip
()
}
. Are you sure it comes from nodes.dmp file?"
)
raise
(
e
)
backend/metagenedb/utils/test_parsers.py
View file @
bb32c198
from
unittest
import
TestCase
from
metagenedb.utils.parsers
import
parse_ncbi_taxonomy_node
from
metagenedb.utils.parsers
import
parse_ncbi_taxonomy_node
,
parse_ncbi_taxonomy_name
class
TestNCBITaxonomyNodeParser
(
TestCase
):
...
...
@@ -27,5 +27,24 @@ class TestNCBITaxonomyNodeParser(TestCase):
def
test_parse_wrong_line_format
(
self
):
node_line
=
"This is a wrong line format."
with
self
.
assertRaises
(
Exception
)
as
context
:
test_dict
=
parse_ncbi_taxonomy_node
(
node_line
)
with
self
.
assertRaises
(
Exception
)
as
context
:
# noqa
parse_ncbi_taxonomy_node
(
node_line
)
class
TestNCBITaxonomyNameParser
(
TestCase
):
def
test_parse_ncbi_taxonomy_name
(
self
):
node_line
=
"2 | Bacteria | Bacteria <prokaryotes> | scientific name |
\n
"
expected_dict
=
{
"tax_id"
:
"2"
,
"name_txt"
:
"Bacteria"
,
"unique_name"
:
"Bacteria <prokaryotes>"
,
"name_class"
:
"scientific name"
,
}
test_dict
=
parse_ncbi_taxonomy_name
(
node_line
)
self
.
assertDictEqual
(
test_dict
,
expected_dict
)
def
test_parse_wrong_line_format
(
self
):
node_line
=
"This is a wrong line format."
with
self
.
assertRaises
(
Exception
)
as
context
:
# noqa
parse_ncbi_taxonomy_name
(
node_line
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment