Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
metagenedb
Manage
Activity
Members
Labels
Plan
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Metagenomics
metagenedb
Commits
bb32c198
Commit
bb32c198
authored
6 years ago
by
Kenzo-Hugo Hillion
Browse files
Options
Downloads
Patches
Plain Diff
add parser for names.dmp from NCBI taxonomy
parent
82af11b1
No related branches found
No related tags found
1 merge request
!3
Integrate taxonomy to database
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
backend/metagenedb/utils/parsers.py
+26
-0
26 additions, 0 deletions
backend/metagenedb/utils/parsers.py
backend/metagenedb/utils/test_parsers.py
+22
-3
22 additions, 3 deletions
backend/metagenedb/utils/test_parsers.py
with
48 additions
and
3 deletions
backend/metagenedb/utils/parsers.py
+
26
−
0
View file @
bb32c198
...
...
@@ -48,3 +48,29 @@ def parse_ncbi_taxonomy_node(line):
except
Exception
as
e
:
_LOGGER
.
error
(
f
"
Could not parse:
{
line
.
rstrip
()
}
. Are you sure it comes from nodes.dmp file?
"
)
raise
(
e
)
def
parse_ncbi_taxonomy_name
(
line
):
"""
parse line from ncbi names.dmp file
From documentation:
Taxonomy names file (names.dmp):
tax_id -- the id of node associated with this name
name_txt -- name itself
unique name -- the unique variant of this name if name not unique
name class -- (synonym, common name, ...)
"""
elements
=
line
.
rstrip
().
split
(
'
|
'
)
try
:
parsed_line
=
{
"
tax_id
"
:
elements
[
0
].
strip
(),
"
name_txt
"
:
elements
[
1
].
strip
(),
"
unique_name
"
:
elements
[
2
].
strip
(),
"
name_class
"
:
elements
[
3
].
strip
(),
}
return
parsed_line
except
Exception
as
e
:
_LOGGER
.
error
(
f
"
Could not parse:
{
line
.
rstrip
()
}
. Are you sure it comes from nodes.dmp file?
"
)
raise
(
e
)
This diff is collapsed.
Click to expand it.
backend/metagenedb/utils/test_parsers.py
+
22
−
3
View file @
bb32c198
from
unittest
import
TestCase
from
metagenedb.utils.parsers
import
parse_ncbi_taxonomy_node
from
metagenedb.utils.parsers
import
parse_ncbi_taxonomy_node
,
parse_ncbi_taxonomy_name
class
TestNCBITaxonomyNodeParser
(
TestCase
):
...
...
@@ -27,5 +27,24 @@ class TestNCBITaxonomyNodeParser(TestCase):
def
test_parse_wrong_line_format
(
self
):
node_line
=
"
This is a wrong line format.
"
with
self
.
assertRaises
(
Exception
)
as
context
:
test_dict
=
parse_ncbi_taxonomy_node
(
node_line
)
with
self
.
assertRaises
(
Exception
)
as
context
:
# noqa
parse_ncbi_taxonomy_node
(
node_line
)
class
TestNCBITaxonomyNameParser
(
TestCase
):
def
test_parse_ncbi_taxonomy_name
(
self
):
node_line
=
"
2 | Bacteria | Bacteria <prokaryotes> | scientific name |
\n
"
expected_dict
=
{
"
tax_id
"
:
"
2
"
,
"
name_txt
"
:
"
Bacteria
"
,
"
unique_name
"
:
"
Bacteria <prokaryotes>
"
,
"
name_class
"
:
"
scientific name
"
,
}
test_dict
=
parse_ncbi_taxonomy_name
(
node_line
)
self
.
assertDictEqual
(
test_dict
,
expected_dict
)
def
test_parse_wrong_line_format
(
self
):
node_line
=
"
This is a wrong line format.
"
with
self
.
assertRaises
(
Exception
)
as
context
:
# noqa
parse_ncbi_taxonomy_name
(
node_line
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment