Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Metagenomics
metagenedb
Commits
82af11b1
Commit
82af11b1
authored
Jul 15, 2019
by
Kenzo-Hugo Hillion
♻
Browse files
add line parser for ncbi nodes.dmp file
parent
cbad11aa
Changes
3
Hide whitespace changes
Inline
Side-by-side
backend/metagenedb/utils/__init__.py
0 → 100644
View file @
82af11b1
backend/metagenedb/utils/parsers.py
0 → 100644
View file @
82af11b1
import
logging
logging
.
basicConfig
(
level
=
logging
.
INFO
)
_LOGGER
=
logging
.
getLogger
(
__name__
)
def
parse_ncbi_taxonomy_node
(
line
):
"""
parse line from ncbi nodes.dmp file
From documentation:
nodes.dmp file consists of taxonomy nodes.
The description for each node includes the following fields:
tax_id -- node id in GenBank taxonomy database
parent tax_id -- parent node id in GenBank taxonomy database
rank -- rank of this node (superkingdom, kingdom, ...)
embl code -- locus-name prefix; not unique
division id -- see division.dmp file
inherited div flag (1 or 0) -- 1 if node inherits division from parent
genetic code id -- see gencode.dmp file
inherited GC flag (1 or 0) -- 1 if node inherits genetic code from parent
mitochondrial genetic code id -- see gencode.dmp file
inherited MGC flag (1 or 0) -- 1 if node inherits mitochondrial gencode from parent
GenBank hidden flag (1 or 0) -- 1 if name is suppressed in GenBank entry lineage
hidden subtree root flag (1 or 0) -- 1 if this subtree has no sequence data yet
comments -- free-text comments and citations
"""
elements
=
line
.
rstrip
().
split
(
'|'
)
try
:
parsed_line
=
{
"tax_id"
:
elements
[
0
].
strip
(),
"parent_tax_id"
:
elements
[
1
].
strip
(),
"rank"
:
elements
[
2
].
strip
(),
"embl_code"
:
elements
[
3
].
strip
(),
"division_id"
:
elements
[
4
].
strip
(),
"inherited_div_flag"
:
elements
[
5
].
strip
(),
"genetic_code_id"
:
elements
[
6
].
strip
(),
"inherited_GC_flag"
:
elements
[
7
].
strip
(),
"mitochondrial_genetic_code_id"
:
elements
[
8
].
strip
(),
"inherited_MGC_flag"
:
elements
[
9
].
strip
(),
"GenBank_hidden_flag"
:
elements
[
10
].
strip
(),
"hidden_subtree_root_flag"
:
elements
[
11
].
strip
(),
"comments"
:
elements
[
12
].
strip
()
}
return
parsed_line
except
Exception
as
e
:
_LOGGER
.
error
(
f
"Could not parse:
{
line
.
rstrip
()
}
. Are you sure it comes from nodes.dmp file?"
)
raise
(
e
)
backend/metagenedb/utils/test_parsers.py
0 → 100644
View file @
82af11b1
from
unittest
import
TestCase
from
metagenedb.utils.parsers
import
parse_ncbi_taxonomy_node
class
TestNCBITaxonomyNodeParser
(
TestCase
):
def
test_parse_ncbi_taxonomy_node
(
self
):
node_line
=
"6 | 335928 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
\n
"
expected_dict
=
{
"tax_id"
:
"6"
,
"parent_tax_id"
:
"335928"
,
"rank"
:
"genus"
,
"embl_code"
:
""
,
"division_id"
:
"0"
,
"inherited_div_flag"
:
"1"
,
"genetic_code_id"
:
"11"
,
"inherited_GC_flag"
:
"1"
,
"mitochondrial_genetic_code_id"
:
"0"
,
"inherited_MGC_flag"
:
"1"
,
"GenBank_hidden_flag"
:
"0"
,
"hidden_subtree_root_flag"
:
"0"
,
"comments"
:
""
}
test_dict
=
parse_ncbi_taxonomy_node
(
node_line
)
self
.
assertDictEqual
(
test_dict
,
expected_dict
)
def
test_parse_wrong_line_format
(
self
):
node_line
=
"This is a wrong line format."
with
self
.
assertRaises
(
Exception
)
as
context
:
test_dict
=
parse_ncbi_taxonomy_node
(
node_line
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment