Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Metagenomics
metagenedb
Commits
170bd11b
Commit
170bd11b
authored
Jul 16, 2019
by
Kenzo-Hugo Hillion
♻
Browse files
Refactor parsers for NCBI taxo and KEGG ko list
parent
bb32c198
Pipeline
#13195
failed with stage
in 2 minutes and 14 seconds
Changes
4
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
backend/metagenedb/utils/parsers.py
View file @
170bd11b
...
...
@@ -4,73 +4,104 @@ logging.basicConfig(level=logging.INFO)
_LOGGER
=
logging
.
getLogger
(
__name__
)
def
parse_ncbi_taxonomy_node
(
line
):
"""
parse line from ncbi nodes.dmp file
class
KEGGLineParser
(
object
):
From documentation:
@
staticmethod
def
ko_list
(
line
):
"""
Parse line from kegg KO list (http://rest.kegg.jp/list/ko) to return organized dict
"""
try
:
elements
=
line
.
split
(
'
\t
'
)
function_id
=
elements
[
0
].
split
(
':'
)[
1
]
if
';'
in
elements
[
1
]:
names
=
elements
[
1
].
split
(
';'
)
else
:
_LOGGER
.
warning
(
f
"Parsing issue with
{
function_id
}
, corresponding line:
{
line
}
"
)
names
=
[
elements
[
1
],
''
]
# Ugly fix to handle one specific case with no name: K23479
if
'[EC:'
in
names
[
1
]:
ec_number
=
names
[
1
].
split
(
'[EC:'
)[
1
].
rstrip
(
']'
)
else
:
ec_number
=
''
return
{
'function_id'
:
function_id
,
'name'
:
names
[
0
],
'long_name'
:
names
[
1
].
lstrip
(),
'ec_number'
:
ec_number
}
except
:
_LOGGER
.
error
(
f
"Could not parse:
{
line
.
rstrip
()
}
. Are you sure it comes from KEGG KO list?"
)
raise
nodes.dmp file consists of taxonomy nodes.
The description for each node includes the following fields:
tax_id -- node id in GenBank taxonomy database
parent tax_id -- parent node id in GenBank taxonomy database
rank -- rank of this node (superkingdom, kingdom, ...)
embl code -- locus-name prefix; not unique
division id -- see division.dmp file
inherited div flag (1 or 0) -- 1 if node inherits division from parent
genetic code id -- see gencode.dmp file
inherited GC flag (1 or 0) -- 1 if node inherits genetic code from parent
mitochondrial genetic code id -- see gencode.dmp file
inherited MGC flag (1 or 0) -- 1 if node inherits mitochondrial gencode from parent
GenBank hidden flag (1 or 0) -- 1 if name is suppressed in GenBank entry lineage
hidden subtree root flag (1 or 0) -- 1 if this subtree has no sequence data yet
comments -- free-text comments and citations
"""
elements
=
line
.
rstrip
().
split
(
'|'
)
try
:
parsed_line
=
{
"tax_id"
:
elements
[
0
].
strip
(),
"parent_tax_id"
:
elements
[
1
].
strip
(),
"rank"
:
elements
[
2
].
strip
(),
"embl_code"
:
elements
[
3
].
strip
(),
"division_id"
:
elements
[
4
].
strip
(),
"inherited_div_flag"
:
elements
[
5
].
strip
(),
"genetic_code_id"
:
elements
[
6
].
strip
(),
"inherited_GC_flag"
:
elements
[
7
].
strip
(),
"mitochondrial_genetic_code_id"
:
elements
[
8
].
strip
(),
"inherited_MGC_flag"
:
elements
[
9
].
strip
(),
"GenBank_hidden_flag"
:
elements
[
10
].
strip
(),
"hidden_subtree_root_flag"
:
elements
[
11
].
strip
(),
"comments"
:
elements
[
12
].
strip
()
}
return
parsed_line
except
Exception
as
e
:
_LOGGER
.
error
(
f
"Could not parse:
{
line
.
rstrip
()
}
. Are you sure it comes from nodes.dmp file?"
)
raise
(
e
)
class
NCBITaxonomyLineParser
(
object
):
@
staticmethod
def
node
(
line
):
"""
parse line from ncbi nodes.dmp file
def
parse_ncbi_taxonomy_name
(
line
):
"""
parse line from ncbi names.dmp file
From documentation:
From documentation:
nodes.dmp file consists of taxonomy nodes.
The description for each node includes the following fields:
Taxonomy names file (names.dmp):
tax_id -- the id of node associated with this name
name_txt -- name itself
unique name -- the unique variant of this name if name not unique
name class -- (synonym, common name, ...)
"""
elements
=
line
.
rstrip
().
split
(
'|'
)
try
:
parsed_line
=
{
"tax_id"
:
elements
[
0
].
strip
(),
"name_txt"
:
elements
[
1
].
strip
(),
"unique_name"
:
elements
[
2
].
strip
(),
"name_class"
:
elements
[
3
].
strip
(),
}
return
parsed_line
except
Exception
as
e
:
_LOGGER
.
error
(
f
"Could not parse:
{
line
.
rstrip
()
}
. Are you sure it comes from nodes.dmp file?"
)
raise
(
e
)
tax_id -- node id in GenBank taxonomy database
parent tax_id -- parent node id in GenBank taxonomy database
rank -- rank of this node (superkingdom, kingdom, ...)
embl code -- locus-name prefix; not unique
division id -- see division.dmp file
inherited div flag (1 or 0) -- 1 if node inherits division from parent
genetic code id -- see gencode.dmp file
inherited GC flag (1 or 0) -- 1 if node inherits genetic code from parent
mitochondrial genetic code id -- see gencode.dmp file
inherited MGC flag (1 or 0) -- 1 if node inherits mitochondrial gencode from parent
GenBank hidden flag (1 or 0) -- 1 if name is suppressed in GenBank entry lineage
hidden subtree root flag (1 or 0) -- 1 if this subtree has no sequence data yet
comments -- free-text comments and citations
"""
elements
=
line
.
rstrip
().
split
(
'|'
)
try
:
return
{
"tax_id"
:
elements
[
0
].
strip
(),
"parent_tax_id"
:
elements
[
1
].
strip
(),
"rank"
:
elements
[
2
].
strip
(),
"embl_code"
:
elements
[
3
].
strip
(),
"division_id"
:
elements
[
4
].
strip
(),
"inherited_div_flag"
:
elements
[
5
].
strip
(),
"genetic_code_id"
:
elements
[
6
].
strip
(),
"inherited_GC_flag"
:
elements
[
7
].
strip
(),
"mitochondrial_genetic_code_id"
:
elements
[
8
].
strip
(),
"inherited_MGC_flag"
:
elements
[
9
].
strip
(),
"GenBank_hidden_flag"
:
elements
[
10
].
strip
(),
"hidden_subtree_root_flag"
:
elements
[
11
].
strip
(),
"comments"
:
elements
[
12
].
strip
()
}
except
:
_LOGGER
.
error
(
f
"Could not parse:
{
line
.
rstrip
()
}
. Are you sure it comes from nodes.dmp file?"
)
raise
@
staticmethod
def
name
(
line
):
"""
parse line from ncbi names.dmp file
From documentation:
Taxonomy names file (names.dmp):
tax_id -- the id of node associated with this name
name_txt -- name itself
unique name -- the unique variant of this name if name not unique
name class -- (synonym, common name, ...)
"""
elements
=
line
.
rstrip
().
split
(
'|'
)
try
:
return
{
"tax_id"
:
elements
[
0
].
strip
(),
"name_txt"
:
elements
[
1
].
strip
(),
"unique_name"
:
elements
[
2
].
strip
(),
"name_class"
:
elements
[
3
].
strip
(),
}
except
:
_LOGGER
.
error
(
f
"Could not parse:
{
line
.
rstrip
()
}
. Are you sure it comes from nodes.dmp file?"
)
raise
backend/metagenedb/utils/test_parsers.py
View file @
170bd11b
from
unittest
import
TestCase
from
metagenedb.utils.parsers
import
parse_ncbi_taxonomy_node
,
parse_ncbi_taxonomy_name
from
metagenedb.utils.parsers
import
KEGGLineParser
,
NCBITaxonomyLineParser
class
Test
NCBITaxonomyNod
eParser
(
TestCase
):
class
Test
KEGGLin
eParser
(
TestCase
):
def
test_parse_ncbi_taxonomy_node
(
self
):
def
test_ko_list
(
self
):
ko_line
=
"ko:K00809 DHPS, dys; deoxyhypusine synthase [EC:2.5.1.46]"
expected_dict
=
{
'function_id'
:
"K00809"
,
'name'
:
"DHPS, dys"
,
'long_name'
:
"deoxyhypusine synthase [EC:2.5.1.46]"
,
'ec_number'
:
"2.5.1.46"
}
test_dict
=
KEGGLineParser
.
ko_list
(
ko_line
)
self
.
assertDictEqual
(
test_dict
,
expected_dict
)
def
test_ko_list_wrong_format
(
self
):
ko_line
=
"This is a wrong line format, with; information and tab"
with
self
.
assertRaises
(
Exception
)
as
context
:
# noqa
KEGGLineParser
.
ko_list
(
ko_line
)
class
TestNCBITaxonomyLineParser
(
TestCase
):
def
test_node
(
self
):
node_line
=
"6 | 335928 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
\n
"
expected_dict
=
{
"tax_id"
:
"6"
,
...
...
@@ -22,29 +41,26 @@ class TestNCBITaxonomyNodeParser(TestCase):
"hidden_subtree_root_flag"
:
"0"
,
"comments"
:
""
}
test_dict
=
parse_ncbi_taxonomy_
node
(
node_line
)
test_dict
=
NCBITaxonomyLineParser
.
node
(
node_line
)
self
.
assertDictEqual
(
test_dict
,
expected_dict
)
def
test_
pars
e_wrong_
line_
format
(
self
):
node_line
=
"This is a wrong line format
.
"
def
test_
nod
e_wrong_format
(
self
):
node_line
=
"This is a wrong line format
, with; information and tab
"
with
self
.
assertRaises
(
Exception
)
as
context
:
# noqa
parse_ncbi_taxonomy_node
(
node_line
)
class
TestNCBITaxonomyNameParser
(
TestCase
):
NCBITaxonomyLineParser
.
node
(
node_line
)
def
test_
parse_ncbi_taxonomy_
name
(
self
):
n
od
e_line
=
"2 | Bacteria | Bacteria <prokaryotes> | scientific name |
\n
"
def
test_name
(
self
):
n
am
e_line
=
"2 | Bacteria | Bacteria <prokaryotes> | scientific name |
\n
"
expected_dict
=
{
"tax_id"
:
"2"
,
"name_txt"
:
"Bacteria"
,
"unique_name"
:
"Bacteria <prokaryotes>"
,
"name_class"
:
"scientific name"
,
}
test_dict
=
parse_ncbi_taxonomy_
name
(
n
od
e_line
)
test_dict
=
NCBITaxonomyLineParser
.
name
(
n
am
e_line
)
self
.
assertDictEqual
(
test_dict
,
expected_dict
)
def
test_
pars
e_wrong_
line_
format
(
self
):
n
od
e_line
=
"This is a wrong line format
.
"
def
test_
nam
e_wrong_format
(
self
):
n
am
e_line
=
"This is a wrong line format
, with; information and tab
"
with
self
.
assertRaises
(
Exception
)
as
context
:
# noqa
parse_ncbi_taxonomy_
name
(
n
od
e_line
)
NCBITaxonomyLineParser
.
name
(
n
am
e_line
)
backend/scripts/import_igc_data.py
→
backend/scripts/
populate_db/
import_igc_data.py
View file @
170bd11b
File moved
backend/scripts/load_kegg_ko.py
→
backend/scripts/
populate_db/
load_kegg_ko.py
View file @
170bd11b
...
...
@@ -8,6 +8,8 @@ import sys
import
django
from
django.core.exceptions
import
ValidationError
from
metagenedb.utils.parsers
import
KEGGLineParser
# Before model import, we need to called django.setup() to Load apps
os
.
environ
.
setdefault
(
"DJANGO_SETTINGS_MODULE"
,
"metagenedb.settings"
)
django
.
setup
()
...
...
@@ -31,29 +33,6 @@ def parse_arguments():
sys
.
exit
(
1
)
def
parse_ko
(
line
):
"""
Parse line from kegg KO list to return organized dict
"""
content
=
line
.
split
(
'
\t
'
)
function_id
=
content
[
0
].
split
(
':'
)[
1
]
if
';'
in
content
[
1
]:
names
=
content
[
1
].
split
(
';'
)
else
:
_LOGGER
.
warning
(
f
"Parsing issue with
{
function_id
}
, corresponding line:
{
line
}
"
)
names
=
[
content
[
1
],
''
]
# Ugly fix to handle one specific case with no name: K23479
if
'[EC:'
in
names
[
1
]:
ec_number
=
names
[
1
].
split
(
'[EC:'
)[
1
].
rstrip
(
']'
)
else
:
ec_number
=
''
return
{
'function_id'
:
function_id
,
'name'
:
names
[
0
],
'long_name'
:
names
[
1
].
lstrip
(),
'ec_number'
:
ec_number
}
def
create_kegg_ko
(
kegg_ko
):
try
:
obj_kegg
=
KeggOrthology
.
objects
.
get
(
function_id
=
kegg_ko
.
get
(
'function_id'
))
...
...
@@ -73,7 +52,7 @@ def run():
skipped_kegg
=
0
total_kegg
=
len
(
all_ko
.
text
.
splitlines
())
for
line
in
all_ko
.
text
.
splitlines
():
kegg_ko
=
parse_ko
(
line
)
kegg_ko
=
KEGGLineParser
.
ko_list
(
line
)
try
:
create_kegg_ko
(
kegg_ko
)
inserted_kegg
+=
1
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment