Commit 170bd11b authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

Refactor parsers for NCBI taxo and KEGG ko list

parent bb32c198
Pipeline #13195 failed with stage
in 2 minutes and 14 seconds
......@@ -4,73 +4,104 @@ logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)
def parse_ncbi_taxonomy_node(line):
"""
parse line from ncbi nodes.dmp file
class KEGGLineParser(object):
From documentation:
@staticmethod
def ko_list(line):
"""
Parse line from kegg KO list (http://rest.kegg.jp/list/ko) to return organized dict
"""
try:
elements = line.split('\t')
function_id = elements[0].split(':')[1]
if ';' in elements[1]:
names = elements[1].split(';')
else:
_LOGGER.warning(f"Parsing issue with {function_id}, corresponding line: {line}")
names = [elements[1], ''] # Ugly fix to handle one specific case with no name: K23479
if '[EC:' in names[1]:
ec_number = names[1].split('[EC:')[1].rstrip(']')
else:
ec_number = ''
return {
'function_id': function_id,
'name': names[0],
'long_name': names[1].lstrip(),
'ec_number': ec_number
}
except:
_LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from KEGG KO list?")
raise
nodes.dmp file consists of taxonomy nodes.
The description for each node includes the following fields:
tax_id -- node id in GenBank taxonomy database
parent tax_id -- parent node id in GenBank taxonomy database
rank -- rank of this node (superkingdom, kingdom, ...)
embl code -- locus-name prefix; not unique
division id -- see division.dmp file
inherited div flag (1 or 0) -- 1 if node inherits division from parent
genetic code id -- see gencode.dmp file
inherited GC flag (1 or 0) -- 1 if node inherits genetic code from parent
mitochondrial genetic code id -- see gencode.dmp file
inherited MGC flag (1 or 0) -- 1 if node inherits mitochondrial gencode from parent
GenBank hidden flag (1 or 0) -- 1 if name is suppressed in GenBank entry lineage
hidden subtree root flag (1 or 0) -- 1 if this subtree has no sequence data yet
comments -- free-text comments and citations
"""
elements = line.rstrip().split('|')
try:
parsed_line = {
"tax_id": elements[0].strip(),
"parent_tax_id": elements[1].strip(),
"rank": elements[2].strip(),
"embl_code": elements[3].strip(),
"division_id": elements[4].strip(),
"inherited_div_flag": elements[5].strip(),
"genetic_code_id": elements[6].strip(),
"inherited_GC_flag": elements[7].strip(),
"mitochondrial_genetic_code_id": elements[8].strip(),
"inherited_MGC_flag": elements[9].strip(),
"GenBank_hidden_flag": elements[10].strip(),
"hidden_subtree_root_flag": elements[11].strip(),
"comments": elements[12].strip()
}
return parsed_line
except Exception as e:
_LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from nodes.dmp file?")
raise(e)
class NCBITaxonomyLineParser(object):
@staticmethod
def node(line):
"""
parse line from ncbi nodes.dmp file
def parse_ncbi_taxonomy_name(line):
"""
parse line from ncbi names.dmp file
From documentation:
From documentation:
nodes.dmp file consists of taxonomy nodes.
The description for each node includes the following fields:
Taxonomy names file (names.dmp):
tax_id -- the id of node associated with this name
name_txt -- name itself
unique name -- the unique variant of this name if name not unique
name class -- (synonym, common name, ...)
"""
elements = line.rstrip().split('|')
try:
parsed_line = {
"tax_id": elements[0].strip(),
"name_txt": elements[1].strip(),
"unique_name": elements[2].strip(),
"name_class": elements[3].strip(),
}
return parsed_line
except Exception as e:
_LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from nodes.dmp file?")
raise(e)
tax_id -- node id in GenBank taxonomy database
parent tax_id -- parent node id in GenBank taxonomy database
rank -- rank of this node (superkingdom, kingdom, ...)
embl code -- locus-name prefix; not unique
division id -- see division.dmp file
inherited div flag (1 or 0) -- 1 if node inherits division from parent
genetic code id -- see gencode.dmp file
inherited GC flag (1 or 0) -- 1 if node inherits genetic code from parent
mitochondrial genetic code id -- see gencode.dmp file
inherited MGC flag (1 or 0) -- 1 if node inherits mitochondrial gencode from parent
GenBank hidden flag (1 or 0) -- 1 if name is suppressed in GenBank entry lineage
hidden subtree root flag (1 or 0) -- 1 if this subtree has no sequence data yet
comments -- free-text comments and citations
"""
elements = line.rstrip().split('|')
try:
return {
"tax_id": elements[0].strip(),
"parent_tax_id": elements[1].strip(),
"rank": elements[2].strip(),
"embl_code": elements[3].strip(),
"division_id": elements[4].strip(),
"inherited_div_flag": elements[5].strip(),
"genetic_code_id": elements[6].strip(),
"inherited_GC_flag": elements[7].strip(),
"mitochondrial_genetic_code_id": elements[8].strip(),
"inherited_MGC_flag": elements[9].strip(),
"GenBank_hidden_flag": elements[10].strip(),
"hidden_subtree_root_flag": elements[11].strip(),
"comments": elements[12].strip()
}
except:
_LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from nodes.dmp file?")
raise
@staticmethod
def name(line):
"""
parse line from ncbi names.dmp file
From documentation:
Taxonomy names file (names.dmp):
tax_id -- the id of node associated with this name
name_txt -- name itself
unique name -- the unique variant of this name if name not unique
name class -- (synonym, common name, ...)
"""
elements = line.rstrip().split('|')
try:
return {
"tax_id": elements[0].strip(),
"name_txt": elements[1].strip(),
"unique_name": elements[2].strip(),
"name_class": elements[3].strip(),
}
except:
_LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from nodes.dmp file?")
raise
from unittest import TestCase
from metagenedb.utils.parsers import parse_ncbi_taxonomy_node, parse_ncbi_taxonomy_name
from metagenedb.utils.parsers import KEGGLineParser, NCBITaxonomyLineParser
class TestNCBITaxonomyNodeParser(TestCase):
class TestKEGGLineParser(TestCase):
def test_parse_ncbi_taxonomy_node(self):
def test_ko_list(self):
ko_line = "ko:K00809 DHPS, dys; deoxyhypusine synthase [EC:2.5.1.46]"
expected_dict = {
'function_id': "K00809",
'name': "DHPS, dys",
'long_name': "deoxyhypusine synthase [EC:2.5.1.46]",
'ec_number': "2.5.1.46"
}
test_dict = KEGGLineParser.ko_list(ko_line)
self.assertDictEqual(test_dict, expected_dict)
def test_ko_list_wrong_format(self):
ko_line = "This is a wrong line format, with; information and tab"
with self.assertRaises(Exception) as context: # noqa
KEGGLineParser.ko_list(ko_line)
class TestNCBITaxonomyLineParser(TestCase):
def test_node(self):
node_line = "6 | 335928 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |\n"
expected_dict = {
"tax_id": "6",
......@@ -22,29 +41,26 @@ class TestNCBITaxonomyNodeParser(TestCase):
"hidden_subtree_root_flag": "0",
"comments": ""
}
test_dict = parse_ncbi_taxonomy_node(node_line)
test_dict = NCBITaxonomyLineParser.node(node_line)
self.assertDictEqual(test_dict, expected_dict)
def test_parse_wrong_line_format(self):
node_line = "This is a wrong line format."
def test_node_wrong_format(self):
node_line = "This is a wrong line format, with; information and tab"
with self.assertRaises(Exception) as context: # noqa
parse_ncbi_taxonomy_node(node_line)
class TestNCBITaxonomyNameParser(TestCase):
NCBITaxonomyLineParser.node(node_line)
def test_parse_ncbi_taxonomy_name(self):
node_line = "2 | Bacteria | Bacteria <prokaryotes> | scientific name |\n"
def test_name(self):
name_line = "2 | Bacteria | Bacteria <prokaryotes> | scientific name |\n"
expected_dict = {
"tax_id": "2",
"name_txt": "Bacteria",
"unique_name": "Bacteria <prokaryotes>",
"name_class": "scientific name",
}
test_dict = parse_ncbi_taxonomy_name(node_line)
test_dict = NCBITaxonomyLineParser.name(name_line)
self.assertDictEqual(test_dict, expected_dict)
def test_parse_wrong_line_format(self):
node_line = "This is a wrong line format."
def test_name_wrong_format(self):
name_line = "This is a wrong line format, with; information and tab"
with self.assertRaises(Exception) as context: # noqa
parse_ncbi_taxonomy_name(node_line)
NCBITaxonomyLineParser.name(name_line)
......@@ -8,6 +8,8 @@ import sys
import django
from django.core.exceptions import ValidationError
from metagenedb.utils.parsers import KEGGLineParser
# Before model import, we need to called django.setup() to Load apps
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings")
django.setup()
......@@ -31,29 +33,6 @@ def parse_arguments():
sys.exit(1)
def parse_ko(line):
"""
Parse line from kegg KO list to return organized dict
"""
content = line.split('\t')
function_id = content[0].split(':')[1]
if ';' in content[1]:
names = content[1].split(';')
else:
_LOGGER.warning(f"Parsing issue with {function_id}, corresponding line: {line}")
names = [content[1], ''] # Ugly fix to handle one specific case with no name: K23479
if '[EC:' in names[1]:
ec_number = names[1].split('[EC:')[1].rstrip(']')
else:
ec_number = ''
return {
'function_id': function_id,
'name': names[0],
'long_name': names[1].lstrip(),
'ec_number': ec_number
}
def create_kegg_ko(kegg_ko):
try:
obj_kegg = KeggOrthology.objects.get(function_id=kegg_ko.get('function_id'))
......@@ -73,7 +52,7 @@ def run():
skipped_kegg = 0
total_kegg = len(all_ko.text.splitlines())
for line in all_ko.text.splitlines():
kegg_ko = parse_ko(line)
kegg_ko = KEGGLineParser.ko_list(line)
try:
create_kegg_ko(kegg_ko)
inserted_kegg += 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment