Commit 25377fe4 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

reformat parser for IGC and add tests

parent d2949072
from .gene import GeneViewSet # noqa
__all__ = ['GeneViewSet']
from .gene import GeneAdmin
from .function import FunctionAdmin, KeggOrthologyAdmin
from .taxonomy import TaxonomyAdmin
__all__ = ['GeneAdmin', 'FunctionAdmin', 'KeggOrthologyAdmin', 'TaxonomyAdmin']
from .gene import GeneAdmin # noqa
from .function import FunctionAdmin, KeggOrthologyAdmin # noqa
from .taxonomy import TaxonomyAdmin # noqa
from .function import Function, KeggOrthology
from .gene import Gene
from .taxonomy import Taxonomy
__all__ = ['Function', 'KeggOrthology', 'Gene', 'Taxonomy']
from .function import Function, KeggOrthology # noqa
from .gene import Gene # noqa
from .taxonomy import Taxonomy # noqa
from .function import FunctionSerializer
from .gene import GeneSerializer
from .taxonomy import TaxonomySerializer
__all__ = ['FunctionSerializer', 'GeneSerializer', 'TaxonomySerializer']
from .function import FunctionSerializer # noqa
from .gene import GeneSerializer # noqa
from .taxonomy import TaxonomySerializer # noqa
from .igc import IGCLineParser # noqa
from .kegg import KEGGLineParser # noqa
from .ncbi_taxonomy import NCBITaxonomyLineParser # noqa
import logging
logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)
class IGCLineParser(object):
@staticmethod
def gene(line):
"""
Parse line from IGC genes list () to return organized dict
IGC annotation columns:
0: Gene ID Unique ID
1: Gene Name Unique name
2: Gene Length Length of nucleotide sequence
3: Gene Completeness Status I the gene complete or partial according to the gene predictor
4: Cohort Origin Stating the cohort contributing the representative gene
5: Taxonomic Annotation(Phylum Level) Annotated phylum for a gene
6: Taxonomic Annotation(Genus Level) Annotated genus for a gene
7: KEGG Annotation Annotated KO(s) for a gene
8: eggNOG Annotation Annotated eggNOG(s) for a gene
9: Sample Occurence Frequency Occurrence frequency in samples based on gene profile
10: Individual Occurence Frequency Occurrence frequency in individuals based on gene profile
11: KEGG Functional Categories KEGG functional category(ies) of the annotated KO(s)
12: eggNOG Functional Categories eggNOG functional category(ies) of the annotated eggNOG(s)
13: Cohort Assembled Stating the metagenomic sequencing cohort(s) contributing the
representative gene or a redundant gene belonging to it
"""
try:
gene_info = line.rstrip().split('\t')
return {
'igc_id': gene_info[0],
'gene_id': gene_info[1],
'gene_length': gene_info[2],
'gene_completeness_status': gene_info[3],
'cohort_origin': gene_info[4],
'taxo_phylum': gene_info[5],
'taxo_genus': gene_info[6],
'kegg_ko': gene_info[7],
'eggnog': gene_info[8],
'sample_occurence_frequency': gene_info[9],
'individual_occurence_frequency': gene_info[10],
'kegg_functional_categories': gene_info[11],
'eggnog_functional_categories': gene_info[12],
'cohort_assembled': gene_info[13]
}
except Exception:
_LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from IGC genes list?")
raise
import logging
logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)
class KEGGLineParser(object):
@staticmethod
def ko_list(line):
"""
Parse line from kegg KO list (http://rest.kegg.jp/list/ko) to return organized dict
"""
try:
elements = line.split('\t')
function_id = elements[0].split(':')[1]
if ';' in elements[1]:
names = elements[1].split(';')
else:
_LOGGER.warning(f"Parsing issue with {function_id}, corresponding line: {line}")
names = [elements[1], ''] # Ugly fix to handle one specific case with no name: K23479
if '[EC:' in names[1]:
ec_number = names[1].split('[EC:')[1].rstrip(']')
else:
ec_number = ''
return {
'function_id': function_id,
'name': names[0],
'long_name': names[1].lstrip(),
'ec_number': ec_number
}
except Exception:
_LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from KEGG KO list?")
raise
......@@ -4,36 +4,6 @@ logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)
class KEGGLineParser(object):
@staticmethod
def ko_list(line):
"""
Parse line from kegg KO list (http://rest.kegg.jp/list/ko) to return organized dict
"""
try:
elements = line.split('\t')
function_id = elements[0].split(':')[1]
if ';' in elements[1]:
names = elements[1].split(';')
else:
_LOGGER.warning(f"Parsing issue with {function_id}, corresponding line: {line}")
names = [elements[1], ''] # Ugly fix to handle one specific case with no name: K23479
if '[EC:' in names[1]:
ec_number = names[1].split('[EC:')[1].rstrip(']')
else:
ec_number = ''
return {
'function_id': function_id,
'name': names[0],
'long_name': names[1].lstrip(),
'ec_number': ec_number
}
except Exception:
_LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from KEGG KO list?")
raise
class NCBITaxonomyLineParser(object):
@staticmethod
......
from unittest import TestCase
from metagenedb.common.utils.parsers import IGCLineParser
class TestIGCLineParser(TestCase):
def test_gene(self):
raw_data = [
'gene_id',
'gene_name',
'gene_length',
'gene_completeness_status',
'cohort_origin',
'taxo_phylum',
'taxo_genus',
'kegg',
'eggnog',
'sample_occurence_freq',
'ind_occurence_freq',
'kegg_functional_cat',
'eggnog_functional_cat',
'cohort_assembled'
]
raw_line = "\t".join(raw_data)
expected_dict = {
'igc_id': raw_data[0],
'gene_id': raw_data[1],
'gene_length': raw_data[2],
'gene_completeness_status': raw_data[3],
'cohort_origin': raw_data[4],
'taxo_phylum': raw_data[5],
'taxo_genus': raw_data[6],
'kegg_ko': raw_data[7],
'eggnog': raw_data[8],
'sample_occurence_frequency': raw_data[9],
'individual_occurence_frequency': raw_data[10],
'kegg_functional_categories': raw_data[11],
'eggnog_functional_categories': raw_data[12],
'cohort_assembled': raw_data[13]
}
test_dict = IGCLineParser.gene(raw_line)
self.assertDictEqual(test_dict, expected_dict)
def test_gene_wrong_format(self):
raw_line = "This is a wrong line format, with; information and tab"
with self.assertRaises(Exception) as context: # noqa
IGCLineParser.gene(raw_line)
from unittest import TestCase
from metagenedb.common.utils.parsers import KEGGLineParser
class TestKEGGLineParser(TestCase):
def test_ko_list(self):
ko_line = "ko:K00809 DHPS, dys; deoxyhypusine synthase [EC:2.5.1.46]"
expected_dict = {
'function_id': "K00809",
'name': "DHPS, dys",
'long_name': "deoxyhypusine synthase [EC:2.5.1.46]",
'ec_number': "2.5.1.46"
}
test_dict = KEGGLineParser.ko_list(ko_line)
self.assertDictEqual(test_dict, expected_dict)
def test_ko_list_wrong_format(self):
ko_line = "This is a wrong line format, with; information and tab"
with self.assertRaises(Exception) as context: # noqa
KEGGLineParser.ko_list(ko_line)
from unittest import TestCase
from metagenedb.common.utils.parsers import KEGGLineParser, NCBITaxonomyLineParser
class TestKEGGLineParser(TestCase):
def test_ko_list(self):
ko_line = "ko:K00809 DHPS, dys; deoxyhypusine synthase [EC:2.5.1.46]"
expected_dict = {
'function_id': "K00809",
'name': "DHPS, dys",
'long_name': "deoxyhypusine synthase [EC:2.5.1.46]",
'ec_number': "2.5.1.46"
}
test_dict = KEGGLineParser.ko_list(ko_line)
self.assertDictEqual(test_dict, expected_dict)
def test_ko_list_wrong_format(self):
ko_line = "This is a wrong line format, with; information and tab"
with self.assertRaises(Exception) as context: # noqa
KEGGLineParser.ko_list(ko_line)
from metagenedb.common.utils.parsers import NCBITaxonomyLineParser
class TestNCBITaxonomyLineParser(TestCase):
......
......@@ -8,6 +8,8 @@ from itertools import islice
import django
from rest_framework.exceptions import ValidationError
from metagenedb.common.utils.parsers import IGCLineParser
# Before model import, we need to called django.setup() to Load apps
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings")
django.setup()
......@@ -18,32 +20,17 @@ from metagenedb.apps.catalog.serializers import GeneSerializer # noqa
logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)
SELECTED_KEYS = ['gene_id', 'gene_length', 'kegg_ko']
def parse_gene(raw_line):
def parse_gene(raw_line, selected_keys=SELECTED_KEYS):
"""
IGC annotation columns:
0: Gene ID Unique ID
1: Gene Name Unique name
2: Gene Length Length of nucleotide sequence
3: Gene Completeness Status Stating a gene is complete or partial according to the gene predictor
4: Cohort Origin Stating the cohort contributing the representative gene
5: Taxonomic Annotation(Phylum Level) Annotated phylum for a gene
6: Taxonomic Annotation(Genus Level) Annotated genus for a gene
7: KEGG Annotation Annotated KO(s) for a gene
8: eggNOG Annotation Annotated eggNOG(s) for a gene
9: Sample Occurence Frequency Occurrence frequency in samples based on gene profile
10:Individual Occurence Frequency Occurrence frequency in individuals based on gene profile
11: KEGG Functional Categories KEGG functional category(ies) of the annotated KO(s)
12: eggNOG Functional Categories eggNOG functional category(ies) of the annotated eggNOG(s)
13: Cohort Assembled Stating the metagenomic sequencing cohort(s) contributing the
representative gene or a redundant gene belonging to it
Use IGCLineParser and return selected keys
"""
gene_info = raw_line.rstrip().split('\t')
return {
'gene_id': gene_info[1],
'gene_length': gene_info[2],
'kegg_ko': gene_info[7]
}
gene_parser = IGCLineParser()
all_dict = gene_parser.gene(raw_line)
selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys}
return selected_dict
def upsert_gene(gene_dict):
......
......@@ -9,7 +9,7 @@ from scripts.populate_db.import_igc_data import parse_gene, upsert_gene
class TestParseGene(TestCase):
def test_parse_gene(self):
def setUp(self):
raw_data = [
'gene_id',
'gene_name',
......@@ -26,13 +26,42 @@ class TestParseGene(TestCase):
'eggnog_functional_cat',
'cohort_assembled'
]
raw_line = "\t".join(raw_data)
self.raw_line = "\t".join(raw_data)
def test_parse_gene_default_selected_keys(self):
"""
This test should failed and need to be updated when SELECTED_KEYS are changed
"""
expected_dict = {
'gene_id': 'gene_name', # We use the gene name for our gene ID
'gene_id': 'gene_name',
'gene_length': 'gene_length',
'kegg_ko': 'kegg'
}
tested_dict = parse_gene(raw_line)
tested_dict = parse_gene(self.raw_line)
self.assertDictEqual(tested_dict, expected_dict)
def test_parse_gene(self):
"""
This test should failed and need to be updated when SELECTED_KEYS are changed
"""
selected_keys = ['gene_id', 'gene_length']
expected_dict = {
'gene_id': 'gene_name',
'gene_length': 'gene_length'
}
tested_dict = parse_gene(self.raw_line, selected_keys=selected_keys)
self.assertDictEqual(tested_dict, expected_dict)
def test_parse_gene_unknown_key(self):
"""
Unknown key should be ignored
"""
selected_keys = ['gene_id', 'gene_length', 'secret_code']
expected_dict = {
'gene_id': 'gene_name',
'gene_length': 'gene_length'
}
tested_dict = parse_gene(self.raw_line, selected_keys=selected_keys)
self.assertDictEqual(tested_dict, expected_dict)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment