Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Metagenomics
metagenedb
Commits
25377fe4
Commit
25377fe4
authored
Aug 05, 2019
by
Kenzo-Hugo Hillion
♻
Browse files
reformat parser for IGC and add tests
parent
d2949072
Changes
13
Hide whitespace changes
Inline
Side-by-side
backend/metagenedb/api/catalog/views/__init__.py
View file @
25377fe4
from
.gene
import
GeneViewSet
# noqa
__all__
=
[
'GeneViewSet'
]
backend/metagenedb/apps/catalog/admin/__init__.py
View file @
25377fe4
from
.gene
import
GeneAdmin
from
.function
import
FunctionAdmin
,
KeggOrthologyAdmin
from
.taxonomy
import
TaxonomyAdmin
__all__
=
[
'GeneAdmin'
,
'FunctionAdmin'
,
'KeggOrthologyAdmin'
,
'TaxonomyAdmin'
]
from
.gene
import
GeneAdmin
# noqa
from
.function
import
FunctionAdmin
,
KeggOrthologyAdmin
# noqa
from
.taxonomy
import
TaxonomyAdmin
# noqa
backend/metagenedb/apps/catalog/models/__init__.py
View file @
25377fe4
from
.function
import
Function
,
KeggOrthology
from
.gene
import
Gene
from
.taxonomy
import
Taxonomy
__all__
=
[
'Function'
,
'KeggOrthology'
,
'Gene'
,
'Taxonomy'
]
from
.function
import
Function
,
KeggOrthology
# noqa
from
.gene
import
Gene
# noqa
from
.taxonomy
import
Taxonomy
# noqa
backend/metagenedb/apps/catalog/serializers/__init__.py
View file @
25377fe4
from
.function
import
FunctionSerializer
from
.gene
import
GeneSerializer
from
.taxonomy
import
TaxonomySerializer
__all__
=
[
'FunctionSerializer'
,
'GeneSerializer'
,
'TaxonomySerializer'
]
from
.function
import
FunctionSerializer
# noqa
from
.gene
import
GeneSerializer
# noqa
from
.taxonomy
import
TaxonomySerializer
# noqa
backend/metagenedb/common/utils/parsers/__init__.py
0 → 100644
View file @
25377fe4
from
.igc
import
IGCLineParser
# noqa
from
.kegg
import
KEGGLineParser
# noqa
from
.ncbi_taxonomy
import
NCBITaxonomyLineParser
# noqa
backend/metagenedb/common/utils/parsers/igc.py
0 → 100644
View file @
25377fe4
import
logging
logging
.
basicConfig
(
level
=
logging
.
INFO
)
_LOGGER
=
logging
.
getLogger
(
__name__
)
class
IGCLineParser
(
object
):
@
staticmethod
def
gene
(
line
):
"""
Parse line from IGC genes list () to return organized dict
IGC annotation columns:
0: Gene ID Unique ID
1: Gene Name Unique name
2: Gene Length Length of nucleotide sequence
3: Gene Completeness Status I the gene complete or partial according to the gene predictor
4: Cohort Origin Stating the cohort contributing the representative gene
5: Taxonomic Annotation(Phylum Level) Annotated phylum for a gene
6: Taxonomic Annotation(Genus Level) Annotated genus for a gene
7: KEGG Annotation Annotated KO(s) for a gene
8: eggNOG Annotation Annotated eggNOG(s) for a gene
9: Sample Occurence Frequency Occurrence frequency in samples based on gene profile
10: Individual Occurence Frequency Occurrence frequency in individuals based on gene profile
11: KEGG Functional Categories KEGG functional category(ies) of the annotated KO(s)
12: eggNOG Functional Categories eggNOG functional category(ies) of the annotated eggNOG(s)
13: Cohort Assembled Stating the metagenomic sequencing cohort(s) contributing the
representative gene or a redundant gene belonging to it
"""
try
:
gene_info
=
line
.
rstrip
().
split
(
'
\t
'
)
return
{
'igc_id'
:
gene_info
[
0
],
'gene_id'
:
gene_info
[
1
],
'gene_length'
:
gene_info
[
2
],
'gene_completeness_status'
:
gene_info
[
3
],
'cohort_origin'
:
gene_info
[
4
],
'taxo_phylum'
:
gene_info
[
5
],
'taxo_genus'
:
gene_info
[
6
],
'kegg_ko'
:
gene_info
[
7
],
'eggnog'
:
gene_info
[
8
],
'sample_occurence_frequency'
:
gene_info
[
9
],
'individual_occurence_frequency'
:
gene_info
[
10
],
'kegg_functional_categories'
:
gene_info
[
11
],
'eggnog_functional_categories'
:
gene_info
[
12
],
'cohort_assembled'
:
gene_info
[
13
]
}
except
Exception
:
_LOGGER
.
error
(
f
"Could not parse:
{
line
.
rstrip
()
}
. Are you sure it comes from IGC genes list?"
)
raise
backend/metagenedb/common/utils/parsers/kegg.py
0 → 100644
View file @
25377fe4
import
logging
logging
.
basicConfig
(
level
=
logging
.
INFO
)
_LOGGER
=
logging
.
getLogger
(
__name__
)
class
KEGGLineParser
(
object
):
@
staticmethod
def
ko_list
(
line
):
"""
Parse line from kegg KO list (http://rest.kegg.jp/list/ko) to return organized dict
"""
try
:
elements
=
line
.
split
(
'
\t
'
)
function_id
=
elements
[
0
].
split
(
':'
)[
1
]
if
';'
in
elements
[
1
]:
names
=
elements
[
1
].
split
(
';'
)
else
:
_LOGGER
.
warning
(
f
"Parsing issue with
{
function_id
}
, corresponding line:
{
line
}
"
)
names
=
[
elements
[
1
],
''
]
# Ugly fix to handle one specific case with no name: K23479
if
'[EC:'
in
names
[
1
]:
ec_number
=
names
[
1
].
split
(
'[EC:'
)[
1
].
rstrip
(
']'
)
else
:
ec_number
=
''
return
{
'function_id'
:
function_id
,
'name'
:
names
[
0
],
'long_name'
:
names
[
1
].
lstrip
(),
'ec_number'
:
ec_number
}
except
Exception
:
_LOGGER
.
error
(
f
"Could not parse:
{
line
.
rstrip
()
}
. Are you sure it comes from KEGG KO list?"
)
raise
backend/metagenedb/common/utils/parsers.py
→
backend/metagenedb/common/utils/parsers
/ncbi_taxonomy
.py
View file @
25377fe4
...
...
@@ -4,36 +4,6 @@ logging.basicConfig(level=logging.INFO)
_LOGGER
=
logging
.
getLogger
(
__name__
)
class
KEGGLineParser
(
object
):
@
staticmethod
def
ko_list
(
line
):
"""
Parse line from kegg KO list (http://rest.kegg.jp/list/ko) to return organized dict
"""
try
:
elements
=
line
.
split
(
'
\t
'
)
function_id
=
elements
[
0
].
split
(
':'
)[
1
]
if
';'
in
elements
[
1
]:
names
=
elements
[
1
].
split
(
';'
)
else
:
_LOGGER
.
warning
(
f
"Parsing issue with
{
function_id
}
, corresponding line:
{
line
}
"
)
names
=
[
elements
[
1
],
''
]
# Ugly fix to handle one specific case with no name: K23479
if
'[EC:'
in
names
[
1
]:
ec_number
=
names
[
1
].
split
(
'[EC:'
)[
1
].
rstrip
(
']'
)
else
:
ec_number
=
''
return
{
'function_id'
:
function_id
,
'name'
:
names
[
0
],
'long_name'
:
names
[
1
].
lstrip
(),
'ec_number'
:
ec_number
}
except
Exception
:
_LOGGER
.
error
(
f
"Could not parse:
{
line
.
rstrip
()
}
. Are you sure it comes from KEGG KO list?"
)
raise
class
NCBITaxonomyLineParser
(
object
):
@
staticmethod
...
...
backend/metagenedb/common/utils/parsers/test_igc.py
0 → 100644
View file @
25377fe4
from
unittest
import
TestCase
from
metagenedb.common.utils.parsers
import
IGCLineParser
class
TestIGCLineParser
(
TestCase
):
def
test_gene
(
self
):
raw_data
=
[
'gene_id'
,
'gene_name'
,
'gene_length'
,
'gene_completeness_status'
,
'cohort_origin'
,
'taxo_phylum'
,
'taxo_genus'
,
'kegg'
,
'eggnog'
,
'sample_occurence_freq'
,
'ind_occurence_freq'
,
'kegg_functional_cat'
,
'eggnog_functional_cat'
,
'cohort_assembled'
]
raw_line
=
"
\t
"
.
join
(
raw_data
)
expected_dict
=
{
'igc_id'
:
raw_data
[
0
],
'gene_id'
:
raw_data
[
1
],
'gene_length'
:
raw_data
[
2
],
'gene_completeness_status'
:
raw_data
[
3
],
'cohort_origin'
:
raw_data
[
4
],
'taxo_phylum'
:
raw_data
[
5
],
'taxo_genus'
:
raw_data
[
6
],
'kegg_ko'
:
raw_data
[
7
],
'eggnog'
:
raw_data
[
8
],
'sample_occurence_frequency'
:
raw_data
[
9
],
'individual_occurence_frequency'
:
raw_data
[
10
],
'kegg_functional_categories'
:
raw_data
[
11
],
'eggnog_functional_categories'
:
raw_data
[
12
],
'cohort_assembled'
:
raw_data
[
13
]
}
test_dict
=
IGCLineParser
.
gene
(
raw_line
)
self
.
assertDictEqual
(
test_dict
,
expected_dict
)
def
test_gene_wrong_format
(
self
):
raw_line
=
"This is a wrong line format, with; information and tab"
with
self
.
assertRaises
(
Exception
)
as
context
:
# noqa
IGCLineParser
.
gene
(
raw_line
)
backend/metagenedb/common/utils/parsers/test_kegg.py
0 → 100644
View file @
25377fe4
from
unittest
import
TestCase
from
metagenedb.common.utils.parsers
import
KEGGLineParser
class
TestKEGGLineParser
(
TestCase
):
def
test_ko_list
(
self
):
ko_line
=
"ko:K00809 DHPS, dys; deoxyhypusine synthase [EC:2.5.1.46]"
expected_dict
=
{
'function_id'
:
"K00809"
,
'name'
:
"DHPS, dys"
,
'long_name'
:
"deoxyhypusine synthase [EC:2.5.1.46]"
,
'ec_number'
:
"2.5.1.46"
}
test_dict
=
KEGGLineParser
.
ko_list
(
ko_line
)
self
.
assertDictEqual
(
test_dict
,
expected_dict
)
def
test_ko_list_wrong_format
(
self
):
ko_line
=
"This is a wrong line format, with; information and tab"
with
self
.
assertRaises
(
Exception
)
as
context
:
# noqa
KEGGLineParser
.
ko_list
(
ko_line
)
backend/metagenedb/common/utils/
test_
parsers.py
→
backend/metagenedb/common/utils/parsers
/test_ncbi_taxonomy
.py
View file @
25377fe4
from
unittest
import
TestCase
from
metagenedb.common.utils.parsers
import
KEGGLineParser
,
NCBITaxonomyLineParser
class
TestKEGGLineParser
(
TestCase
):
def
test_ko_list
(
self
):
ko_line
=
"ko:K00809 DHPS, dys; deoxyhypusine synthase [EC:2.5.1.46]"
expected_dict
=
{
'function_id'
:
"K00809"
,
'name'
:
"DHPS, dys"
,
'long_name'
:
"deoxyhypusine synthase [EC:2.5.1.46]"
,
'ec_number'
:
"2.5.1.46"
}
test_dict
=
KEGGLineParser
.
ko_list
(
ko_line
)
self
.
assertDictEqual
(
test_dict
,
expected_dict
)
def
test_ko_list_wrong_format
(
self
):
ko_line
=
"This is a wrong line format, with; information and tab"
with
self
.
assertRaises
(
Exception
)
as
context
:
# noqa
KEGGLineParser
.
ko_list
(
ko_line
)
from
metagenedb.common.utils.parsers
import
NCBITaxonomyLineParser
class
TestNCBITaxonomyLineParser
(
TestCase
):
...
...
backend/scripts/populate_db/import_igc_data.py
View file @
25377fe4
...
...
@@ -8,6 +8,8 @@ from itertools import islice
import
django
from
rest_framework.exceptions
import
ValidationError
from
metagenedb.common.utils.parsers
import
IGCLineParser
# Before model import, we need to called django.setup() to Load apps
os
.
environ
.
setdefault
(
"DJANGO_SETTINGS_MODULE"
,
"metagenedb.settings"
)
django
.
setup
()
...
...
@@ -18,32 +20,17 @@ from metagenedb.apps.catalog.serializers import GeneSerializer # noqa
logging
.
basicConfig
(
level
=
logging
.
INFO
)
_LOGGER
=
logging
.
getLogger
(
__name__
)
SELECTED_KEYS
=
[
'gene_id'
,
'gene_length'
,
'kegg_ko'
]
def
parse_gene
(
raw_line
):
def
parse_gene
(
raw_line
,
selected_keys
=
SELECTED_KEYS
):
"""
IGC annotation columns:
0: Gene ID Unique ID
1: Gene Name Unique name
2: Gene Length Length of nucleotide sequence
3: Gene Completeness Status Stating a gene is complete or partial according to the gene predictor
4: Cohort Origin Stating the cohort contributing the representative gene
5: Taxonomic Annotation(Phylum Level) Annotated phylum for a gene
6: Taxonomic Annotation(Genus Level) Annotated genus for a gene
7: KEGG Annotation Annotated KO(s) for a gene
8: eggNOG Annotation Annotated eggNOG(s) for a gene
9: Sample Occurence Frequency Occurrence frequency in samples based on gene profile
10:Individual Occurence Frequency Occurrence frequency in individuals based on gene profile
11: KEGG Functional Categories KEGG functional category(ies) of the annotated KO(s)
12: eggNOG Functional Categories eggNOG functional category(ies) of the annotated eggNOG(s)
13: Cohort Assembled Stating the metagenomic sequencing cohort(s) contributing the
representative gene or a redundant gene belonging to it
Use IGCLineParser and return selected keys
"""
gene_info
=
raw_line
.
rstrip
().
split
(
'
\t
'
)
return
{
'gene_id'
:
gene_info
[
1
],
'gene_length'
:
gene_info
[
2
],
'kegg_ko'
:
gene_info
[
7
]
}
gene_parser
=
IGCLineParser
()
all_dict
=
gene_parser
.
gene
(
raw_line
)
selected_dict
=
{
k
:
v
for
k
,
v
in
all_dict
.
items
()
if
k
in
selected_keys
}
return
selected_dict
def
upsert_gene
(
gene_dict
):
...
...
backend/scripts/populate_db/test_import_igc_data.py
View file @
25377fe4
...
...
@@ -9,7 +9,7 @@ from scripts.populate_db.import_igc_data import parse_gene, upsert_gene
class
TestParseGene
(
TestCase
):
def
test_parse_gene
(
self
):
def
setUp
(
self
):
raw_data
=
[
'gene_id'
,
'gene_name'
,
...
...
@@ -26,13 +26,42 @@ class TestParseGene(TestCase):
'eggnog_functional_cat'
,
'cohort_assembled'
]
raw_line
=
"
\t
"
.
join
(
raw_data
)
self
.
raw_line
=
"
\t
"
.
join
(
raw_data
)
def
test_parse_gene_default_selected_keys
(
self
):
"""
This test should failed and need to be updated when SELECTED_KEYS are changed
"""
expected_dict
=
{
'gene_id'
:
'gene_name'
,
# We use the gene name for our gene ID
'gene_id'
:
'gene_name'
,
'gene_length'
:
'gene_length'
,
'kegg_ko'
:
'kegg'
}
tested_dict
=
parse_gene
(
raw_line
)
tested_dict
=
parse_gene
(
self
.
raw_line
)
self
.
assertDictEqual
(
tested_dict
,
expected_dict
)
def
test_parse_gene
(
self
):
"""
This test should failed and need to be updated when SELECTED_KEYS are changed
"""
selected_keys
=
[
'gene_id'
,
'gene_length'
]
expected_dict
=
{
'gene_id'
:
'gene_name'
,
'gene_length'
:
'gene_length'
}
tested_dict
=
parse_gene
(
self
.
raw_line
,
selected_keys
=
selected_keys
)
self
.
assertDictEqual
(
tested_dict
,
expected_dict
)
def
test_parse_gene_unknown_key
(
self
):
"""
Unknown key should be ignored
"""
selected_keys
=
[
'gene_id'
,
'gene_length'
,
'secret_code'
]
expected_dict
=
{
'gene_id'
:
'gene_name'
,
'gene_length'
:
'gene_length'
}
tested_dict
=
parse_gene
(
self
.
raw_line
,
selected_keys
=
selected_keys
)
self
.
assertDictEqual
(
tested_dict
,
expected_dict
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment