Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Metagenomics
metagenedb
Commits
fd935b95
Commit
fd935b95
authored
Aug 05, 2019
by
Kenzo-Hugo Hillion
♻
Browse files
Merge branch '40-import-taxo-genes' into 'master'
Add taxonomy information to IGC genes Closes #40 See merge request
!8
parents
d2949072
f5da1cca
Pipeline
#13541
passed with stages
in 1 minute and 58 seconds
Changes
17
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
backend/metagenedb/api/catalog/views/__init__.py
View file @
fd935b95
from
.gene
import
GeneViewSet
# noqa
__all__
=
[
'GeneViewSet'
]
backend/metagenedb/apps/catalog/admin/__init__.py
View file @
fd935b95
from
.gene
import
GeneAdmin
from
.function
import
FunctionAdmin
,
KeggOrthologyAdmin
from
.taxonomy
import
TaxonomyAdmin
__all__
=
[
'GeneAdmin'
,
'FunctionAdmin'
,
'KeggOrthologyAdmin'
,
'TaxonomyAdmin'
]
from
.gene
import
GeneAdmin
# noqa
from
.function
import
FunctionAdmin
,
KeggOrthologyAdmin
# noqa
from
.taxonomy
import
TaxonomyAdmin
# noqa
backend/metagenedb/apps/catalog/admin/gene.py
View file @
fd935b95
...
...
@@ -6,9 +6,17 @@ from metagenedb.apps.catalog.models import Gene
@
admin
.
register
(
Gene
)
class
GeneAdmin
(
admin
.
ModelAdmin
):
list_display
=
(
'gene_id'
,
'gene_length'
,
'get_functions'
)
list_display
=
(
'gene_id'
,
'gene_length'
,
'get_functions'
,
'get_taxonomy'
)
search_fields
=
(
'gene_id'
,)
def
get_functions
(
self
,
obj
):
return
","
.
join
([
str
(
f
)
for
f
in
obj
.
functions
.
all
()])
if
obj
.
functions
.
all
():
return
","
.
join
([
str
(
f
)
for
f
in
obj
.
functions
.
all
()])
return
'-'
get_functions
.
short_description
=
'Functions'
def
get_taxonomy
(
self
,
obj
):
if
obj
.
taxonomy
:
return
f
"
{
obj
.
taxonomy
}
(
{
obj
.
taxonomy
.
rank
}
)"
return
'-'
get_taxonomy
.
short_description
=
'Taxonomy'
backend/metagenedb/apps/catalog/migrations/0006_gene_taxonomy.py
0 → 100644
View file @
fd935b95
# Generated by Django 2.2.1 on 2019-08-05 13:45
from
django.db
import
migrations
,
models
import
django.db.models.deletion
class
Migration
(
migrations
.
Migration
):
dependencies
=
[
(
'catalog'
,
'0005_gene_ordering'
),
]
operations
=
[
migrations
.
AddField
(
model_name
=
'gene'
,
name
=
'taxonomy'
,
field
=
models
.
ForeignKey
(
blank
=
True
,
null
=
True
,
on_delete
=
django
.
db
.
models
.
deletion
.
SET_NULL
,
related_name
=
'genes'
,
to
=
'catalog.Taxonomy'
),
),
]
backend/metagenedb/apps/catalog/models/__init__.py
View file @
fd935b95
from
.function
import
Function
,
KeggOrthology
from
.gene
import
Gene
from
.taxonomy
import
Taxonomy
__all__
=
[
'Function'
,
'KeggOrthology'
,
'Gene'
,
'Taxonomy'
]
from
.function
import
Function
,
KeggOrthology
# noqa
from
.gene
import
Gene
# noqa
from
.taxonomy
import
Taxonomy
# noqa
backend/metagenedb/apps/catalog/models/gene.py
View file @
fd935b95
...
...
@@ -7,6 +7,11 @@ class Gene(models.Model):
gene_id
=
models
.
CharField
(
max_length
=
100
,
unique
=
True
,
db_index
=
True
)
gene_length
=
models
.
IntegerField
()
functions
=
models
.
ManyToManyField
(
Function
)
taxonomy
=
models
.
ForeignKey
(
'Taxonomy'
,
related_name
=
'genes'
,
on_delete
=
models
.
SET_NULL
,
null
=
True
,
blank
=
True
)
def
__str__
(
self
):
return
self
.
gene_id
...
...
backend/metagenedb/apps/catalog/serializers/__init__.py
View file @
fd935b95
from
.function
import
FunctionSerializer
from
.gene
import
GeneSerializer
from
.taxonomy
import
TaxonomySerializer
__all__
=
[
'FunctionSerializer'
,
'GeneSerializer'
,
'TaxonomySerializer'
]
from
.function
import
FunctionSerializer
# noqa
from
.gene
import
GeneSerializer
# noqa
from
.taxonomy
import
TaxonomySerializer
# noqa
backend/metagenedb/apps/catalog/serializers/gene.py
View file @
fd935b95
from
rest_framework
import
serializers
from
metagenedb.apps.catalog.models
import
Gene
from
metagenedb.apps.catalog.models
import
Gene
,
Taxonomy
from
metagenedb.apps.catalog.serializers
import
FunctionSerializer
class
GeneSerializer
(
serializers
.
ModelSerializer
):
functions
=
FunctionSerializer
(
many
=
True
,
read_only
=
True
)
taxonomy
=
serializers
.
SlugRelatedField
(
queryset
=
Taxonomy
.
objects
.
all
(),
slug_field
=
'tax_id'
,
required
=
False
,
)
class
Meta
:
model
=
Gene
fields
=
(
'gene_id'
,
'gene_length'
,
'functions'
)
fields
=
(
'gene_id'
,
'gene_length'
,
'functions'
,
'taxonomy'
)
backend/metagenedb/common/utils/parsers/__init__.py
0 → 100644
View file @
fd935b95
from
.igc
import
IGCLineParser
# noqa
from
.kegg
import
KEGGLineParser
# noqa
from
.ncbi_taxonomy
import
NCBITaxonomyLineParser
# noqa
backend/metagenedb/common/utils/parsers/igc.py
0 → 100644
View file @
fd935b95
import
logging
logging
.
basicConfig
(
level
=
logging
.
INFO
)
_LOGGER
=
logging
.
getLogger
(
__name__
)
class
IGCLineParser
(
object
):
@
staticmethod
def
gene
(
line
):
"""
Parse line from IGC genes list () to return organized dict
IGC annotation columns:
0: Gene ID Unique ID
1: Gene Name Unique name
2: Gene Length Length of nucleotide sequence
3: Gene Completeness Status I the gene complete or partial according to the gene predictor
4: Cohort Origin Stating the cohort contributing the representative gene
5: Taxonomic Annotation(Phylum Level) Annotated phylum for a gene
6: Taxonomic Annotation(Genus Level) Annotated genus for a gene
7: KEGG Annotation Annotated KO(s) for a gene
8: eggNOG Annotation Annotated eggNOG(s) for a gene
9: Sample Occurence Frequency Occurrence frequency in samples based on gene profile
10: Individual Occurence Frequency Occurrence frequency in individuals based on gene profile
11: KEGG Functional Categories KEGG functional category(ies) of the annotated KO(s)
12: eggNOG Functional Categories eggNOG functional category(ies) of the annotated eggNOG(s)
13: Cohort Assembled Stating the metagenomic sequencing cohort(s) contributing the
representative gene or a redundant gene belonging to it
"""
try
:
gene_info
=
line
.
rstrip
().
split
(
'
\t
'
)
return
{
'igc_id'
:
gene_info
[
0
],
'gene_id'
:
gene_info
[
1
],
'gene_length'
:
gene_info
[
2
],
'gene_completeness_status'
:
gene_info
[
3
],
'cohort_origin'
:
gene_info
[
4
],
'taxo_phylum'
:
gene_info
[
5
],
'taxo_genus'
:
gene_info
[
6
],
'kegg_ko'
:
gene_info
[
7
],
'eggnog'
:
gene_info
[
8
],
'sample_occurence_frequency'
:
gene_info
[
9
],
'individual_occurence_frequency'
:
gene_info
[
10
],
'kegg_functional_categories'
:
gene_info
[
11
],
'eggnog_functional_categories'
:
gene_info
[
12
],
'cohort_assembled'
:
gene_info
[
13
]
}
except
Exception
:
_LOGGER
.
error
(
f
"Could not parse:
{
line
.
rstrip
()
}
. Are you sure it comes from IGC genes list?"
)
raise
backend/metagenedb/common/utils/parsers/kegg.py
0 → 100644
View file @
fd935b95
import
logging
logging
.
basicConfig
(
level
=
logging
.
INFO
)
_LOGGER
=
logging
.
getLogger
(
__name__
)
class
KEGGLineParser
(
object
):
@
staticmethod
def
ko_list
(
line
):
"""
Parse line from kegg KO list (http://rest.kegg.jp/list/ko) to return organized dict
"""
try
:
elements
=
line
.
split
(
'
\t
'
)
function_id
=
elements
[
0
].
split
(
':'
)[
1
]
if
';'
in
elements
[
1
]:
names
=
elements
[
1
].
split
(
';'
)
else
:
_LOGGER
.
warning
(
f
"Parsing issue with
{
function_id
}
, corresponding line:
{
line
}
"
)
names
=
[
elements
[
1
],
''
]
# Ugly fix to handle one specific case with no name: K23479
if
'[EC:'
in
names
[
1
]:
ec_number
=
names
[
1
].
split
(
'[EC:'
)[
1
].
rstrip
(
']'
)
else
:
ec_number
=
''
return
{
'function_id'
:
function_id
,
'name'
:
names
[
0
],
'long_name'
:
names
[
1
].
lstrip
(),
'ec_number'
:
ec_number
}
except
Exception
:
_LOGGER
.
error
(
f
"Could not parse:
{
line
.
rstrip
()
}
. Are you sure it comes from KEGG KO list?"
)
raise
backend/metagenedb/common/utils/parsers.py
→
backend/metagenedb/common/utils/parsers
/ncbi_taxonomy
.py
View file @
fd935b95
...
...
@@ -4,36 +4,6 @@ logging.basicConfig(level=logging.INFO)
_LOGGER
=
logging
.
getLogger
(
__name__
)
class
KEGGLineParser
(
object
):
@
staticmethod
def
ko_list
(
line
):
"""
Parse line from kegg KO list (http://rest.kegg.jp/list/ko) to return organized dict
"""
try
:
elements
=
line
.
split
(
'
\t
'
)
function_id
=
elements
[
0
].
split
(
':'
)[
1
]
if
';'
in
elements
[
1
]:
names
=
elements
[
1
].
split
(
';'
)
else
:
_LOGGER
.
warning
(
f
"Parsing issue with
{
function_id
}
, corresponding line:
{
line
}
"
)
names
=
[
elements
[
1
],
''
]
# Ugly fix to handle one specific case with no name: K23479
if
'[EC:'
in
names
[
1
]:
ec_number
=
names
[
1
].
split
(
'[EC:'
)[
1
].
rstrip
(
']'
)
else
:
ec_number
=
''
return
{
'function_id'
:
function_id
,
'name'
:
names
[
0
],
'long_name'
:
names
[
1
].
lstrip
(),
'ec_number'
:
ec_number
}
except
Exception
:
_LOGGER
.
error
(
f
"Could not parse:
{
line
.
rstrip
()
}
. Are you sure it comes from KEGG KO list?"
)
raise
class
NCBITaxonomyLineParser
(
object
):
@
staticmethod
...
...
backend/metagenedb/common/utils/parsers/test_igc.py
0 → 100644
View file @
fd935b95
from
unittest
import
TestCase
from
metagenedb.common.utils.parsers
import
IGCLineParser
class
TestIGCLineParser
(
TestCase
):
def
test_gene
(
self
):
raw_data
=
[
'gene_id'
,
'gene_name'
,
'gene_length'
,
'gene_completeness_status'
,
'cohort_origin'
,
'taxo_phylum'
,
'taxo_genus'
,
'kegg'
,
'eggnog'
,
'sample_occurence_freq'
,
'ind_occurence_freq'
,
'kegg_functional_cat'
,
'eggnog_functional_cat'
,
'cohort_assembled'
]
raw_line
=
"
\t
"
.
join
(
raw_data
)
expected_dict
=
{
'igc_id'
:
raw_data
[
0
],
'gene_id'
:
raw_data
[
1
],
'gene_length'
:
raw_data
[
2
],
'gene_completeness_status'
:
raw_data
[
3
],
'cohort_origin'
:
raw_data
[
4
],
'taxo_phylum'
:
raw_data
[
5
],
'taxo_genus'
:
raw_data
[
6
],
'kegg_ko'
:
raw_data
[
7
],
'eggnog'
:
raw_data
[
8
],
'sample_occurence_frequency'
:
raw_data
[
9
],
'individual_occurence_frequency'
:
raw_data
[
10
],
'kegg_functional_categories'
:
raw_data
[
11
],
'eggnog_functional_categories'
:
raw_data
[
12
],
'cohort_assembled'
:
raw_data
[
13
]
}
test_dict
=
IGCLineParser
.
gene
(
raw_line
)
self
.
assertDictEqual
(
test_dict
,
expected_dict
)
def
test_gene_wrong_format
(
self
):
raw_line
=
"This is a wrong line format, with; information and tab"
with
self
.
assertRaises
(
Exception
)
as
context
:
# noqa
IGCLineParser
.
gene
(
raw_line
)
backend/metagenedb/common/utils/parsers/test_kegg.py
0 → 100644
View file @
fd935b95
from
unittest
import
TestCase
from
metagenedb.common.utils.parsers
import
KEGGLineParser
class
TestKEGGLineParser
(
TestCase
):
def
test_ko_list
(
self
):
ko_line
=
"ko:K00809 DHPS, dys; deoxyhypusine synthase [EC:2.5.1.46]"
expected_dict
=
{
'function_id'
:
"K00809"
,
'name'
:
"DHPS, dys"
,
'long_name'
:
"deoxyhypusine synthase [EC:2.5.1.46]"
,
'ec_number'
:
"2.5.1.46"
}
test_dict
=
KEGGLineParser
.
ko_list
(
ko_line
)
self
.
assertDictEqual
(
test_dict
,
expected_dict
)
def
test_ko_list_wrong_format
(
self
):
ko_line
=
"This is a wrong line format, with; information and tab"
with
self
.
assertRaises
(
Exception
)
as
context
:
# noqa
KEGGLineParser
.
ko_list
(
ko_line
)
backend/metagenedb/common/utils/
test_
parsers.py
→
backend/metagenedb/common/utils/parsers
/test_ncbi_taxonomy
.py
View file @
fd935b95
from
unittest
import
TestCase
from
metagenedb.common.utils.parsers
import
KEGGLineParser
,
NCBITaxonomyLineParser
class
TestKEGGLineParser
(
TestCase
):
def
test_ko_list
(
self
):
ko_line
=
"ko:K00809 DHPS, dys; deoxyhypusine synthase [EC:2.5.1.46]"
expected_dict
=
{
'function_id'
:
"K00809"
,
'name'
:
"DHPS, dys"
,
'long_name'
:
"deoxyhypusine synthase [EC:2.5.1.46]"
,
'ec_number'
:
"2.5.1.46"
}
test_dict
=
KEGGLineParser
.
ko_list
(
ko_line
)
self
.
assertDictEqual
(
test_dict
,
expected_dict
)
def
test_ko_list_wrong_format
(
self
):
ko_line
=
"This is a wrong line format, with; information and tab"
with
self
.
assertRaises
(
Exception
)
as
context
:
# noqa
KEGGLineParser
.
ko_list
(
ko_line
)
from
metagenedb.common.utils.parsers
import
NCBITaxonomyLineParser
class
TestNCBITaxonomyLineParser
(
TestCase
):
...
...
backend/scripts/populate_db/import_igc_data.py
View file @
fd935b95
...
...
@@ -8,42 +8,55 @@ from itertools import islice
import
django
from
rest_framework.exceptions
import
ValidationError
from
metagenedb.common.utils.parsers
import
IGCLineParser
# Before model import, we need to called django.setup() to Load apps
os
.
environ
.
setdefault
(
"DJANGO_SETTINGS_MODULE"
,
"metagenedb.settings"
)
django
.
setup
()
from
metagenedb.apps.catalog.models
import
Gene
,
Function
# noqa
from
metagenedb.apps.catalog.models
import
Gene
,
Function
,
Taxonomy
# noqa
from
metagenedb.apps.catalog.serializers
import
GeneSerializer
# noqa
logging
.
basicConfig
(
level
=
logging
.
INFO
)
_LOGGER
=
logging
.
getLogger
(
__name__
)
PHYLUM_COL
=
'taxo_phylum'
GENUS_COL
=
'taxo_genus'
SELECTED_KEYS
=
[
'gene_id'
,
'gene_length'
,
'kegg_ko'
,
PHYLUM_COL
,
GENUS_COL
]
def
parse_gene
(
raw_line
,
selected_keys
=
SELECTED_KEYS
):
"""
Use IGCLineParser and return selected keys
"""
gene_parser
=
IGCLineParser
()
all_dict
=
gene_parser
.
gene
(
raw_line
)
selected_dict
=
{
k
:
v
for
k
,
v
in
all_dict
.
items
()
if
k
in
selected_keys
}
return
selected_dict
def
parse_gene
(
raw_line
):
def
select_taxonomy
(
gene_dict
,
unknown_val
=
'unknown'
):
"""
IGC annotation columns:
0: Gene ID Unique ID
1: Gene Name Unique name
2: Gene Length Length of nucleotide sequence
3: Gene Completeness Status Stating a gene is complete or partial according to the gene predictor
4: Cohort Origin Stating the cohort contributing the representative gene
5: Taxonomic Annotation(Phylum Level) Annotated phylum for a gene
6: Taxonomic Annotation(Genus Level) Annotated genus for a gene
7: KEGG Annotation Annotated KO(s) for a gene
8: eggNOG Annotation Annotated eggNOG(s) for a gene
9: Sample Occurence Frequency Occurrence frequency in samples based on gene profile
10:Individual Occurence Frequency Occurrence frequency in individuals based on gene profile
11: KEGG Functional Categories KEGG functional category(ies) of the annotated KO(s)
12: eggNOG Functional Categories eggNOG functional category(ies) of the annotated eggNOG(s)
13: Cohort Assembled Stating the metagenomic sequencing cohort(s) contributing the
representative gene or a redundant gene belonging to it
Select the taxonomy to be assigned for the gene.
genus has priority on phylum. If both unknow, remove the taxonomy key
"""
gene_info
=
raw_line
.
rstrip
().
split
(
'
\t
'
)
return
{
'gene_id'
:
gene_info
[
1
],
'gene_length'
:
gene_info
[
2
],
'kegg_ko'
:
gene_info
[
7
]
}
phylum
=
gene_dict
.
pop
(
PHYLUM_COL
)
genus
=
gene_dict
.
pop
(
GENUS_COL
)
if
genus
!=
unknown_val
:
queryset
=
Taxonomy
.
objects
.
filter
(
name
=
genus
,
rank
=
"genus"
)
if
queryset
.
count
()
>
1
:
_LOGGER
.
warning
(
f
"More than 1 result found for genus
{
genus
}
. First result is kept."
)
gene_dict
.
update
(
{
'taxonomy'
:
queryset
[
0
].
tax_id
}
)
elif
phylum
!=
unknown_val
:
queryset
=
Taxonomy
.
objects
.
filter
(
name
=
phylum
,
rank
=
"phylum"
)
if
queryset
.
count
()
>
1
:
_LOGGER
.
warning
(
f
"More than 1 result found for phylum
{
phylum
}
. First result is kept."
)
gene_dict
.
update
(
{
'taxonomy'
:
queryset
[
0
].
tax_id
}
)
return
gene_dict
def
upsert_gene
(
gene_dict
):
...
...
@@ -59,8 +72,9 @@ def upsert_gene(gene_dict):
def
insert_gene_list
(
chunk_genes
):
for
gene_line
in
chunk_genes
:
gene_dict
=
parse_gene
(
gene_line
)
gene_dict_with_taxo
=
select_taxonomy
(
gene_dict
)
try
:
upsert_gene
(
gene_dict
)
upsert_gene
(
gene_dict
_with_taxo
)
except
ValidationError
as
e
:
_LOGGER
.
warning
(
f
"
{
e
.
__dict__
}
for gene_id:
{
gene_dict
.
get
(
'gene_id'
)
}
. Insertion skipped."
)
...
...
backend/scripts/populate_db/test_import_igc_data.py
View file @
fd935b95
...
...
@@ -4,12 +4,12 @@ from rest_framework.exceptions import ValidationError
from
rest_framework.test
import
APITestCase
from
metagenedb.apps.catalog.models
import
Gene
from
scripts.populate_db.import_igc_data
import
parse_gene
,
upsert_gene
from
scripts.populate_db.import_igc_data
import
parse_gene
,
upsert_gene
,
select_taxonomy
class
TestParseGene
(
TestCase
):
def
test_parse_gene
(
self
):
def
setUp
(
self
):
raw_data
=
[
'gene_id'
,
'gene_name'
,
...
...
@@ -26,13 +26,44 @@ class TestParseGene(TestCase):
'eggnog_functional_cat'
,
'cohort_assembled'
]
raw_line
=
"
\t
"
.
join
(
raw_data
)
self
.
raw_line
=
"
\t
"
.
join
(
raw_data
)
def
test_parse_gene_default_selected_keys
(
self
):
"""
This test should failed and need to be updated when SELECTED_KEYS are changed
"""
expected_dict
=
{
'gene_id'
:
'gene_name'
,
# We use the gene name for our gene ID
'gene_id'
:
'gene_name'
,
'gene_length'
:
'gene_length'
,
'kegg_ko'
:
'kegg'
'kegg_ko'
:
'kegg'
,
'taxo_phylum'
:
'taxo_phylum'
,
'taxo_genus'
:
'taxo_genus'
,
}
tested_dict
=
parse_gene
(
self
.
raw_line
)
self
.
assertDictEqual
(
tested_dict
,
expected_dict
)
def
test_parse_gene
(
self
):
"""
This test should failed and need to be updated when SELECTED_KEYS are changed
"""
selected_keys
=
[
'gene_id'
,
'gene_length'
]
expected_dict
=
{
'gene_id'
:
'gene_name'
,
'gene_length'
:
'gene_length'
}
tested_dict
=
parse_gene
(
self
.
raw_line
,
selected_keys
=
selected_keys
)
self
.
assertDictEqual
(
tested_dict
,
expected_dict
)
def
test_parse_gene_unknown_key
(
self
):
"""
Unknown key should be ignored
"""
selected_keys
=
[
'gene_id'
,
'gene_length'
,
'secret_code'
]
expected_dict
=
{
'gene_id'
:
'gene_name'
,
'gene_length'
:
'gene_length'
}
tested_dict
=
parse_gene
(
raw_line
)
tested_dict
=
parse_gene
(
self
.
raw_line
,
selected_keys
=
selected_keys
)
self
.
assertDictEqual
(
tested_dict
,
expected_dict
)
...
...
@@ -67,3 +98,29 @@ class TestUpsertGene(APITestCase):
self
.
assertEqual
(
Gene
.
objects
.
get
(
gene_id
=
"test_gene01"
).
gene_length
,
3556
)
upsert_gene
(
updated_gene
)
self
.
assertEqual
(
Gene
.
objects
.
get
(
gene_id
=
"test_gene01"
).
gene_length
,
356
)
class
TestSelectTaxonomy
(
TestCase
):
def
test_genus_only
(
self
):
pass
# @TODO with #31
def
test_phylum_only
(
self
):
pass
# @TODO with #31
def
test_genus_phylum
(
self
):
pass
# @TODO with #31
def
test_both_unknown
(
self
):
gene_dict
=
{
'gene_id'
:
'gene'
,
'gene_length'
:
135
,
'taxo_phylum'
:
'unknown'
,
'taxo_genus'
:
'unknown'
}
expected_dict
=
{
'gene_id'
:
'gene'
,
'gene_length'
:
135
}
tested_dict
=
select_taxonomy
(
gene_dict
)
self
.
assertDictEqual
(
tested_dict
,
expected_dict
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment