Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Metagenomics
metagenedb
Commits
db4cb890
Commit
db4cb890
authored
Dec 11, 2019
by
Kenzo-Hugo Hillion
♻
Browse files
Deal with EggNOG in creation of genes
parent
793e61eb
Pipeline
#19797
passed with stages
in 2 minutes and 27 seconds
Changes
5
Pipelines
1
Show whitespace changes
Inline
Side-by-side
backend/metagenedb/apps/catalog/serializers/function.py
View file @
db4cb890
...
...
@@ -29,7 +29,7 @@ class EggNogSerializer(serializers.ModelSerializer):
class
Meta
:
model
=
EggNog
list_serializer_class
=
EggNogListSerializer
fields
=
(
'function_id'
,
'name'
,
'functional_categor
y
'
)
fields
=
(
'function_id'
,
'name'
,
'functional_categor
ies
'
)
class
KeggOrthologyListSerializer
(
BulkListSerializer
):
...
...
backend/metagenedb/common/utils/parsers/igc.py
View file @
db4cb890
...
...
@@ -38,7 +38,7 @@ class IGCLineParser(object):
'taxo_phylum'
:
gene_info
[
5
],
'taxo_genus'
:
gene_info
[
6
],
'kegg_ko'
:
gene_info
[
7
].
split
(
';'
),
'eggnog'
:
gene_info
[
8
],
'eggnog'
:
gene_info
[
8
]
.
split
(
';'
)
,
'sample_occurence_frequency'
:
gene_info
[
9
],
'individual_occurence_frequency'
:
gene_info
[
10
],
'kegg_functional_categories'
:
gene_info
[
11
],
...
...
backend/metagenedb/common/utils/parsers/test_igc.py
View file @
db4cb890
...
...
@@ -32,7 +32,7 @@ class TestIGCLineParser(TestCase):
'taxo_phylum'
:
raw_data
[
5
],
'taxo_genus'
:
raw_data
[
6
],
'kegg_ko'
:
[
raw_data
[
7
]],
'eggnog'
:
raw_data
[
8
],
'eggnog'
:
[
raw_data
[
8
]
]
,
'sample_occurence_frequency'
:
raw_data
[
9
],
'individual_occurence_frequency'
:
raw_data
[
10
],
'kegg_functional_categories'
:
raw_data
[
11
],
...
...
@@ -57,7 +57,7 @@ class TestIGCLineParser(TestCase):
'taxo_phylum'
,
'taxo_genus'
,
'kegg;kegg2'
,
'eggnog'
,
'eggnog
1;eggnog2
'
,
'sample_occurence_freq'
,
'ind_occurence_freq'
,
'kegg_functional_cat'
,
...
...
@@ -74,7 +74,7 @@ class TestIGCLineParser(TestCase):
'taxo_phylum'
:
raw_data
[
5
],
'taxo_genus'
:
raw_data
[
6
],
'kegg_ko'
:
[
'kegg'
,
'kegg2'
],
'eggnog'
:
raw_data
[
8
],
'eggnog'
:
[
'eggnog1'
,
'eggnog2'
],
'sample_occurence_frequency'
:
raw_data
[
9
],
'individual_occurence_frequency'
:
raw_data
[
10
],
'kegg_functional_categories'
:
raw_data
[
11
],
...
...
backend/scripts/populate_db/import_igc_data.py
View file @
db4cb890
...
...
@@ -30,20 +30,31 @@ class ImportIGCGenes(object):
PHYLUM_COL
=
'taxo_phylum'
GENUS_COL
=
'taxo_genus'
SELECTED_KEYS
=
[
'gene_id'
,
'length'
,
'kegg_ko'
,
PHYLUM_COL
,
GENUS_COL
]
SELECTED_KEYS
=
[
'gene_id'
,
'length'
,
'kegg_ko'
,
'eggnog'
,
PHYLUM_COL
,
GENUS_COL
]
def
__init__
(
self
,
annotation_file
,
url
,
jwt_token
,
skip_tax
=
False
,
skip_functions
=
False
):
self
.
annotation_file
=
annotation_file
self
.
url
=
url
self
.
metagenedb_gene_api
=
self
.
METAGENEDB_GENE_API
(
base_url
=
self
.
url
,
jwt_token
=
jwt_token
)
self
.
metagenedb_taxonomy_api
=
self
.
METAGENEDB_TAXONOMY_API
(
base_url
=
self
.
url
,
jwt_token
=
jwt_token
)
self
.
metagenedb_function_api
=
self
.
METAGENEDB_FUNCTION_API
(
base_url
=
self
.
url
,
jwt_token
=
jwt_token
)
self
.
_open_api_endpoints
(
jwt_token
)
self
.
total_genes
=
self
.
_get_number_genes
()
self
.
_reset_counters
()
# Skip some insertion if specified in script options
self
.
skip_tax
=
skip_tax
self
.
skip_functions
=
skip_functions
def
_reset_counters
(
self
):
self
.
processed_genes
=
0
self
.
created_genes
=
0
self
.
updated_genes
=
0
self
.
skipped_genes
=
0
def
_open_api_endpoints
(
self
,
jwt_token
):
self
.
metagenedb_gene_api
=
self
.
METAGENEDB_GENE_API
(
base_url
=
self
.
url
,
jwt_token
=
jwt_token
)
self
.
metagenedb_taxonomy_api
=
self
.
METAGENEDB_TAXONOMY_API
(
base_url
=
self
.
url
,
jwt_token
=
jwt_token
)
self
.
metagenedb_function_api
=
self
.
METAGENEDB_FUNCTION_API
(
base_url
=
self
.
url
,
jwt_token
=
jwt_token
)
self
.
metagenedb_kegg_api
=
self
.
METAGENEDB_KEGG_API
(
base_url
=
self
.
url
,
jwt_token
=
jwt_token
)
self
.
metagenedb_eggnog_api
=
self
.
METAGENEDB_EGGNOG_API
(
base_url
=
self
.
url
,
jwt_token
=
jwt_token
)
def
_build_taxo_mapping
(
self
,
rank
,
page_size
=
1000
):
logger
.
info
(
"Building local mapping for %s level..."
,
rank
)
counter
=
1
...
...
@@ -63,8 +74,8 @@ class ImportIGCGenes(object):
counter
+=
1
return
mapping
def
build
_function_catalog
(
self
,
page_size
=
1000
):
logger
.
info
(
"Building local
function catalog..."
)
def
_retrieve
_function_catalog
(
self
,
api
,
page_size
=
1000
):
logger
.
info
(
"Building local
catalog from %s..."
,
api
.
ROUTE
)
counter
=
1
next_page
=
None
functions
=
set
()
...
...
@@ -73,24 +84,22 @@ class ImportIGCGenes(object):
'page'
:
counter
,
'page_size'
:
page_size
,
}
current_page
=
self
.
metagenedb_function_
api
.
get_all
(
params
=
params
)
current_page
=
api
.
get_all
(
params
=
params
)
next_page
=
current_page
[
'next'
]
functions
=
functions
.
union
(
set
(
[
item
[
'function_id'
]
for
item
in
current_page
[
'results'
]]
))
counter
+=
1
self
.
metagenedb_functions
=
functions
return
functions
def
build_function_mappings
(
self
,
page_size
=
1000
):
self
.
metagenedb_keggs
=
self
.
_retrieve_function_catalog
(
self
.
metagenedb_kegg_api
,
page_size
=
page_size
)
self
.
metagenedb_eggnogs
=
self
.
_retrieve_function_catalog
(
self
.
metagenedb_eggnog_api
,
page_size
=
page_size
)
def
build_mapping
(
self
,
page_size
=
1000
):
self
.
phylum_mapping
=
self
.
_build_taxo_mapping
(
"phylum"
,
page_size
=
page_size
)
self
.
genus_mapping
=
self
.
_build_taxo_mapping
(
"genus"
,
page_size
=
page_size
)
def
_reset_counters
(
self
):
self
.
processed_genes
=
0
self
.
created_genes
=
0
self
.
updated_genes
=
0
self
.
skipped_genes
=
0
def
_get_number_genes
(
self
):
if
not
os
.
path
.
isfile
(
self
.
annotation_file
):
return
0
...
...
@@ -133,18 +142,21 @@ class ImportIGCGenes(object):
def
_clean_functions
(
self
,
functions
):
clean_functions
=
[]
for
function
in
functions
:
if
function
in
self
.
metagenedb_functions
:
clean_functions
.
append
(
function
)
elif
function
!=
'unknown'
:
logger
.
warning
(
"Function %s not found in metagenedb"
,
function
)
if
function
[
'function_id'
]
in
getattr
(
self
,
f
"metagenedb_
{
function
[
'source'
]
}
s"
):
clean_functions
.
append
(
function
[
'function_id'
])
else
:
logger
.
warning
(
"Function %s not found from %s in metagenedb"
,
function
[
'function_id'
],
function
[
'source'
])
return
clean_functions
def
_clean_gene
(
self
,
gene_dict
):
gene_dict
[
'gene_name'
]
=
gene_dict
[
'gene_id'
]
gene_dict
[
'gene_id'
]
=
slugify
(
gene_dict
[
'gene_id'
])
gene_dict
[
'functions'
]
=
gene_dict
.
pop
(
'kegg_ko'
)
gene_dict
[
'functions'
]
=
[
{
'source'
:
'kegg'
,
'function_id'
:
v
}
for
v
in
gene_dict
.
pop
(
'kegg_ko'
)
if
v
!=
'unknown'
]
+
\
[{
'source'
:
'eggnog'
,
'function_id'
:
v
}
for
v
in
gene_dict
.
pop
(
'eggnog'
)
if
v
!=
'unknown'
]
gene_dict
=
self
.
_select_taxonomy
(
gene_dict
)
if
self
.
skip_functions
or
'unknown'
in
gene_dict
[
'functions'
]:
if
self
.
skip_functions
or
not
gene_dict
[
'functions'
]:
gene_dict
.
pop
(
'functions'
)
else
:
gene_dict
[
'functions'
]
=
self
.
_clean_functions
(
gene_dict
[
'functions'
])
...
...
@@ -154,7 +166,7 @@ class ImportIGCGenes(object):
if
not
self
.
skip_tax
:
self
.
build_mapping
()
if
not
self
.
skip_functions
:
self
.
build_function_
catalog
()
self
.
build_function_
mappings
()
with
open
(
self
.
annotation_file
,
'r'
)
as
file
:
while
True
:
chunk_genes
=
list
(
islice
(
file
,
chunk_size
))
...
...
backend/scripts/populate_db/test_import_igc_data.py
View file @
db4cb890
...
...
@@ -2,8 +2,16 @@ from unittest import TestCase
from
rest_framework.test
import
APITestCase
from
metagenedb.common.utils.mocks.metagenedb
import
MetageneDBCatalogTaxonomyAPIMock
,
MetageneDBCatalogFunctionAPIMock
from
metagenedb.apps.catalog.factory
import
TaxonomyFactory
,
FunctionFactory
from
metagenedb.common.utils.mocks.metagenedb
import
(
MetageneDBCatalogTaxonomyAPIMock
,
MetageneDBCatalogEggNogAPIMock
,
MetageneDBCatalogKeggOrthologyAPIMock
)
from
metagenedb.apps.catalog.factory
import
(
TaxonomyFactory
,
KeggOrthologyFactory
,
EggNogFactory
)
from
scripts.populate_db.import_igc_data
import
ImportIGCGenes
...
...
@@ -37,6 +45,7 @@ class TestParseGene(TestCase):
'gene_id'
:
'gene_name'
,
'length'
:
'length'
,
'kegg_ko'
:
[
'kegg'
],
'eggnog'
:
[
'eggnog'
],
'taxo_phylum'
:
'taxo_phylum'
,
'taxo_genus'
:
'taxo_genus'
,
}
...
...
@@ -77,7 +86,8 @@ class TestCleanGene(TestCase):
self
.
gene_dict
=
{
'gene_id'
:
'gene.01'
,
'length'
:
135
,
'kegg_ko'
:
[
'K00001'
]
'kegg_ko'
:
[
'K00001'
],
'eggnog'
:
[
'COG1'
]
}
def
test_clean_gene
(
self
):
...
...
@@ -85,7 +95,10 @@ class TestCleanGene(TestCase):
'gene_id'
:
'gene-01'
,
'gene_name'
:
'gene.01'
,
'length'
:
135
,
'functions'
:
[
'K00001'
]
'functions'
:
[
{
'source'
:
'kegg'
,
'function_id'
:
'K00001'
},
{
'source'
:
'eggnog'
,
'function_id'
:
'COG1'
}
]
}
test_gene_dict
=
self
.
import_igc_genes
.
_clean_gene
(
self
.
gene_dict
)
self
.
assertDictEqual
(
test_gene_dict
,
expected_gene_dict
)
...
...
@@ -104,16 +117,57 @@ class TestCleanGene(TestCase):
gene_dict
=
{
'gene_id'
:
'gene.01'
,
'length'
:
135
,
'kegg_ko'
:
'unknown'
'kegg_ko'
:
[
'unknown'
],
'eggnog'
:
[
'COG1'
]
}
expected_gene_dict
=
{
'gene_id'
:
'gene-01'
,
'gene_name'
:
'gene.01'
,
'functions'
:
[{
'function_id'
:
'COG1'
,
'source'
:
'eggnog'
}],
'length'
:
135
}
test_gene_dict
=
self
.
import_igc_genes
.
_clean_gene
(
gene_dict
)
self
.
assertDictEqual
(
test_gene_dict
,
expected_gene_dict
)
def
test_unknow_kegg_and_eggnog
(
self
):
gene_dict
=
{
'gene_id'
:
'gene.01'
,
'length'
:
135
,
'kegg_ko'
:
[
'unknown'
],
'eggnog'
:
[
'unknown'
]
}
expected_gene_dict
=
{
'gene_id'
:
'gene-01'
,
'gene_name'
:
'gene.01'
,
'length'
:
135
}
test_gene_dict
=
self
.
import_igc_genes
.
_clean_gene
(
gene_dict
)
self
.
assertDictEqual
(
test_gene_dict
,
expected_gene_dict
)
class
TestCleanFunctions
(
TestCase
):
def
setUp
(
self
):
self
.
import_igc_genes
=
ImportIGCGenes
(
'test'
,
'test_url'
,
'test_token'
)
self
.
import_igc_genes
.
metagenedb_eggnogs
=
set
([
'COG1'
,
'COG2'
])
self
.
import_igc_genes
.
metagenedb_keggs
=
set
([
'K00001'
,
'K00002'
])
def
test_clean_functions
(
self
):
functions
=
[
{
'function_id'
:
'K00001'
,
'source'
:
'kegg'
},
{
'function_id'
:
'COG1'
,
'source'
:
'eggnog'
}
]
expected_list
=
[
'K00001'
,
'COG1'
]
self
.
assertListEqual
(
self
.
import_igc_genes
.
_clean_functions
(
functions
),
expected_list
)
def
test_clean_functions_unknown_kegg
(
self
):
functions
=
[
{
'function_id'
:
'K00301'
,
'source'
:
'kegg'
},
{
'function_id'
:
'COG1'
,
'source'
:
'eggnog'
}
]
expected_list
=
[
'COG1'
]
self
.
assertListEqual
(
self
.
import_igc_genes
.
_clean_functions
(
functions
),
expected_list
)
class
TestSelectTaxonomy
(
TestCase
):
...
...
@@ -247,16 +301,23 @@ class TestBuildBuildFunctionCatalog(APITestCase):
@
classmethod
def
setUpTestData
(
cls
):
cls
.
functions
=
FunctionFactory
.
create_batch
(
100
)
cls
.
keggs
=
KeggOrthologyFactory
.
create_batch
(
100
)
cls
.
eggnogs
=
EggNogFactory
.
create_batch
(
100
)
def
setUp
(
self
):
self
.
import_igc_genes
=
ImportIGCGenes
(
'test'
,
'test_url'
,
'test_token'
)
self
.
api_mock
=
MetageneDBCatalogFunctionAPIMock
(
self
.
client
)
self
.
import_igc_genes
.
metagenedb_function_api
=
self
.
api_mock
self
.
kegg_api_mock
=
MetageneDBCatalogKeggOrthologyAPIMock
(
self
.
client
)
self
.
eggnog_api_mock
=
MetageneDBCatalogEggNogAPIMock
(
self
.
client
)
self
.
import_igc_genes
.
metagenedb_kegg_api
=
self
.
kegg_api_mock
self
.
import_igc_genes
.
metagenedb_eggnog_api
=
self
.
eggnog_api_mock
def
test_build_catalog
(
self
):
expected_catalog
=
set
(
[
function
.
function_id
for
function
in
self
.
functions
]
expected_kegg_catalog
=
set
(
[
function
.
function_id
for
function
in
self
.
keggs
]
)
expected_eggnog_catalog
=
set
(
[
function
.
function_id
for
function
in
self
.
eggnogs
]
)
self
.
import_igc_genes
.
build_function_catalog
(
page_size
=
100
)
self
.
assertSetEqual
(
self
.
import_igc_genes
.
metagenedb_functions
,
expected_catalog
)
self
.
import_igc_genes
.
build_function_mappings
(
page_size
=
100
)
self
.
assertSetEqual
(
self
.
import_igc_genes
.
metagenedb_keggs
,
expected_kegg_catalog
)
self
.
assertSetEqual
(
self
.
import_igc_genes
.
metagenedb_eggnogs
,
expected_eggnog_catalog
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment