Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Metagenomics
metagenedb
Commits
a77ffe7b
Commit
a77ffe7b
authored
Aug 26, 2019
by
Kenzo-Hugo Hillion
♻
Browse files
Refactor script into classes and start using API
parent
86a4744d
Changes
6
Hide whitespace changes
Inline
Side-by-side
backend/metagenedb/apps/catalog/models/gene.py
View file @
a77ffe7b
...
...
@@ -15,7 +15,7 @@ class Gene(models.Model):
)
def
__str__
(
self
):
return
self
.
gene_
slug
return
self
.
gene_
id
class
Meta
:
ordering
=
[
'-gene_id'
]
backend/metagenedb/common/utils/api/__init__.py
View file @
a77ffe7b
from
.togows
import
TogoWSEntry
# noqa
from
.metagenedb
import
MetageneDBCatalogGene
# noqa
from
.togows
import
TogoWSEntry
API
# noqa
from
.metagenedb
import
MetageneDBCatalogGene
API
# noqa
backend/metagenedb/common/utils/api/metagenedb.py
View file @
a77ffe7b
from
.baseapi
import
BaseAPI
class
MetageneDB
(
BaseAPI
):
class
MetageneDB
API
(
BaseAPI
):
BASE_URL
=
'http://localhost/'
def
__init__
(
self
,
base_url
=
BASE_URL
):
...
...
@@ -9,5 +9,5 @@ class MetageneDB(BaseAPI):
super
().
__init__
()
class
MetageneDBCatalogGene
(
MetageneDB
):
class
MetageneDBCatalogGene
API
(
MetageneDB
API
):
ROUTE
=
'api/catalog/v1/genes/'
backend/metagenedb/common/utils/api/togows.py
View file @
a77ffe7b
...
...
@@ -3,11 +3,11 @@ from urllib.parse import urljoin
from
.baseapi
import
BaseAPI
class
TogoWS
(
BaseAPI
):
class
TogoWS
API
(
BaseAPI
):
BASE_URL
=
'http://togows.org'
class
TogoWSEntry
(
TogoWS
):
class
TogoWSEntry
API
(
TogoWS
API
):
TYPE
=
'entry'
def
__init__
(
self
,
database
,
entry_format
=
'json'
):
...
...
backend/scripts/populate_db/import_igc_data.py
View file @
a77ffe7b
...
...
@@ -4,10 +4,13 @@ import logging
import
os
import
sys
from
itertools
import
islice
from
requests.exceptions
import
HTTPError
import
django
from
rest_framework.exceptions
import
ValidationError
from
slugify
import
slugify
from
metagenedb.common.utils.api
import
MetageneDBCatalogGeneAPI
from
metagenedb.common.utils.parsers
import
IGCLineParser
# Before model import, we need to called django.setup() to Load apps
...
...
@@ -59,37 +62,54 @@ def select_taxonomy(gene_dict, unknown_val='unknown'):
return
gene_dict
def
upsert_gene
(
gene_dict
):
try
:
gene_obj
=
Gene
.
objects
.
get
(
gene_id
=
gene_dict
.
get
(
'gene_id'
))
serializer
=
GeneSerializer
(
gene_obj
,
data
=
gene_dict
)
except
Gene
.
DoesNotExist
:
serializer
=
GeneSerializer
(
data
=
gene_dict
)
serializer
.
is_valid
(
raise_exception
=
True
)
serializer
.
save
()
def
insert_gene_list
(
chunk_genes
):
for
gene_line
in
chunk_genes
:
gene_dict
=
parse_gene
(
gene_line
)
gene_dict_with_taxo
=
select_taxonomy
(
gene_dict
)
try
:
upsert_gene
(
gene_dict_with_taxo
)
except
ValidationError
as
e
:
_LOGGER
.
warning
(
f
"
{
e
.
__dict__
}
for gene_id:
{
gene_dict
.
get
(
'gene_id'
)
}
. Insertion skipped."
)
class
ImportIGCGenes
(
object
):
METAGENEDB_GENE_API
=
MetageneDBCatalogGeneAPI
def
__init__
(
self
,
annotation_file
,
url
,
skip_tax
=
False
,
skip_functions
=
False
):
self
.
annotation_file
=
annotation_file
self
.
url
=
url
self
.
metagenedb_gene_api
=
self
.
METAGENEDB_GENE_API
(
base_url
=
self
.
url
)
# Skip some insertion if specified in script options
self
.
skip_tax
=
skip_tax
self
.
skip_functions
=
skip_functions
def
load_annotation_file_to_db_in_chunks
(
annotation_file
,
chunk_size
=
100000
):
processed_genes
=
0
with
open
(
annotation_file
,
'r'
)
as
file
:
while
True
:
chunk_genes
=
list
(
islice
(
file
,
chunk_size
))
if
not
chunk_genes
:
break
processed_genes
+=
len
(
chunk_genes
)
insert_gene_list
(
chunk_genes
)
_LOGGER
.
info
(
f
"
{
processed_genes
}
genes processed so far..."
)
_LOGGER
.
info
(
f
"[DONE]
{
processed_genes
}
genes processed."
)
def
_clean_gene
(
self
,
gene_dict
):
gene_dict
[
'gene_id'
]
=
slugify
(
gene_dict
[
'gene_id'
])
if
self
.
skip_tax
:
gene_dict
.
pop
(
'taxonomy'
)
if
self
.
skip_functions
:
gene_dict
.
pop
(
'functions'
)
return
gene_dict
def
_upsert_gene
(
self
,
gene_dict
):
clean_gene_dict
=
self
.
_clean_gene
(
gene_dict
)
try
:
gene_id
=
clean_gene_dict
[
'gene_id'
]
self
.
metagenedb_gene_api
.
get
(
gene_id
)
# Try to get obj to check if it exists
self
.
metagenedb_gene_api
.
put
(
gene_id
,
clean_gene_dict
)
except
HTTPError
:
self
.
metagenedb_gene_api
.
post
(
clean_gene_dict
)
def
_insert_gene_list
(
self
,
chunk_genes
):
for
gene_line
in
chunk_genes
:
gene_dict
=
parse_gene
(
gene_line
)
gene_dict_with_taxo
=
select_taxonomy
(
gene_dict
)
try
:
self
.
_upsert_gene
(
gene_dict_with_taxo
)
except
ValidationError
as
e
:
_LOGGER
.
warning
(
f
"
{
e
.
__dict__
}
for gene_id:
{
gene_dict
.
get
(
'gene_id'
)
}
. Insertion skipped."
)
def
load_annotation_file_to_db_in_chunks
(
self
,
chunk_size
=
100000
):
processed_genes
=
0
with
open
(
self
.
annotation_file
,
'r'
)
as
file
:
while
True
:
chunk_genes
=
list
(
islice
(
file
,
chunk_size
))
if
not
chunk_genes
:
break
processed_genes
+=
len
(
chunk_genes
)
self
.
_insert_gene_list
(
chunk_genes
)
_LOGGER
.
info
(
f
"
{
processed_genes
}
genes processed so far..."
)
_LOGGER
.
info
(
f
"[DONE]
{
processed_genes
}
genes processed."
)
def
parse_arguments
():
...
...
@@ -99,7 +119,9 @@ def parse_arguments():
parser
=
argparse
.
ArgumentParser
(
description
=
'Populate database from a given IGC annotation file.'
)
# Common arguments for analysis and annotations
parser
.
add_argument
(
'annotation'
,
help
=
'IGC annotation file'
)
parser
.
add_argument
(
'--delete_all'
,
action
=
'store_true'
,
help
=
'Empty database before insertion.'
)
parser
.
add_argument
(
'url'
,
help
=
'base URL of the instance.'
,
default
=
'http://localhost/'
)
parser
.
add_argument
(
'--skip_taxonomy'
,
action
=
'store_true'
,
help
=
'Skip taxonomy information from genes.'
)
parser
.
add_argument
(
'--skip_functions'
,
action
=
'store_true'
,
help
=
'Skip functions information from genes.'
)
try
:
return
parser
.
parse_args
()
...
...
@@ -109,9 +131,7 @@ def parse_arguments():
def
run
():
args
=
parse_arguments
()
if
args
.
delete_all
:
Gene
.
objects
.
all
().
delete
()
load_annotation_file_to_db_in_chunks
(
args
.
annotation
)
load_annotation_file_to_db_in_chunks
(
args
.
annotation
,
args
.
url
)
if
__name__
==
"__main__"
:
...
...
backend/scripts/populate_db/test_import_igc_data.py
View file @
a77ffe7b
from
requests.exceptions
import
HTTPError
from
unittest
import
TestCase
import
pytest
from
rest_framework.exceptions
import
ValidationError
from
django.urls
import
reverse
from
rest_framework.test
import
APITestCase
from
metagenedb.
apps.catalog.models
import
Gene
from
metagenedb.apps.catalog.factory
.taxonomy
import
TaxonomyFactory
from
scripts.populate_db.import_igc_data
import
parse_gene
,
upsert_gene
,
select_taxonomy
from
metagenedb.
common.utils.api
import
MetageneDBCatalog
Gene
API
from
metagenedb.apps.catalog.factory
import
TaxonomyFactory
from
scripts.populate_db.import_igc_data
import
parse_gene
,
select_taxonomy
,
ImportIGCGenes
class
TestParseGene
(
TestCase
):
...
...
@@ -69,40 +70,75 @@ class TestParseGene(TestCase):
self
.
assertDictEqual
(
tested_dict
,
expected_dict
)
class
MetageneDBCatalogGeneAPIMock
(
MetageneDBCatalogGeneAPI
):
"""
Just a simple mock to go through the Test client. The idea is to test the upsert behaviour and not
the insertion to the db.
"""
def
__init__
(
self
,
client
):
self
.
client
=
client
self
.
reverse_path
=
'api:catalog:v1:genes'
def
get_all
(
self
):
return
self
.
client
.
get
(
reverse
(
f
'
{
self
.
reverse_path
}
-list'
)).
json
()
def
get
(
self
,
entry_id
):
response
=
self
.
client
.
get
(
reverse
(
f
'
{
self
.
reverse_path
}
-detail'
,
kwargs
=
{
'gene_id'
:
entry_id
}))
if
response
.
status_code
==
404
:
raise
HTTPError
return
response
.
json
()
def
post
(
self
,
data
):
response
=
self
.
client
.
post
(
reverse
(
f
'
{
self
.
reverse_path
}
-list'
),
data
,
format
=
'json'
)
if
response
.
status_code
==
400
:
raise
HTTPError
return
response
.
json
()
def
put
(
self
,
entry_id
,
data
):
return
self
.
client
.
put
(
reverse
(
f
'
{
self
.
reverse_path
}
-detail'
,
kwargs
=
{
'gene_id'
:
entry_id
}),
data
,
format
=
'json'
).
json
()
class
TestUpsertGene
(
APITestCase
):
def
setUp
(
self
):
self
.
import_igc_genes
=
ImportIGCGenes
(
'test'
,
'test'
)
self
.
api_mock
=
MetageneDBCatalogGeneAPIMock
(
self
.
client
)
self
.
import_igc_genes
.
metagenedb_gene_api
=
self
.
api_mock
def
test_insert_valid_gene_no_kegg
(
self
):
valid_gene
=
{
'gene_name'
:
'test_gene.01'
,
'gene_id'
:
'test
_
gene01'
,
'gene_id'
:
'test
-
gene01'
,
'length'
:
3556
}
upsert_gene
(
valid_gene
)
self
.
assertEqual
(
Gene
.
objects
.
all
()
.
count
()
,
1
)
self
.
import_igc_genes
.
_
upsert_gene
(
valid_gene
)
self
.
assertEqual
(
self
.
api_mock
.
get_
all
()
[
'
count
'
]
,
1
)
def
test_insert_invalid_length
(
self
):
invalid_gene
=
{
'gene_id'
:
'test
_
gene01'
,
'gene_id'
:
'test
-
gene01'
,
'length'
:
'wrong_format'
}
with
self
.
assertRaises
(
Validation
Error
)
as
context
:
# noqa
upsert_gene
(
invalid_gene
)
with
self
.
assertRaises
(
HTTP
Error
)
as
context
:
# noqa
self
.
import_igc_genes
.
_
upsert_gene
(
invalid_gene
)
def
test_update_gene
(
self
):
valid_gene
=
{
'gene_name'
:
'test_gene.01'
,
'gene_id'
:
'test
_
gene01'
,
'gene_id'
:
'test
-
gene01'
,
'length'
:
3556
}
updated_gene
=
{
'gene_name'
:
'test_gene.01'
,
'gene_id'
:
'test
_
gene01'
,
'gene_id'
:
'test
-
gene01'
,
'length'
:
356
}
upsert_gene
(
valid_gene
)
self
.
assertEqual
(
Gene
.
objects
.
get
(
gene_id
=
"
test
_
gene01
"
).
length
,
3556
)
upsert_gene
(
updated_gene
)
self
.
assertEqual
(
Gene
.
objects
.
get
(
gene_id
=
"
test
_
gene01
"
).
length
,
356
)
self
.
import_igc_genes
.
_
upsert_gene
(
valid_gene
)
self
.
assertEqual
(
self
.
api_mock
.
get
(
'
test
-
gene01
'
)[
'
length
'
]
,
3556
)
self
.
import_igc_genes
.
_
upsert_gene
(
updated_gene
)
self
.
assertEqual
(
self
.
api_mock
.
get
(
'
test
-
gene01
'
)[
'
length
'
]
,
356
)
@
pytest
.
mark
.
django_db
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment