Commit a494b00c authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

Merge branch '67-statistics-on-genes' into 'dev'

Try to fix the issue for gene length graph

See merge request !21
parents 6cf1eabb c0fccc05
Pipeline #18989 passed with stages
in 2 minutes and 32 seconds
......@@ -127,3 +127,21 @@ For the moment you can:
> **Note**: You can also execute the scripts locally from a `pipenv shell` for instance. You need to make
sure that you change the way to log to postgres since the access is different from your machine compared to
from a container.
-----
## Dev tips
#### Profiling code
```python
from metagenedb.common.utils.profiling import profile
@profile("/my/file/path")
def my_function(a, b, c):
...
```
```bash
snakeviz /my/file/path
```
\ No newline at end of file
......@@ -58,6 +58,7 @@ packaging = "*"
python-slugify = "*"
master = {git = "https://github.com/khillion/bioapi.git"}
marshmallow = "*"
django-pandas = "*"
[requires]
python_version = "3.7"
{
"_meta": {
"hash": {
"sha256": "5998b6b97448fd635cc1b05787de28ac5ac3344ca6b8055831fca13790fc3f33"
"sha256": "4be3394e3c4abe5fc7b75328ef912eaba09e15365322b7493e256f2def2ff013"
},
"pipfile-spec": 6,
"requires": {
......@@ -86,6 +86,14 @@
"index": "pypi",
"version": "==2.2.0"
},
"django-pandas": {
"hashes": [
"sha256:738cc03ffb411eef3eb02334d1f5a5d40697099a92ac59eb39629c08a9c2d6fb",
"sha256:788f4652012a67d2c5849191b01af58255f7af815ab612bebca019854235a9bc"
],
"index": "pypi",
"version": "==0.6.1"
},
"djangorestframework": {
"hashes": [
"sha256:5488aed8f8df5ec1d70f04b2114abc52ae6729748a176c453313834a9ee179c8",
......@@ -812,11 +820,11 @@
},
"pluggy": {
"hashes": [
"sha256:0db4b7601aae1d35b4a033282da476845aa19185c1e6964b25cf324b5e4ec3e6",
"sha256:fa5fa1622fa6dd5c030e9cad086fa19ef6a0cf6d7a2d12318e10cb49d6d68f34"
"sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
"sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
],
"index": "pypi",
"version": "==0.13.0"
"version": "==0.13.1"
},
"prometheus-client": {
"hashes": [
......@@ -866,10 +874,10 @@
},
"pygments": {
"hashes": [
"sha256:71e430bc85c88a430f000ac1d9b331d2407f681d6f6aec95e8bcfbc3df5b0127",
"sha256:881c4c157e45f30af185c1ffe8d549d48ac9127433f2c380c24b84572ad66297"
"sha256:83ec6c6133ca6b529b7ff5aa826328fd14b5bb02a58c37f4f06384e96a0f94ab",
"sha256:b7949de3d396836085fea596998b135a22610bbcc4f2abfe9e448e44cbc58388"
],
"version": "==2.4.2"
"version": "==2.5.1"
},
"pylint": {
"hashes": [
......@@ -888,17 +896,17 @@
},
"pyrsistent": {
"hashes": [
"sha256:eb6545dbeb1aa69ab1fb4809bfbf5a8705e44d92ef8fc7c2361682a47c46c778"
"sha256:f3b280d030afb652f79d67c5586157c5c1355c9a58dfc7940566e28d28f3df1b"
],
"version": "==0.15.5"
"version": "==0.15.6"
},
"pytest": {
"hashes": [
"sha256:8e256fe71eb74e14a4d20a5987bb5e1488f0511ee800680aaedc62b9358714e8",
"sha256:ff0090819f669aaa0284d0f4aad1a6d9d67a6efdc6dd4eb4ac56b704f890a0d6"
"sha256:63344a2e3bce2e4d522fd62b4fdebb647c019f1f9e4ca075debbd13219db4418",
"sha256:f67403f33b2b1d25a6756184077394167fe5e2f9d8bdaab30707d19ccec35427"
],
"index": "pypi",
"version": "==5.2.4"
"version": "==5.3.1"
},
"pytest-cov": {
"hashes": [
......
from marshmallow import Schema, fields
class GeneLengthQueryParams(Schema):
window_size = fields.Integer()
stop_at = fields.Integer()
......@@ -91,7 +91,11 @@ class BulkViewSet(ModelViewSet):
try:
query_params = self._get_qparams(request.query_params) # noqa
except ValidationError as validation_error:
return Response(validation_error.normalized_messages(), status=status.HTTP_422_UNPROCESSABLE_ENTITY)
error_message = validation_error.normalized_messages()
error_message.update({
'allowed_query_params': ', '.join(self.query_params_parser().declared_fields.keys())
})
return Response(error_message, status=status.HTTP_422_UNPROCESSABLE_ENTITY)
queryset = self.filter_queryset(self.get_queryset())
page = self.paginate_queryset(queryset)
......@@ -106,7 +110,11 @@ class BulkViewSet(ModelViewSet):
try:
query_params = self._get_qparams(request.query_params) # noqa
except ValidationError as validation_error:
return Response(validation_error.normalized_messages(), status=status.HTTP_422_UNPROCESSABLE_ENTITY)
error_message = validation_error.normalized_messages()
error_message.update({
'allowed_query_params': ', '.join(self.query_params_parser().declared_fields.keys())
})
return Response(error_message, status=status.HTTP_422_UNPROCESSABLE_ENTITY)
instance = self.get_object()
serializer = self.get_serializer(instance)
return Response(serializer.data)
......@@ -38,7 +38,11 @@ class FunctionViewSet(BulkViewSet):
try:
query_params = self._get_qparams(request.query_params)
except ValidationError as validation_error:
return Response(validation_error.normalized_messages(), status=HTTP_422_UNPROCESSABLE_ENTITY)
error_message = validation_error.normalized_messages()
error_message.update({
'allowed_query_params': ', '.join(self.query_params_parser().declared_fields.keys())
})
return Response(error_message, status=HTTP_422_UNPROCESSABLE_ENTITY)
instance = self.get_object()
serializer = self.get_serializer(instance)
returned_data = serializer.data
......
import pandas as pd
from django_pandas.io import read_frame
from drf_yasg import openapi
from drf_yasg.utils import swagger_auto_schema
from rest_framework import filters, status
from marshmallow.exceptions import ValidationError
from rest_framework import filters
from rest_framework.decorators import action
from rest_framework.response import Response
from rest_framework.status import HTTP_204_NO_CONTENT, HTTP_422_UNPROCESSABLE_ENTITY
from metagenedb.common.utils.df_operations import get_mask
from metagenedb.apps.catalog.models import Gene
from metagenedb.api.catalog.qparams_validators.gene import GeneLengthQueryParams
from metagenedb.apps.catalog.serializers import GeneSerializer
from metagenedb.common.utils.df_operations import get_mask
from .bulk_viewset import BulkViewSet
......@@ -49,11 +52,13 @@ class GeneViewSet(BulkViewSet):
serializer_class = GeneSerializer
lookup_field = 'gene_id'
GENE_LENGTH_COL = 'length'
DEFAULT_WINDOW_SIZE = 1000
DEFAULT_STOP_AT = 10000
def get_permissions(self):
return super(self.__class__, self).get_permissions()
def _count_windows(self, df, window_size=10000, window_col=GENE_LENGTH_COL):
def _count_windows(self, df, window_size=DEFAULT_WINDOW_SIZE, window_col=GENE_LENGTH_COL, stop_at=DEFAULT_STOP_AT):
"""
Count how many line of the df belong to each windows defined by the window_size for the window_col
:param df:
......@@ -61,12 +66,18 @@ class GeneViewSet(BulkViewSet):
:param window_size: size of the window
:return: {'data': COUNTS_BY_WINDOW, 'labels': START-END}
"""
all_ranges = [(i, i + window_size) for i in range(0, df[window_col].max(), window_size)]
length_max = df[window_col].max()
stop_at = length_max if length_max < stop_at else stop_at
all_ranges = [[i, i + window_size] for i in range(0, stop_at + 1, window_size)]
all_ranges[-1][1] = length_max + 1 # last should contain all above the stop_at
data = []
labels = []
for rg in all_ranges:
labels.append(f"{rg[0]}-{rg[1]-1}")
labels.append(f"{rg[0]/1000}k-{rg[1]/1000}k")
data.append(df[get_mask(df, rg, window_col)].count()[window_col])
# Change labels
labels[0] = f"<{labels[0].split('-')[1]}"
labels[-1] = f">{labels[-1].split('-')[0]}"
return {
'counts': data,
'labels': labels
......@@ -81,15 +92,24 @@ class GeneViewSet(BulkViewSet):
operation_id='Gene length distribution',
)
@action(methods=['get'], detail=False)
def gene_length(self, request, window_size=10000):
if 'window_size' in request.query_params:
window_size = int(request.query_params.get('window_size'))
df = pd.DataFrame(list(self.queryset.values(self.GENE_LENGTH_COL)))
def gene_length(self, request):
try:
query_params = GeneLengthQueryParams().load(request.query_params)
except ValidationError as validation_error:
error_message = validation_error.normalized_messages()
error_message.update({
'allowed_query_params': ', '.join(GeneLengthQueryParams().declared_fields.keys())
})
return Response(error_message, status=HTTP_422_UNPROCESSABLE_ENTITY)
window_size = query_params.get('window_size', self.DEFAULT_WINDOW_SIZE)
stop_at = query_params.get('stop_at', self.DEFAULT_STOP_AT)
df = read_frame(Gene.objects.all(), fieldnames=[self.GENE_LENGTH_COL])
if df.empty:
return Response(
{'results': {}},
status=status.HTTP_204_NO_CONTENT
status=HTTP_204_NO_CONTENT
)
return Response(
{'results': self._count_windows(df, window_size)}
{'results': self._count_windows(df, window_size=window_size, stop_at=stop_at)}
)
import pandas as pd
from django.contrib.auth.models import User
from django.test import TestCase
from django.urls import reverse
import pandas as pd
from rest_framework import status
from rest_framework.test import APITestCase
from rest_framework_jwt.settings import api_settings
from metagenedb.api.catalog.views.gene import GeneViewSet
from metagenedb.apps.catalog.factory import GeneFactory
from metagenedb.common.utils.mocks.metagenedb import MetageneDBCatalogGeneAPIMock
class TestGenes(TestCase):
......@@ -50,9 +52,48 @@ class TestCountWindows(TestCase):
def test_simple_count_window10(self):
expected_dict = {
'labels': ['0-9', '10-19', '20-29', '30-39'],
'labels': ['<0.01k', '0.01k-0.02k', '0.02k-0.03k', '>0.03k'],
'counts': [0, 0, 2, 1]
}
geneviewset = GeneViewSet()
test_dict = geneviewset._count_windows(self.df, 10, window_col=self.window_col)
self.assertDictEqual(test_dict, expected_dict)
def test_simple_count_window10_stop20(self):
expected_dict = {
'labels': ['<0.01k', '0.01k-0.02k', '>0.02k'],
'counts': [0, 0, 3]
}
geneviewset = GeneViewSet()
test_dict = geneviewset._count_windows(self.df, window_size=10,
window_col=self.window_col, stop_at=20)
self.assertDictEqual(test_dict, expected_dict)
class TestCountWindowsAPI(APITestCase):
def setUp(self):
self.gene_api = MetageneDBCatalogGeneAPIMock(self.client)
for i in range(2000, 4000, 350):
GeneFactory.create(length=i)
def test_gene_length_api(self):
expected_dict = {
'results': {
'counts': [0, 0, 3, 3],
'labels': ['<1.0k', '1.0k-2.0k', '2.0k-3.0k', '>3.0k']
}
}
self.assertDictEqual(self.gene_api.get_gene_length(), expected_dict)
def test_gene_length_api_stop_at_2000(self):
expected_dict = {
'results': {
'counts': [0, 0, 6],
'labels': ['<1.0k', '1.0k-2.0k', '>2.0k']
}
}
query_params = {
'stop_at': 2000
}
self.assertDictEqual(self.gene_api.get_gene_length(params=query_params), expected_dict)
......@@ -15,4 +15,5 @@ class GeneFactory(DjangoModelFactory):
model = models.Gene
gene_id = FuzzyLowerText(prefix='gene-', length=15)
gene_name = fuzzy.FuzzyText(prefix='name-', length=15)
length = fuzzy.FuzzyInteger(200, 10000)
......@@ -47,6 +47,13 @@ class MetageneDBCatalogGeneAPIMock(MetageneDBAPIMock):
KEY_ID = 'gene_id'
REVERSE_PATH = 'catalog:v1:genes'
def get_gene_length(self, params=None):
reverse_path = f"{self.reverse_path}-gene-length"
response = self.client.get(reverse(reverse_path), params)
if response.status_code in self.BAD_REQUESTS:
raise HTTPError
return response.json()
class MetageneDBCatalogTaxonomyAPIMock(MetageneDBAPIMock):
KEY_ID = 'gene_id'
......
import cProfile
import functools
def profile(file_path):
def decorator_profile(func):
@functools.wraps(func)
def wrapper_profile(*args, **kwargs):
cp = cProfile.Profile()
cp.enable()
value = func(*args, **kwargs)
cp.disable()
cp.dump_stats(file_path)
return value
return wrapper_profile
return decorator_profile
......@@ -2,6 +2,12 @@
<div v-if="geneLengthData.counts">
<canvas id="histogram"></canvas>
</div>
<div class="text-xs-center" v-else>
<v-progress-circular
indeterminate
color="secondary"
></v-progress-circular>
</div>
</template>
<script>
......
......@@ -35,7 +35,7 @@ export default {
data() {
return {
geneLengthData: {},
geneLengthWindowSize: 10000,
geneLengthWindowSize: 1000,
};
},
mounted() {
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment