Commit bd68a8c4 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

Improve perf for counting windows

parent 87cb9d53
Pipeline #19172 passed with stages
in 2 minutes and 20 seconds
from django_pandas.io import read_frame from django.db.models import Max
from drf_yasg import openapi from drf_yasg import openapi
from drf_yasg.utils import swagger_auto_schema from drf_yasg.utils import swagger_auto_schema
from marshmallow.exceptions import ValidationError from marshmallow.exceptions import ValidationError
...@@ -10,7 +10,6 @@ from metagenedb.apps.catalog.models import Gene ...@@ -10,7 +10,6 @@ from metagenedb.apps.catalog.models import Gene
from metagenedb.api.catalog.filters import GeneFilter from metagenedb.api.catalog.filters import GeneFilter
from metagenedb.api.catalog.qparams_validators.gene import GeneLengthQueryParams, GeneQueryParams from metagenedb.api.catalog.qparams_validators.gene import GeneLengthQueryParams, GeneQueryParams
from metagenedb.apps.catalog.serializers import GeneSerializer from metagenedb.apps.catalog.serializers import GeneSerializer
from metagenedb.common.utils.df_operations import get_mask
from .bulk_viewset import BulkViewSet from .bulk_viewset import BulkViewSet
...@@ -59,15 +58,16 @@ class GeneViewSet(BulkViewSet): ...@@ -59,15 +58,16 @@ class GeneViewSet(BulkViewSet):
def get_permissions(self): def get_permissions(self):
return super(self.__class__, self).get_permissions() return super(self.__class__, self).get_permissions()
def _count_windows(self, df, window_size=DEFAULT_WINDOW_SIZE, window_col=GENE_LENGTH_COL, stop_at=DEFAULT_STOP_AT): def _count_windows(self, queryset, window_size=DEFAULT_WINDOW_SIZE, window_col=GENE_LENGTH_COL,
stop_at=DEFAULT_STOP_AT):
""" """
Count how many line of the df belong to each windows defined by the window_size for the window_col Count how many entries by performing one query per range
:param df: :param queryset:
:param window_col: column concerned by the window :param window_col: column concerned by the window
:param window_size: size of the window :param window_size: size of the window
:return: {'data': COUNTS_BY_WINDOW, 'labels': START-END} :return: {'data': COUNTS_BY_WINDOW, 'labels': START-END}
""" """
length_max = df[window_col].max() length_max = queryset.aggregate(Max('length')).get('length__max', 0)
stop_at = length_max if length_max < stop_at else stop_at stop_at = length_max if length_max < stop_at else stop_at
all_ranges = [[i, i + window_size] for i in range(0, stop_at + 1, window_size)] all_ranges = [[i, i + window_size] for i in range(0, stop_at + 1, window_size)]
all_ranges[-1][1] = length_max + 1 # last should contain all above the stop_at all_ranges[-1][1] = length_max + 1 # last should contain all above the stop_at
...@@ -75,7 +75,7 @@ class GeneViewSet(BulkViewSet): ...@@ -75,7 +75,7 @@ class GeneViewSet(BulkViewSet):
labels = [] labels = []
for rg in all_ranges: for rg in all_ranges:
labels.append(f"{rg[0]/1000}k-{rg[1]/1000}k") labels.append(f"{rg[0]/1000}k-{rg[1]/1000}k")
data.append(df[get_mask(df, rg, window_col)].count()[window_col]) data.append(queryset.filter(length__gte=rg[0], length__lt=rg[1]).count())
# Change labels # Change labels
labels[0] = f"<{labels[0].split('-')[1]}" labels[0] = f"<{labels[0].split('-')[1]}"
labels[-1] = f">{labels[-1].split('-')[0]}" labels[-1] = f">{labels[-1].split('-')[0]}"
...@@ -105,12 +105,13 @@ class GeneViewSet(BulkViewSet): ...@@ -105,12 +105,13 @@ class GeneViewSet(BulkViewSet):
window_size = query_params.get('window_size', self.DEFAULT_WINDOW_SIZE) window_size = query_params.get('window_size', self.DEFAULT_WINDOW_SIZE)
stop_at = query_params.get('stop_at', self.DEFAULT_STOP_AT) stop_at = query_params.get('stop_at', self.DEFAULT_STOP_AT)
df = read_frame(Gene.objects.all(), fieldnames=[self.GENE_LENGTH_COL]) # df = read_frame(Gene.objects.all(), fieldnames=[self.GENE_LENGTH_COL])
if df.empty: queryset = Gene.objects.all()
if not queryset.exists():
return Response( return Response(
{'results': {}}, {},
status=HTTP_204_NO_CONTENT status=HTTP_204_NO_CONTENT
) )
return Response( return Response(
{'results': self._count_windows(df, window_size=window_size, stop_at=stop_at)} {'results': self._count_windows(queryset, window_size=window_size, stop_at=stop_at)}
) )
import pandas as pd
from django.contrib.auth.models import User from django.contrib.auth.models import User
from django.test import TestCase from django.test import TestCase
from django.urls import reverse from django.urls import reverse
...@@ -6,7 +5,6 @@ from rest_framework import status ...@@ -6,7 +5,6 @@ from rest_framework import status
from rest_framework.test import APITestCase from rest_framework.test import APITestCase
from rest_framework_jwt.settings import api_settings from rest_framework_jwt.settings import api_settings
from metagenedb.api.catalog.views.gene import GeneViewSet
from metagenedb.apps.catalog.factory import GeneFactory from metagenedb.apps.catalog.factory import GeneFactory
from metagenedb.common.utils.mocks.metagenedb import MetageneDBCatalogGeneAPIMock from metagenedb.common.utils.mocks.metagenedb import MetageneDBCatalogGeneAPIMock
...@@ -41,43 +39,17 @@ class TestGenes(TestCase): ...@@ -41,43 +39,17 @@ class TestGenes(TestCase):
self.assertEqual(resp.status_code, status.HTTP_200_OK) self.assertEqual(resp.status_code, status.HTTP_200_OK)
class TestCountWindows(TestCase):
def setUp(self):
self.window_col = "length"
self.df = pd.DataFrame(
[22, 29, 35],
columns=[self.window_col]
)
def test_simple_count_window10(self):
expected_dict = {
'labels': ['<0.01k', '0.01k-0.02k', '0.02k-0.03k', '>0.03k'],
'counts': [0, 0, 2, 1]
}
geneviewset = GeneViewSet()
test_dict = geneviewset._count_windows(self.df, 10, window_col=self.window_col)
self.assertDictEqual(test_dict, expected_dict)
def test_simple_count_window10_stop20(self):
expected_dict = {
'labels': ['<0.01k', '0.01k-0.02k', '>0.02k'],
'counts': [0, 0, 3]
}
geneviewset = GeneViewSet()
test_dict = geneviewset._count_windows(self.df, window_size=10,
window_col=self.window_col, stop_at=20)
self.assertDictEqual(test_dict, expected_dict)
class TestCountWindowsAPI(APITestCase): class TestCountWindowsAPI(APITestCase):
def setUp(self): def setUp(self):
self.gene_api = MetageneDBCatalogGeneAPIMock(self.client) self.gene_api = MetageneDBCatalogGeneAPIMock(self.client)
for i in range(2000, 4000, 350):
GeneFactory.create(length=i) def test_gene_length_no_content(self):
self.assertFalse(self.gene_api.get_gene_length())
def test_gene_length_api(self): def test_gene_length_api(self):
for i in range(2000, 4000, 350):
GeneFactory.create(length=i)
expected_dict = { expected_dict = {
'results': { 'results': {
'counts': [0, 0, 3, 3], 'counts': [0, 0, 3, 3],
...@@ -87,6 +59,8 @@ class TestCountWindowsAPI(APITestCase): ...@@ -87,6 +59,8 @@ class TestCountWindowsAPI(APITestCase):
self.assertDictEqual(self.gene_api.get_gene_length(), expected_dict) self.assertDictEqual(self.gene_api.get_gene_length(), expected_dict)
def test_gene_length_api_stop_at_2000(self): def test_gene_length_api_stop_at_2000(self):
for i in range(2000, 4000, 350):
GeneFactory.create(length=i)
expected_dict = { expected_dict = {
'results': { 'results': {
'counts': [0, 0, 6], 'counts': [0, 0, 6],
......
...@@ -52,6 +52,8 @@ class MetageneDBCatalogGeneAPIMock(MetageneDBAPIMock): ...@@ -52,6 +52,8 @@ class MetageneDBCatalogGeneAPIMock(MetageneDBAPIMock):
response = self.client.get(reverse(reverse_path), params) response = self.client.get(reverse(reverse_path), params)
if response.status_code in self.BAD_REQUESTS: if response.status_code in self.BAD_REQUESTS:
raise HTTPError raise HTTPError
if response.status_code == 204: # no content
return {}
return response.json() return response.json()
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment