Commit bd68a8c4 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

Improve perf for counting windows

parent 87cb9d53
Pipeline #19172 passed with stages
in 2 minutes and 20 seconds
from django_pandas.io import read_frame
from django.db.models import Max
from drf_yasg import openapi
from drf_yasg.utils import swagger_auto_schema
from marshmallow.exceptions import ValidationError
......@@ -10,7 +10,6 @@ from metagenedb.apps.catalog.models import Gene
from metagenedb.api.catalog.filters import GeneFilter
from metagenedb.api.catalog.qparams_validators.gene import GeneLengthQueryParams, GeneQueryParams
from metagenedb.apps.catalog.serializers import GeneSerializer
from metagenedb.common.utils.df_operations import get_mask
from .bulk_viewset import BulkViewSet
......@@ -59,15 +58,16 @@ class GeneViewSet(BulkViewSet):
def get_permissions(self):
return super(self.__class__, self).get_permissions()
def _count_windows(self, df, window_size=DEFAULT_WINDOW_SIZE, window_col=GENE_LENGTH_COL, stop_at=DEFAULT_STOP_AT):
def _count_windows(self, queryset, window_size=DEFAULT_WINDOW_SIZE, window_col=GENE_LENGTH_COL,
stop_at=DEFAULT_STOP_AT):
"""
Count how many line of the df belong to each windows defined by the window_size for the window_col
:param df:
Count how many entries by performing one query per range
:param queryset:
:param window_col: column concerned by the window
:param window_size: size of the window
:return: {'data': COUNTS_BY_WINDOW, 'labels': START-END}
"""
length_max = df[window_col].max()
length_max = queryset.aggregate(Max('length')).get('length__max', 0)
stop_at = length_max if length_max < stop_at else stop_at
all_ranges = [[i, i + window_size] for i in range(0, stop_at + 1, window_size)]
all_ranges[-1][1] = length_max + 1 # last should contain all above the stop_at
......@@ -75,7 +75,7 @@ class GeneViewSet(BulkViewSet):
labels = []
for rg in all_ranges:
labels.append(f"{rg[0]/1000}k-{rg[1]/1000}k")
data.append(df[get_mask(df, rg, window_col)].count()[window_col])
data.append(queryset.filter(length__gte=rg[0], length__lt=rg[1]).count())
# Change labels
labels[0] = f"<{labels[0].split('-')[1]}"
labels[-1] = f">{labels[-1].split('-')[0]}"
......@@ -105,12 +105,13 @@ class GeneViewSet(BulkViewSet):
window_size = query_params.get('window_size', self.DEFAULT_WINDOW_SIZE)
stop_at = query_params.get('stop_at', self.DEFAULT_STOP_AT)
df = read_frame(Gene.objects.all(), fieldnames=[self.GENE_LENGTH_COL])
if df.empty:
# df = read_frame(Gene.objects.all(), fieldnames=[self.GENE_LENGTH_COL])
queryset = Gene.objects.all()
if not queryset.exists():
return Response(
{'results': {}},
{},
status=HTTP_204_NO_CONTENT
)
return Response(
{'results': self._count_windows(df, window_size=window_size, stop_at=stop_at)}
{'results': self._count_windows(queryset, window_size=window_size, stop_at=stop_at)}
)
import pandas as pd
from django.contrib.auth.models import User
from django.test import TestCase
from django.urls import reverse
......@@ -6,7 +5,6 @@ from rest_framework import status
from rest_framework.test import APITestCase
from rest_framework_jwt.settings import api_settings
from metagenedb.api.catalog.views.gene import GeneViewSet
from metagenedb.apps.catalog.factory import GeneFactory
from metagenedb.common.utils.mocks.metagenedb import MetageneDBCatalogGeneAPIMock
......@@ -41,43 +39,17 @@ class TestGenes(TestCase):
self.assertEqual(resp.status_code, status.HTTP_200_OK)
class TestCountWindows(TestCase):
def setUp(self):
self.window_col = "length"
self.df = pd.DataFrame(
[22, 29, 35],
columns=[self.window_col]
)
def test_simple_count_window10(self):
expected_dict = {
'labels': ['<0.01k', '0.01k-0.02k', '0.02k-0.03k', '>0.03k'],
'counts': [0, 0, 2, 1]
}
geneviewset = GeneViewSet()
test_dict = geneviewset._count_windows(self.df, 10, window_col=self.window_col)
self.assertDictEqual(test_dict, expected_dict)
def test_simple_count_window10_stop20(self):
expected_dict = {
'labels': ['<0.01k', '0.01k-0.02k', '>0.02k'],
'counts': [0, 0, 3]
}
geneviewset = GeneViewSet()
test_dict = geneviewset._count_windows(self.df, window_size=10,
window_col=self.window_col, stop_at=20)
self.assertDictEqual(test_dict, expected_dict)
class TestCountWindowsAPI(APITestCase):
def setUp(self):
self.gene_api = MetageneDBCatalogGeneAPIMock(self.client)
for i in range(2000, 4000, 350):
GeneFactory.create(length=i)
def test_gene_length_no_content(self):
self.assertFalse(self.gene_api.get_gene_length())
def test_gene_length_api(self):
for i in range(2000, 4000, 350):
GeneFactory.create(length=i)
expected_dict = {
'results': {
'counts': [0, 0, 3, 3],
......@@ -87,6 +59,8 @@ class TestCountWindowsAPI(APITestCase):
self.assertDictEqual(self.gene_api.get_gene_length(), expected_dict)
def test_gene_length_api_stop_at_2000(self):
for i in range(2000, 4000, 350):
GeneFactory.create(length=i)
expected_dict = {
'results': {
'counts': [0, 0, 6],
......
......@@ -52,6 +52,8 @@ class MetageneDBCatalogGeneAPIMock(MetageneDBAPIMock):
response = self.client.get(reverse(reverse_path), params)
if response.status_code in self.BAD_REQUESTS:
raise HTTPError
if response.status_code == 204: # no content
return {}
return response.json()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment