Commit 3e4f2bfc authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

compute static values for statistics everyday midnight

parent a37b5522
Pipeline #20136 failed with stages
in 2 minutes and 29 seconds
......@@ -5,7 +5,7 @@ ENV PYTHONUNBUFFERED 1
# Install pipenv
RUN pip install pipenv
RUN apt update && apt install vim -y
RUN apt update && apt install vim cron -y
WORKDIR /code
RUN rm -rf Dockerfile
......
SHELL=/bin/bash
* 0 * * * /usr/local/bin/python /code/scripts/manage.py compute_stats
......@@ -3,20 +3,7 @@ from marshmallow.validate import OneOf
from metagenedb.common.django_default.qparams_validators import PaginatedQueryParams
TAXA_CHOICES = [
'superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'
]
class GeneLengthQueryParams(Schema):
window_size = fields.Integer()
stop_at = fields.Integer()
class GeneQueryParams(PaginatedQueryParams):
no_taxonomy = fields.Boolean()
no_functions = fields.Boolean()
class TaxCountQueryParams(Schema):
level = fields.String(validate=OneOf(choices=TAXA_CHOICES))
from collections import defaultdict
from django.db.models import Max
from drf_yasg import openapi
from drf_yasg.utils import swagger_auto_schema
from marshmallow.exceptions import ValidationError
from rest_framework.decorators import action
from rest_framework.permissions import AllowAny
from rest_framework.response import Response
from rest_framework.status import HTTP_204_NO_CONTENT, HTTP_422_UNPROCESSABLE_ENTITY
from metagenedb.apps.catalog.models import Gene
from metagenedb.api.catalog.filters import GeneFilter
from metagenedb.api.catalog.qparams_validators.gene import GeneLengthQueryParams, GeneQueryParams, TaxCountQueryParams
from metagenedb.api.catalog.qparams_validators.gene import GeneQueryParams
from metagenedb.apps.catalog.serializers import GeneSerializer
from .base import BulkViewSet
class DocGeneLength(object):
"""
Define response for API documentation of gene length distribution method
{
"results": {
"counts": [
0,
887,
],
"labels": [
"0-9999",
"10000-19999",
"20000-29999",
]
}
}
"""
window_size_param = openapi.Parameter('window_size', in_=openapi.IN_QUERY, description='Size of the window.',
type=openapi.TYPE_INTEGER, default=10000)
counts = openapi.Schema(type="array", items=openapi.Schema(type="int"),
description="Counts for every window_size")
labels = openapi.Schema(type="array", items=openapi.Schema(type="char"),
description="Corresponding windows")
results = openapi.Schema(type="object", properties={'counts': counts, 'labels': labels},
description="results of your request")
gene_length_schema = openapi.Schema(type="object", properties={'results': results})
gene_length_response = openapi.Response('Get the distribution of gene length for a given window size',
schema=gene_length_schema)
class GeneViewSet(BulkViewSet):
queryset = Gene.objects.select_related('taxonomy').prefetch_related('functions').all()
serializer_class = GeneSerializer
......@@ -59,104 +18,3 @@ class GeneViewSet(BulkViewSet):
DEFAULT_STOP_AT = 10000
DEFAULT_LEVEL = 'phylum'
def get_permissions(self):
if self.action in ['gene_length', 'taxonomy_counts']:
return [AllowAny()]
return super(self.__class__, self).get_permissions()
def _count_windows(self, queryset, window_size=DEFAULT_WINDOW_SIZE, window_col=GENE_LENGTH_COL,
stop_at=DEFAULT_STOP_AT):
"""
Count how many entries by performing one query per range
:param queryset:
:param window_col: column concerned by the window
:param window_size: size of the window
:return: {'data': COUNTS_BY_WINDOW, 'labels': START-END}
"""
length_max = queryset.aggregate(Max('length')).get('length__max', 0)
stop_at = length_max if length_max < stop_at else stop_at
all_ranges = [[i, i + window_size] for i in range(0, stop_at + 1, window_size)]
all_ranges[-1][1] = length_max + 1 # last should contain all above the stop_at
data = []
labels = []
for rg in all_ranges:
labels.append(f"{rg[0]/1000}k-{rg[1]/1000}k")
data.append(queryset.filter(length__gte=rg[0], length__lt=rg[1]).count())
# Change labels
labels[0] = f"<{labels[0].split('-')[1]}"
labels[-1] = f">{labels[-1].split('-')[0]}"
return {
'counts': data,
'labels': labels
}
@swagger_auto_schema(
manual_parameters=[DocGeneLength.window_size_param],
responses={
'200': DocGeneLength.gene_length_response,
'204': 'No genes on the catalog to build the distribution'
},
operation_id='Gene length distribution',
)
@action(methods=['get'], detail=False)
def gene_length(self, request):
try:
query_params = GeneLengthQueryParams().load(request.query_params)
except ValidationError as validation_error:
error_message = validation_error.normalized_messages()
error_message.update({
'allowed_query_params': ', '.join(GeneLengthQueryParams().declared_fields.keys())
})
return Response(error_message, status=HTTP_422_UNPROCESSABLE_ENTITY)
window_size = query_params.get('window_size', self.DEFAULT_WINDOW_SIZE)
stop_at = query_params.get('stop_at', self.DEFAULT_STOP_AT)
queryset = Gene.objects.all()
if not queryset.exists():
return Response(
{},
status=HTTP_204_NO_CONTENT
)
return Response(
{'results': self._count_windows(queryset, window_size=window_size, stop_at=stop_at)}
)
def _taxonomy_counts(self, queryset, level=DEFAULT_LEVEL):
filter_no_annotation = {f"taxonomy__{level}__isnull": True}
filter_annotation = {f"taxonomy__{level}__isnull": False}
value_to_retrieve = f'taxonomy__{level}__name'
taxonomy_counts = {}
taxonomy_counts['counts'] = defaultdict(lambda: 0)
taxonomy_counts['counts']['No annotation'] = queryset.filter(**filter_no_annotation).values().count()
if taxonomy_counts['counts']['No annotation'] == 0:
del taxonomy_counts['counts']['No annotation']
for value in queryset.filter(**filter_annotation).values(value_to_retrieve):
tax_name = value[value_to_retrieve]
taxonomy_counts['counts'][tax_name] += 1
return taxonomy_counts
@action(methods=['get'], detail=False)
def taxonomy_counts(self, request):
try:
query_params = TaxCountQueryParams().load(request.query_params)
except ValidationError as validation_error:
error_message = validation_error.normalized_messages()
error_message.update({
'allowed_query_params': ', '.join(GeneLengthQueryParams().declared_fields.keys())
})
return Response(error_message, status=HTTP_422_UNPROCESSABLE_ENTITY)
level = query_params.get('level', self.DEFAULT_LEVEL)
level = 'class_rank' if level == 'class' else level # deal with class exception @TODO fix cleaner way
queryset = Gene.objects.all().select_related(f'taxonomy__{level}')
if not queryset.exists():
return Response(
{},
status=HTTP_204_NO_CONTENT
)
counts = self._taxonomy_counts(queryset, level=level)
counts['level'] = query_params.get('level', self.DEFAULT_LEVEL)
return Response(
{'results': counts}
)
from django.test import TestCase
from django.urls import reverse
from rest_framework import status
from rest_framework.test import APITestCase
from metagenedb.apps.catalog.factory import GeneFactory, TaxonomyFactory
from metagenedb.common.utils.mocks.metagenedb import MetageneDBCatalogGeneAPIMock
class TestGenes(TestCase):
......@@ -17,81 +13,3 @@ class TestGenes(TestCase):
url = reverse('api:catalog:v1:genes-list')
resp = self.client.get(url)
self.assertEqual(resp.status_code, status.HTTP_200_OK)
class TestCountWindowsAPI(APITestCase):
def setUp(self):
self.gene_api = MetageneDBCatalogGeneAPIMock(self.client)
def test_gene_length_no_content(self):
self.assertFalse(self.gene_api.get_gene_length())
def test_gene_length_api(self):
for i in range(2000, 4000, 350):
GeneFactory.create(length=i)
expected_dict = {
'results': {
'counts': [0, 0, 3, 3],
'labels': ['<1.0k', '1.0k-2.0k', '2.0k-3.0k', '>3.0k']
}
}
self.assertDictEqual(self.gene_api.get_gene_length(), expected_dict)
def test_gene_length_api_stop_at_2000(self):
for i in range(2000, 4000, 350):
GeneFactory.create(length=i)
expected_dict = {
'results': {
'counts': [0, 0, 6],
'labels': ['<1.0k', '1.0k-2.0k', '>2.0k']
}
}
query_params = {
'stop_at': 2000
}
self.assertDictEqual(self.gene_api.get_gene_length(params=query_params), expected_dict)
class TestTaxonomyCountsAPI(APITestCase):
def setUp(self):
self.gene_api = MetageneDBCatalogGeneAPIMock(self.client)
def test_taxonomy_counts_no_content(self):
self.assertFalse(self.gene_api.get_tax_counts())
def test_taxonomy_counts_api(self):
tax_name = "TaxTest"
taxonomy = TaxonomyFactory(rank='phylum', name=tax_name)
taxonomy.phylum = taxonomy # link taxonomy to itself as phylum
taxonomy.save()
gene = GeneFactory.create(taxonomy=taxonomy) # noqa
expected_dict = {
'results': {
'level': 'phylum',
'counts': {
tax_name: 1
}
}
}
self.assertDictEqual(self.gene_api.get_tax_counts(), expected_dict)
def test_taxonomy_counts_api_class_level(self):
tax_name = "TaxTest"
taxonomy = TaxonomyFactory(rank='class_rank', name=tax_name)
taxonomy.class_rank = taxonomy # link taxonomy to itself as phylum
taxonomy.save()
gene = GeneFactory.create(taxonomy=taxonomy) # noqa
expected_dict = {
'results': {
'level': 'class',
'counts': {
tax_name: 1
}
}
}
query_params = {
'level': 'class'
}
self.assertDictEqual(self.gene_api.get_tax_counts(params=query_params), expected_dict)
import logging
from django.core.exceptions import ValidationError
from django.core.management.base import BaseCommand
from slugify import slugify
from metagenedb.apps.catalog.models import Statistics
from metagenedb.apps.catalog.operations.statistics import GeneStatistics
logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s')
logger = logging.getLogger(__name__)
class ComputeStatistics:
def _save_to_db(self, payload):
try:
statistics = Statistics(**payload)
statistics.full_clean()
except ValidationError:
try:
statistics = Statistics.objects.get(stats_id=payload['stats_id'])
for k, v in payload.items():
setattr(statistics, k, v)
statistics.full_clean()
except ValidationError as validation_error:
raise(validation_error)
statistics.save()
class ComputeCounts(ComputeStatistics):
METHODS = [
'count', 'count_has_function', 'count_has_taxonomy', 'count_has_function_has_taxonomy'
]
def compute_count(self, method):
logger.info("Call GeneStatistics.%s()", method)
payload = {
'stats_id': slugify(f"GeneStatistics.{method}()"),
'body': {
'count': getattr(GeneStatistics, method)()
}
}
self._save_to_db(payload)
def all(self):
for method in self.METHODS:
self.compute_count(method)
class ComputeGeneLength(ComputeStatistics):
WINDOW_SIZES = [
100, 200, 400, 600, 800, 1000
]
STOP_ATS = list(range(4000, 11000, 1000))
def all(self):
for window_size in self.WINDOW_SIZES:
for stop_at in self.STOP_ATS:
logger.info("Call GeneStatistics.gene_length(%s, %s)", window_size, stop_at)
payload = {
'stats_id': slugify(f"GeneStatistics.gene_length-{window_size}-{stop_at}"),
'body': GeneStatistics.gene_length(window_size=window_size, stop_at=stop_at)
}
self._save_to_db(payload)
class ComputeTaxonomyRepartition(ComputeStatistics):
ALL_LEVEL = [
'kingdom', 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus'
]
def all(self):
for level in self.ALL_LEVEL:
logger.info("Call GeneStatistics.taxonomy_repartition(%s)", level)
payload = {
'stats_id': slugify(f"GeneStatistics.taxonomy_repartition-{level}"),
'body': GeneStatistics.taxonomy_repartition(level=level)
}
self._save_to_db(payload)
class Command(BaseCommand):
help = "Compute gene catalog statistics."
def set_logger_level(self, verbosity):
if verbosity > 2:
logger.setLevel(logging.DEBUG)
elif verbosity > 1:
logger.setLevel(logging.INFO)
def handle(self, *args, **options):
self.set_logger_level(int(options['verbosity']))
# ComputeCounts().all()
# ComputeGeneLength().all()
ComputeTaxonomyRepartition().all()
from collections import defaultdict
from django.db.models import Max
from metagenedb.apps.catalog.models import Gene
class GeneStatistics:
model = Gene
@staticmethod
def count():
return GeneStatistics.model.objects.all().count()
@staticmethod
def count_has_function():
return GeneStatistics.model.objects.filter(functions__isnull=False).distinct().count()
......@@ -11,3 +19,52 @@ class GeneStatistics:
@staticmethod
def count_has_taxonomy():
return GeneStatistics.model.objects.filter(taxonomy__isnull=False).count()
@staticmethod
def count_has_function_has_taxonomy():
return GeneStatistics.model.objects.filter(
taxonomy__isnull=False).filter(functions__isnull=False).distinct().count()
@staticmethod
def gene_length(window_size=1000, stop_at=10000):
"""
Count how many gene by window of gene length.
"""
queryset = Gene.objects.select_related('taxonomy').prefetch_related('functions').all()
if not queryset:
return {
'counts': [],
'labels': []
}
length_max = queryset.aggregate(Max('length')).get('length__max', 0)
stop_at = length_max if length_max < stop_at else stop_at
all_ranges = [[i, i + window_size] for i in range(0, stop_at + 1, window_size)]
all_ranges[-1][1] = length_max + 1 # last should contain all above the stop_at
data = []
labels = []
for rg in all_ranges:
labels.append(f"{rg[0]/1000}k-{rg[1]/1000}k")
data.append(queryset.filter(length__gte=rg[0], length__lt=rg[1]).count())
# Change labels
labels[0] = f"<{labels[0].split('-')[1]}"
labels[-1] = f">{labels[-1].split('-')[0]}"
return {
'counts': data,
'labels': labels
}
@staticmethod
def taxonomy_repartition(level="phylum"):
level = "class_rank" if level == "class" else level
queryset = Gene.objects.all().select_related(f'taxonomy__{level}')
filter_no_annotation = {f"taxonomy__{level}__isnull": True}
filter_annotation = {f"taxonomy__{level}__isnull": False}
value_to_retrieve = f'taxonomy__{level}__name'
taxonomy_counts = defaultdict(lambda: 0)
taxonomy_counts['No annotation'] = queryset.filter(**filter_no_annotation).values().count()
if taxonomy_counts['No annotation'] == 0:
del taxonomy_counts['No annotation']
for value in queryset.filter(**filter_annotation).values(value_to_retrieve):
tax_name = value[value_to_retrieve]
taxonomy_counts[tax_name] += 1
return taxonomy_counts
from rest_framework.test import APITestCase
from metagenedb.apps.catalog.factory import GeneFactory, TaxonomyFactory
from .statistics import GeneStatistics
class TestCountWindowsAPI(APITestCase):
def test_gene_length_no_content(self):
expected_dict = {
'counts': [],
'labels': []
}
self.assertDictEqual(GeneStatistics.gene_length(), expected_dict)
def test_gene_length(self):
for i in range(2000, 4000, 350):
GeneFactory.create(length=i)
expected_dict = {
'counts': [0, 0, 3, 3],
'labels': ['<1.0k', '1.0k-2.0k', '2.0k-3.0k', '>3.0k']
}
self.assertDictEqual(GeneStatistics.gene_length(), expected_dict)
def test_gene_length_stop_at_2000(self):
for i in range(2000, 4000, 350):
GeneFactory.create(length=i)
expected_dict = {
'counts': [0, 0, 6],
'labels': ['<1.0k', '1.0k-2.0k', '>2.0k']
}
stop_at = 2000
self.assertDictEqual(GeneStatistics.gene_length(stop_at=stop_at), expected_dict)
class TestTaxonomyCountsAPI(APITestCase):
def test_taxonomy_counts_no_content(self):
expected_dict = {}
self.assertDictEqual(GeneStatistics.taxonomy_repartition(), expected_dict)
def test_taxonomy_repartition(self):
tax_name = "TaxTest"
taxonomy = TaxonomyFactory(rank='phylum', name=tax_name)
taxonomy.phylum = taxonomy # link taxonomy to itself as phylum
taxonomy.save()
gene = GeneFactory.create(taxonomy=taxonomy) # noqa
expected_dict = {
tax_name: 1
}
self.assertDictEqual(GeneStatistics.taxonomy_repartition(), expected_dict)
def test_taxonomy_counts_class_level(self):
tax_name = "TaxTest"
taxonomy = TaxonomyFactory(rank='class_rank', name=tax_name)
taxonomy.class_rank = taxonomy # link taxonomy to itself as phylum
taxonomy.save()
gene = GeneFactory.create(taxonomy=taxonomy) # noqa
expected_dict = {
tax_name: 1
}
self.assertDictEqual(GeneStatistics.taxonomy_repartition(level='class'), expected_dict)
......@@ -8,3 +8,4 @@ python ${SCRIPTS_PATH}/manage.py collectstatic --no-input
python ${SCRIPTS_PATH}/manage.py makemigrations
python ${SCRIPTS_PATH}/manage.py migrate
python ${SCRIPTS_PATH}/manage.py runserver 0.0.0.0:${PORT}
crontab cron.txt && service cron start
......@@ -128,19 +128,15 @@ export default {
},
methods: {
getGeneLength() {
axios.get('/api/catalog/v1/genes/gene_length', {
params: {
'window_size': this.geneLengthWindowSize,
'stop_at': this.stopAt,
},
axios.get('/api/catalog/v1/statistics/genestatistics-gene-length-' + this.geneLengthWindowSize + '-' + this.stopAt, {
headers: {
Accept: 'application/json',
},
})
.then((response) => {
this.geneLengthData = {
data: response.data.results.counts,
labels: response.data.results.labels,
data: response.data.body.counts,
labels: response.data.body.labels,
label: "Number of genes",
borderColor: '#edb183',
backgroundColor: '#f15152',
......@@ -151,7 +147,7 @@ export default {
});
},
getGeneCountsAll() {
axios.get('/api/catalog/v1/genes', {
axios.get('/api/catalog/v1/statistics/genestatistics-count', {
headers: {
Accept: 'application/json',
},
......@@ -160,7 +156,7 @@ export default {
this.geneCountAll = {
class: "secondary",
icon: "bar_chart",
text: response.data.count,
text: response.data.body.count,
title: "Genes",
};
})
......@@ -187,10 +183,7 @@ export default {
});
},
getGeneCountsTaxo() {
axios.get('/api/catalog/v1/genes', {
params: {
no_taxonomy: false,
},
axios.get('/api/catalog/v1/statistics/genestatistics-count-has-taxonomy', {
headers: {
Accept: 'application/json',
},
......@@ -199,7 +192,7 @@ export default {
this.geneCountTaxo = {
class: "secondary",
icon: "bar_chart",