Skip to content
Snippets Groups Projects
Commit d15281f4 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion :recycle:
Browse files

Merge branch 'improve-gene-length-dedicated-class' into 'dev'

Improve gene length dedicated class

See merge request !33
parents 5cf99b20 2ed62d3b
No related branches found
No related tags found
2 merge requests!59Prod,!33Improve gene length dedicated class
Pipeline #21060 passed with stages
in 2 minutes and 50 seconds
......@@ -5,7 +5,7 @@ from django.core.management.base import BaseCommand
from slugify import slugify
from metagenedb.apps.catalog.models import Statistics
from metagenedb.apps.catalog.operations.statistics import GeneStatistics
from metagenedb.apps.catalog.operations.statistics import GeneStatistics, GeneLengthDistribution
logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s')
logger = logging.getLogger(__name__)
......@@ -53,8 +53,10 @@ class ComputeCounts(ComputeStatistics):
class ComputeGeneLength(ComputeStatistics):
WINDOW_SIZES = list(range(100, 300, 100))
STOP_ATS = list(range(4000, 6000, 1000))
MIN_WINDOW_SIZE = 100
MAX_STOP_AT = 5000
WINDOW_SIZES = list(range(MIN_WINDOW_SIZE, 300, 100))
STOP_ATS = list(range(1000, MAX_STOP_AT + 1, 1000))
CATEGORIES = {
'all': None,
'with-functions': {'functions__isnull': False},
......@@ -68,14 +70,16 @@ class ComputeGeneLength(ComputeStatistics):
}
def all(self):
gene_stats = GeneStatistics()
for category, filters in self.CATEGORIES.items():
gene_stats = GeneLengthDistribution(
window_size=self.MIN_WINDOW_SIZE, stop_at=self.MAX_STOP_AT, filters=filters
)
for window_size in self.WINDOW_SIZES:
for stop_at in self.STOP_ATS:
logger.info("Call GeneStatistics.gene_length(%s, %s) for %s", window_size, stop_at, category)
payload = {
'stats_id': slugify(f"GeneStatistics.gene_length-{window_size}-{stop_at}-{category}"),
'body': gene_stats.gene_length(window_size=window_size, stop_at=stop_at, filters=filters)
'body': gene_stats.get_distribution(window_size=window_size, stop_at=stop_at)
}
self._save_to_db(payload)
......@@ -107,6 +111,6 @@ class Command(BaseCommand):
def handle(self, *args, **options):
self.set_logger_level(int(options['verbosity']))
ComputeCounts().all()
# ComputeCounts().all()
ComputeGeneLength().all()
ComputeTaxonomyRepartition().all()
# ComputeTaxonomyRepartition().all()
# Generated by Django 3.0.1 on 2019-12-30 10:31
from django.db.models.indexes import Index
from django.db import migrations, models
......@@ -20,5 +19,4 @@ class Migration(migrations.Migration):
name='length',
field=models.PositiveIntegerField(db_index=True),
),
migrations.AddIndex('Gene', Index(fields=['length'], name='length_index')),
]
......@@ -91,3 +91,119 @@ class GeneStatistics(Statistics):
'colors': colors,
'counts': results[1],
}
class GeneLengthDistribution(Statistics):
model = Gene
def __init__(self, window_size=100, stop_at=5000, filters=None, counts_key='counts', windows_key='labels'):
self.base_window_size = window_size
self.base_stop_at = stop_at
self.current_stop_at = stop_at
if filters is None:
filters = {}
self.filters = filters
self.counts_key = counts_key
self.windows_key = windows_key
def _update_first_window(self, window_name):
return f"<{window_name.split('-')[1]}"
def _update_last_window(self, window_name):
return f">{window_name.split('-')[0]}"
def _update_first_last_windows(self, windows):
windows[0] = self._update_first_window(windows[0])
windows[-1] = self._update_last_window(windows[-1])
return windows
def _perform_query(self):
"""
Count how many gene by window of gene length.
"""
if self.filters:
queryset = self.get_queryset(filters=self.filters).distinct().only('length')
else:
queryset = self.get_queryset().only('length')
length_max = queryset.aggregate(Max('length')).get('length__max', 0)
self.current_stop_at = length_max if length_max < self.base_stop_at else self.base_stop_at
all_ranges = [[i, i + self.base_window_size] for i in range(0, self.current_stop_at + 1, self.base_window_size)]
all_ranges[-1][1] = length_max + 1 # last should contain all above the stop_at
data = []
windows = []
for rg in all_ranges:
windows.append(f"{rg[0]/1000}k-{rg[1]/1000}k")
data.append(queryset.filter(length__gte=rg[0], length__lt=rg[1]).count())
# Change labels
windows = self._update_first_last_windows(windows)
self._counts = data
self._windows = windows
@property
def counts(self):
if getattr(self, '_counts', None) is None:
self._perform_query()
return self._counts
@property
def windows(self):
if getattr(self, '_windows', None) is None:
self._perform_query()
return self._windows
def validate_arguments(self, window_size, stop_at):
if window_size < self.base_window_size or window_size % self.base_window_size != 0:
raise Exception(f"window_size has to be >= {self.base_window_size} and a multiple of it.")
if stop_at > self.base_stop_at or stop_at % window_size != 0:
raise Exception(f"stop_at needs be <= {self.base_stop_at} and a multiple of {window_size}.")
def _reduce_stop_at(self, counts, windows, window_size, stop_at):
indice_stop_at = int(stop_at / window_size)
new_last_window = self._update_last_window(windows[indice_stop_at])
return {
self.counts_key: counts[:indice_stop_at] + [sum(counts[indice_stop_at:])],
self.windows_key: windows[:indice_stop_at] + [new_last_window]
}
def _group_by_different_window_size(self, counts, windows, window_size):
group_nb = window_size // self.base_window_size
new_counts = []
new_windows = []
for i in range(0, len(counts), group_nb):
new_counts.append(sum(counts[i:i + group_nb]))
new_windows.append(
f"{(self.base_window_size * i)/1000}k-{(self.base_window_size * (i + group_nb))/1000}k"
)
return {
self.counts_key: new_counts,
self.windows_key: self._update_first_last_windows(new_windows)
}
def _handle_diff_window_diff_stop_at(self, counts, windows, window_size, stop_at):
new_group = self._group_by_different_window_size(counts, windows, window_size)
new_stop_at = self._reduce_stop_at(
new_group[self.counts_key], new_group[self.windows_key], window_size, stop_at
)
return new_stop_at
def _format_dict(self, window_size, stop_at):
if window_size == self.base_window_size:
if stop_at >= self.current_stop_at:
return {
self.counts_key: self.counts,
self.windows_key: self.windows
}
return self._reduce_stop_at(self.counts, self.windows, window_size, stop_at)
elif stop_at >= self.current_stop_at:
return self._group_by_different_window_size(self.counts, self.windows, window_size)
else:
return self._handle_diff_window_diff_stop_at(self.counts, self.windows, window_size, stop_at)
def get_distribution(self, window_size=100, stop_at=5000):
if not self.get_queryset().exists():
return {
'counts': [],
'labels': []
}
self.validate_arguments(window_size, stop_at)
return self._format_dict(window_size, stop_at)
......@@ -4,7 +4,7 @@ from metagenedb.apps.catalog.factory import (
GeneFactory, GeneWithEggNOGFactory, GeneWithKeggFactory, TaxonomyFactory
)
from .statistics import GeneStatistics
from .statistics import GeneStatistics, GeneLengthDistribution
class BaseTestGeneStatistics(APITestCase):
......@@ -13,53 +13,6 @@ class BaseTestGeneStatistics(APITestCase):
self.gene_stats = GeneStatistics()
class TestCountWindows(BaseTestGeneStatistics):
def test_gene_length_no_content(self):
expected_dict = {
'counts': [],
'labels': []
}
self.assertDictEqual(self.gene_stats.gene_length(), expected_dict)
def test_gene_length(self):
for i in range(2000, 4000, 350):
GeneFactory.create(length=i)
expected_dict = {
'counts': [0, 0, 3, 3],
'labels': ['<1.0k', '1.0k-2.0k', '2.0k-3.0k', '>3.0k']
}
self.assertDictEqual(self.gene_stats.gene_length(), expected_dict)
def test_gene_length_stop_at_2000(self):
for i in range(2000, 4000, 350):
GeneFactory.create(length=i)
expected_dict = {
'counts': [0, 0, 6],
'labels': ['<1.0k', '1.0k-2.0k', '>2.0k']
}
stop_at = 2000
self.assertDictEqual(self.gene_stats.gene_length(stop_at=stop_at), expected_dict)
def test_gene_length_with_functions(self):
for i in range(2000, 4000, 350):
GeneFactory.create(length=i)
GeneWithKeggFactory(length=i)
expected_dict_all = {
'counts': [0, 0, 6, 6],
'labels': ['<1.0k', '1.0k-2.0k', '2.0k-3.0k', '>3.0k']
}
self.assertDictEqual(self.gene_stats.gene_length(), expected_dict_all)
expected_dict_with_function = {
'counts': [0, 0, 3, 3],
'labels': ['<1.0k', '1.0k-2.0k', '2.0k-3.0k', '>3.0k']
}
filters = {
'functions__isnull': False
}
self.assertDictEqual(self.gene_stats.gene_length(filters=filters), expected_dict_with_function)
class TestTaxonomyRepartition(BaseTestGeneStatistics):
def test_taxonomy_counts_no_content(self):
......@@ -133,3 +86,74 @@ class TestCounts(BaseTestGeneStatistics):
def test_count_has_taxonomy_has_function(self):
self.assertEqual(self.gene_stats.count_has_function_has_taxonomy(), 15)
class TestCountWindows(APITestCase):
def test_gene_length_no_content(self):
gene_stats = GeneLengthDistribution()
expected_dict = {
'counts': [],
'labels': []
}
self.assertDictEqual(gene_stats.get_distribution(), expected_dict)
def test_get_distribution(self):
for i in range(100, 400, 60):
GeneFactory.create(length=i)
gene_stats = GeneLengthDistribution()
expected_dict = {
'counts': [0, 2, 2, 1],
'labels': ['<0.1k', '0.1k-0.2k', '0.2k-0.3k', '>0.3k']
}
self.assertDictEqual(gene_stats.get_distribution(), expected_dict)
def test_gene_length_stop_at_200(self):
for i in range(100, 400, 60):
GeneFactory.create(length=i)
gene_stats = GeneLengthDistribution()
expected_dict = {
'counts': [0, 2, 3],
'labels': ['<0.1k', '0.1k-0.2k', '>0.2k']
}
self.assertDictEqual(gene_stats.get_distribution(stop_at=200), expected_dict)
def test_gene_length_window_size_200(self):
for i in range(100, 400, 60):
GeneFactory.create(length=i)
gene_stats = GeneLengthDistribution()
expected_dict = {
'counts': [2, 3],
'labels': ['<0.2k', '>0.2k']
}
self.assertDictEqual(gene_stats.get_distribution(window_size=200), expected_dict)
def test_gene_length_window_size_200_stop_at_200(self):
for i in range(100, 500, 60):
GeneFactory.create(length=i)
gene_stats = GeneLengthDistribution()
expected_dict = {
'counts': [2, 5],
'labels': ['<0.2k', '>0.2k']
}
self.assertDictEqual(gene_stats.get_distribution(window_size=200, stop_at=200), expected_dict)
def test_gene_length_with_functions(self):
for i in range(100, 400, 60):
GeneFactory.create(length=i)
GeneWithKeggFactory(length=i)
gene_stats = GeneLengthDistribution()
expected_dict = {
'counts': [0, 4, 4, 2],
'labels': ['<0.1k', '0.1k-0.2k', '0.2k-0.3k', '>0.3k']
}
self.assertDictEqual(gene_stats.get_distribution(), expected_dict)
filters = {
'functions__isnull': False
}
gene_stats = GeneLengthDistribution(filters=filters)
expected_dict = {
'counts': [0, 2, 2, 1],
'labels': ['<0.1k', '0.1k-0.2k', '0.2k-0.3k', '>0.3k']
}
self.assertDictEqual(gene_stats.get_distribution(), expected_dict)
......@@ -118,7 +118,7 @@ export default {
computed: {
stopAtChoice() {
let listStopAt = [];
for (let i=4000; i<=5000; i+=1000) {
for (let i=1000; i<=5000; i+=1000) {
listStopAt.push(i);
};
return listStopAt;
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment