Commit d6ad306e authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

add tool to compute all taxonomy entries present in at least one gene of the catalog

parent 357426f3
Pipeline #31343 passed with stages
in 3 minutes and 18 seconds
......@@ -140,9 +140,34 @@ class ComputeTaxonomyRepartition(ComputeStatistics):
self._save_to_db(payload)
class ComputeTaxonomyPresence(ComputeStatistics):
ALL_LEVEL = [
'kingdom', 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'
]
def all(self):
for gene_source in self.GENE_SOURCES:
if gene_source == 'all':
filters = {}
else:
filters = {'source': gene_source}
gene_stats = GeneStatistics(filters=filters)
for level in self.ALL_LEVEL:
stats_id = slugify(f"GeneStatistics({gene_source}).present_taxonomy({level})")
logger.info(
"Call GeneStatistics.present_taxonomy(%s) and saving under id <%s>",
level, stats_id
)
payload = {
'stats_id': stats_id,
'body': gene_stats.present_taxonomy(level=level)
}
self._save_to_db(payload)
class Command(BaseCommand):
help = "Compute gene catalog statistics."
STEP_CHOICES = ['clean', 'counts', 'gene-length', 'taxonomy']
STEP_CHOICES = ['clean', 'counts', 'gene-length', 'taxonomy_repartition', 'taxonomy_presence']
def add_arguments(self, parser):
parser.add_argument('--only', help=f'Run only one step (choices: {self.STEP_CHOICES}).')
......@@ -168,5 +193,7 @@ class Command(BaseCommand):
ComputeCounts().all()
if only_step is None or only_step == "gene-length":
ComputeGeneLength().all()
if only_step is None or only_step == "taxonomy":
if only_step is None or only_step == "taxonomy_repartition":
ComputeTaxonomyRepartition().all()
if only_step is None or only_step == "taxonomy_presence":
ComputeTaxonomyPresence().all()
......@@ -3,7 +3,7 @@ from collections import defaultdict
from django.db.models import Max
from metagenedb.apps.catalog.models import Gene
from metagenedb.apps.catalog.models import Gene, Taxonomy
from metagenedb.common.utils.color_generator import generate_color_code
from metagenedb.common.utils.dict import extract_labels_and_values
......@@ -71,6 +71,21 @@ class GeneStatistics(Statistics):
'counts': results[1],
}
def present_taxonomy(self, level="phylum"):
queryset = self.get_queryset().select_related(f'taxonomy__{level}')
filter_annotation = {f"taxonomy__hierarchy__{level}__isnull": False}
value_to_retrieve = f'taxonomy__hierarchy__{level}__tax_id'
all_tax_ids = [el[0] for el in queryset.filter(**filter_annotation).values_list(value_to_retrieve)]
all_unique_tax_ids = list(set(all_tax_ids))
results = {
'tax_ids': [],
'tax_names': []
}
for taxonomy in Taxonomy.objects.filter(tax_id__in=all_unique_tax_ids):
results['tax_ids'].append(taxonomy.tax_id)
results['tax_names'].append(taxonomy.name)
return results
class GeneLengthDistribution(Statistics):
model = Gene
......
from rest_framework.test import APITestCase
from metagenedb.common.utils.color_generator import generate_color_code
from metagenedb.apps.catalog.factory import (
GeneFactory, GeneWithEggNOGFactory, GeneWithKeggFactory, TaxonomyFactory
)
......@@ -13,11 +14,22 @@ class BaseTestGeneStatistics(APITestCase):
self.gene_stats = GeneStatistics()
class TestTaxonomyRepartition(BaseTestGeneStatistics):
class BaseTestTaxonomy(BaseTestGeneStatistics):
@classmethod
def setUpTestData(cls):
cls.parent_root = TaxonomyFactory(rank="root")
cls.phylum = TaxonomyFactory(rank='phylum')
cls.phylum.parent = cls.parent_root
cls.phylum.save()
cls.phylum.build_hierarchy()
cls.class_tax = TaxonomyFactory(rank='class')
cls.class_tax.parent = cls.phylum
cls.class_tax.save()
cls.class_tax.build_hierarchy()
class TestTaxonomyRepartition(BaseTestTaxonomy):
def test_taxonomy_counts_no_content(self):
expected_dict = {
......@@ -27,35 +39,69 @@ class TestTaxonomyRepartition(BaseTestGeneStatistics):
}
self.assertDictEqual(self.gene_stats.taxonomy_repartition(), expected_dict)
def test_taxonom_counts_no_annotation(self):
gene = GeneFactory.create() # noqa
expected_dict = {
'labels': ['No annotation'],
'counts': [1],
'colors': [generate_color_code('No annotation')]
}
self.assertDictEqual(self.gene_stats.taxonomy_repartition(), expected_dict)
def test_taxonomy_repartition(self):
tax_name = "TaxTest"
taxonomy = TaxonomyFactory(rank='phylum', name=tax_name)
taxonomy.parent = self.parent_root
taxonomy.save()
taxonomy.build_hierarchy()
gene = GeneFactory.create(taxonomy=taxonomy) # noqa
gene = GeneFactory.create(taxonomy=self.phylum) # noqa
expected_dict = {
'labels': [tax_name],
'labels': [self.phylum.name],
'counts': [1],
'colors': ['#c989eb']
'colors': [generate_color_code(self.phylum.name)]
}
self.assertDictEqual(self.gene_stats.taxonomy_repartition(), expected_dict)
def test_taxonomy_counts_class_level(self):
tax_name = "TaxTest"
taxonomy = TaxonomyFactory(rank='class', name=tax_name)
taxonomy.parent = self.parent_root
taxonomy.save()
taxonomy.build_hierarchy()
gene = GeneFactory.create(taxonomy=taxonomy) # noqa
gene = GeneFactory.create(taxonomy=self.class_tax) # noqa
expected_dict = {
'labels': [tax_name],
'labels': [self.class_tax.name],
'counts': [1],
'colors': ['#c989eb']
'colors': [generate_color_code(self.class_tax.name)]
}
self.assertDictEqual(self.gene_stats.taxonomy_repartition(level='class'), expected_dict)
class TestPresentTaxonomy(BaseTestTaxonomy):
def test_present_taxonomy_no_content(self):
expected_dict = {
'tax_ids': [],
'tax_names': []
}
self.assertDictEqual(self.gene_stats.present_taxonomy(), expected_dict)
def test_present_taxonomy(self):
gene = GeneFactory.create(taxonomy=self.phylum) # noqa
expected_dict = {
'tax_ids': [self.phylum.tax_id],
'tax_names': [self.phylum.name]
}
self.assertDictEqual(self.gene_stats.present_taxonomy(), expected_dict)
def test_present_taxonomy_multiple_genes(self):
# Create 10 genes with the same taxonomy
GeneFactory.create_batch(10, taxonomy=self.phylum) # noqa
expected_dict = {
'tax_ids': [self.phylum.tax_id],
'tax_names': [self.phylum.name]
}
self.assertDictEqual(self.gene_stats.present_taxonomy(), expected_dict)
def test_taxonomy_counts_class_level(self):
gene = GeneFactory.create(taxonomy=self.class_tax) # noqa
expected_dict = {
'tax_ids': [self.class_tax.tax_id],
'tax_names': [self.class_tax.name]
}
self.assertDictEqual(self.gene_stats.present_taxonomy(level='class'), expected_dict)
class TestCounts(BaseTestGeneStatistics):
@classmethod
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment