Commit f5da1cca authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

Add taxonomy to Gene model

parent 25377fe4
Pipeline #13540 passed with stages
in 2 minutes and 3 seconds
......@@ -6,9 +6,17 @@ from metagenedb.apps.catalog.models import Gene
@admin.register(Gene)
class GeneAdmin(admin.ModelAdmin):
list_display = ('gene_id', 'gene_length', 'get_functions')
list_display = ('gene_id', 'gene_length', 'get_functions', 'get_taxonomy')
search_fields = ('gene_id',)
def get_functions(self, obj):
return ",".join([str(f) for f in obj.functions.all()])
if obj.functions.all():
return ",".join([str(f) for f in obj.functions.all()])
return '-'
get_functions.short_description = 'Functions'
def get_taxonomy(self, obj):
if obj.taxonomy:
return f"{obj.taxonomy} ({obj.taxonomy.rank})"
return '-'
get_taxonomy.short_description = 'Taxonomy'
# Generated by Django 2.2.1 on 2019-08-05 13:45
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('catalog', '0005_gene_ordering'),
]
operations = [
migrations.AddField(
model_name='gene',
name='taxonomy',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='genes', to='catalog.Taxonomy'),
),
]
......@@ -7,6 +7,11 @@ class Gene(models.Model):
gene_id = models.CharField(max_length=100, unique=True, db_index=True)
gene_length = models.IntegerField()
functions = models.ManyToManyField(Function)
taxonomy = models.ForeignKey(
'Taxonomy', related_name='genes',
on_delete=models.SET_NULL,
null=True, blank=True
)
def __str__(self):
return self.gene_id
......
from rest_framework import serializers
from metagenedb.apps.catalog.models import Gene
from metagenedb.apps.catalog.models import Gene, Taxonomy
from metagenedb.apps.catalog.serializers import FunctionSerializer
class GeneSerializer(serializers.ModelSerializer):
functions = FunctionSerializer(many=True, read_only=True)
taxonomy = serializers.SlugRelatedField(
queryset=Taxonomy.objects.all(),
slug_field='tax_id',
required=False,
)
class Meta:
model = Gene
fields = ('gene_id', 'gene_length', 'functions')
fields = ('gene_id', 'gene_length', 'functions', 'taxonomy')
......@@ -14,13 +14,15 @@ from metagenedb.common.utils.parsers import IGCLineParser
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings")
django.setup()
from metagenedb.apps.catalog.models import Gene, Function # noqa
from metagenedb.apps.catalog.models import Gene, Function, Taxonomy # noqa
from metagenedb.apps.catalog.serializers import GeneSerializer # noqa
logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)
SELECTED_KEYS = ['gene_id', 'gene_length', 'kegg_ko']
PHYLUM_COL = 'taxo_phylum'
GENUS_COL = 'taxo_genus'
SELECTED_KEYS = ['gene_id', 'gene_length', 'kegg_ko', PHYLUM_COL, GENUS_COL]
def parse_gene(raw_line, selected_keys=SELECTED_KEYS):
......@@ -33,6 +35,30 @@ def parse_gene(raw_line, selected_keys=SELECTED_KEYS):
return selected_dict
def select_taxonomy(gene_dict, unknown_val='unknown'):
"""
Select the taxonomy to be assigned for the gene.
genus has priority on phylum. If both unknow, remove the taxonomy key
"""
phylum = gene_dict.pop(PHYLUM_COL)
genus = gene_dict.pop(GENUS_COL)
if genus != unknown_val:
queryset = Taxonomy.objects.filter(name=genus, rank="genus")
if queryset.count() > 1:
_LOGGER.warning(f"More than 1 result found for genus {genus}. First result is kept.")
gene_dict.update(
{'taxonomy': queryset[0].tax_id}
)
elif phylum != unknown_val:
queryset = Taxonomy.objects.filter(name=phylum, rank="phylum")
if queryset.count() > 1:
_LOGGER.warning(f"More than 1 result found for phylum {phylum}. First result is kept.")
gene_dict.update(
{'taxonomy': queryset[0].tax_id}
)
return gene_dict
def upsert_gene(gene_dict):
try:
gene_obj = Gene.objects.get(gene_id=gene_dict.get('gene_id'))
......@@ -46,8 +72,9 @@ def upsert_gene(gene_dict):
def insert_gene_list(chunk_genes):
for gene_line in chunk_genes:
gene_dict = parse_gene(gene_line)
gene_dict_with_taxo = select_taxonomy(gene_dict)
try:
upsert_gene(gene_dict)
upsert_gene(gene_dict_with_taxo)
except ValidationError as e:
_LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.")
......
......@@ -4,7 +4,7 @@ from rest_framework.exceptions import ValidationError
from rest_framework.test import APITestCase
from metagenedb.apps.catalog.models import Gene
from scripts.populate_db.import_igc_data import parse_gene, upsert_gene
from scripts.populate_db.import_igc_data import parse_gene, upsert_gene, select_taxonomy
class TestParseGene(TestCase):
......@@ -35,7 +35,9 @@ class TestParseGene(TestCase):
expected_dict = {
'gene_id': 'gene_name',
'gene_length': 'gene_length',
'kegg_ko': 'kegg'
'kegg_ko': 'kegg',
'taxo_phylum': 'taxo_phylum',
'taxo_genus': 'taxo_genus',
}
tested_dict = parse_gene(self.raw_line)
self.assertDictEqual(tested_dict, expected_dict)
......@@ -96,3 +98,29 @@ class TestUpsertGene(APITestCase):
self.assertEqual(Gene.objects.get(gene_id="test_gene01").gene_length, 3556)
upsert_gene(updated_gene)
self.assertEqual(Gene.objects.get(gene_id="test_gene01").gene_length, 356)
class TestSelectTaxonomy(TestCase):
def test_genus_only(self):
pass # @TODO with #31
def test_phylum_only(self):
pass # @TODO with #31
def test_genus_phylum(self):
pass # @TODO with #31
def test_both_unknown(self):
gene_dict = {
'gene_id': 'gene',
'gene_length': 135,
'taxo_phylum': 'unknown',
'taxo_genus': 'unknown'
}
expected_dict = {
'gene_id': 'gene',
'gene_length': 135
}
tested_dict = select_taxonomy(gene_dict)
self.assertDictEqual(tested_dict, expected_dict)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment