Commit fd935b95 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

Merge branch '40-import-taxo-genes' into 'master'

Add taxonomy information to IGC genes

Closes #40

See merge request !8
parents d2949072 f5da1cca
Pipeline #13541 passed with stages
in 1 minute and 58 seconds
from .gene import GeneViewSet # noqa
__all__ = ['GeneViewSet']
from .gene import GeneAdmin
from .function import FunctionAdmin, KeggOrthologyAdmin
from .taxonomy import TaxonomyAdmin
__all__ = ['GeneAdmin', 'FunctionAdmin', 'KeggOrthologyAdmin', 'TaxonomyAdmin']
from .gene import GeneAdmin # noqa
from .function import FunctionAdmin, KeggOrthologyAdmin # noqa
from .taxonomy import TaxonomyAdmin # noqa
......@@ -6,9 +6,17 @@ from metagenedb.apps.catalog.models import Gene
@admin.register(Gene)
class GeneAdmin(admin.ModelAdmin):
list_display = ('gene_id', 'gene_length', 'get_functions')
list_display = ('gene_id', 'gene_length', 'get_functions', 'get_taxonomy')
search_fields = ('gene_id',)
def get_functions(self, obj):
return ",".join([str(f) for f in obj.functions.all()])
if obj.functions.all():
return ",".join([str(f) for f in obj.functions.all()])
return '-'
get_functions.short_description = 'Functions'
def get_taxonomy(self, obj):
if obj.taxonomy:
return f"{obj.taxonomy} ({obj.taxonomy.rank})"
return '-'
get_taxonomy.short_description = 'Taxonomy'
# Generated by Django 2.2.1 on 2019-08-05 13:45
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('catalog', '0005_gene_ordering'),
]
operations = [
migrations.AddField(
model_name='gene',
name='taxonomy',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='genes', to='catalog.Taxonomy'),
),
]
from .function import Function, KeggOrthology
from .gene import Gene
from .taxonomy import Taxonomy
__all__ = ['Function', 'KeggOrthology', 'Gene', 'Taxonomy']
from .function import Function, KeggOrthology # noqa
from .gene import Gene # noqa
from .taxonomy import Taxonomy # noqa
......@@ -7,6 +7,11 @@ class Gene(models.Model):
gene_id = models.CharField(max_length=100, unique=True, db_index=True)
gene_length = models.IntegerField()
functions = models.ManyToManyField(Function)
taxonomy = models.ForeignKey(
'Taxonomy', related_name='genes',
on_delete=models.SET_NULL,
null=True, blank=True
)
def __str__(self):
return self.gene_id
......
from .function import FunctionSerializer
from .gene import GeneSerializer
from .taxonomy import TaxonomySerializer
__all__ = ['FunctionSerializer', 'GeneSerializer', 'TaxonomySerializer']
from .function import FunctionSerializer # noqa
from .gene import GeneSerializer # noqa
from .taxonomy import TaxonomySerializer # noqa
from rest_framework import serializers
from metagenedb.apps.catalog.models import Gene
from metagenedb.apps.catalog.models import Gene, Taxonomy
from metagenedb.apps.catalog.serializers import FunctionSerializer
class GeneSerializer(serializers.ModelSerializer):
functions = FunctionSerializer(many=True, read_only=True)
taxonomy = serializers.SlugRelatedField(
queryset=Taxonomy.objects.all(),
slug_field='tax_id',
required=False,
)
class Meta:
model = Gene
fields = ('gene_id', 'gene_length', 'functions')
fields = ('gene_id', 'gene_length', 'functions', 'taxonomy')
from .igc import IGCLineParser # noqa
from .kegg import KEGGLineParser # noqa
from .ncbi_taxonomy import NCBITaxonomyLineParser # noqa
import logging
logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)
class IGCLineParser(object):
@staticmethod
def gene(line):
"""
Parse line from IGC genes list () to return organized dict
IGC annotation columns:
0: Gene ID Unique ID
1: Gene Name Unique name
2: Gene Length Length of nucleotide sequence
3: Gene Completeness Status I the gene complete or partial according to the gene predictor
4: Cohort Origin Stating the cohort contributing the representative gene
5: Taxonomic Annotation(Phylum Level) Annotated phylum for a gene
6: Taxonomic Annotation(Genus Level) Annotated genus for a gene
7: KEGG Annotation Annotated KO(s) for a gene
8: eggNOG Annotation Annotated eggNOG(s) for a gene
9: Sample Occurence Frequency Occurrence frequency in samples based on gene profile
10: Individual Occurence Frequency Occurrence frequency in individuals based on gene profile
11: KEGG Functional Categories KEGG functional category(ies) of the annotated KO(s)
12: eggNOG Functional Categories eggNOG functional category(ies) of the annotated eggNOG(s)
13: Cohort Assembled Stating the metagenomic sequencing cohort(s) contributing the
representative gene or a redundant gene belonging to it
"""
try:
gene_info = line.rstrip().split('\t')
return {
'igc_id': gene_info[0],
'gene_id': gene_info[1],
'gene_length': gene_info[2],
'gene_completeness_status': gene_info[3],
'cohort_origin': gene_info[4],
'taxo_phylum': gene_info[5],
'taxo_genus': gene_info[6],
'kegg_ko': gene_info[7],
'eggnog': gene_info[8],
'sample_occurence_frequency': gene_info[9],
'individual_occurence_frequency': gene_info[10],
'kegg_functional_categories': gene_info[11],
'eggnog_functional_categories': gene_info[12],
'cohort_assembled': gene_info[13]
}
except Exception:
_LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from IGC genes list?")
raise
import logging
logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)
class KEGGLineParser(object):
@staticmethod
def ko_list(line):
"""
Parse line from kegg KO list (http://rest.kegg.jp/list/ko) to return organized dict
"""
try:
elements = line.split('\t')
function_id = elements[0].split(':')[1]
if ';' in elements[1]:
names = elements[1].split(';')
else:
_LOGGER.warning(f"Parsing issue with {function_id}, corresponding line: {line}")
names = [elements[1], ''] # Ugly fix to handle one specific case with no name: K23479
if '[EC:' in names[1]:
ec_number = names[1].split('[EC:')[1].rstrip(']')
else:
ec_number = ''
return {
'function_id': function_id,
'name': names[0],
'long_name': names[1].lstrip(),
'ec_number': ec_number
}
except Exception:
_LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from KEGG KO list?")
raise
......@@ -4,36 +4,6 @@ logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)
class KEGGLineParser(object):
@staticmethod
def ko_list(line):
"""
Parse line from kegg KO list (http://rest.kegg.jp/list/ko) to return organized dict
"""
try:
elements = line.split('\t')
function_id = elements[0].split(':')[1]
if ';' in elements[1]:
names = elements[1].split(';')
else:
_LOGGER.warning(f"Parsing issue with {function_id}, corresponding line: {line}")
names = [elements[1], ''] # Ugly fix to handle one specific case with no name: K23479
if '[EC:' in names[1]:
ec_number = names[1].split('[EC:')[1].rstrip(']')
else:
ec_number = ''
return {
'function_id': function_id,
'name': names[0],
'long_name': names[1].lstrip(),
'ec_number': ec_number
}
except Exception:
_LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from KEGG KO list?")
raise
class NCBITaxonomyLineParser(object):
@staticmethod
......
from unittest import TestCase
from metagenedb.common.utils.parsers import IGCLineParser
class TestIGCLineParser(TestCase):
def test_gene(self):
raw_data = [
'gene_id',
'gene_name',
'gene_length',
'gene_completeness_status',
'cohort_origin',
'taxo_phylum',
'taxo_genus',
'kegg',
'eggnog',
'sample_occurence_freq',
'ind_occurence_freq',
'kegg_functional_cat',
'eggnog_functional_cat',
'cohort_assembled'
]
raw_line = "\t".join(raw_data)
expected_dict = {
'igc_id': raw_data[0],
'gene_id': raw_data[1],
'gene_length': raw_data[2],
'gene_completeness_status': raw_data[3],
'cohort_origin': raw_data[4],
'taxo_phylum': raw_data[5],
'taxo_genus': raw_data[6],
'kegg_ko': raw_data[7],
'eggnog': raw_data[8],
'sample_occurence_frequency': raw_data[9],
'individual_occurence_frequency': raw_data[10],
'kegg_functional_categories': raw_data[11],
'eggnog_functional_categories': raw_data[12],
'cohort_assembled': raw_data[13]
}
test_dict = IGCLineParser.gene(raw_line)
self.assertDictEqual(test_dict, expected_dict)
def test_gene_wrong_format(self):
raw_line = "This is a wrong line format, with; information and tab"
with self.assertRaises(Exception) as context: # noqa
IGCLineParser.gene(raw_line)
from unittest import TestCase
from metagenedb.common.utils.parsers import KEGGLineParser
class TestKEGGLineParser(TestCase):
def test_ko_list(self):
ko_line = "ko:K00809 DHPS, dys; deoxyhypusine synthase [EC:2.5.1.46]"
expected_dict = {
'function_id': "K00809",
'name': "DHPS, dys",
'long_name': "deoxyhypusine synthase [EC:2.5.1.46]",
'ec_number': "2.5.1.46"
}
test_dict = KEGGLineParser.ko_list(ko_line)
self.assertDictEqual(test_dict, expected_dict)
def test_ko_list_wrong_format(self):
ko_line = "This is a wrong line format, with; information and tab"
with self.assertRaises(Exception) as context: # noqa
KEGGLineParser.ko_list(ko_line)
from unittest import TestCase
from metagenedb.common.utils.parsers import KEGGLineParser, NCBITaxonomyLineParser
class TestKEGGLineParser(TestCase):
def test_ko_list(self):
ko_line = "ko:K00809 DHPS, dys; deoxyhypusine synthase [EC:2.5.1.46]"
expected_dict = {
'function_id': "K00809",
'name': "DHPS, dys",
'long_name': "deoxyhypusine synthase [EC:2.5.1.46]",
'ec_number': "2.5.1.46"
}
test_dict = KEGGLineParser.ko_list(ko_line)
self.assertDictEqual(test_dict, expected_dict)
def test_ko_list_wrong_format(self):
ko_line = "This is a wrong line format, with; information and tab"
with self.assertRaises(Exception) as context: # noqa
KEGGLineParser.ko_list(ko_line)
from metagenedb.common.utils.parsers import NCBITaxonomyLineParser
class TestNCBITaxonomyLineParser(TestCase):
......
......@@ -8,42 +8,55 @@ from itertools import islice
import django
from rest_framework.exceptions import ValidationError
from metagenedb.common.utils.parsers import IGCLineParser
# Before model import, we need to called django.setup() to Load apps
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings")
django.setup()
from metagenedb.apps.catalog.models import Gene, Function # noqa
from metagenedb.apps.catalog.models import Gene, Function, Taxonomy # noqa
from metagenedb.apps.catalog.serializers import GeneSerializer # noqa
logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)
PHYLUM_COL = 'taxo_phylum'
GENUS_COL = 'taxo_genus'
SELECTED_KEYS = ['gene_id', 'gene_length', 'kegg_ko', PHYLUM_COL, GENUS_COL]
def parse_gene(raw_line, selected_keys=SELECTED_KEYS):
"""
Use IGCLineParser and return selected keys
"""
gene_parser = IGCLineParser()
all_dict = gene_parser.gene(raw_line)
selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys}
return selected_dict
def parse_gene(raw_line):
def select_taxonomy(gene_dict, unknown_val='unknown'):
"""
IGC annotation columns:
0: Gene ID Unique ID
1: Gene Name Unique name
2: Gene Length Length of nucleotide sequence
3: Gene Completeness Status Stating a gene is complete or partial according to the gene predictor
4: Cohort Origin Stating the cohort contributing the representative gene
5: Taxonomic Annotation(Phylum Level) Annotated phylum for a gene
6: Taxonomic Annotation(Genus Level) Annotated genus for a gene
7: KEGG Annotation Annotated KO(s) for a gene
8: eggNOG Annotation Annotated eggNOG(s) for a gene
9: Sample Occurence Frequency Occurrence frequency in samples based on gene profile
10:Individual Occurence Frequency Occurrence frequency in individuals based on gene profile
11: KEGG Functional Categories KEGG functional category(ies) of the annotated KO(s)
12: eggNOG Functional Categories eggNOG functional category(ies) of the annotated eggNOG(s)
13: Cohort Assembled Stating the metagenomic sequencing cohort(s) contributing the
representative gene or a redundant gene belonging to it
Select the taxonomy to be assigned for the gene.
genus has priority on phylum. If both unknow, remove the taxonomy key
"""
gene_info = raw_line.rstrip().split('\t')
return {
'gene_id': gene_info[1],
'gene_length': gene_info[2],
'kegg_ko': gene_info[7]
}
phylum = gene_dict.pop(PHYLUM_COL)
genus = gene_dict.pop(GENUS_COL)
if genus != unknown_val:
queryset = Taxonomy.objects.filter(name=genus, rank="genus")
if queryset.count() > 1:
_LOGGER.warning(f"More than 1 result found for genus {genus}. First result is kept.")
gene_dict.update(
{'taxonomy': queryset[0].tax_id}
)
elif phylum != unknown_val:
queryset = Taxonomy.objects.filter(name=phylum, rank="phylum")
if queryset.count() > 1:
_LOGGER.warning(f"More than 1 result found for phylum {phylum}. First result is kept.")
gene_dict.update(
{'taxonomy': queryset[0].tax_id}
)
return gene_dict
def upsert_gene(gene_dict):
......@@ -59,8 +72,9 @@ def upsert_gene(gene_dict):
def insert_gene_list(chunk_genes):
for gene_line in chunk_genes:
gene_dict = parse_gene(gene_line)
gene_dict_with_taxo = select_taxonomy(gene_dict)
try:
upsert_gene(gene_dict)
upsert_gene(gene_dict_with_taxo)
except ValidationError as e:
_LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.")
......
......@@ -4,12 +4,12 @@ from rest_framework.exceptions import ValidationError
from rest_framework.test import APITestCase
from metagenedb.apps.catalog.models import Gene
from scripts.populate_db.import_igc_data import parse_gene, upsert_gene
from scripts.populate_db.import_igc_data import parse_gene, upsert_gene, select_taxonomy
class TestParseGene(TestCase):
def test_parse_gene(self):
def setUp(self):
raw_data = [
'gene_id',
'gene_name',
......@@ -26,13 +26,44 @@ class TestParseGene(TestCase):
'eggnog_functional_cat',
'cohort_assembled'
]
raw_line = "\t".join(raw_data)
self.raw_line = "\t".join(raw_data)
def test_parse_gene_default_selected_keys(self):
"""
This test should failed and need to be updated when SELECTED_KEYS are changed
"""
expected_dict = {
'gene_id': 'gene_name', # We use the gene name for our gene ID
'gene_id': 'gene_name',
'gene_length': 'gene_length',
'kegg_ko': 'kegg'
'kegg_ko': 'kegg',
'taxo_phylum': 'taxo_phylum',
'taxo_genus': 'taxo_genus',
}
tested_dict = parse_gene(self.raw_line)
self.assertDictEqual(tested_dict, expected_dict)
def test_parse_gene(self):
"""
This test should failed and need to be updated when SELECTED_KEYS are changed
"""
selected_keys = ['gene_id', 'gene_length']
expected_dict = {
'gene_id': 'gene_name',
'gene_length': 'gene_length'
}
tested_dict = parse_gene(self.raw_line, selected_keys=selected_keys)
self.assertDictEqual(tested_dict, expected_dict)
def test_parse_gene_unknown_key(self):
"""
Unknown key should be ignored
"""
selected_keys = ['gene_id', 'gene_length', 'secret_code']
expected_dict = {
'gene_id': 'gene_name',
'gene_length': 'gene_length'
}
tested_dict = parse_gene(raw_line)
tested_dict = parse_gene(self.raw_line, selected_keys=selected_keys)
self.assertDictEqual(tested_dict, expected_dict)
......@@ -67,3 +98,29 @@ class TestUpsertGene(APITestCase):
self.assertEqual(Gene.objects.get(gene_id="test_gene01").gene_length, 3556)
upsert_gene(updated_gene)
self.assertEqual(Gene.objects.get(gene_id="test_gene01").gene_length, 356)
class TestSelectTaxonomy(TestCase):
def test_genus_only(self):
pass # @TODO with #31
def test_phylum_only(self):
pass # @TODO with #31
def test_genus_phylum(self):
pass # @TODO with #31
def test_both_unknown(self):
gene_dict = {
'gene_id': 'gene',
'gene_length': 135,
'taxo_phylum': 'unknown',
'taxo_genus': 'unknown'
}
expected_dict = {
'gene_id': 'gene',
'gene_length': 135
}
tested_dict = select_taxonomy(gene_dict)
self.assertDictEqual(tested_dict, expected_dict)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment