From 9756274519762b849c24e2e08123c7bfdd1f03a8 Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Mon, 27 Apr 2020 15:45:23 +0200 Subject: [PATCH 01/21] add virgo source for genes --- .../migrations/0025_add_virgo_source.py | 18 ++++++++++++++++++ backend/metagenedb/apps/catalog/models/gene.py | 2 ++ 2 files changed, 20 insertions(+) create mode 100644 backend/metagenedb/apps/catalog/migrations/0025_add_virgo_source.py diff --git a/backend/metagenedb/apps/catalog/migrations/0025_add_virgo_source.py b/backend/metagenedb/apps/catalog/migrations/0025_add_virgo_source.py new file mode 100644 index 0000000..2a7df3d --- /dev/null +++ b/backend/metagenedb/apps/catalog/migrations/0025_add_virgo_source.py @@ -0,0 +1,18 @@ +# Generated by Django 3.0.4 on 2020-04-27 13:32 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('catalog', '0024_set_undef_default_source_gene'), + ] + + operations = [ + migrations.AlterField( + model_name='gene', + name='source', + field=models.CharField(choices=[('undef', 'Undefined'), ('igc', 'IGC'), ('virgo', 'Virgo')], default='undef', max_length=10), + ), + ] diff --git a/backend/metagenedb/apps/catalog/models/gene.py b/backend/metagenedb/apps/catalog/models/gene.py index 50bc01b..aee62eb 100644 --- a/backend/metagenedb/apps/catalog/models/gene.py +++ b/backend/metagenedb/apps/catalog/models/gene.py @@ -6,9 +6,11 @@ from .function import Function class Gene(models.Model): UNDEFINED = 'undef' IGC = 'igc' + VIRGO = 'virgo' SOURCE_CHOICES = [ (UNDEFINED, 'Undefined'), (IGC, 'IGC'), + (VIRGO, 'Virgo'), ] gene_id = models.SlugField(max_length=100, db_index=True, unique=True) -- GitLab From 519dd6a91dac5f829eeeab3507a8bb340450a58d Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Mon, 27 Apr 2020 18:57:04 +0200 Subject: [PATCH 02/21] add virgo parser for KEGG --- .../common/utils/parsers/__init__.py | 1 + .../common/utils/parsers/test_virgo.py | 28 ++++++++++++++++++ .../metagenedb/common/utils/parsers/virgo.py | 29 +++++++++++++++++++ 3 files changed, 58 insertions(+) create mode 100644 backend/metagenedb/common/utils/parsers/test_virgo.py create mode 100644 backend/metagenedb/common/utils/parsers/virgo.py diff --git a/backend/metagenedb/common/utils/parsers/__init__.py b/backend/metagenedb/common/utils/parsers/__init__.py index d6aa345..b63a400 100644 --- a/backend/metagenedb/common/utils/parsers/__init__.py +++ b/backend/metagenedb/common/utils/parsers/__init__.py @@ -2,3 +2,4 @@ from .eggnog import EggNOGAnnotationLineParser # noqa from .igc import IGCLineParser # noqa from .kegg import KEGGLineParser # noqa from .ncbi_taxonomy import NCBITaxonomyLineParser # noqa +from .virgo import VirgoKEGGLineParser # noqa diff --git a/backend/metagenedb/common/utils/parsers/test_virgo.py b/backend/metagenedb/common/utils/parsers/test_virgo.py new file mode 100644 index 0000000..fc23c2e --- /dev/null +++ b/backend/metagenedb/common/utils/parsers/test_virgo.py @@ -0,0 +1,28 @@ +from unittest import TestCase + +from metagenedb.common.utils.parsers import VirgoKEGGLineParser + + +class TestVirgoKEGGLineParser(TestCase): + + def test_gene(self): + raw_data = [ + 'gene_id', + 'kegg_ko', + 'kegg_gene', + 'more_information', + ] + raw_line = "\t".join(raw_data) + expected_dict = { + 'virgo_id': raw_data[0], + 'kegg_ko': raw_data[1], + 'kegg_gene': raw_data[2], + 'more_info': raw_data[3], + } + test_dict = VirgoKEGGLineParser.gene(raw_line) + self.assertDictEqual(test_dict, expected_dict) + + def test_gene_wrong_format(self): + raw_line = "This is a wrong line format, with; information and tab" + with self.assertRaises(Exception) as context: # noqa + VirgoKEGGLineParser.gene(raw_line) diff --git a/backend/metagenedb/common/utils/parsers/virgo.py b/backend/metagenedb/common/utils/parsers/virgo.py new file mode 100644 index 0000000..04bb936 --- /dev/null +++ b/backend/metagenedb/common/utils/parsers/virgo.py @@ -0,0 +1,29 @@ +import logging + +_LOGGER = logging.getLogger(__name__) + + +class VirgoKEGGLineParser(object): + + @staticmethod + def gene(line): + """ + Parse line from Virgo KEGG annotations to return organized dict (8.A.kegg.ortholog.txt) + + IGC annotation columns: + 0: Gene ID Unique ID + 1: KEGG KO Annotation Annotated KO(s) for a gene + 2: KEGG Gene + 2: More information Information separated by ; + """ + try: + gene_info = line.rstrip().split('\t') + return { + 'virgo_id': gene_info[0], + 'kegg_ko': gene_info[1], + 'kegg_gene': gene_info[2], + 'more_info': gene_info[3], + } + except Exception: + _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from Virgo KEGG annotation file?") + raise -- GitLab From 28028563225344d64139d87feecc7596699e1d46 Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Tue, 28 Apr 2020 11:40:44 +0200 Subject: [PATCH 03/21] add parser for virgo gene length --- .../common/utils/parsers/__init__.py | 2 +- .../common/utils/parsers/test_virgo.py | 27 +++++++++++++++++-- .../metagenedb/common/utils/parsers/virgo.py | 24 ++++++++++++++++- 3 files changed, 49 insertions(+), 4 deletions(-) diff --git a/backend/metagenedb/common/utils/parsers/__init__.py b/backend/metagenedb/common/utils/parsers/__init__.py index b63a400..6e718d0 100644 --- a/backend/metagenedb/common/utils/parsers/__init__.py +++ b/backend/metagenedb/common/utils/parsers/__init__.py @@ -2,4 +2,4 @@ from .eggnog import EggNOGAnnotationLineParser # noqa from .igc import IGCLineParser # noqa from .kegg import KEGGLineParser # noqa from .ncbi_taxonomy import NCBITaxonomyLineParser # noqa -from .virgo import VirgoKEGGLineParser # noqa +from .virgo import VirgoGeneLengthLineParser, VirgoKEGGLineParser # noqa diff --git a/backend/metagenedb/common/utils/parsers/test_virgo.py b/backend/metagenedb/common/utils/parsers/test_virgo.py index fc23c2e..bde7cb9 100644 --- a/backend/metagenedb/common/utils/parsers/test_virgo.py +++ b/backend/metagenedb/common/utils/parsers/test_virgo.py @@ -1,6 +1,29 @@ from unittest import TestCase -from metagenedb.common.utils.parsers import VirgoKEGGLineParser +from metagenedb.common.utils.parsers import ( + VirgoGeneLengthLineParser, VirgoKEGGLineParser +) + + +class TestVirgoGeneLengthLineParser(TestCase): + + def test_gene(self): + raw_data = [ + 'gene_id', + 'length', + ] + raw_line = "\t".join(raw_data) + expected_dict = { + 'gene_id': raw_data[0], + 'length': raw_data[1], + } + test_dict = VirgoGeneLengthLineParser.gene(raw_line) + self.assertDictEqual(test_dict, expected_dict) + + def test_gene_wrong_format(self): + raw_line = "This is a wrong line format, with; information and tab" + with self.assertRaises(Exception) as context: # noqa + VirgoGeneLengthLineParser.gene(raw_line) class TestVirgoKEGGLineParser(TestCase): @@ -14,7 +37,7 @@ class TestVirgoKEGGLineParser(TestCase): ] raw_line = "\t".join(raw_data) expected_dict = { - 'virgo_id': raw_data[0], + 'gene_id': raw_data[0], 'kegg_ko': raw_data[1], 'kegg_gene': raw_data[2], 'more_info': raw_data[3], diff --git a/backend/metagenedb/common/utils/parsers/virgo.py b/backend/metagenedb/common/utils/parsers/virgo.py index 04bb936..44cfbcc 100644 --- a/backend/metagenedb/common/utils/parsers/virgo.py +++ b/backend/metagenedb/common/utils/parsers/virgo.py @@ -3,6 +3,28 @@ import logging _LOGGER = logging.getLogger(__name__) +class VirgoGeneLengthLineParser(object): + + @staticmethod + def gene(line): + """ + Parse line from Virgo KEGG annotations to return organized dict (8.A.kegg.ortholog.txt) + + IGC annotation columns: + 0: Gene ID Unique ID + 1: Gene Length Length of nucleotide sequence + """ + try: + gene_info = line.rstrip().split('\t') + return { + 'gene_id': gene_info[0], + 'length': gene_info[1], + } + except Exception: + _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from Virgo KEGG annotation file?") + raise + + class VirgoKEGGLineParser(object): @staticmethod @@ -19,7 +41,7 @@ class VirgoKEGGLineParser(object): try: gene_info = line.rstrip().split('\t') return { - 'virgo_id': gene_info[0], + 'gene_id': gene_info[0], 'kegg_ko': gene_info[1], 'kegg_gene': gene_info[2], 'more_info': gene_info[3], -- GitLab From d86cb8e1a36decb7de7038a59960a7071f1a1301 Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Tue, 28 Apr 2020 18:23:25 +0200 Subject: [PATCH 04/21] move tests and update virgo gene creation --- .../management/commands/commons/__init__.py | 0 .../management/commands/import_virgo_genes.py | 114 ++++++++++++++++++ .../{ => tests}/test_build_hierarchy.py | 2 +- .../tests/test_files/virgo_gene_length.tsv | 2 + .../{ => tests}/test_import_igc_annotation.py | 0 .../{ => tests}/test_import_igc_sequences.py | 0 .../commands/tests/test_import_virgo_genes.py | 110 +++++++++++++++++ 7 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 backend/metagenedb/apps/catalog/management/commands/commons/__init__.py create mode 100644 backend/metagenedb/apps/catalog/management/commands/import_virgo_genes.py rename backend/metagenedb/apps/catalog/management/commands/{ => tests}/test_build_hierarchy.py (94%) create mode 100644 backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_gene_length.tsv rename backend/metagenedb/apps/catalog/management/commands/{ => tests}/test_import_igc_annotation.py (100%) rename backend/metagenedb/apps/catalog/management/commands/{ => tests}/test_import_igc_sequences.py (100%) create mode 100644 backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_genes.py diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/__init__.py b/backend/metagenedb/apps/catalog/management/commands/commons/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/metagenedb/apps/catalog/management/commands/import_virgo_genes.py b/backend/metagenedb/apps/catalog/management/commands/import_virgo_genes.py new file mode 100644 index 0000000..12b277b --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/import_virgo_genes.py @@ -0,0 +1,114 @@ +import logging +from itertools import islice + +from django.core.management.base import BaseCommand +from slugify import slugify + +from metagenedb.apps.catalog.models import Gene +from metagenedb.common.utils.chunks import file_len +from metagenedb.common.utils.parsers import VirgoGeneLengthLineParser + +logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') +logger = logging.getLogger(__name__) + + +class ImportVirgoGenes(object): + + SELECTED_KEYS = ['gene_id', 'length'] + SOURCE = "virgo" + + def __init__(self, annotation_file): + self.annotation_file = annotation_file + self.total_genes = file_len(annotation_file) + self._reset_counters() + + def _reset_counters(self): + self.processed_genes = 0 + self.created_genes = 0 + self.updated_genes = 0 + self.skipped_genes = 0 + + def _parse_gene(self, raw_line, selected_keys=SELECTED_KEYS): + """ + Use VirgoGeneLengthLineParser + """ + gene_parser = VirgoGeneLengthLineParser() + all_dict = gene_parser.gene(raw_line) + selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys} + return selected_dict + + def _format_for_model(self, igc_dict): + gene_dict = {} + gene_dict['gene_id'] = slugify(igc_dict['gene_id']) + gene_dict['name'] = igc_dict['gene_id'] + gene_dict['length'] = igc_dict['length'] + gene_dict['source'] = self.SOURCE + return gene_dict + + def _update_genes(self, gene_instances, gene_dict): + for gene_id, gene_instance in gene_instances.items(): + for key, value in gene_dict[gene_id].items(): + setattr(gene_instance, key, value) + try: + Gene.objects.bulk_update( + list(gene_instances.values()), + ['name', 'length', 'source'] + ) + self.updated_genes += len(gene_instances.keys()) + except Exception as exception: + logger.warning(exception) + self.skipped_genes += len(gene_instances.keys()) + + def _create_genes(self, gene_list): + try: + Gene.objects.bulk_create( + [Gene(**item) for item in gene_list] + ) + self.created_genes += len(gene_list) + except Exception as exception: + logger.warning(exception) + self.skipped_genes += len(gene_list) + + def create_or_update_genes(self, gene_dict): + update_instances = Gene.objects.in_bulk(gene_dict.keys(), field_name='gene_id') + self._update_genes(update_instances, gene_dict) + gene_ids_to_create = set(gene_dict.keys()) - set(update_instances.keys()) + if gene_ids_to_create: + self._create_genes([gene_dict[gene_id] for gene_id in gene_ids_to_create]) + + def load_all(self, test=False, chunk_size=10000): + logger.info("Starting Virgo KEGG annotations import (creation or update) to DB") + with open(self.annotation_file, 'r') as file: + while True: + chunk_genes = list(islice(file, chunk_size)) + if not chunk_genes: + break + virgo_dict_list = [self._parse_gene(i) for i in chunk_genes] + virgo_clean_dict = {slugify(i['gene_id']): self._format_for_model(i) for i in virgo_dict_list} + self.processed_genes += chunk_size + self.create_or_update_genes(virgo_clean_dict) + logger.info("%s Genes processed so far...", self.processed_genes) + if test is True: + break + logger.info("[DONE] %s/%s Genes created.", self.created_genes, self.total_genes) + logger.info("[DONE] %s/%s Genes updated.", self.updated_genes, self.total_genes) + logger.info("[DONE] %s/%s Genes skipped.", self.skipped_genes, self.total_genes) + + +class Command(BaseCommand): + help = 'Create or update all EggNOG entries from annotations.tsv file.' + + def add_arguments(self, parser): + parser.add_argument('annotation', help='8.A.kegg.ortholog.txt file from Virgo') + parser.add_argument('--test', action='store_true', help='Run only on first 10000 entries.') + + def set_logger_level(self, verbosity): + if verbosity > 2: + logger.setLevel(logging.DEBUG) + elif verbosity > 1: + logger.setLevel(logging.INFO) + + def handle(self, *args, **options): + self.set_logger_level(int(options['verbosity'])) + import_igc = ImportVirgoGenes(options['annotation']) + import_igc.load_all(test=options['test']) diff --git a/backend/metagenedb/apps/catalog/management/commands/test_build_hierarchy.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_build_hierarchy.py similarity index 94% rename from backend/metagenedb/apps/catalog/management/commands/test_build_hierarchy.py rename to backend/metagenedb/apps/catalog/management/commands/tests/test_build_hierarchy.py index 68a33cf..81d76ce 100644 --- a/backend/metagenedb/apps/catalog/management/commands/test_build_hierarchy.py +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_build_hierarchy.py @@ -3,7 +3,7 @@ from rest_framework.test import APITestCase from metagenedb.apps.catalog.factory import TaxonomyFactory from metagenedb.apps.catalog.models import Taxonomy -from .build_hierarchy import HierarchyBuilder +from metagenedb.apps.catalog.management.commands.build_hierarchy import HierarchyBuilder class TestBuildHierarchy(APITestCase): diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_gene_length.tsv b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_gene_length.tsv new file mode 100644 index 0000000..6cf8383 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_gene_length.tsv @@ -0,0 +1,2 @@ +V1 101 +V2 102 diff --git a/backend/metagenedb/apps/catalog/management/commands/test_import_igc_annotation.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py similarity index 100% rename from backend/metagenedb/apps/catalog/management/commands/test_import_igc_annotation.py rename to backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py diff --git a/backend/metagenedb/apps/catalog/management/commands/test_import_igc_sequences.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_sequences.py similarity index 100% rename from backend/metagenedb/apps/catalog/management/commands/test_import_igc_sequences.py rename to backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_sequences.py diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_genes.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_genes.py new file mode 100644 index 0000000..44f5aa0 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_genes.py @@ -0,0 +1,110 @@ +import os +from unittest import TestCase + +import mock +from rest_framework.test import APITestCase + +from metagenedb.apps.catalog.models import Gene +from metagenedb.apps.catalog.management.commands.import_virgo_genes import ImportVirgoGenes +from metagenedb.apps.catalog.factory import ( + GeneFactory, +) + + +class BaseTestImportVirgoGenes(TestCase): + + def setUp(self): + function_to_mock = 'metagenedb.apps.catalog.management.commands.import_virgo_genes.file_len' + with mock.patch(function_to_mock) as MockFileLen: + MockFileLen.return_value = 10 + self.import_virgo_genes = ImportVirgoGenes('test') + + +class TestParseGene(BaseTestImportVirgoGenes): + + def setUp(self): + raw_data = [ + 'gene_ID', + 'length', + ] + self.raw_line = "\t".join(raw_data) + super().setUp() + + def test_parse_gene_default_selected_keys(self): + """ + This test should failed and need to be updated when SELECTED_KEYS are changed + """ + expected_dict = { + 'gene_id': 'gene_ID', + 'length': 'length', + } + tested_dict = self.import_virgo_genes._parse_gene(self.raw_line) + self.assertDictEqual(tested_dict, expected_dict) + + def test_parse_gene(self): + """ + This test should failed and need to be updated when SELECTED_KEYS are changed + """ + selected_keys = ['gene_id'] + expected_dict = { + 'gene_id': 'gene_ID', + } + tested_dict = self.import_virgo_genes._parse_gene(self.raw_line, selected_keys=selected_keys) + self.assertDictEqual(tested_dict, expected_dict) + + +class TestCreateOrUpdateGenes(APITestCase, BaseTestImportVirgoGenes): + + @classmethod + def setUpTestData(cls): + cls.gene = GeneFactory() + + def test_create_1_update_1(self): + gene_to_update = { + 'gene_id': self.gene.gene_id, + 'name': 'Updated Gene', + 'length': 2235, + } + gene_to_create = { + 'gene_id': 'gene-create-123', + 'name': 'Created Gene', + 'length': 5629, + } + gene_dict = { + gene_to_update['gene_id']: gene_to_update, + gene_to_create['gene_id']: gene_to_create + } + self.import_virgo_genes.create_or_update_genes(gene_dict) + self.assertEqual(Gene.objects.all().count(), 2) + # Check updated gene + updated_gene = Gene.objects.get(gene_id=gene_to_update['gene_id']) + for key, value in gene_to_update.items(): + self.assertEqual(getattr(updated_gene, key), value) + # Check created gene + created_gene = Gene.objects.get(gene_id=gene_to_create['gene_id']) + for key, value in gene_to_create.items(): + self.assertEqual(getattr(created_gene, key), value) + + +class TestEndToEnd(APITestCase): + + def test_end_to_end(self): + test_file = os.path.join(os.path.dirname(__file__), "./test_files/virgo_gene_length.tsv") + loader = ImportVirgoGenes(test_file) + expected_genes = { + 'v1': { + 'source': 'virgo', + 'length': 101, + 'name': 'V1' + }, + 'v2': { + 'source': 'virgo', + 'length': 102, + 'name': 'V2' + }, + } + loader.load_all() + created_genes = Gene.objects.all().values() + for created_gene in created_genes: + for key in ['source', 'length', 'name']: + self.assertEqual(created_gene[key], expected_genes[created_gene['gene_id']][key]) -- GitLab From 0d566f27276930034ae70d6dfa19155025bd056e Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Wed, 29 Apr 2020 13:44:42 +0200 Subject: [PATCH 05/21] start refactoring gene import as command --- .../commands/commons/import_genes.py | 98 +++++++++++++++++++ .../commands/commons/test_import_genes.py | 98 +++++++++++++++++++ .../management/commands/import_virgo_genes.py | 89 ++--------------- .../commands/tests/test_import_virgo_genes.py | 80 --------------- 4 files changed, 202 insertions(+), 163 deletions(-) create mode 100644 backend/metagenedb/apps/catalog/management/commands/commons/import_genes.py create mode 100644 backend/metagenedb/apps/catalog/management/commands/commons/test_import_genes.py diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/import_genes.py b/backend/metagenedb/apps/catalog/management/commands/commons/import_genes.py new file mode 100644 index 0000000..2d25d3f --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/commons/import_genes.py @@ -0,0 +1,98 @@ +import logging +from itertools import islice + +from slugify import slugify + +from metagenedb.apps.catalog.models import Gene +from metagenedb.common.utils.chunks import file_len + +logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') +logger = logging.getLogger(__name__) + + +class BaseImportGenes(object): + IMPORT_TYPE = "gene" # For logs + SELECTED_KEYS = ['gene_id', 'length'] + UPDATED_FIELDS = ['length', 'name', 'source'] + SOURCE = 'unknown' + PARSER = None + + def __init__(self, annotation_file): + self.annotation_file = annotation_file + self.total_genes = file_len(annotation_file) + self._reset_counters() + + def _reset_counters(self): + self.processed_genes = 0 + self.created_genes = 0 + self.updated_genes = 0 + self.skipped_genes = 0 + + def _parse_gene(self, raw_line, selected_keys=SELECTED_KEYS): + gene_parser = self.PARSER + all_dict = gene_parser.gene(raw_line) + selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys} + return selected_dict + + def _format_for_model(self, igc_dict): + gene_dict = {} + gene_dict['gene_id'] = slugify(igc_dict['gene_id']) + gene_dict['name'] = igc_dict['gene_id'] + gene_dict['length'] = igc_dict['length'] + gene_dict['source'] = self.SOURCE + return gene_dict + + def _update_genes(self, gene_instances, gene_dict): + for gene_id, gene_instance in gene_instances.items(): + for key, value in gene_dict[gene_id].items(): + setattr(gene_instance, key, value) + try: + Gene.objects.bulk_update( + list(gene_instances.values()), + self.UPDATED_FIELDS + ) + self.updated_genes += len(gene_instances.keys()) + except Exception as exception: + logger.warning(exception) + self.skipped_genes += len(gene_instances.keys()) + + def _create_genes(self, gene_list): + try: + Gene.objects.bulk_create( + [Gene(**item) for item in gene_list] + ) + self.created_genes += len(gene_list) + except Exception as exception: + logger.warning(exception) + self.skipped_genes += len(gene_list) + + def create_or_update_genes(self, gene_dict): + update_instances = Gene.objects.in_bulk(gene_dict.keys(), field_name='gene_id') + self._update_genes(update_instances, gene_dict) + gene_ids_to_create = set(gene_dict.keys()) - set(update_instances.keys()) + if gene_ids_to_create: + self._create_genes([gene_dict[gene_id] for gene_id in gene_ids_to_create]) + + def _handle_chunk(self, chunk_genes): + """ + Overide for all different sources + """ + gene_dict_list = [self._parse_gene(i) for i in chunk_genes] + gene_clean_dict = {slugify(i['gene_id']): self._format_for_model(i) for i in gene_dict_list} + self.create_or_update_genes(gene_clean_dict) + + def load_all(self, test=False, chunk_size=10000): + logger.info("Starting %s import (creation or update) to DB", self.IMPORT_TYPE) + with open(self.annotation_file, 'r') as file: + while True: + chunk_genes = list(islice(file, chunk_size)) + self.processed_genes += chunk_size + logger.info("%s Genes processed so far...", self.processed_genes) + if not chunk_genes: + break + self._handle_chunk(chunk_genes) + if test is True: + break + logger.info("[DONE] %s/%s Genes created.", self.created_genes, self.total_genes) + logger.info("[DONE] %s/%s Genes updated.", self.updated_genes, self.total_genes) + logger.info("[DONE] %s/%s Genes skipped.", self.skipped_genes, self.total_genes) diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/test_import_genes.py b/backend/metagenedb/apps/catalog/management/commands/commons/test_import_genes.py new file mode 100644 index 0000000..f9f0049 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/commons/test_import_genes.py @@ -0,0 +1,98 @@ +from unittest import TestCase + +import mock +from rest_framework.test import APITestCase + +from metagenedb.apps.catalog.models import Gene +from metagenedb.apps.catalog.management.commands.commons.import_genes import BaseImportGenes +from metagenedb.apps.catalog.factory import ( + GeneFactory, +) + + +class ParserTest: + """Simple parser for test purposes""" + + @staticmethod + def gene(line): + gene_info = line.rstrip().split('\t') + return { + 'gene_id': gene_info[0], + 'length': gene_info[1], + } + + +class BaseTestImportGenes(TestCase): + + def setUp(self): + function_to_mock = 'metagenedb.apps.catalog.management.commands.commons.import_genes.file_len' + with mock.patch(function_to_mock) as MockFileLen: + MockFileLen.return_value = 10 + self.import_genes = BaseImportGenes('test') + self.import_genes.PARSER = ParserTest + + +class TestParseGene(BaseTestImportGenes): + + def setUp(self): + raw_data = [ + 'gene_ID', + 'length', + ] + self.raw_line = "\t".join(raw_data) + super().setUp() + + def test_parse_gene_default_selected_keys(self): + """ + This test should failed and need to be updated when SELECTED_KEYS are changed + """ + expected_dict = { + 'gene_id': 'gene_ID', + 'length': 'length', + } + tested_dict = self.import_genes._parse_gene(self.raw_line) + self.assertDictEqual(tested_dict, expected_dict) + + def test_parse_gene(self): + """ + This test should failed and need to be updated when SELECTED_KEYS are changed + """ + selected_keys = ['gene_id'] + expected_dict = { + 'gene_id': 'gene_ID', + } + tested_dict = self.import_genes._parse_gene(self.raw_line, selected_keys=selected_keys) + self.assertDictEqual(tested_dict, expected_dict) + + +class TestCreateOrUpdateGenes(APITestCase, BaseTestImportGenes): + + @classmethod + def setUpTestData(cls): + cls.gene = GeneFactory() + + def test_create_1_update_1(self): + gene_to_update = { + 'gene_id': self.gene.gene_id, + 'name': 'Updated Gene', + 'length': 2235, + } + gene_to_create = { + 'gene_id': 'gene-create-123', + 'name': 'Created Gene', + 'length': 5629, + } + gene_dict = { + gene_to_update['gene_id']: gene_to_update, + gene_to_create['gene_id']: gene_to_create + } + self.import_genes.create_or_update_genes(gene_dict) + self.assertEqual(Gene.objects.all().count(), 2) + # Check updated gene + updated_gene = Gene.objects.get(gene_id=gene_to_update['gene_id']) + for key, value in gene_to_update.items(): + self.assertEqual(getattr(updated_gene, key), value) + # Check created gene + created_gene = Gene.objects.get(gene_id=gene_to_create['gene_id']) + for key, value in gene_to_create.items(): + self.assertEqual(getattr(created_gene, key), value) diff --git a/backend/metagenedb/apps/catalog/management/commands/import_virgo_genes.py b/backend/metagenedb/apps/catalog/management/commands/import_virgo_genes.py index 12b277b..8636f44 100644 --- a/backend/metagenedb/apps/catalog/management/commands/import_virgo_genes.py +++ b/backend/metagenedb/apps/catalog/management/commands/import_virgo_genes.py @@ -1,98 +1,21 @@ import logging -from itertools import islice from django.core.management.base import BaseCommand -from slugify import slugify -from metagenedb.apps.catalog.models import Gene -from metagenedb.common.utils.chunks import file_len +from metagenedb.apps.catalog.management.commands.commons.import_genes import BaseImportGenes from metagenedb.common.utils.parsers import VirgoGeneLengthLineParser logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') logger = logging.getLogger(__name__) -class ImportVirgoGenes(object): +class ImportVirgoGenes(BaseImportGenes): + IMPORT_TYPE = "Virgo gene length" # For logs SELECTED_KEYS = ['gene_id', 'length'] - SOURCE = "virgo" - - def __init__(self, annotation_file): - self.annotation_file = annotation_file - self.total_genes = file_len(annotation_file) - self._reset_counters() - - def _reset_counters(self): - self.processed_genes = 0 - self.created_genes = 0 - self.updated_genes = 0 - self.skipped_genes = 0 - - def _parse_gene(self, raw_line, selected_keys=SELECTED_KEYS): - """ - Use VirgoGeneLengthLineParser - """ - gene_parser = VirgoGeneLengthLineParser() - all_dict = gene_parser.gene(raw_line) - selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys} - return selected_dict - - def _format_for_model(self, igc_dict): - gene_dict = {} - gene_dict['gene_id'] = slugify(igc_dict['gene_id']) - gene_dict['name'] = igc_dict['gene_id'] - gene_dict['length'] = igc_dict['length'] - gene_dict['source'] = self.SOURCE - return gene_dict - - def _update_genes(self, gene_instances, gene_dict): - for gene_id, gene_instance in gene_instances.items(): - for key, value in gene_dict[gene_id].items(): - setattr(gene_instance, key, value) - try: - Gene.objects.bulk_update( - list(gene_instances.values()), - ['name', 'length', 'source'] - ) - self.updated_genes += len(gene_instances.keys()) - except Exception as exception: - logger.warning(exception) - self.skipped_genes += len(gene_instances.keys()) - - def _create_genes(self, gene_list): - try: - Gene.objects.bulk_create( - [Gene(**item) for item in gene_list] - ) - self.created_genes += len(gene_list) - except Exception as exception: - logger.warning(exception) - self.skipped_genes += len(gene_list) - - def create_or_update_genes(self, gene_dict): - update_instances = Gene.objects.in_bulk(gene_dict.keys(), field_name='gene_id') - self._update_genes(update_instances, gene_dict) - gene_ids_to_create = set(gene_dict.keys()) - set(update_instances.keys()) - if gene_ids_to_create: - self._create_genes([gene_dict[gene_id] for gene_id in gene_ids_to_create]) - - def load_all(self, test=False, chunk_size=10000): - logger.info("Starting Virgo KEGG annotations import (creation or update) to DB") - with open(self.annotation_file, 'r') as file: - while True: - chunk_genes = list(islice(file, chunk_size)) - if not chunk_genes: - break - virgo_dict_list = [self._parse_gene(i) for i in chunk_genes] - virgo_clean_dict = {slugify(i['gene_id']): self._format_for_model(i) for i in virgo_dict_list} - self.processed_genes += chunk_size - self.create_or_update_genes(virgo_clean_dict) - logger.info("%s Genes processed so far...", self.processed_genes) - if test is True: - break - logger.info("[DONE] %s/%s Genes created.", self.created_genes, self.total_genes) - logger.info("[DONE] %s/%s Genes updated.", self.updated_genes, self.total_genes) - logger.info("[DONE] %s/%s Genes skipped.", self.skipped_genes, self.total_genes) + UPDATED_FIELDS = ['length', 'name', 'source'] + SOURCE = 'virgo' + PARSER = VirgoGeneLengthLineParser class Command(BaseCommand): diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_genes.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_genes.py index 44f5aa0..6166e00 100644 --- a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_genes.py +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_genes.py @@ -1,89 +1,9 @@ import os -from unittest import TestCase -import mock from rest_framework.test import APITestCase from metagenedb.apps.catalog.models import Gene from metagenedb.apps.catalog.management.commands.import_virgo_genes import ImportVirgoGenes -from metagenedb.apps.catalog.factory import ( - GeneFactory, -) - - -class BaseTestImportVirgoGenes(TestCase): - - def setUp(self): - function_to_mock = 'metagenedb.apps.catalog.management.commands.import_virgo_genes.file_len' - with mock.patch(function_to_mock) as MockFileLen: - MockFileLen.return_value = 10 - self.import_virgo_genes = ImportVirgoGenes('test') - - -class TestParseGene(BaseTestImportVirgoGenes): - - def setUp(self): - raw_data = [ - 'gene_ID', - 'length', - ] - self.raw_line = "\t".join(raw_data) - super().setUp() - - def test_parse_gene_default_selected_keys(self): - """ - This test should failed and need to be updated when SELECTED_KEYS are changed - """ - expected_dict = { - 'gene_id': 'gene_ID', - 'length': 'length', - } - tested_dict = self.import_virgo_genes._parse_gene(self.raw_line) - self.assertDictEqual(tested_dict, expected_dict) - - def test_parse_gene(self): - """ - This test should failed and need to be updated when SELECTED_KEYS are changed - """ - selected_keys = ['gene_id'] - expected_dict = { - 'gene_id': 'gene_ID', - } - tested_dict = self.import_virgo_genes._parse_gene(self.raw_line, selected_keys=selected_keys) - self.assertDictEqual(tested_dict, expected_dict) - - -class TestCreateOrUpdateGenes(APITestCase, BaseTestImportVirgoGenes): - - @classmethod - def setUpTestData(cls): - cls.gene = GeneFactory() - - def test_create_1_update_1(self): - gene_to_update = { - 'gene_id': self.gene.gene_id, - 'name': 'Updated Gene', - 'length': 2235, - } - gene_to_create = { - 'gene_id': 'gene-create-123', - 'name': 'Created Gene', - 'length': 5629, - } - gene_dict = { - gene_to_update['gene_id']: gene_to_update, - gene_to_create['gene_id']: gene_to_create - } - self.import_virgo_genes.create_or_update_genes(gene_dict) - self.assertEqual(Gene.objects.all().count(), 2) - # Check updated gene - updated_gene = Gene.objects.get(gene_id=gene_to_update['gene_id']) - for key, value in gene_to_update.items(): - self.assertEqual(getattr(updated_gene, key), value) - # Check created gene - created_gene = Gene.objects.get(gene_id=gene_to_create['gene_id']) - for key, value in gene_to_create.items(): - self.assertEqual(getattr(created_gene, key), value) class TestEndToEnd(APITestCase): -- GitLab From eaee3b9384447c42e92db5d05d3770682f268299 Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Wed, 29 Apr 2020 16:05:15 +0200 Subject: [PATCH 06/21] refactor import for igc and allow generation of simple db for tests --- .../apps/catalog/factory/taxonomy.py | 53 +++++ .../commands/commons/import_genes.py | 11 +- .../commands/commons/test_import_genes.py | 11 - .../commands/import_igc_annotation.py | 93 ++------- .../management/commands/import_virgo_kegg.py | 189 ++++++++++++++++++ .../tests/test_files/igc_annotation.tsv | 2 + .../tests/test_import_igc_annotation.py | 133 ++++-------- 7 files changed, 300 insertions(+), 192 deletions(-) create mode 100644 backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py create mode 100644 backend/metagenedb/apps/catalog/management/commands/tests/test_files/igc_annotation.tsv diff --git a/backend/metagenedb/apps/catalog/factory/taxonomy.py b/backend/metagenedb/apps/catalog/factory/taxonomy.py index f57b96c..2504f40 100644 --- a/backend/metagenedb/apps/catalog/factory/taxonomy.py +++ b/backend/metagenedb/apps/catalog/factory/taxonomy.py @@ -17,3 +17,56 @@ class TaxonomyFactory(DjangoModelFactory): rank = fuzzy.FuzzyChoice(SELECTED_RANK) tax_id = FuzzyLowerText(prefix='tax-', length=15) name = fuzzy.FuzzyText(length=20) + + +class DbGenerator: + + def __init__(self): + self.created_ids = set() # store already created IDs to skip them + + def generate_db_from_tree(self, tree): + for rank, desc in tree.items(): + if desc['tax_id'] not in self.created_ids: + TaxonomyFactory.create( + tax_id=desc['tax_id'], + name=desc['name'], + rank=rank, + ) + self.created_ids.add(desc['tax_id']) + + +def _generate_lactobacillus_db(db_generator): + """ + Generate db with few ranks corresponding to Lactobacillus genus + """ + tree = { + "class": {"name": "Bacilli", "tax_id": "91061"}, + "genus": {"name": "Lactobacillus", "tax_id": "1578"}, + "order": {"name": "Lactobacillales", "tax_id": "186826"}, + "family": {"name": "Lactobacillaceae", "tax_id": "33958"}, + "phylum": {"name": "Firmicutes", "tax_id": "1239"}, + "no_rank": {"name": "cellular organisms", "tax_id": "131567"}, + "superkingdom": {"name": "Bacteria", "tax_id": "2"}, + "species_group": {"name": "Lactobacillus casei group", "tax_id": "655183"} + } + db_generator.generate_db_from_tree(tree) + + +def _generate_escherichia_db(db_generator): + tree = { + "class": {"name": "Gammaproteobacteria", "tax_id": "1236"}, + "genus": {"name": "Escherichia", "tax_id": "561"}, + "order": {"name": "Enterobacterales", "tax_id": "91347"}, + "family": {"name": "Enterobacteriaceae", "tax_id": "543"}, + "phylum": {"name": "Proteobacteria", "tax_id": "1224"}, + "no_rank": {"name": "cellular organisms", "tax_id": "131567"}, + "species": {"name": "Escherichia coli", "tax_id": "562"}, + "superkingdom": {"name": "Bacteria", "tax_id": "2"} + } + db_generator.generate_db_from_tree(tree) + + +def generate_simple_db(): + db_generator = DbGenerator() + _generate_escherichia_db(db_generator) + _generate_lactobacillus_db(db_generator) diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/import_genes.py b/backend/metagenedb/apps/catalog/management/commands/commons/import_genes.py index 2d25d3f..edfb66a 100644 --- a/backend/metagenedb/apps/catalog/management/commands/commons/import_genes.py +++ b/backend/metagenedb/apps/catalog/management/commands/commons/import_genes.py @@ -14,7 +14,7 @@ class BaseImportGenes(object): IMPORT_TYPE = "gene" # For logs SELECTED_KEYS = ['gene_id', 'length'] UPDATED_FIELDS = ['length', 'name', 'source'] - SOURCE = 'unknown' + SOURCE = 'undef' PARSER = None def __init__(self, annotation_file): @@ -28,10 +28,10 @@ class BaseImportGenes(object): self.updated_genes = 0 self.skipped_genes = 0 - def _parse_gene(self, raw_line, selected_keys=SELECTED_KEYS): + def _parse_gene(self, raw_line): gene_parser = self.PARSER all_dict = gene_parser.gene(raw_line) - selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys} + selected_dict = {k: v for k, v in all_dict.items() if k in self.SELECTED_KEYS} return selected_dict def _format_for_model(self, igc_dict): @@ -40,6 +40,7 @@ class BaseImportGenes(object): gene_dict['name'] = igc_dict['gene_id'] gene_dict['length'] = igc_dict['length'] gene_dict['source'] = self.SOURCE + print(gene_dict) return gene_dict def _update_genes(self, gene_instances, gene_dict): @@ -86,11 +87,11 @@ class BaseImportGenes(object): with open(self.annotation_file, 'r') as file: while True: chunk_genes = list(islice(file, chunk_size)) - self.processed_genes += chunk_size - logger.info("%s Genes processed so far...", self.processed_genes) if not chunk_genes: break self._handle_chunk(chunk_genes) + self.processed_genes += chunk_size + logger.info("%s Genes processed so far...", self.processed_genes) if test is True: break logger.info("[DONE] %s/%s Genes created.", self.created_genes, self.total_genes) diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/test_import_genes.py b/backend/metagenedb/apps/catalog/management/commands/commons/test_import_genes.py index f9f0049..751ac12 100644 --- a/backend/metagenedb/apps/catalog/management/commands/commons/test_import_genes.py +++ b/backend/metagenedb/apps/catalog/management/commands/commons/test_import_genes.py @@ -53,17 +53,6 @@ class TestParseGene(BaseTestImportGenes): tested_dict = self.import_genes._parse_gene(self.raw_line) self.assertDictEqual(tested_dict, expected_dict) - def test_parse_gene(self): - """ - This test should failed and need to be updated when SELECTED_KEYS are changed - """ - selected_keys = ['gene_id'] - expected_dict = { - 'gene_id': 'gene_ID', - } - tested_dict = self.import_genes._parse_gene(self.raw_line, selected_keys=selected_keys) - self.assertDictEqual(tested_dict, expected_dict) - class TestCreateOrUpdateGenes(APITestCase, BaseTestImportGenes): diff --git a/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py b/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py index 20c38a7..9fdcc70 100644 --- a/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py +++ b/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py @@ -1,37 +1,31 @@ import logging -from itertools import islice from django.core.management.base import BaseCommand from slugify import slugify +from metagenedb.apps.catalog.management.commands.commons.import_genes import BaseImportGenes from metagenedb.apps.catalog.models import Function, Gene, GeneFunction, Taxonomy -from metagenedb.common.utils.chunks import file_len from metagenedb.common.utils.parsers import IGCLineParser logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') logger = logging.getLogger(__name__) -class ImportIGCGenes(object): - +class ImportIGCGenes(BaseImportGenes): PHYLUM_COL = 'taxo_phylum' GENUS_COL = 'taxo_genus' SELECTED_KEYS = ['gene_id', 'length', 'kegg_ko', 'eggnog', PHYLUM_COL, GENUS_COL] + IMPORT_TYPE = "IGC genes" # For logs + UPDATED_FIELDS = ['length', 'name', 'source'] + SOURCE = 'igc' + PARSER = IGCLineParser def __init__(self, annotation_file, skip_tax=False, skip_functions=False): - self.annotation_file = annotation_file - self.total_genes = file_len(annotation_file) - self._reset_counters() + super().__init__(annotation_file) # Skip some insertion if specified in script options self.skip_tax = skip_tax self.skip_functions = skip_functions - def _reset_counters(self): - self.processed_genes = 0 - self.created_genes = 0 - self.updated_genes = 0 - self.skipped_genes = 0 - def _build_taxo_mapping(self, rank): logger.info("Building local mapping for %s level...", rank) instances = Taxonomy.objects.filter(rank=rank) @@ -66,15 +60,6 @@ class ImportIGCGenes(object): self._kegg_mapping = self._build_function_mapping("kegg") return self._kegg_mapping - def _parse_gene(self, raw_line, selected_keys=SELECTED_KEYS): - """ - Use IGCLineParser and return selected keys - """ - gene_parser = IGCLineParser() - all_dict = gene_parser.gene(raw_line) - selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys} - return selected_dict - def _retrieve_taxonomy(self, genus_name, phylum_name, unknown_val='unknown'): taxonomy_instance = None if genus_name != unknown_val: @@ -93,45 +78,11 @@ class ImportIGCGenes(object): return functions def _format_for_model(self, igc_dict): - gene_dict = {} - gene_dict['name'] = igc_dict['gene_id'] - gene_dict['gene_id'] = slugify(igc_dict['gene_id']) - gene_dict['length'] = igc_dict['length'] + gene_dict = super()._format_for_model(igc_dict) if not self.skip_tax: gene_dict['taxonomy'] = self._retrieve_taxonomy(igc_dict.get('taxo_genus'), igc_dict.get('taxo_phylum')) return gene_dict - def _update_genes(self, gene_instances, gene_dict): - for gene_id, gene_instance in gene_instances.items(): - for key, value in gene_dict[gene_id].items(): - setattr(gene_instance, key, value) - try: - Gene.objects.bulk_update( - list(gene_instances.values()), - ['name', 'taxonomy', 'length'] - ) - self.updated_genes += len(gene_instances.keys()) - except Exception as exception: - logger.warning(exception) - self.skipped_genes += len(gene_instances.keys()) - - def _create_genes(self, gene_list): - try: - Gene.objects.bulk_create( - [Gene(**item) for item in gene_list] - ) - self.created_genes += len(gene_list) - except Exception as exception: - logger.warning(exception) - self.skipped_genes += len(gene_list) - - def create_or_update_genes(self, gene_dict): - update_instances = Gene.objects.in_bulk(gene_dict.keys(), field_name='gene_id') - self._update_genes(update_instances, gene_dict) - gene_ids_to_create = set(gene_dict.keys()) - set(update_instances.keys()) - if gene_ids_to_create: - self._create_genes([gene_dict[gene_id] for gene_id in gene_ids_to_create]) - def _clean_functions(self, functions, unknown_val='unknown'): """ Get rid of functions that are not in the db or entitled unknown @@ -173,26 +124,14 @@ class ImportIGCGenes(object): self._generate_gene_function_mapping(cleaned_functions, genes) ) - def load_all(self, test=False, chunk_size=10000): - logger.info("Starting IGC genes import (creation or update) to DB") - with open(self.annotation_file, 'r') as file: - while True: - chunk_genes = list(islice(file, chunk_size)) - if not chunk_genes: - break - igc_dict_list = [self._parse_gene(i) for i in chunk_genes] - functions = self._remove_functions(igc_dict_list) - igc_clean_dict = {slugify(i['gene_id']): self._format_for_model(i) for i in igc_dict_list} - self.processed_genes += chunk_size - self.create_or_update_genes(igc_clean_dict) - if not self.skip_functions: - self.link_genes_to_functions(functions) - logger.info("%s Genes processed so far...", self.processed_genes) - if test is True: - break - logger.info("[DONE] %s/%s Genes created.", self.created_genes, self.total_genes) - logger.info("[DONE] %s/%s Genes updated.", self.updated_genes, self.total_genes) - logger.info("[DONE] %s/%s Genes skipped.", self.skipped_genes, self.total_genes) + def _handle_chunk(self, chunk_genes): + gene_dict_list = [self._parse_gene(i) for i in chunk_genes] + functions = self._remove_functions(gene_dict_list) + gene_clean_dict = {slugify(i['gene_id']): self._format_for_model(i) for i in gene_dict_list} + print(gene_clean_dict) + self.create_or_update_genes(gene_clean_dict) + if not self.skip_functions: + self.link_genes_to_functions(functions) class Command(BaseCommand): diff --git a/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py b/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py new file mode 100644 index 0000000..9f71df4 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py @@ -0,0 +1,189 @@ +import logging +from itertools import islice + +from django.core.management.base import BaseCommand +from slugify import slugify + +from metagenedb.apps.catalog.models import Function, Gene, GeneFunction, Taxonomy +from metagenedb.common.utils.chunks import file_len +from metagenedb.common.utils.parsers import VirgoKEGGLineParser + +logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') +logger = logging.getLogger(__name__) + + +class ImportVirgoGenes(object): + + SELECTED_KEYS = ['gene_id', 'kegg_ko'] + + def __init__(self, annotation_file): + self.annotation_file = annotation_file + self.total_genes = file_len(annotation_file) + self._reset_counters() + + def _reset_counters(self): + self.processed_genes = 0 + self.created_genes = 0 + self.updated_genes = 0 + self.skipped_genes = 0 + + def _build_taxo_mapping(self, rank): + logger.info("Building local mapping for %s level...", rank) + instances = Taxonomy.objects.filter(rank=rank) + return {instance.name: instance for instance in instances} + + def _build_function_mapping(self, source): + logger.info("Building local mapping for %s function...", source) + instances = Function.objects.filter(source=source) + return {instance.function_id: instance for instance in instances} + + @property + def eggnog_mapping(self): + if getattr(self, '_eggnog_mapping', None) is None: + self._eggnog_mapping = self._build_function_mapping("eggnog") + return self._eggnog_mapping + + @property + def kegg_mapping(self): + if getattr(self, '_kegg_mapping', None) is None: + self._kegg_mapping = self._build_function_mapping("kegg") + return self._kegg_mapping + + def _parse_gene(self, raw_line, selected_keys=SELECTED_KEYS): + """ + Use VirgoKEGGLineParser and return selected keys + """ + gene_parser = VirgoKEGGLineParser() + all_dict = gene_parser.gene(raw_line) + selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys} + return selected_dict + + def _remove_functions(self, gene_dicts): + functions = {} + for gene_dict in gene_dicts: + functions[slugify(gene_dict['gene_id'])] = { + 'kegg': gene_dict.pop('kegg_ko'), + 'eggnog': gene_dict.pop('eggnog') + } + return functions + + def _format_for_model(self, igc_dict): + gene_dict = {} + gene_dict['name'] = igc_dict['gene_id'] + gene_dict['gene_id'] = slugify(igc_dict['gene_id']) + gene_dict['length'] = igc_dict['length'] + if not self.skip_tax: + gene_dict['taxonomy'] = self._retrieve_taxonomy(igc_dict.get('taxo_genus'), igc_dict.get('taxo_phylum')) + return gene_dict + + def _update_genes(self, gene_instances, gene_dict): + for gene_id, gene_instance in gene_instances.items(): + for key, value in gene_dict[gene_id].items(): + setattr(gene_instance, key, value) + try: + Gene.objects.bulk_update( + list(gene_instances.values()), + ['name', 'taxonomy', 'length'] + ) + self.updated_genes += len(gene_instances.keys()) + except Exception as exception: + logger.warning(exception) + self.skipped_genes += len(gene_instances.keys()) + + def _create_genes(self, gene_list): + try: + Gene.objects.bulk_create( + [Gene(**item) for item in gene_list] + ) + self.created_genes += len(gene_list) + except Exception as exception: + logger.warning(exception) + self.skipped_genes += len(gene_list) + + def create_or_update_genes(self, gene_dict): + update_instances = Gene.objects.in_bulk(gene_dict.keys(), field_name='gene_id') + self._update_genes(update_instances, gene_dict) + gene_ids_to_create = set(gene_dict.keys()) - set(update_instances.keys()) + if gene_ids_to_create: + self._create_genes([gene_dict[gene_id] for gene_id in gene_ids_to_create]) + + def _clean_functions(self, functions, unknown_val='unknown'): + """ + Get rid of functions that are not in the db or entitled unknown + """ + cleaned_functions = {} + for gene_id, all_functions in functions.items(): + new_functions = [] + for kegg in all_functions['kegg']: + if kegg == unknown_val: + continue + elif kegg in self.kegg_mapping.keys(): + new_functions.append(self.kegg_mapping[kegg]) + for eggnog in all_functions['eggnog']: + if eggnog == unknown_val: + continue + elif eggnog in self.eggnog_mapping.keys(): + new_functions.append(self.eggnog_mapping[eggnog]) + if new_functions: + cleaned_functions[gene_id] = new_functions + return cleaned_functions + + def _generate_gene_function_mapping(self, functions, genes): + """ + Generate a list of GeneFunction pair to create relation between them + """ + mapping = [] + for gene_id, function_list in functions.items(): + for function in function_list: + mapping.append(GeneFunction(gene=genes[gene_id], function=function)) + return mapping + + def link_genes_to_functions(self, functions): + cleaned_functions = self._clean_functions(functions) + genes = Gene.objects.in_bulk(cleaned_functions.keys(), field_name='gene_id') + # Get all link with corresponding genes & Delete them + GeneFunction.objects.filter(gene__in=genes.values()).delete() + # Generate table for bulk_create of function <-> gene and create it + GeneFunction.objects.bulk_create( + self._generate_gene_function_mapping(cleaned_functions, genes) + ) + + def load_all(self, test=False, chunk_size=10000): + logger.info("Starting Virgo KEGG annotations import (creation or update) to DB") + with open(self.annotation_file, 'r') as file: + while True: + chunk_genes = list(islice(file, chunk_size)) + if not chunk_genes: + break + virgo_dict_list = [self._parse_gene(i) for i in chunk_genes] + functions = self._remove_functions(virgo_dict_list) + igc_clean_dict = {slugify(i['gene_id']): self._format_for_model(i) for i in virgo_dict_list} + self.processed_genes += chunk_size + self.create_or_update_genes(igc_clean_dict) + if not self.skip_functions: + self.link_genes_to_functions(functions) + logger.info("%s Genes processed so far...", self.processed_genes) + if test is True: + break + logger.info("[DONE] %s/%s Genes created.", self.created_genes, self.total_genes) + logger.info("[DONE] %s/%s Genes updated.", self.updated_genes, self.total_genes) + logger.info("[DONE] %s/%s Genes skipped.", self.skipped_genes, self.total_genes) + + +class Command(BaseCommand): + help = 'Create or update all EggNOG entries from annotations.tsv file.' + + def add_arguments(self, parser): + parser.add_argument('annotation', help='8.A.kegg.ortholog.txt file from Virgo') + parser.add_argument('--test', action='store_true', help='Run only on first 10000 entries.') + + def set_logger_level(self, verbosity): + if verbosity > 2: + logger.setLevel(logging.DEBUG) + elif verbosity > 1: + logger.setLevel(logging.INFO) + + def handle(self, *args, **options): + self.set_logger_level(int(options['verbosity'])) + import_igc = ImportVirgoGenes(options['annotation']) + import_igc.load_all(test=options['test']) diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_files/igc_annotation.tsv b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/igc_annotation.tsv new file mode 100644 index 0000000..6dba205 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/igc_annotation.tsv @@ -0,0 +1,2 @@ +1 Gene_1 123 Complete CHN Proteobacteria Escherichia K01824 COG5184 0.224151539068666 0.236448598130841 Lipid Metabolism Cell cycle control, cell division, chromosome partitioning;Cytoskeleton EUR;CHN;USA +2 Gene_2 456 Complete EUR Firmicutes Veillonella K01824 COG5184 0.352801894238358 0.351401869158878 Lipid Metabolism Cell cycle control, cell division, chromosome partitioning;Cytoskeleton EUR;CHN;USA diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py index cea4c1d..1b84885 100644 --- a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py @@ -1,3 +1,4 @@ +import os from unittest import TestCase import mock @@ -10,79 +11,18 @@ from metagenedb.apps.catalog.factory import ( GeneFactory, TaxonomyFactory, ) +from metagenedb.apps.catalog.factory.taxonomy import generate_simple_db class BaseTestImportIGCGenes(TestCase): def setUp(self): - function_to_mock = 'metagenedb.apps.catalog.management.commands.import_igc_annotation.file_len' + function_to_mock = 'metagenedb.apps.catalog.management.commands.commons.import_genes.file_len' with mock.patch(function_to_mock) as MockFileLen: MockFileLen.return_value = 10 self.import_igc_genes = ImportIGCGenes('test') -class TestParseGene(BaseTestImportIGCGenes): - - def setUp(self): - raw_data = [ - 'gene_id', - 'name', - 'length', - 'gene_completeness_status', - 'cohort_origin', - 'taxo_phylum', - 'taxo_genus', - 'kegg', - 'eggnog', - 'sample_occurence_freq', - 'ind_occurence_freq', - 'kegg_functional_cat', - 'eggnog_functional_cat', - 'cohort_assembled' - ] - self.raw_line = "\t".join(raw_data) - super().setUp() - - def test_parse_gene_default_selected_keys(self): - """ - This test should failed and need to be updated when SELECTED_KEYS are changed - """ - expected_dict = { - 'gene_id': 'name', - 'length': 'length', - 'kegg_ko': ['kegg'], - 'eggnog': ['eggnog'], - 'taxo_phylum': 'taxo_phylum', - 'taxo_genus': 'taxo_genus', - } - tested_dict = self.import_igc_genes._parse_gene(self.raw_line) - self.assertDictEqual(tested_dict, expected_dict) - - def test_parse_gene(self): - """ - This test should failed and need to be updated when SELECTED_KEYS are changed - """ - selected_keys = ['gene_id', 'length'] - expected_dict = { - 'gene_id': 'name', - 'length': 'length' - } - tested_dict = self.import_igc_genes._parse_gene(self.raw_line, selected_keys=selected_keys) - self.assertDictEqual(tested_dict, expected_dict) - - def test_parse_gene_unknown_key(self): - """ - Unknown key should be ignored - """ - selected_keys = ['gene_id', 'length', 'secret_code'] - expected_dict = { - 'gene_id': 'name', - 'length': 'length' - } - tested_dict = self.import_igc_genes._parse_gene(self.raw_line, selected_keys=selected_keys) - self.assertDictEqual(tested_dict, expected_dict) - - class TestRetrieveTaxonomy(APITestCase, BaseTestImportIGCGenes): @classmethod @@ -137,42 +77,6 @@ class TestRemoveFunctions(BaseTestImportIGCGenes): self.assertDictEqual(tested_dict, expected_functions) -class TestCreateOrUpdateGenes(APITestCase, BaseTestImportIGCGenes): - - @classmethod - def setUpTestData(cls): - cls.gene = GeneFactory() - cls.taxo_list = TaxonomyFactory.create_batch(2) - - def test_create_1_update_1(self): - gene_to_update = { - 'gene_id': self.gene.gene_id, - 'name': 'Updated Gene', - 'length': 2235, - 'taxonomy': self.taxo_list[0] - } - gene_to_create = { - 'gene_id': 'gene-create-123', - 'name': 'Created Gene', - 'length': 5629, - 'taxonomy': self.taxo_list[1] - } - gene_dict = { - gene_to_update['gene_id']: gene_to_update, - gene_to_create['gene_id']: gene_to_create - } - self.import_igc_genes.create_or_update_genes(gene_dict) - self.assertEqual(Gene.objects.all().count(), 2) - # Check updated gene - updated_gene = Gene.objects.get(gene_id=gene_to_update['gene_id']) - for key, value in gene_to_update.items(): - self.assertEqual(getattr(updated_gene, key), value) - # Check created gene - created_gene = Gene.objects.get(gene_id=gene_to_create['gene_id']) - for key, value in gene_to_create.items(): - self.assertEqual(getattr(created_gene, key), value) - - class TestRemoveUnknownFunctions(APITestCase, BaseTestImportIGCGenes): @classmethod @@ -248,3 +152,34 @@ class TestLinkGenesToFunctions(APITestCase, BaseTestImportIGCGenes): self.assertEqual(gene_functions.count(), 2) for link in gene_functions: self.assertEqual(link.gene.gene_id, self.gene.gene_id) + + +class TestEndToEnd(APITestCase): + + @classmethod + def setUpTestData(cls): + generate_simple_db() + + def test_end_to_end(self): + test_file = os.path.join(os.path.dirname(__file__), "./test_files/igc_annotation.tsv") + loader = ImportIGCGenes(test_file) + expected_genes = { + 'gene-1': { + 'source': 'igc', + 'length': 123, + 'name': 'Gene_1', + 'tax_id': '561' + }, + 'gene-2': { + 'source': 'igc', + 'length': 456, + 'name': 'Gene_2', + 'tax_id': '1239' + }, + } + loader.load_all() + created_genes = Gene.objects.all() + for created_gene in created_genes: + for key in ['source', 'length', 'name']: + self.assertEqual(getattr(created_gene, key), expected_genes[created_gene.gene_id][key]) + self.assertEqual(created_gene.taxonomy.tax_id, expected_genes[created_gene.gene_id]['tax_id']) -- GitLab From d316b52835e88adfb046e332317f4b2a8f1887a7 Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Wed, 29 Apr 2020 16:43:26 +0200 Subject: [PATCH 07/21] add fake functions db to tests for IGC --- .../apps/catalog/factory/function.py | 15 +++++++++++++ .../commands/commons/import_genes.py | 1 - .../commands/import_igc_annotation.py | 1 - .../tests/test_files/igc_annotation.tsv | 4 ++-- .../tests/test_import_igc_annotation.py | 22 ++++++++++++++++--- 5 files changed, 36 insertions(+), 7 deletions(-) diff --git a/backend/metagenedb/apps/catalog/factory/function.py b/backend/metagenedb/apps/catalog/factory/function.py index 1ab878b..a8b077e 100644 --- a/backend/metagenedb/apps/catalog/factory/function.py +++ b/backend/metagenedb/apps/catalog/factory/function.py @@ -33,3 +33,18 @@ class EggNOGFactory(BaseFunctionFactory): class KeggOrthologyFactory(BaseFunctionFactory): class Meta: model = models.KeggOrthology + + +def _create_fake_kegg_db(): + KeggOrthologyFactory.create(function_id="K12345", name="Kegg1") + KeggOrthologyFactory.create(function_id="K67890", name="Kegg2") + + +def _create_fake_eggnog_db(): + EggNOGFactory.create(function_id="COG1234", name="COG1") + EggNOGFactory.create(function_id="COG5678", name="COG2") + + +def generate_fake_functions_db(): + _create_fake_eggnog_db() + _create_fake_kegg_db() diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/import_genes.py b/backend/metagenedb/apps/catalog/management/commands/commons/import_genes.py index edfb66a..bf648f5 100644 --- a/backend/metagenedb/apps/catalog/management/commands/commons/import_genes.py +++ b/backend/metagenedb/apps/catalog/management/commands/commons/import_genes.py @@ -40,7 +40,6 @@ class BaseImportGenes(object): gene_dict['name'] = igc_dict['gene_id'] gene_dict['length'] = igc_dict['length'] gene_dict['source'] = self.SOURCE - print(gene_dict) return gene_dict def _update_genes(self, gene_instances, gene_dict): diff --git a/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py b/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py index 9fdcc70..7d3294c 100644 --- a/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py +++ b/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py @@ -128,7 +128,6 @@ class ImportIGCGenes(BaseImportGenes): gene_dict_list = [self._parse_gene(i) for i in chunk_genes] functions = self._remove_functions(gene_dict_list) gene_clean_dict = {slugify(i['gene_id']): self._format_for_model(i) for i in gene_dict_list} - print(gene_clean_dict) self.create_or_update_genes(gene_clean_dict) if not self.skip_functions: self.link_genes_to_functions(functions) diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_files/igc_annotation.tsv b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/igc_annotation.tsv index 6dba205..1f912b1 100644 --- a/backend/metagenedb/apps/catalog/management/commands/tests/test_files/igc_annotation.tsv +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/igc_annotation.tsv @@ -1,2 +1,2 @@ -1 Gene_1 123 Complete CHN Proteobacteria Escherichia K01824 COG5184 0.224151539068666 0.236448598130841 Lipid Metabolism Cell cycle control, cell division, chromosome partitioning;Cytoskeleton EUR;CHN;USA -2 Gene_2 456 Complete EUR Firmicutes Veillonella K01824 COG5184 0.352801894238358 0.351401869158878 Lipid Metabolism Cell cycle control, cell division, chromosome partitioning;Cytoskeleton EUR;CHN;USA +1 Gene_1 123 Complete CHN Proteobacteria Escherichia K12345 COG1234 0.224151539068666 0.236448598130841 Lipid Metabolism Cell cycle control, cell division, chromosome partitioning;Cytoskeleton EUR;CHN;USA +2 Gene_2 456 Complete EUR Firmicutes Veillonella K67890 COG5678 0.352801894238358 0.351401869158878 Lipid Metabolism Cell cycle control, cell division, chromosome partitioning;Cytoskeleton EUR;CHN;USA diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py index 1b84885..c88542d 100644 --- a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py @@ -11,6 +11,7 @@ from metagenedb.apps.catalog.factory import ( GeneFactory, TaxonomyFactory, ) +from metagenedb.apps.catalog.factory.function import generate_fake_functions_db from metagenedb.apps.catalog.factory.taxonomy import generate_simple_db @@ -159,6 +160,7 @@ class TestEndToEnd(APITestCase): @classmethod def setUpTestData(cls): generate_simple_db() + generate_fake_functions_db() def test_end_to_end(self): test_file = os.path.join(os.path.dirname(__file__), "./test_files/igc_annotation.tsv") @@ -168,18 +170,32 @@ class TestEndToEnd(APITestCase): 'source': 'igc', 'length': 123, 'name': 'Gene_1', - 'tax_id': '561' + 'tax_id': '561', + 'functions': { + 'kegg': 'K12345', + 'eggnog': 'COG1234' + } }, 'gene-2': { 'source': 'igc', 'length': 456, 'name': 'Gene_2', - 'tax_id': '1239' + 'tax_id': '1239', + 'functions': { + 'kegg': 'K67890', + 'eggnog': 'COG5678' + } }, } loader.load_all() - created_genes = Gene.objects.all() + created_genes = Gene.objects.all().prefetch_related('functions') for created_gene in created_genes: for key in ['source', 'length', 'name']: self.assertEqual(getattr(created_gene, key), expected_genes[created_gene.gene_id][key]) self.assertEqual(created_gene.taxonomy.tax_id, expected_genes[created_gene.gene_id]['tax_id']) + # Check functions + for function in created_gene.functions.all(): + self.assertIn(function.source, ['kegg', 'eggnog']) + self.assertEqual( + function.function_id, expected_genes[created_gene.gene_id]['functions'][function.source] + ) -- GitLab From 415c1eb09e78cd9fef020285c9fd6de7c9005191 Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Thu, 30 Apr 2020 15:12:48 +0200 Subject: [PATCH 08/21] link Virgo genes to KEGG (need refactor...) --- .../management/commands/import_virgo_genes.py | 8 +- .../management/commands/import_virgo_kegg.py | 175 +++++------------- 2 files changed, 54 insertions(+), 129 deletions(-) diff --git a/backend/metagenedb/apps/catalog/management/commands/import_virgo_genes.py b/backend/metagenedb/apps/catalog/management/commands/import_virgo_genes.py index 8636f44..d37456f 100644 --- a/backend/metagenedb/apps/catalog/management/commands/import_virgo_genes.py +++ b/backend/metagenedb/apps/catalog/management/commands/import_virgo_genes.py @@ -19,10 +19,10 @@ class ImportVirgoGenes(BaseImportGenes): class Command(BaseCommand): - help = 'Create or update all EggNOG entries from annotations.tsv file.' + help = 'Create or update all Virgo genes (name and length from `0.geneLength.txt` file).' def add_arguments(self, parser): - parser.add_argument('annotation', help='8.A.kegg.ortholog.txt file from Virgo') + parser.add_argument('annotation', help='0.geneLength.txt file from Virgo') parser.add_argument('--test', action='store_true', help='Run only on first 10000 entries.') def set_logger_level(self, verbosity): @@ -33,5 +33,5 @@ class Command(BaseCommand): def handle(self, *args, **options): self.set_logger_level(int(options['verbosity'])) - import_igc = ImportVirgoGenes(options['annotation']) - import_igc.load_all(test=options['test']) + import_virgo = ImportVirgoGenes(options['annotation']) + import_virgo.load_all(test=options['test']) diff --git a/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py b/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py index 9f71df4..59e7f4f 100644 --- a/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py +++ b/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py @@ -1,111 +1,44 @@ import logging -from itertools import islice from django.core.management.base import BaseCommand from slugify import slugify -from metagenedb.apps.catalog.models import Function, Gene, GeneFunction, Taxonomy -from metagenedb.common.utils.chunks import file_len +from metagenedb.apps.catalog.management.commands.commons.import_genes import BaseImportGenes +from metagenedb.apps.catalog.models import Function, Gene, GeneFunction from metagenedb.common.utils.parsers import VirgoKEGGLineParser logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') logger = logging.getLogger(__name__) -class ImportVirgoGenes(object): +class ImportVirgoGeneKeggAnnotation(BaseImportGenes): + IMPORT_TYPE = "Virgo KEGG annotations" # For logs SELECTED_KEYS = ['gene_id', 'kegg_ko'] - - def __init__(self, annotation_file): - self.annotation_file = annotation_file - self.total_genes = file_len(annotation_file) - self._reset_counters() - - def _reset_counters(self): - self.processed_genes = 0 - self.created_genes = 0 - self.updated_genes = 0 - self.skipped_genes = 0 - - def _build_taxo_mapping(self, rank): - logger.info("Building local mapping for %s level...", rank) - instances = Taxonomy.objects.filter(rank=rank) - return {instance.name: instance for instance in instances} + UPDATED_FIELDS = ['name'] + SOURCE = 'virgo' + PARSER = VirgoKEGGLineParser def _build_function_mapping(self, source): - logger.info("Building local mapping for %s function...", source) + logger.info("Building local mapping for %s functions...", source) instances = Function.objects.filter(source=source) return {instance.function_id: instance for instance in instances} - @property - def eggnog_mapping(self): - if getattr(self, '_eggnog_mapping', None) is None: - self._eggnog_mapping = self._build_function_mapping("eggnog") - return self._eggnog_mapping - @property def kegg_mapping(self): if getattr(self, '_kegg_mapping', None) is None: self._kegg_mapping = self._build_function_mapping("kegg") return self._kegg_mapping - def _parse_gene(self, raw_line, selected_keys=SELECTED_KEYS): + def _generate_gene_function_mapping(self, functions, genes): """ - Use VirgoKEGGLineParser and return selected keys + Generate a list of GeneFunction pair to create relation between them """ - gene_parser = VirgoKEGGLineParser() - all_dict = gene_parser.gene(raw_line) - selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys} - return selected_dict - - def _remove_functions(self, gene_dicts): - functions = {} - for gene_dict in gene_dicts: - functions[slugify(gene_dict['gene_id'])] = { - 'kegg': gene_dict.pop('kegg_ko'), - 'eggnog': gene_dict.pop('eggnog') - } - return functions - - def _format_for_model(self, igc_dict): - gene_dict = {} - gene_dict['name'] = igc_dict['gene_id'] - gene_dict['gene_id'] = slugify(igc_dict['gene_id']) - gene_dict['length'] = igc_dict['length'] - if not self.skip_tax: - gene_dict['taxonomy'] = self._retrieve_taxonomy(igc_dict.get('taxo_genus'), igc_dict.get('taxo_phylum')) - return gene_dict - - def _update_genes(self, gene_instances, gene_dict): - for gene_id, gene_instance in gene_instances.items(): - for key, value in gene_dict[gene_id].items(): - setattr(gene_instance, key, value) - try: - Gene.objects.bulk_update( - list(gene_instances.values()), - ['name', 'taxonomy', 'length'] - ) - self.updated_genes += len(gene_instances.keys()) - except Exception as exception: - logger.warning(exception) - self.skipped_genes += len(gene_instances.keys()) - - def _create_genes(self, gene_list): - try: - Gene.objects.bulk_create( - [Gene(**item) for item in gene_list] - ) - self.created_genes += len(gene_list) - except Exception as exception: - logger.warning(exception) - self.skipped_genes += len(gene_list) - - def create_or_update_genes(self, gene_dict): - update_instances = Gene.objects.in_bulk(gene_dict.keys(), field_name='gene_id') - self._update_genes(update_instances, gene_dict) - gene_ids_to_create = set(gene_dict.keys()) - set(update_instances.keys()) - if gene_ids_to_create: - self._create_genes([gene_dict[gene_id] for gene_id in gene_ids_to_create]) + mapping = [] + for gene_id, function_list in functions.items(): + for function in function_list: + mapping.append(GeneFunction(gene=genes[gene_id], function=function)) + return mapping def _clean_functions(self, functions, unknown_val='unknown'): """ @@ -114,30 +47,15 @@ class ImportVirgoGenes(object): cleaned_functions = {} for gene_id, all_functions in functions.items(): new_functions = [] - for kegg in all_functions['kegg']: - if kegg == unknown_val: - continue - elif kegg in self.kegg_mapping.keys(): - new_functions.append(self.kegg_mapping[kegg]) - for eggnog in all_functions['eggnog']: - if eggnog == unknown_val: - continue - elif eggnog in self.eggnog_mapping.keys(): - new_functions.append(self.eggnog_mapping[eggnog]) + kegg_annotation = all_functions['kegg'] + if kegg_annotation == unknown_val: + continue + elif kegg_annotation in self.kegg_mapping.keys(): + new_functions.append(self.kegg_mapping[kegg_annotation]) if new_functions: cleaned_functions[gene_id] = new_functions return cleaned_functions - def _generate_gene_function_mapping(self, functions, genes): - """ - Generate a list of GeneFunction pair to create relation between them - """ - mapping = [] - for gene_id, function_list in functions.items(): - for function in function_list: - mapping.append(GeneFunction(gene=genes[gene_id], function=function)) - return mapping - def link_genes_to_functions(self, functions): cleaned_functions = self._clean_functions(functions) genes = Gene.objects.in_bulk(cleaned_functions.keys(), field_name='gene_id') @@ -148,30 +66,37 @@ class ImportVirgoGenes(object): self._generate_gene_function_mapping(cleaned_functions, genes) ) - def load_all(self, test=False, chunk_size=10000): - logger.info("Starting Virgo KEGG annotations import (creation or update) to DB") - with open(self.annotation_file, 'r') as file: - while True: - chunk_genes = list(islice(file, chunk_size)) - if not chunk_genes: - break - virgo_dict_list = [self._parse_gene(i) for i in chunk_genes] - functions = self._remove_functions(virgo_dict_list) - igc_clean_dict = {slugify(i['gene_id']): self._format_for_model(i) for i in virgo_dict_list} - self.processed_genes += chunk_size - self.create_or_update_genes(igc_clean_dict) - if not self.skip_functions: - self.link_genes_to_functions(functions) - logger.info("%s Genes processed so far...", self.processed_genes) - if test is True: - break - logger.info("[DONE] %s/%s Genes created.", self.created_genes, self.total_genes) - logger.info("[DONE] %s/%s Genes updated.", self.updated_genes, self.total_genes) - logger.info("[DONE] %s/%s Genes skipped.", self.skipped_genes, self.total_genes) + def _remove_functions(self, gene_dicts): + functions = {} + for gene_dict in gene_dicts: + functions[slugify(gene_dict['gene_id'])] = { + 'kegg': gene_dict.pop('kegg_ko'), + } + return functions + + def _format_for_model(self, igc_dict): + """ + @TODO remove in the future and makes function from parent class more modulable + """ + gene_dict = {} + gene_dict['gene_id'] = slugify(igc_dict['gene_id']) + gene_dict['name'] = igc_dict['gene_id'] + gene_dict['source'] = self.SOURCE + return gene_dict + + def _handle_chunk(self, chunk_genes): + """ + Overide for all different sources + """ + gene_dict_list = [self._parse_gene(i) for i in chunk_genes] + functions = self._remove_functions(gene_dict_list) + gene_clean_dict = {slugify(i['gene_id']): self._format_for_model(i) for i in gene_dict_list} + self.create_or_update_genes(gene_clean_dict) + self.link_genes_to_functions(functions) class Command(BaseCommand): - help = 'Create or update all EggNOG entries from annotations.tsv file.' + help = 'Create or update all KEGG annotation for Virgo genes (from `8.A.kegg.ortholog.txt` file).' def add_arguments(self, parser): parser.add_argument('annotation', help='8.A.kegg.ortholog.txt file from Virgo') @@ -185,5 +110,5 @@ class Command(BaseCommand): def handle(self, *args, **options): self.set_logger_level(int(options['verbosity'])) - import_igc = ImportVirgoGenes(options['annotation']) - import_igc.load_all(test=options['test']) + import_annotations = ImportVirgoGeneKeggAnnotation(options['annotation']) + import_annotations.load_all(test=options['test']) -- GitLab From 220bb3561f4d9008941946f7f48c132516deedad Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Mon, 4 May 2020 12:29:44 +0200 Subject: [PATCH 09/21] refactor handling of functions --- .../commands/commons/handle_functions.py | 78 ++++++++++++ .../commands/commons/test_handle_functions.py | 111 ++++++++++++++++++ .../commands/import_igc_annotation.py | 72 +----------- .../management/commands/import_virgo_kegg.py | 35 +----- .../tests/test_import_igc_annotation.py | 99 +--------------- 5 files changed, 195 insertions(+), 200 deletions(-) create mode 100644 backend/metagenedb/apps/catalog/management/commands/commons/handle_functions.py create mode 100644 backend/metagenedb/apps/catalog/management/commands/commons/test_handle_functions.py diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/handle_functions.py b/backend/metagenedb/apps/catalog/management/commands/commons/handle_functions.py new file mode 100644 index 0000000..86c7264 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/commons/handle_functions.py @@ -0,0 +1,78 @@ +import logging + +from slugify import slugify + +from metagenedb.apps.catalog.models import Function, Gene, GeneFunction + +logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') +logger = logging.getLogger(__name__) + + +class HandleFunctions: + + def _build_function_mapping(self, source): + logger.info("Building local mapping for %s function...", source) + instances = Function.objects.filter(source=source) + return {instance.function_id: instance for instance in instances} + + @property + def eggnog_mapping(self): + if getattr(self, '_eggnog_mapping', None) is None: + self._eggnog_mapping = self._build_function_mapping("eggnog") + return self._eggnog_mapping + + @property + def kegg_mapping(self): + if getattr(self, '_kegg_mapping', None) is None: + self._kegg_mapping = self._build_function_mapping("kegg") + return self._kegg_mapping + + def _clean_functions(self, functions, unknown_val='unknown'): + """ + Get rid of functions that are not in the db or entitled unknown + """ + cleaned_functions = {} + for gene_id, all_functions in functions.items(): + new_functions = [] + for kegg in all_functions['kegg']: + if kegg == unknown_val: + continue + elif kegg in self.kegg_mapping.keys(): + new_functions.append(self.kegg_mapping[kegg]) + for eggnog in all_functions['eggnog']: + if eggnog == unknown_val: + continue + elif eggnog in self.eggnog_mapping.keys(): + new_functions.append(self.eggnog_mapping[eggnog]) + if new_functions: + cleaned_functions[gene_id] = new_functions + return cleaned_functions + + def _remove_functions(self, gene_dicts): + functions = {} + for gene_dict in gene_dicts: + functions[slugify(gene_dict['gene_id'])] = { + 'kegg': gene_dict.pop('kegg_ko'), + 'eggnog': gene_dict.pop('eggnog') + } + return functions + + def _generate_gene_function_mapping(self, functions, genes): + """ + Generate a list of GeneFunction pair to create relation between them + """ + mapping = [] + for gene_id, function_list in functions.items(): + for function in function_list: + mapping.append(GeneFunction(gene=genes[gene_id], function=function)) + return mapping + + def link_genes_to_functions(self, functions): + cleaned_functions = self._clean_functions(functions) + genes = Gene.objects.in_bulk(cleaned_functions.keys(), field_name='gene_id') + # Get all link with corresponding genes & Delete them + GeneFunction.objects.filter(gene__in=genes.values()).delete() + # Generate table for bulk_create of function <-> gene and create it + GeneFunction.objects.bulk_create( + self._generate_gene_function_mapping(cleaned_functions, genes) + ) diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/test_handle_functions.py b/backend/metagenedb/apps/catalog/management/commands/commons/test_handle_functions.py new file mode 100644 index 0000000..cc925d5 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/commons/test_handle_functions.py @@ -0,0 +1,111 @@ +from unittest import TestCase + +from rest_framework.test import APITestCase + +from metagenedb.apps.catalog.models import GeneFunction +from metagenedb.apps.catalog.management.commands.commons.handle_functions import HandleFunctions +from metagenedb.apps.catalog.factory import ( + FunctionFactory, + GeneFactory, +) + + +class BaseTestHandleFunctions(TestCase): + + def setUp(self): + self.handle_functions = HandleFunctions() + + +class TestRemoveFunctions(BaseTestHandleFunctions): + + def test_remove_functions(self): + input_dicts = [{ + 'gene_id': 'Test_gene', + 'kegg_ko': ['K0001'], + 'eggnog': ['COG1', 'COG2'] + }] + expected_functions = { + 'test-gene': { + 'kegg': ['K0001'], + 'eggnog': ['COG1', 'COG2'] + } + } + tested_dict = self.handle_functions._remove_functions(input_dicts) + self.assertDictEqual(tested_dict, expected_functions) + + +class TestCleanFunctions(APITestCase, BaseTestHandleFunctions): + + @classmethod + def setUpTestData(cls): + cls.kegg = FunctionFactory(source='kegg') + cls.eggnog = FunctionFactory(source='eggnog') + + def test_clean_functions_kegg_only(self): + functions = { + 'gene-kegg': { + 'kegg': [self.kegg.function_id, 'KO12345'], + 'eggnog': ['unknown'] + }, + } + expected_functions = { + 'gene-kegg': [self.kegg] + } + self.assertDictEqual(self.handle_functions._clean_functions(functions), expected_functions) + + def test_clean_functions_eggnog_only(self): + functions = { + 'gene-kegg': { + 'kegg': ['unknown'], + 'eggnog': [self.eggnog.function_id, 'COG12345'] + }, + } + expected_functions = { + 'gene-kegg': [self.eggnog] + } + self.assertDictEqual(self.handle_functions._clean_functions(functions), expected_functions) + + def test_clean_functions_kegg_eggnog(self): + functions = { + 'gene-kegg': { + 'kegg': [self.kegg.function_id, 'KO12345'], + 'eggnog': [self.eggnog.function_id, 'COG12345'] + }, + } + expected_functions = { + 'gene-kegg': [self.kegg, self.eggnog] + } + self.assertDictEqual(self.handle_functions._clean_functions(functions), expected_functions) + + def test_clean_functions_both_unknown(self): + functions = { + 'gene-kegg': { + 'kegg': ['unknown'], + 'eggnog': ['unknown'] + }, + } + expected_functions = {} + self.assertDictEqual(self.handle_functions._clean_functions(functions), expected_functions) + + +class TestLinkGenesToFunctions(APITestCase, BaseTestHandleFunctions): + + @classmethod + def setUpTestData(cls): + cls.kegg = FunctionFactory(source='kegg') + cls.eggnog = FunctionFactory(source='eggnog') + cls.gene = GeneFactory() + + def test_link_kegg_and_eggnog(self): + self.assertEqual(GeneFunction.objects.all().count(), 0) + functions = { + self.gene.gene_id: { + 'kegg': [self.kegg.function_id], + 'eggnog': [self.eggnog.function_id] + } + } + self.handle_functions.link_genes_to_functions(functions) + gene_functions = GeneFunction.objects.all() + self.assertEqual(gene_functions.count(), 2) + for link in gene_functions: + self.assertEqual(link.gene.gene_id, self.gene.gene_id) diff --git a/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py b/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py index 7d3294c..5d64d52 100644 --- a/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py +++ b/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py @@ -3,15 +3,16 @@ import logging from django.core.management.base import BaseCommand from slugify import slugify +from metagenedb.apps.catalog.management.commands.commons.handle_functions import HandleFunctions from metagenedb.apps.catalog.management.commands.commons.import_genes import BaseImportGenes -from metagenedb.apps.catalog.models import Function, Gene, GeneFunction, Taxonomy +from metagenedb.apps.catalog.models import Taxonomy from metagenedb.common.utils.parsers import IGCLineParser logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') logger = logging.getLogger(__name__) -class ImportIGCGenes(BaseImportGenes): +class ImportIGCGenes(BaseImportGenes, HandleFunctions): PHYLUM_COL = 'taxo_phylum' GENUS_COL = 'taxo_genus' SELECTED_KEYS = ['gene_id', 'length', 'kegg_ko', 'eggnog', PHYLUM_COL, GENUS_COL] @@ -43,23 +44,6 @@ class ImportIGCGenes(BaseImportGenes): self._genus_mapping = self._build_taxo_mapping("genus") return self._genus_mapping - def _build_function_mapping(self, source): - logger.info("Building local mapping for %s function...", source) - instances = Function.objects.filter(source=source) - return {instance.function_id: instance for instance in instances} - - @property - def eggnog_mapping(self): - if getattr(self, '_eggnog_mapping', None) is None: - self._eggnog_mapping = self._build_function_mapping("eggnog") - return self._eggnog_mapping - - @property - def kegg_mapping(self): - if getattr(self, '_kegg_mapping', None) is None: - self._kegg_mapping = self._build_function_mapping("kegg") - return self._kegg_mapping - def _retrieve_taxonomy(self, genus_name, phylum_name, unknown_val='unknown'): taxonomy_instance = None if genus_name != unknown_val: @@ -68,62 +52,12 @@ class ImportIGCGenes(BaseImportGenes): taxonomy_instance = self.phylum_mapping.get(phylum_name, None) return taxonomy_instance - def _remove_functions(self, gene_dicts): - functions = {} - for gene_dict in gene_dicts: - functions[slugify(gene_dict['gene_id'])] = { - 'kegg': gene_dict.pop('kegg_ko'), - 'eggnog': gene_dict.pop('eggnog') - } - return functions - def _format_for_model(self, igc_dict): gene_dict = super()._format_for_model(igc_dict) if not self.skip_tax: gene_dict['taxonomy'] = self._retrieve_taxonomy(igc_dict.get('taxo_genus'), igc_dict.get('taxo_phylum')) return gene_dict - def _clean_functions(self, functions, unknown_val='unknown'): - """ - Get rid of functions that are not in the db or entitled unknown - """ - cleaned_functions = {} - for gene_id, all_functions in functions.items(): - new_functions = [] - for kegg in all_functions['kegg']: - if kegg == unknown_val: - continue - elif kegg in self.kegg_mapping.keys(): - new_functions.append(self.kegg_mapping[kegg]) - for eggnog in all_functions['eggnog']: - if eggnog == unknown_val: - continue - elif eggnog in self.eggnog_mapping.keys(): - new_functions.append(self.eggnog_mapping[eggnog]) - if new_functions: - cleaned_functions[gene_id] = new_functions - return cleaned_functions - - def _generate_gene_function_mapping(self, functions, genes): - """ - Generate a list of GeneFunction pair to create relation between them - """ - mapping = [] - for gene_id, function_list in functions.items(): - for function in function_list: - mapping.append(GeneFunction(gene=genes[gene_id], function=function)) - return mapping - - def link_genes_to_functions(self, functions): - cleaned_functions = self._clean_functions(functions) - genes = Gene.objects.in_bulk(cleaned_functions.keys(), field_name='gene_id') - # Get all link with corresponding genes & Delete them - GeneFunction.objects.filter(gene__in=genes.values()).delete() - # Generate table for bulk_create of function <-> gene and create it - GeneFunction.objects.bulk_create( - self._generate_gene_function_mapping(cleaned_functions, genes) - ) - def _handle_chunk(self, chunk_genes): gene_dict_list = [self._parse_gene(i) for i in chunk_genes] functions = self._remove_functions(gene_dict_list) diff --git a/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py b/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py index 59e7f4f..9312bea 100644 --- a/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py +++ b/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py @@ -3,15 +3,15 @@ import logging from django.core.management.base import BaseCommand from slugify import slugify +from metagenedb.apps.catalog.management.commands.commons.handle_functions import HandleFunctions from metagenedb.apps.catalog.management.commands.commons.import_genes import BaseImportGenes -from metagenedb.apps.catalog.models import Function, Gene, GeneFunction from metagenedb.common.utils.parsers import VirgoKEGGLineParser logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') logger = logging.getLogger(__name__) -class ImportVirgoGeneKeggAnnotation(BaseImportGenes): +class ImportVirgoGeneKeggAnnotation(BaseImportGenes, HandleFunctions): IMPORT_TYPE = "Virgo KEGG annotations" # For logs SELECTED_KEYS = ['gene_id', 'kegg_ko'] @@ -19,27 +19,6 @@ class ImportVirgoGeneKeggAnnotation(BaseImportGenes): SOURCE = 'virgo' PARSER = VirgoKEGGLineParser - def _build_function_mapping(self, source): - logger.info("Building local mapping for %s functions...", source) - instances = Function.objects.filter(source=source) - return {instance.function_id: instance for instance in instances} - - @property - def kegg_mapping(self): - if getattr(self, '_kegg_mapping', None) is None: - self._kegg_mapping = self._build_function_mapping("kegg") - return self._kegg_mapping - - def _generate_gene_function_mapping(self, functions, genes): - """ - Generate a list of GeneFunction pair to create relation between them - """ - mapping = [] - for gene_id, function_list in functions.items(): - for function in function_list: - mapping.append(GeneFunction(gene=genes[gene_id], function=function)) - return mapping - def _clean_functions(self, functions, unknown_val='unknown'): """ Get rid of functions that are not in the db or entitled unknown @@ -56,16 +35,6 @@ class ImportVirgoGeneKeggAnnotation(BaseImportGenes): cleaned_functions[gene_id] = new_functions return cleaned_functions - def link_genes_to_functions(self, functions): - cleaned_functions = self._clean_functions(functions) - genes = Gene.objects.in_bulk(cleaned_functions.keys(), field_name='gene_id') - # Get all link with corresponding genes & Delete them - GeneFunction.objects.filter(gene__in=genes.values()).delete() - # Generate table for bulk_create of function <-> gene and create it - GeneFunction.objects.bulk_create( - self._generate_gene_function_mapping(cleaned_functions, genes) - ) - def _remove_functions(self, gene_dicts): functions = {} for gene_dict in gene_dicts: diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py index c88542d..981db09 100644 --- a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py @@ -4,11 +4,9 @@ from unittest import TestCase import mock from rest_framework.test import APITestCase -from metagenedb.apps.catalog.models import Gene, GeneFunction +from metagenedb.apps.catalog.models import Gene from metagenedb.apps.catalog.management.commands.import_igc_annotation import ImportIGCGenes from metagenedb.apps.catalog.factory import ( - FunctionFactory, - GeneFactory, TaxonomyFactory, ) from metagenedb.apps.catalog.factory.function import generate_fake_functions_db @@ -60,101 +58,6 @@ class TestRetrieveTaxonomy(APITestCase, BaseTestImportIGCGenes): self.assertEqual(tested_taxonomy, None) -class TestRemoveFunctions(BaseTestImportIGCGenes): - - def test_remove_functions(self): - input_dicts = [{ - 'gene_id': 'Test_gene', - 'kegg_ko': ['K0001'], - 'eggnog': ['COG1', 'COG2'] - }] - expected_functions = { - 'test-gene': { - 'kegg': ['K0001'], - 'eggnog': ['COG1', 'COG2'] - } - } - tested_dict = self.import_igc_genes._remove_functions(input_dicts) - self.assertDictEqual(tested_dict, expected_functions) - - -class TestRemoveUnknownFunctions(APITestCase, BaseTestImportIGCGenes): - - @classmethod - def setUpTestData(cls): - cls.kegg = FunctionFactory(source='kegg') - cls.eggnog = FunctionFactory(source='eggnog') - - def test_clean_functions_kegg_only(self): - functions = { - 'gene-kegg': { - 'kegg': [self.kegg.function_id, 'KO12345'], - 'eggnog': ['unknown'] - }, - } - expected_functions = { - 'gene-kegg': [self.kegg] - } - self.assertDictEqual(self.import_igc_genes._clean_functions(functions), expected_functions) - - def test_clean_functions_eggnog_only(self): - functions = { - 'gene-kegg': { - 'kegg': ['unknown'], - 'eggnog': [self.eggnog.function_id, 'COG12345'] - }, - } - expected_functions = { - 'gene-kegg': [self.eggnog] - } - self.assertDictEqual(self.import_igc_genes._clean_functions(functions), expected_functions) - - def test_clean_functions_kegg_eggnog(self): - functions = { - 'gene-kegg': { - 'kegg': [self.kegg.function_id, 'KO12345'], - 'eggnog': [self.eggnog.function_id, 'COG12345'] - }, - } - expected_functions = { - 'gene-kegg': [self.kegg, self.eggnog] - } - self.assertDictEqual(self.import_igc_genes._clean_functions(functions), expected_functions) - - def test_clean_functions_both_unknown(self): - functions = { - 'gene-kegg': { - 'kegg': ['unknown'], - 'eggnog': ['unknown'] - }, - } - expected_functions = {} - self.assertDictEqual(self.import_igc_genes._clean_functions(functions), expected_functions) - - -class TestLinkGenesToFunctions(APITestCase, BaseTestImportIGCGenes): - - @classmethod - def setUpTestData(cls): - cls.kegg = FunctionFactory(source='kegg') - cls.eggnog = FunctionFactory(source='eggnog') - cls.gene = GeneFactory() - - def test_link_kegg_and_eggnog(self): - self.assertEqual(GeneFunction.objects.all().count(), 0) - functions = { - self.gene.gene_id: { - 'kegg': [self.kegg.function_id], - 'eggnog': [self.eggnog.function_id] - } - } - self.import_igc_genes.link_genes_to_functions(functions) - gene_functions = GeneFunction.objects.all() - self.assertEqual(gene_functions.count(), 2) - for link in gene_functions: - self.assertEqual(link.gene.gene_id, self.gene.gene_id) - - class TestEndToEnd(APITestCase): @classmethod -- GitLab From 6bbb575ffa4cab77c13c545838dfd6efd4601298 Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Mon, 4 May 2020 15:24:30 +0200 Subject: [PATCH 10/21] add end to end test for kegg annotation of virgo --- .../commands/tests/test_files/virgo_kegg.tsv | 2 + .../commands/tests/test_import_virgo_kegg.py | 52 +++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_kegg.tsv create mode 100644 backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_kegg.py diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_kegg.tsv b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_kegg.tsv new file mode 100644 index 0000000..1a28b25 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_kegg.tsv @@ -0,0 +1,2 @@ +V1 K12345 ljo:LJ0360 dvvi:GSVIVP00035275001 GSVIVT00035275001; assembled CDS; K02948 small subunit ribosomal protein S11 +V2 K67890 shg:Sph21_4943 dtni:5367 ; diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_kegg.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_kegg.py new file mode 100644 index 0000000..8723977 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_kegg.py @@ -0,0 +1,52 @@ +import os + +from rest_framework.test import APITestCase + +from metagenedb.apps.catalog.models import Gene +from metagenedb.apps.catalog.management.commands.import_virgo_genes import ImportVirgoGenes +from metagenedb.apps.catalog.management.commands.import_virgo_kegg import ImportVirgoGeneKeggAnnotation +from metagenedb.apps.catalog.factory.function import generate_fake_functions_db + + +class TestEndToEnd(APITestCase): + """ + This test depends on two files, one to create genes (gene_length) and the other for this test (kegg) + """ + + @classmethod + def setUpTestData(cls): + generate_fake_functions_db() + virgo_gene_length_file = os.path.join(os.path.dirname(__file__), "./test_files/virgo_gene_length.tsv") + loader = ImportVirgoGenes(virgo_gene_length_file) + loader.load_all() + + def test_end_to_end(self): + test_file = os.path.join(os.path.dirname(__file__), "./test_files/virgo_kegg.tsv") + loader = ImportVirgoGeneKeggAnnotation(test_file) + expected_genes = { + 'v1': { + 'source': 'virgo', + 'name': 'V1', + 'functions': { + 'kegg': 'K12345', + } + }, + 'v2': { + 'source': 'virgo', + 'name': 'V2', + 'functions': { + 'kegg': 'K67890', + } + }, + } + loader.load_all() + created_genes = Gene.objects.all().prefetch_related('functions') + for created_gene in created_genes: + for key in ['source', 'name']: + self.assertEqual(getattr(created_gene, key), expected_genes[created_gene.gene_id][key]) + # Check functions + for function in created_gene.functions.all(): + self.assertIn(function.source, ['kegg', 'eggnog']) + self.assertEqual( + function.function_id, expected_genes[created_gene.gene_id]['functions'][function.source] + ) -- GitLab From 0d87ecd0b0a754c2cf430894257069faffb609c3 Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Mon, 4 May 2020 16:40:35 +0200 Subject: [PATCH 11/21] use factory to create Virgo genes --- .../management/commands/import_virgo_kegg.py | 5 ++++- .../commands/tests/test_import_virgo_kegg.py | 15 ++++----------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py b/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py index 9312bea..ced1911 100644 --- a/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py +++ b/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py @@ -68,7 +68,10 @@ class Command(BaseCommand): help = 'Create or update all KEGG annotation for Virgo genes (from `8.A.kegg.ortholog.txt` file).' def add_arguments(self, parser): - parser.add_argument('annotation', help='8.A.kegg.ortholog.txt file from Virgo') + parser.add_argument( + 'annotation', + help='8.A.kegg.ortholog.txt file from Virgo. Genes need to exist in DB for this script to work.' + ) parser.add_argument('--test', action='store_true', help='Run only on first 10000 entries.') def set_logger_level(self, verbosity): diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_kegg.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_kegg.py index 8723977..bb37e2b 100644 --- a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_kegg.py +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_kegg.py @@ -3,36 +3,30 @@ import os from rest_framework.test import APITestCase from metagenedb.apps.catalog.models import Gene -from metagenedb.apps.catalog.management.commands.import_virgo_genes import ImportVirgoGenes from metagenedb.apps.catalog.management.commands.import_virgo_kegg import ImportVirgoGeneKeggAnnotation +from metagenedb.apps.catalog.factory import GeneFactory from metagenedb.apps.catalog.factory.function import generate_fake_functions_db class TestEndToEnd(APITestCase): - """ - This test depends on two files, one to create genes (gene_length) and the other for this test (kegg) - """ @classmethod def setUpTestData(cls): generate_fake_functions_db() - virgo_gene_length_file = os.path.join(os.path.dirname(__file__), "./test_files/virgo_gene_length.tsv") - loader = ImportVirgoGenes(virgo_gene_length_file) - loader.load_all() + GeneFactory.create(gene_id="v1") + GeneFactory.create(gene_id="v2") def test_end_to_end(self): test_file = os.path.join(os.path.dirname(__file__), "./test_files/virgo_kegg.tsv") loader = ImportVirgoGeneKeggAnnotation(test_file) expected_genes = { 'v1': { - 'source': 'virgo', 'name': 'V1', 'functions': { 'kegg': 'K12345', } }, 'v2': { - 'source': 'virgo', 'name': 'V2', 'functions': { 'kegg': 'K67890', @@ -42,8 +36,7 @@ class TestEndToEnd(APITestCase): loader.load_all() created_genes = Gene.objects.all().prefetch_related('functions') for created_gene in created_genes: - for key in ['source', 'name']: - self.assertEqual(getattr(created_gene, key), expected_genes[created_gene.gene_id][key]) + self.assertEqual(getattr(created_gene, 'name'), expected_genes[created_gene.gene_id]['name']) # Check functions for function in created_gene.functions.all(): self.assertIn(function.source, ['kegg', 'eggnog']) -- GitLab From bc356009d78f3bae14a9ab469cacbfa1afa700e0 Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Mon, 4 May 2020 16:42:25 +0200 Subject: [PATCH 12/21] fix help message --- .../apps/catalog/management/commands/import_igc_annotation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py b/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py index 5d64d52..557eda1 100644 --- a/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py +++ b/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py @@ -68,7 +68,7 @@ class ImportIGCGenes(BaseImportGenes, HandleFunctions): class Command(BaseCommand): - help = 'Create or update all EggNOG entries from annotations.tsv file.' + help = 'Create or update IGC genes from IGC annotations file.' def add_arguments(self, parser): parser.add_argument('annotation', help='IGC.annotation_OF.summary file from IGC') -- GitLab From 1a863324a542d8cf3bd1b44b27f9a64157ebdd98 Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Mon, 4 May 2020 17:05:34 +0200 Subject: [PATCH 13/21] add end to end test for sequences creation of genes --- .../commands/commons/import_gene_sequences.py | 58 +++++++++++++++++++ .../commands/commons/test_files/genes.fa | 4 ++ .../commons/test_import_gene_sequences.py | 52 +++++++++++++++++ .../commands/import_igc_sequences.py | 52 +---------------- .../commands/import_virgo_sequences.py | 35 +++++++++++ .../tests/test_import_igc_sequences.py | 26 --------- 6 files changed, 152 insertions(+), 75 deletions(-) create mode 100644 backend/metagenedb/apps/catalog/management/commands/commons/import_gene_sequences.py create mode 100644 backend/metagenedb/apps/catalog/management/commands/commons/test_files/genes.fa create mode 100644 backend/metagenedb/apps/catalog/management/commands/commons/test_import_gene_sequences.py create mode 100644 backend/metagenedb/apps/catalog/management/commands/import_virgo_sequences.py delete mode 100644 backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_sequences.py diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/import_gene_sequences.py b/backend/metagenedb/apps/catalog/management/commands/commons/import_gene_sequences.py new file mode 100644 index 0000000..6fdc5b7 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/commons/import_gene_sequences.py @@ -0,0 +1,58 @@ +import logging + +import pyfastx +from slugify import slugify + +from metagenedb.apps.catalog.models import Gene + +logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') +logger = logging.getLogger(__name__) + + +class ImportGeneSequences(object): + CATALOG = "CAT_NAME" + + def __init__(self, sequence_file): + self.sequence_file = sequence_file + self._reset_counters() + + def _reset_counters(self): + self.processed_genes = 0 + self.updated_genes = 0 + self.skipped_genes = 0 + + def update_sequences(self, sequences): + genes = Gene.objects.filter(gene_id__in=sequences.keys()) + genes_retrieved = genes.count() + for gene in genes: + gene.sequence = sequences[gene.gene_id] + try: + Gene.objects.bulk_update(genes, ['sequence']) + self.updated_genes += genes_retrieved + self.skipped_genes += len(sequences) - genes_retrieved + except Exception: + logger.warning("Could not update genes... skipped.") + self.skipped_genes += len(sequences) + + def load_all(self, test=False, chunk_size=10000, skip_n_sequences=0): + logger.info("Starting %s Gene sequences import (update) to DB", self.CATALOG) + if skip_n_sequences > 0: + logger.info("Skipping first %s sequences", skip_n_sequences) + current_sequences = {} + for name, seq in pyfastx.Fasta(self.sequence_file, build_index=False): + if self.processed_genes < skip_n_sequences: + self.processed_genes += 1 + self.skipped_genes += 1 + continue + current_sequences[slugify(name.split()[0])] = seq + self.processed_genes += 1 + if self.processed_genes % chunk_size == 0: + self.update_sequences(current_sequences) + logger.info("%s Gene sequences processed so far...", self.processed_genes) + current_sequences = {} + if test is True: + break + if len(current_sequences) > 0: + self.update_sequences(current_sequences) + logger.info("[DONE] %s/%s Gene sequences updated.", self.updated_genes, self.processed_genes) + logger.info("[DONE] %s/%s Genes skipped.", self.skipped_genes, self.processed_genes) diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/test_files/genes.fa b/backend/metagenedb/apps/catalog/management/commands/commons/test_files/genes.fa new file mode 100644 index 0000000..db0709e --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/commons/test_files/genes.fa @@ -0,0 +1,4 @@ +>Gene1 +ACGT +>Gene2 +ATCG \ No newline at end of file diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/test_import_gene_sequences.py b/backend/metagenedb/apps/catalog/management/commands/commons/test_import_gene_sequences.py new file mode 100644 index 0000000..95e7ab8 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/commons/test_import_gene_sequences.py @@ -0,0 +1,52 @@ +import os + +from rest_framework.test import APITestCase + +from metagenedb.apps.catalog.models import Gene +from metagenedb.apps.catalog.management.commands.commons.import_gene_sequences import ImportGeneSequences +from metagenedb.apps.catalog.factory import ( + GeneFactory, +) + + +class TestUpdateSequences(APITestCase): + + @classmethod + def setUpTestData(cls): + cls.gene = GeneFactory() + + def setUp(self): + self.import_igc_seq = ImportGeneSequences("test") # we never make real reference to the sequence_file + + def test_update_sequence(self): + seq = "ACTG" + sequences = { + self.gene.gene_id: seq + } + self.assertFalse(Gene.objects.get(gene_id=self.gene.gene_id).sequence) + self.import_igc_seq.update_sequences(sequences) + self.assertEqual(Gene.objects.get(gene_id=self.gene.gene_id).sequence, seq) + + +class TestEndToEnd(APITestCase): + + @classmethod + def setUpTestData(cls): + GeneFactory.create(gene_id="gene1") + GeneFactory.create(gene_id="gene2") + + def test_end_to_end(self): + test_file = os.path.join(os.path.dirname(__file__), "./test_files/genes.fa") + loader = ImportGeneSequences(test_file) + expected_genes = { + 'gene1': { + 'sequence': 'ACGT' + }, + 'gene2': { + 'sequence': 'ATCG' + }, + } + loader.load_all() + created_genes = Gene.objects.all() + for created_gene in created_genes: + self.assertEqual(getattr(created_gene, 'sequence'), expected_genes[created_gene.gene_id]['sequence']) diff --git a/backend/metagenedb/apps/catalog/management/commands/import_igc_sequences.py b/backend/metagenedb/apps/catalog/management/commands/import_igc_sequences.py index 4d751b3..0988c2c 100644 --- a/backend/metagenedb/apps/catalog/management/commands/import_igc_sequences.py +++ b/backend/metagenedb/apps/catalog/management/commands/import_igc_sequences.py @@ -1,61 +1,15 @@ import logging -import pyfastx from django.core.management.base import BaseCommand -from slugify import slugify -from metagenedb.apps.catalog.models import Gene +from metagenedb.apps.catalog.management.commands.commons.import_gene_sequences import ImportGeneSequences logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') logger = logging.getLogger(__name__) -class ImportIGCGeneSequences(object): - - def __init__(self, sequence_file): - self.sequence_file = sequence_file - self._reset_counters() - - def _reset_counters(self): - self.processed_genes = 0 - self.updated_genes = 0 - self.skipped_genes = 0 - - def update_sequences(self, sequences): - genes = Gene.objects.filter(gene_id__in=sequences.keys()) - genes_retrieved = genes.count() - for gene in genes: - gene.sequence = sequences[gene.gene_id] - try: - Gene.objects.bulk_update(genes, ['sequence']) - self.updated_genes += genes_retrieved - self.skipped_genes += len(sequences) - genes_retrieved - except Exception: - logger.warning("Could not update genes... skipped.") - self.skipped_genes += len(sequences) - - def load_all(self, test=False, chunk_size=10000, skip_n_sequences=0): - logger.info("Starting IGC Gene sequences import (update) to DB") - if skip_n_sequences > 0: - logger.info("Skipping first %s sequences", skip_n_sequences) - current_sequences = {} - for name, seq in pyfastx.Fasta(self.sequence_file, build_index=False): - if self.processed_genes < skip_n_sequences: - self.processed_genes += 1 - self.skipped_genes += 1 - continue - current_sequences[slugify(name.split()[0])] = seq - self.processed_genes += 1 - if self.processed_genes % chunk_size == 0: - self.update_sequences(current_sequences) - logger.info("%s Gene sequences processed so far...", self.processed_genes) - current_sequences = {} - if test is True: - break - if len(current_sequences) > 0: - self.update_sequences(current_sequences) - logger.info("[DONE] %s/%s Gene sequences updated.", self.updated_genes, self.processed_genes) - logger.info("[DONE] %s/%s Genes skipped.", self.skipped_genes, self.processed_genes) +class ImportIGCGeneSequences(ImportGeneSequences): + CATALOG = "IGC" class Command(BaseCommand): diff --git a/backend/metagenedb/apps/catalog/management/commands/import_virgo_sequences.py b/backend/metagenedb/apps/catalog/management/commands/import_virgo_sequences.py new file mode 100644 index 0000000..9e96ca8 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/import_virgo_sequences.py @@ -0,0 +1,35 @@ +import logging + +from django.core.management.base import BaseCommand + +from metagenedb.apps.catalog.management.commands.commons.import_gene_sequences import ImportGeneSequences + +logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') +logger = logging.getLogger(__name__) + + +class ImportVirgoGeneSequences(ImportGeneSequences): + CATALOG = "Virgo" + + +class Command(BaseCommand): + help = 'Create or update all Virgo gene equences (from `NT.fasta` file).' + + def add_arguments(self, parser): + parser.add_argument( + 'fasta', + help='NT.fasta file from Virgo. Genes need to exist in DB for this script to work.' + ) + parser.add_argument('--test', action='store_true', help='Run only on first 10000 sequences.') + parser.add_argument('--skip_n', type=int, default=0, help='Number of sequence to skip') + + def set_logger_level(self, verbosity): + if verbosity > 2: + logger.setLevel(logging.DEBUG) + elif verbosity > 1: + logger.setLevel(logging.INFO) + + def handle(self, *args, **options): + self.set_logger_level(int(options['verbosity'])) + import_igc = ImportVirgoGeneSequences(options['fasta']) + import_igc.load_all(test=options['test'], skip_n_sequences=options['skip_n']) diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_sequences.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_sequences.py deleted file mode 100644 index 5a1608e..0000000 --- a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_sequences.py +++ /dev/null @@ -1,26 +0,0 @@ -from rest_framework.test import APITestCase - -from metagenedb.apps.catalog.models import Gene -from metagenedb.apps.catalog.management.commands.import_igc_sequences import ImportIGCGeneSequences -from metagenedb.apps.catalog.factory import ( - GeneFactory, -) - - -class TestUpdateSequences(APITestCase): - - @classmethod - def setUpTestData(cls): - cls.gene = GeneFactory() - - def setUp(self): - self.import_igc_seq = ImportIGCGeneSequences("test") # we never make real reference to the sequence_file - - def test_update_sequence(self): - seq = "ACTG" - sequences = { - self.gene.gene_id: seq - } - self.assertFalse(Gene.objects.get(gene_id=self.gene.gene_id).sequence) - self.import_igc_seq.update_sequences(sequences) - self.assertEqual(Gene.objects.get(gene_id=self.gene.gene_id).sequence, seq) -- GitLab From 534effc08a6c4928e22ec9cb2b63bc41cf5c0d76 Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Tue, 5 May 2020 11:22:33 +0200 Subject: [PATCH 14/21] add parser for eggnog from Virgo --- .../common/utils/parsers/__init__.py | 2 +- .../common/utils/parsers/test_virgo.py | 56 ++++++++++++++++++- .../metagenedb/common/utils/parsers/virgo.py | 32 +++++++++++ 3 files changed, 88 insertions(+), 2 deletions(-) diff --git a/backend/metagenedb/common/utils/parsers/__init__.py b/backend/metagenedb/common/utils/parsers/__init__.py index 6e718d0..855edc3 100644 --- a/backend/metagenedb/common/utils/parsers/__init__.py +++ b/backend/metagenedb/common/utils/parsers/__init__.py @@ -2,4 +2,4 @@ from .eggnog import EggNOGAnnotationLineParser # noqa from .igc import IGCLineParser # noqa from .kegg import KEGGLineParser # noqa from .ncbi_taxonomy import NCBITaxonomyLineParser # noqa -from .virgo import VirgoGeneLengthLineParser, VirgoKEGGLineParser # noqa +from .virgo import VirgoGeneLengthLineParser, VirgoKEGGLineParser, VirgoEggNOGLineParser # noqa diff --git a/backend/metagenedb/common/utils/parsers/test_virgo.py b/backend/metagenedb/common/utils/parsers/test_virgo.py index bde7cb9..293466f 100644 --- a/backend/metagenedb/common/utils/parsers/test_virgo.py +++ b/backend/metagenedb/common/utils/parsers/test_virgo.py @@ -1,7 +1,7 @@ from unittest import TestCase from metagenedb.common.utils.parsers import ( - VirgoGeneLengthLineParser, VirgoKEGGLineParser + VirgoGeneLengthLineParser, VirgoKEGGLineParser, VirgoEggNOGLineParser ) @@ -49,3 +49,57 @@ class TestVirgoKEGGLineParser(TestCase): raw_line = "This is a wrong line format, with; information and tab" with self.assertRaises(Exception) as context: # noqa VirgoKEGGLineParser.gene(raw_line) + + +class TestVirgoEggNOGLineParser(TestCase): + + def test_gene(self): + raw_data = [ + 'cluster_id', + 'gene_id', + 'ortholog', + 'kegg_pathway', + 'funcat', + 'name', + 'eggnog' + ] + raw_line = "\t".join(raw_data) + expected_dict = { + 'cluster_id': raw_data[0], + 'gene_id': raw_data[1], + 'ortholog': raw_data[2], + 'kegg_pathway': raw_data[3], + 'eggnog_funcat': raw_data[4], + 'function_name': raw_data[5], + 'eggnog': raw_data[6], + } + test_dict = VirgoEggNOGLineParser.gene(raw_line) + self.assertDictEqual(test_dict, expected_dict) + + def test_missing_kegg_pathway(self): + raw_data = [ + 'cluster_id', + 'gene_id', + 'ortholog', + '', + 'funcat', + 'name', + 'eggnog' + ] + raw_line = "\t".join(raw_data) + expected_dict = { + 'cluster_id': raw_data[0], + 'gene_id': raw_data[1], + 'ortholog': raw_data[2], + 'kegg_pathway': raw_data[3], + 'eggnog_funcat': raw_data[4], + 'function_name': raw_data[5], + 'eggnog': raw_data[6], + } + test_dict = VirgoEggNOGLineParser.gene(raw_line) + self.assertDictEqual(test_dict, expected_dict) + + def test_gene_wrong_format(self): + raw_line = "This is a wrong line format, with; information and tab" + with self.assertRaises(Exception) as context: # noqa + VirgoEggNOGLineParser.gene(raw_line) diff --git a/backend/metagenedb/common/utils/parsers/virgo.py b/backend/metagenedb/common/utils/parsers/virgo.py index 44cfbcc..0f3038f 100644 --- a/backend/metagenedb/common/utils/parsers/virgo.py +++ b/backend/metagenedb/common/utils/parsers/virgo.py @@ -49,3 +49,35 @@ class VirgoKEGGLineParser(object): except Exception: _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from Virgo KEGG annotation file?") raise + + +class VirgoEggNOGLineParser(object): + + @staticmethod + def gene(line): + """ + Parse line from Virgo EggNOG annotations to return organized dict (3.eggnog.NOG.txt) + + IGC annotation columns: + 0: Cluster ID + 1: Gene ID + 2: Ortholog + 3: KEGG pathway? + 4: EggNOG Functional category + 5: Name + 6: EggNOG annotation + """ + try: + gene_info = line.rstrip().split('\t') + return { + 'cluster_id': gene_info[0], + 'gene_id': gene_info[1], + 'ortholog': gene_info[2], + 'kegg_pathway': gene_info[3], + 'eggnog_funcat': gene_info[4], + 'function_name': gene_info[5], + 'eggnog': gene_info[6], + } + except Exception: + _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from Virgo KEGG annotation file?") + raise -- GitLab From e346db16d3099e519476934f2f8f6b2d6b21fc6a Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Tue, 5 May 2020 11:54:28 +0200 Subject: [PATCH 15/21] add script to add EggNOG annotations from VIRGO --- .../commands/import_virgo_eggnog.py | 86 +++++++++++++++++++ .../tests/test_files/virgo_eggnog.tsv | 2 + .../tests/test_import_virgo_eggnog.py | 46 ++++++++++ .../commands/tests/test_import_virgo_kegg.py | 1 + 4 files changed, 135 insertions(+) create mode 100644 backend/metagenedb/apps/catalog/management/commands/import_virgo_eggnog.py create mode 100644 backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_eggnog.tsv create mode 100644 backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_eggnog.py diff --git a/backend/metagenedb/apps/catalog/management/commands/import_virgo_eggnog.py b/backend/metagenedb/apps/catalog/management/commands/import_virgo_eggnog.py new file mode 100644 index 0000000..d18c7f6 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/import_virgo_eggnog.py @@ -0,0 +1,86 @@ +import logging + +from django.core.management.base import BaseCommand +from slugify import slugify + +from metagenedb.apps.catalog.management.commands.commons.handle_functions import HandleFunctions +from metagenedb.apps.catalog.management.commands.commons.import_genes import BaseImportGenes +from metagenedb.common.utils.parsers import VirgoEggNOGLineParser + +logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') +logger = logging.getLogger(__name__) + + +class ImportVirgoGeneEggNOGAnnotation(BaseImportGenes, HandleFunctions): + + IMPORT_TYPE = "Virgo EggNOG annotations" # For logs + SELECTED_KEYS = ['gene_id', 'eggnog'] + UPDATED_FIELDS = ['name'] + SOURCE = 'virgo' + PARSER = VirgoEggNOGLineParser + + def _clean_functions(self, functions, unknown_val='unknown'): + """ + Get rid of functions that are not in the db or entitled unknown + """ + cleaned_functions = {} + for gene_id, all_functions in functions.items(): + new_functions = [] + eggnog_annotation = all_functions['eggnog'] + if eggnog_annotation == unknown_val: + continue + elif eggnog_annotation in self.eggnog_mapping.keys(): + new_functions.append(self.eggnog_mapping[eggnog_annotation]) + if new_functions: + cleaned_functions[gene_id] = new_functions + return cleaned_functions + + def _remove_functions(self, gene_dicts): + functions = {} + for gene_dict in gene_dicts: + functions[slugify(gene_dict['gene_id'])] = { + 'eggnog': gene_dict.pop('eggnog'), + } + return functions + + def _format_for_model(self, igc_dict): + """ + @TODO remove in the future and makes function from parent class more modulable + """ + gene_dict = {} + gene_dict['gene_id'] = slugify(igc_dict['gene_id']) + gene_dict['name'] = igc_dict['gene_id'] + gene_dict['source'] = self.SOURCE + return gene_dict + + def _handle_chunk(self, chunk_genes): + """ + Overide for all different sources + """ + gene_dict_list = [self._parse_gene(i) for i in chunk_genes] + functions = self._remove_functions(gene_dict_list) + gene_clean_dict = {slugify(i['gene_id']): self._format_for_model(i) for i in gene_dict_list} + self.create_or_update_genes(gene_clean_dict) + self.link_genes_to_functions(functions) + + +class Command(BaseCommand): + help = 'Create or update all EggNOG annotation for Virgo genes (from `3.eggnog.NOG.txt` file).' + + def add_arguments(self, parser): + parser.add_argument( + 'annotation', + help='3.eggnog.NOG.txt file from Virgo. Genes need to exist in DB for this script to work.' + ) + parser.add_argument('--test', action='store_true', help='Run only on first 10000 entries.') + + def set_logger_level(self, verbosity): + if verbosity > 2: + logger.setLevel(logging.DEBUG) + elif verbosity > 1: + logger.setLevel(logging.INFO) + + def handle(self, *args, **options): + self.set_logger_level(int(options['verbosity'])) + import_annotations = ImportVirgoGeneEggNOGAnnotation(options['annotation']) + import_annotations.load_all(test=options['test']) diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_eggnog.tsv b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_eggnog.tsv new file mode 100644 index 0000000..a79239e --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_eggnog.tsv @@ -0,0 +1,2 @@ +Cluster_566081 V1 RPSI map03010 J 30S ribosomal protein S9 COG1234 +Cluster_308979 V2 TRUA J Formation of pseudouridine at positions 38, 39 and 40 in the anticodon stem and loop of transfer RNAs (By similarity) COG5678 diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_eggnog.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_eggnog.py new file mode 100644 index 0000000..898f5ea --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_eggnog.py @@ -0,0 +1,46 @@ +import os + +from rest_framework.test import APITestCase + +from metagenedb.apps.catalog.models import Gene +from metagenedb.apps.catalog.management.commands.import_virgo_eggnog import ImportVirgoGeneEggNOGAnnotation +from metagenedb.apps.catalog.factory import GeneFactory +from metagenedb.apps.catalog.factory.function import generate_fake_functions_db + + +class TestEndToEnd(APITestCase): + + @classmethod + def setUpTestData(cls): + generate_fake_functions_db() + GeneFactory.create(gene_id="v1") + GeneFactory.create(gene_id="v2") + + def test_end_to_end(self): + test_file = os.path.join(os.path.dirname(__file__), "./test_files/virgo_eggnog.tsv") + loader = ImportVirgoGeneEggNOGAnnotation(test_file) + expected_genes = { + 'v1': { + 'name': 'V1', + 'functions': { + 'eggnog': 'COG1234', + } + }, + 'v2': { + 'name': 'V2', + 'functions': { + 'eggnog': 'COG5678', + } + }, + } + loader.load_all() + created_genes = Gene.objects.all().prefetch_related('functions') + for created_gene in created_genes: + self.assertEqual(getattr(created_gene, 'name'), expected_genes[created_gene.gene_id]['name']) + # Check functions + self.assertTrue(created_gene.functions.all()) + for function in created_gene.functions.all(): + self.assertIn(function.source, ['kegg', 'eggnog']) + self.assertEqual( + function.function_id, expected_genes[created_gene.gene_id]['functions'][function.source] + ) diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_kegg.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_kegg.py index bb37e2b..16b9bdc 100644 --- a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_kegg.py +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_kegg.py @@ -38,6 +38,7 @@ class TestEndToEnd(APITestCase): for created_gene in created_genes: self.assertEqual(getattr(created_gene, 'name'), expected_genes[created_gene.gene_id]['name']) # Check functions + self.assertTrue(created_gene.functions.all()) for function in created_gene.functions.all(): self.assertIn(function.source, ['kegg', 'eggnog']) self.assertEqual( -- GitLab From 657d1d9e9c4622a251a63b88a2f5caa33329d17d Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Wed, 6 May 2020 18:21:07 +0200 Subject: [PATCH 16/21] do not erase kegg while linking eggnog for virgo --- .../management/commands/commons/handle_functions.py | 7 ++++++- .../catalog/management/commands/import_virgo_eggnog.py | 2 ++ .../apps/catalog/management/commands/import_virgo_kegg.py | 2 ++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/handle_functions.py b/backend/metagenedb/apps/catalog/management/commands/commons/handle_functions.py index 86c7264..c715b7d 100644 --- a/backend/metagenedb/apps/catalog/management/commands/commons/handle_functions.py +++ b/backend/metagenedb/apps/catalog/management/commands/commons/handle_functions.py @@ -9,6 +9,7 @@ logger = logging.getLogger(__name__) class HandleFunctions: + FUN_SOURCE_TO_DELETE = ['kegg', 'eggnog'] # links to get rid of everytime def _build_function_mapping(self, source): logger.info("Building local mapping for %s function...", source) @@ -67,11 +68,15 @@ class HandleFunctions: mapping.append(GeneFunction(gene=genes[gene_id], function=function)) return mapping + def _delete_previous_annotations(self, genes): + for function_source in self.FUN_SOURCE_TO_DELETE: + GeneFunction.objects.filter(gene__in=genes.values(), function__source=function_source).delete() + def link_genes_to_functions(self, functions): cleaned_functions = self._clean_functions(functions) genes = Gene.objects.in_bulk(cleaned_functions.keys(), field_name='gene_id') # Get all link with corresponding genes & Delete them - GeneFunction.objects.filter(gene__in=genes.values()).delete() + self._delete_previous_annotations(genes) # Generate table for bulk_create of function <-> gene and create it GeneFunction.objects.bulk_create( self._generate_gene_function_mapping(cleaned_functions, genes) diff --git a/backend/metagenedb/apps/catalog/management/commands/import_virgo_eggnog.py b/backend/metagenedb/apps/catalog/management/commands/import_virgo_eggnog.py index d18c7f6..1f3fc9f 100644 --- a/backend/metagenedb/apps/catalog/management/commands/import_virgo_eggnog.py +++ b/backend/metagenedb/apps/catalog/management/commands/import_virgo_eggnog.py @@ -19,6 +19,8 @@ class ImportVirgoGeneEggNOGAnnotation(BaseImportGenes, HandleFunctions): SOURCE = 'virgo' PARSER = VirgoEggNOGLineParser + FUN_SOURCE_TO_DELETE = ['eggnog'] + def _clean_functions(self, functions, unknown_val='unknown'): """ Get rid of functions that are not in the db or entitled unknown diff --git a/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py b/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py index ced1911..8ddb863 100644 --- a/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py +++ b/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py @@ -19,6 +19,8 @@ class ImportVirgoGeneKeggAnnotation(BaseImportGenes, HandleFunctions): SOURCE = 'virgo' PARSER = VirgoKEGGLineParser + FUN_SOURCE_TO_DELETE = ['kegg'] + def _clean_functions(self, functions, unknown_val='unknown'): """ Get rid of functions that are not in the db or entitled unknown -- GitLab From a6bc8b0f615c5314a886c13791aaa11f726ef43b Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Mon, 11 May 2020 10:45:48 +0200 Subject: [PATCH 17/21] refactor taxonomy handling when importing genes --- .../commands/commons/handle_taxonomy.py | 38 +++++++++++++ .../commands/commons/test_handle_taxonomy.py | 43 +++++++++++++++ .../commands/import_igc_annotation.py | 34 +++--------- .../tests/test_import_igc_annotation.py | 53 +------------------ 4 files changed, 88 insertions(+), 80 deletions(-) create mode 100644 backend/metagenedb/apps/catalog/management/commands/commons/handle_taxonomy.py create mode 100644 backend/metagenedb/apps/catalog/management/commands/commons/test_handle_taxonomy.py diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/handle_taxonomy.py b/backend/metagenedb/apps/catalog/management/commands/commons/handle_taxonomy.py new file mode 100644 index 0000000..5b1378e --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/commons/handle_taxonomy.py @@ -0,0 +1,38 @@ +import logging + +from metagenedb.apps.catalog.models import Taxonomy + +logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') +logger = logging.getLogger(__name__) + + +class HandleTaxonomy: + + def _build_taxo_mapping(self, rank): + logger.info("Building local mapping for %s level...", rank) + instances = Taxonomy.objects.filter(rank=rank) + return {instance.name: instance for instance in instances} + + @property + def phylum_mapping(self): + if getattr(self, '_phylum_mapping', None) is None: + self._phylum_mapping = self._build_taxo_mapping("phylum") + return self._phylum_mapping + + @property + def genus_mapping(self): + if getattr(self, '_genus_mapping', None) is None: + self._genus_mapping = self._build_taxo_mapping("genus") + return self._genus_mapping + + @property + def species_mapping(self): + if getattr(self, '_species_mapping', None) is None: + self._species_mapping = self._build_taxo_mapping("species") + return self._species_mapping + + def _retrieve_taxonomy(self, name, rank='species', unknown_val='unknown'): + taxonomy_instance = None + if name != unknown_val: + taxonomy_instance = getattr(self, f"{rank}_mapping", {}).get(name, None) + return taxonomy_instance diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/test_handle_taxonomy.py b/backend/metagenedb/apps/catalog/management/commands/commons/test_handle_taxonomy.py new file mode 100644 index 0000000..2d44a80 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/commons/test_handle_taxonomy.py @@ -0,0 +1,43 @@ +from rest_framework.test import APITestCase + +from metagenedb.apps.catalog.factory import ( + TaxonomyFactory, +) + +from metagenedb.apps.catalog.management.commands.commons.handle_taxonomy import HandleTaxonomy + + +class TestRetrieveTaxonomy(APITestCase): + + @classmethod + def setUpTestData(cls): + cls.genus = TaxonomyFactory(rank='genus') + cls.phylum = TaxonomyFactory(rank='phylum') + + def setUp(self): + self.unknown = 'unknown' + self.handle_taxonomy = HandleTaxonomy() + + def test_genus_only(self): + tested_taxonomy = self.handle_taxonomy._retrieve_taxonomy( + self.genus.name, rank='genus', unknown_val=self.unknown + ) + self.assertEqual(tested_taxonomy.tax_id, self.genus.tax_id) + + def test_genus_not_in_db(self): + tested_taxonomy = self.handle_taxonomy._retrieve_taxonomy("Fake Name", rank="genus", unknown_val=self.unknown) + self.assertEqual(tested_taxonomy, None) + + def test_phylum_only(self): + tested_taxonomy = self.handle_taxonomy._retrieve_taxonomy( + self.phylum.name, rank="phylum", unknown_val=self.unknown + ) + self.assertEqual(tested_taxonomy.tax_id, self.phylum.tax_id) + + def test_phylum_not_in_db(self): + tested_taxonomy = self.handle_taxonomy._retrieve_taxonomy(self.unknown, "Fake Name") + self.assertEqual(tested_taxonomy, None) + + def test_both_unknown(self): + tested_taxonomy = self.handle_taxonomy._retrieve_taxonomy(self.unknown) + self.assertEqual(tested_taxonomy, None) diff --git a/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py b/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py index 557eda1..e03dbbd 100644 --- a/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py +++ b/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py @@ -4,15 +4,15 @@ from django.core.management.base import BaseCommand from slugify import slugify from metagenedb.apps.catalog.management.commands.commons.handle_functions import HandleFunctions +from metagenedb.apps.catalog.management.commands.commons.handle_taxonomy import HandleTaxonomy from metagenedb.apps.catalog.management.commands.commons.import_genes import BaseImportGenes -from metagenedb.apps.catalog.models import Taxonomy from metagenedb.common.utils.parsers import IGCLineParser logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') logger = logging.getLogger(__name__) -class ImportIGCGenes(BaseImportGenes, HandleFunctions): +class ImportIGCGenes(BaseImportGenes, HandleFunctions, HandleTaxonomy): PHYLUM_COL = 'taxo_phylum' GENUS_COL = 'taxo_genus' SELECTED_KEYS = ['gene_id', 'length', 'kegg_ko', 'eggnog', PHYLUM_COL, GENUS_COL] @@ -27,35 +27,13 @@ class ImportIGCGenes(BaseImportGenes, HandleFunctions): self.skip_tax = skip_tax self.skip_functions = skip_functions - def _build_taxo_mapping(self, rank): - logger.info("Building local mapping for %s level...", rank) - instances = Taxonomy.objects.filter(rank=rank) - return {instance.name: instance for instance in instances} - - @property - def phylum_mapping(self): - if getattr(self, '_phylum_mapping', None) is None: - self._phylum_mapping = self._build_taxo_mapping("phylum") - return self._phylum_mapping - - @property - def genus_mapping(self): - if getattr(self, '_genus_mapping', None) is None: - self._genus_mapping = self._build_taxo_mapping("genus") - return self._genus_mapping - - def _retrieve_taxonomy(self, genus_name, phylum_name, unknown_val='unknown'): - taxonomy_instance = None - if genus_name != unknown_val: - taxonomy_instance = self.genus_mapping.get(genus_name, None) - if taxonomy_instance is None and phylum_name != unknown_val: - taxonomy_instance = self.phylum_mapping.get(phylum_name, None) - return taxonomy_instance - def _format_for_model(self, igc_dict): gene_dict = super()._format_for_model(igc_dict) if not self.skip_tax: - gene_dict['taxonomy'] = self._retrieve_taxonomy(igc_dict.get('taxo_genus'), igc_dict.get('taxo_phylum')) + taxonomy = self._retrieve_taxonomy(igc_dict.get('taxo_genus'), rank="genus") + if taxonomy is None: + taxonomy = self._retrieve_taxonomy(igc_dict.get('taxo_phylum'), rank="phylum") + gene_dict['taxonomy'] = taxonomy return gene_dict def _handle_chunk(self, chunk_genes): diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py index 981db09..a8538b3 100644 --- a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py @@ -1,63 +1,12 @@ import os -from unittest import TestCase - -import mock from rest_framework.test import APITestCase from metagenedb.apps.catalog.models import Gene from metagenedb.apps.catalog.management.commands.import_igc_annotation import ImportIGCGenes -from metagenedb.apps.catalog.factory import ( - TaxonomyFactory, -) from metagenedb.apps.catalog.factory.function import generate_fake_functions_db from metagenedb.apps.catalog.factory.taxonomy import generate_simple_db -class BaseTestImportIGCGenes(TestCase): - - def setUp(self): - function_to_mock = 'metagenedb.apps.catalog.management.commands.commons.import_genes.file_len' - with mock.patch(function_to_mock) as MockFileLen: - MockFileLen.return_value = 10 - self.import_igc_genes = ImportIGCGenes('test') - - -class TestRetrieveTaxonomy(APITestCase, BaseTestImportIGCGenes): - - @classmethod - def setUpTestData(cls): - cls.genus = TaxonomyFactory(rank='genus') - cls.phylum = TaxonomyFactory(rank='phylum') - - def setUp(self): - self.unknown = 'unknown' - super().setUp() - - def test_genus_only(self): - tested_taxonomy = self.import_igc_genes._retrieve_taxonomy(self.genus.name, self.unknown) - self.assertEqual(tested_taxonomy.tax_id, self.genus.tax_id) - - def test_genus_not_in_db(self): - tested_taxonomy = self.import_igc_genes._retrieve_taxonomy("Fake Name", self.unknown) - self.assertEqual(tested_taxonomy, None) - - def test_phylum_only(self): - tested_taxonomy = self.import_igc_genes._retrieve_taxonomy(self.unknown, self.phylum.name) - self.assertEqual(tested_taxonomy.tax_id, self.phylum.tax_id) - - def test_phylum_not_in_db(self): - tested_taxonomy = self.import_igc_genes._retrieve_taxonomy(self.unknown, "Fake Name") - self.assertEqual(tested_taxonomy, None) - - def test_genus_phylum(self): - tested_taxonomy = self.import_igc_genes._retrieve_taxonomy(self.genus.name, self.phylum.name) - self.assertEqual(tested_taxonomy.tax_id, self.genus.tax_id) - - def test_both_unknown(self): - tested_taxonomy = self.import_igc_genes._retrieve_taxonomy(self.unknown, self.unknown) - self.assertEqual(tested_taxonomy, None) - - class TestEndToEnd(APITestCase): @classmethod @@ -83,7 +32,7 @@ class TestEndToEnd(APITestCase): 'source': 'igc', 'length': 456, 'name': 'Gene_2', - 'tax_id': '1239', + 'tax_id': '1239', # Genus annotation Veillonella not in test db, but phylum yes 'functions': { 'kegg': 'K67890', 'eggnog': 'COG5678' -- GitLab From 55f6ce950190cc8b9ba6e98510a8b83fa6510d88 Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Mon, 11 May 2020 15:13:24 +0200 Subject: [PATCH 18/21] add taxonomy annotations for VIRGO genes --- .../commands/commons/handle_taxonomy.py | 13 ++++ .../commands/commons/import_genes.py | 8 +-- .../commands/commons/test_handle_taxonomy.py | 9 +++ .../commands/import_igc_annotation.py | 2 +- .../commands/import_virgo_eggnog.py | 6 +- .../management/commands/import_virgo_kegg.py | 6 +- .../commands/import_virgo_taxonomy.py | 72 +++++++++++++++++++ .../tests/test_files/virgo_taxonomy.tsv | 2 + .../tests/test_import_virgo_taxonomy.py | 36 ++++++++++ .../common/utils/parsers/__init__.py | 4 +- .../common/utils/parsers/test_virgo.py | 27 ++++++- .../metagenedb/common/utils/parsers/virgo.py | 28 +++++++- 12 files changed, 199 insertions(+), 14 deletions(-) create mode 100644 backend/metagenedb/apps/catalog/management/commands/import_virgo_taxonomy.py create mode 100644 backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_taxonomy.tsv create mode 100644 backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_taxonomy.py diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/handle_taxonomy.py b/backend/metagenedb/apps/catalog/management/commands/commons/handle_taxonomy.py index 5b1378e..dbb44d2 100644 --- a/backend/metagenedb/apps/catalog/management/commands/commons/handle_taxonomy.py +++ b/backend/metagenedb/apps/catalog/management/commands/commons/handle_taxonomy.py @@ -7,6 +7,7 @@ logger = logging.getLogger(__name__) class HandleTaxonomy: + MANUAL_TAXO_MAPPING = {} def _build_taxo_mapping(self, rank): logger.info("Building local mapping for %s level...", rank) @@ -31,6 +32,18 @@ class HandleTaxonomy: self._species_mapping = self._build_taxo_mapping("species") return self._species_mapping + def _build_manual_mapping(self): + mapping = {} + for key, tax_id in self.MANUAL_TAXO_MAPPING.items(): + mapping[key] = Taxonomy.objects.get(tax_id=tax_id) + return mapping + + @property + def manual_mapping(self): + if getattr(self, '_manual_mapping', None) is None: + self._manual_mapping = self._build_manual_mapping() + return self._manual_mapping + def _retrieve_taxonomy(self, name, rank='species', unknown_val='unknown'): taxonomy_instance = None if name != unknown_val: diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/import_genes.py b/backend/metagenedb/apps/catalog/management/commands/commons/import_genes.py index bf648f5..d332807 100644 --- a/backend/metagenedb/apps/catalog/management/commands/commons/import_genes.py +++ b/backend/metagenedb/apps/catalog/management/commands/commons/import_genes.py @@ -34,11 +34,11 @@ class BaseImportGenes(object): selected_dict = {k: v for k, v in all_dict.items() if k in self.SELECTED_KEYS} return selected_dict - def _format_for_model(self, igc_dict): + def _format_for_model(self, ori_gene_dict): gene_dict = {} - gene_dict['gene_id'] = slugify(igc_dict['gene_id']) - gene_dict['name'] = igc_dict['gene_id'] - gene_dict['length'] = igc_dict['length'] + gene_dict['gene_id'] = slugify(ori_gene_dict['gene_id']) + gene_dict['name'] = ori_gene_dict['gene_id'] + gene_dict['length'] = ori_gene_dict['length'] gene_dict['source'] = self.SOURCE return gene_dict diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/test_handle_taxonomy.py b/backend/metagenedb/apps/catalog/management/commands/commons/test_handle_taxonomy.py index 2d44a80..ba7dcc2 100644 --- a/backend/metagenedb/apps/catalog/management/commands/commons/test_handle_taxonomy.py +++ b/backend/metagenedb/apps/catalog/management/commands/commons/test_handle_taxonomy.py @@ -41,3 +41,12 @@ class TestRetrieveTaxonomy(APITestCase): def test_both_unknown(self): tested_taxonomy = self.handle_taxonomy._retrieve_taxonomy(self.unknown) self.assertEqual(tested_taxonomy, None) + + def test_build_manual_mapping(self): + self.handle_taxonomy.MANUAL_TAXO_MAPPING = { + 'test_manual': self.genus.tax_id + } + tested_taxonomy = self.handle_taxonomy._retrieve_taxonomy( + 'test_manual', rank='manual', unknown_val=self.unknown + ) + self.assertEqual(tested_taxonomy.tax_id, self.genus.tax_id) diff --git a/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py b/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py index e03dbbd..351bc75 100644 --- a/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py +++ b/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py @@ -17,7 +17,7 @@ class ImportIGCGenes(BaseImportGenes, HandleFunctions, HandleTaxonomy): GENUS_COL = 'taxo_genus' SELECTED_KEYS = ['gene_id', 'length', 'kegg_ko', 'eggnog', PHYLUM_COL, GENUS_COL] IMPORT_TYPE = "IGC genes" # For logs - UPDATED_FIELDS = ['length', 'name', 'source'] + UPDATED_FIELDS = ['length', 'name', 'source', 'taxonomy'] SOURCE = 'igc' PARSER = IGCLineParser diff --git a/backend/metagenedb/apps/catalog/management/commands/import_virgo_eggnog.py b/backend/metagenedb/apps/catalog/management/commands/import_virgo_eggnog.py index 1f3fc9f..388e488 100644 --- a/backend/metagenedb/apps/catalog/management/commands/import_virgo_eggnog.py +++ b/backend/metagenedb/apps/catalog/management/commands/import_virgo_eggnog.py @@ -45,13 +45,13 @@ class ImportVirgoGeneEggNOGAnnotation(BaseImportGenes, HandleFunctions): } return functions - def _format_for_model(self, igc_dict): + def _format_for_model(self, ori_gene_dict): """ @TODO remove in the future and makes function from parent class more modulable """ gene_dict = {} - gene_dict['gene_id'] = slugify(igc_dict['gene_id']) - gene_dict['name'] = igc_dict['gene_id'] + gene_dict['gene_id'] = slugify(ori_gene_dict['gene_id']) + gene_dict['name'] = ori_gene_dict['gene_id'] gene_dict['source'] = self.SOURCE return gene_dict diff --git a/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py b/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py index 8ddb863..0b621a9 100644 --- a/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py +++ b/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py @@ -45,13 +45,13 @@ class ImportVirgoGeneKeggAnnotation(BaseImportGenes, HandleFunctions): } return functions - def _format_for_model(self, igc_dict): + def _format_for_model(self, ori_gene_dict): """ @TODO remove in the future and makes function from parent class more modulable """ gene_dict = {} - gene_dict['gene_id'] = slugify(igc_dict['gene_id']) - gene_dict['name'] = igc_dict['gene_id'] + gene_dict['gene_id'] = slugify(ori_gene_dict['gene_id']) + gene_dict['name'] = ori_gene_dict['gene_id'] gene_dict['source'] = self.SOURCE return gene_dict diff --git a/backend/metagenedb/apps/catalog/management/commands/import_virgo_taxonomy.py b/backend/metagenedb/apps/catalog/management/commands/import_virgo_taxonomy.py new file mode 100644 index 0000000..8670334 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/import_virgo_taxonomy.py @@ -0,0 +1,72 @@ +import logging + +from django.core.management.base import BaseCommand +from slugify import slugify + +from metagenedb.apps.catalog.management.commands.commons.handle_taxonomy import HandleTaxonomy +from metagenedb.apps.catalog.management.commands.commons.import_genes import BaseImportGenes +from metagenedb.common.utils.parsers import VirgoTaxonomyLineParser + +logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') +logger = logging.getLogger(__name__) + + +class ImportVirgoGeneTaxonomyAnnotation(BaseImportGenes, HandleTaxonomy): + SELECTED_KEYS = ['gene_id', 'taxonomy'] + IMPORT_TYPE = "Virgo taxonomy annotations" # For logs + UPDATED_FIELDS = ['name', 'taxonomy'] + SOURCE = 'virgo' + PARSER = VirgoTaxonomyLineParser + MANUAL_TAXO_MAPPING = { + 'BVAB1': '699240', + 'Clostridiales Family': '186802', + 'Chlamydophila psittaci': '83554' + } + + def _format_for_model(self, ori_gene_dict): + """ + @TODO remove in the future and makes function from parent class more modulable + """ + gene_dict = {} + gene_dict['gene_id'] = slugify(ori_gene_dict['gene_id']) + gene_dict['name'] = ori_gene_dict['gene_id'] + gene_dict['source'] = self.SOURCE + taxonomy_term = ori_gene_dict.get('taxonomy').replace('_', ' ') + taxonomy = self._retrieve_taxonomy(taxonomy_term, rank="species") + if taxonomy is None: + # Use manually created mapping dict + taxonomy = self._retrieve_taxonomy(taxonomy_term.split(' ')[0], rank="genus") + if taxonomy is None: + # Try to at least retrieve the genus from the first part of the taxonomy + taxonomy = self._retrieve_taxonomy(taxonomy_term, rank="genus") + # @TODO need to find a way of handling other cases + if taxonomy is None: + # Use manually created mapping dict + taxonomy = self._retrieve_taxonomy(taxonomy_term, rank="manual") + if taxonomy is None: + self.skipped_genes += 1 + logger.warning("Could not retrieve %s for %s", ori_gene_dict.get('taxonomy'), ori_gene_dict['gene_id']) + gene_dict['taxonomy'] = taxonomy + return gene_dict + + +class Command(BaseCommand): + help = 'Create or update all Taxonomy annotations for Virgo genes (from `1.taxon.tbl.txt` file).' + + def add_arguments(self, parser): + parser.add_argument( + 'annotation', + help='1.taxon.tbl.txt file from Virgo. Genes need to exist in DB for this script to work.' + ) + parser.add_argument('--test', action='store_true', help='Run only on first 10000 entries.') + + def set_logger_level(self, verbosity): + if verbosity > 2: + logger.setLevel(logging.DEBUG) + elif verbosity > 1: + logger.setLevel(logging.INFO) + + def handle(self, *args, **options): + self.set_logger_level(int(options['verbosity'])) + import_annotations = ImportVirgoGeneTaxonomyAnnotation(options['annotation']) + import_annotations.load_all(test=options['test']) diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_taxonomy.tsv b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_taxonomy.tsv new file mode 100644 index 0000000..0f5d7a3 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_taxonomy.tsv @@ -0,0 +1,2 @@ +Cluster_566081 V1 Escherichia_coli 396 +Cluster_308979 V2 Lactobacillus_iners 783 diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_taxonomy.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_taxonomy.py new file mode 100644 index 0000000..830b537 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_taxonomy.py @@ -0,0 +1,36 @@ +import os + +from rest_framework.test import APITestCase + +from metagenedb.apps.catalog.models import Gene +from metagenedb.apps.catalog.management.commands.import_virgo_taxonomy import ImportVirgoGeneTaxonomyAnnotation +from metagenedb.apps.catalog.factory import GeneFactory +from metagenedb.apps.catalog.factory.taxonomy import generate_simple_db + + +class TestEndToEnd(APITestCase): + + @classmethod + def setUpTestData(cls): + generate_simple_db() + for gene_id in ['v1', 'v2']: + GeneFactory.create(gene_id=gene_id) + + def test_end_to_end(self): + test_file = os.path.join(os.path.dirname(__file__), "./test_files/virgo_taxonomy.tsv") + loader = ImportVirgoGeneTaxonomyAnnotation(test_file) + expected_genes = { + 'v1': { + 'name': 'V1', + 'tax_id': '562', + }, + 'v2': { + 'name': 'V2', + 'tax_id': '1578', + } + } + loader.load_all() + created_genes = Gene.objects.all().prefetch_related('functions') + for created_gene in created_genes: + self.assertEqual(getattr(created_gene, 'name'), expected_genes[created_gene.gene_id]['name']) + self.assertEqual(created_gene.taxonomy.tax_id, expected_genes[created_gene.gene_id]['tax_id']) diff --git a/backend/metagenedb/common/utils/parsers/__init__.py b/backend/metagenedb/common/utils/parsers/__init__.py index 855edc3..a425988 100644 --- a/backend/metagenedb/common/utils/parsers/__init__.py +++ b/backend/metagenedb/common/utils/parsers/__init__.py @@ -2,4 +2,6 @@ from .eggnog import EggNOGAnnotationLineParser # noqa from .igc import IGCLineParser # noqa from .kegg import KEGGLineParser # noqa from .ncbi_taxonomy import NCBITaxonomyLineParser # noqa -from .virgo import VirgoGeneLengthLineParser, VirgoKEGGLineParser, VirgoEggNOGLineParser # noqa +from .virgo import ( # noqa + VirgoGeneLengthLineParser, VirgoKEGGLineParser, VirgoEggNOGLineParser, VirgoTaxonomyLineParser +) diff --git a/backend/metagenedb/common/utils/parsers/test_virgo.py b/backend/metagenedb/common/utils/parsers/test_virgo.py index 293466f..31b1280 100644 --- a/backend/metagenedb/common/utils/parsers/test_virgo.py +++ b/backend/metagenedb/common/utils/parsers/test_virgo.py @@ -1,7 +1,7 @@ from unittest import TestCase from metagenedb.common.utils.parsers import ( - VirgoGeneLengthLineParser, VirgoKEGGLineParser, VirgoEggNOGLineParser + VirgoGeneLengthLineParser, VirgoKEGGLineParser, VirgoEggNOGLineParser, VirgoTaxonomyLineParser ) @@ -103,3 +103,28 @@ class TestVirgoEggNOGLineParser(TestCase): raw_line = "This is a wrong line format, with; information and tab" with self.assertRaises(Exception) as context: # noqa VirgoEggNOGLineParser.gene(raw_line) + + +class TestVirgoTaxonomyLineParser(TestCase): + + def test_gene(self): + raw_data = [ + 'cluster_id', + 'gene_id', + 'taxonomy', + '1234', + ] + raw_line = "\t".join(raw_data) + expected_dict = { + 'cluster_id': raw_data[0], + 'gene_id': raw_data[1], + 'taxonomy': raw_data[2], + 'length': raw_data[3], + } + test_dict = VirgoTaxonomyLineParser.gene(raw_line) + self.assertDictEqual(test_dict, expected_dict) + + def test_gene_wrong_format(self): + raw_line = "This is a wrong line format, with; information and tab" + with self.assertRaises(Exception) as context: # noqa + VirgoEggNOGLineParser.gene(raw_line) diff --git a/backend/metagenedb/common/utils/parsers/virgo.py b/backend/metagenedb/common/utils/parsers/virgo.py index 0f3038f..30de3e5 100644 --- a/backend/metagenedb/common/utils/parsers/virgo.py +++ b/backend/metagenedb/common/utils/parsers/virgo.py @@ -79,5 +79,31 @@ class VirgoEggNOGLineParser(object): 'eggnog': gene_info[6], } except Exception: - _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from Virgo KEGG annotation file?") + _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from Virgo EggNOG annotation file?") + raise + + +class VirgoTaxonomyLineParser(object): + + @staticmethod + def gene(line): + """ + Parse line from Virgo Taxonomy annotations to return organized dict (1.taxon.tbl.txt) + + IGC annotation columns: + 0: Cluster ID + 1: Gene ID + 2: Taxonomy annotation + 3: Gene length + """ + try: + gene_info = line.rstrip().split('\t') + return { + 'cluster_id': gene_info[0], + 'gene_id': gene_info[1], + 'taxonomy': gene_info[2], + 'length': gene_info[3], + } + except Exception: + _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from Virgo taxonomy file?") raise -- GitLab From e90b0218352cd492c75c37702645f7b02121789c Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Mon, 11 May 2020 15:45:34 +0200 Subject: [PATCH 19/21] use source on frontend --- backend/metagenedb/api/catalog/filters/gene.py | 2 +- .../api/catalog/qparams_validators/gene.py | 1 + backend/metagenedb/apps/catalog/serializers/gene.py | 2 +- frontend/src/views/GeneDetail.vue | 4 ++++ frontend/src/views/genes/genes.html | 11 ++++++++++- frontend/src/views/genes/genes.js | 13 ++++++++++++- 6 files changed, 29 insertions(+), 4 deletions(-) diff --git a/backend/metagenedb/api/catalog/filters/gene.py b/backend/metagenedb/api/catalog/filters/gene.py index 950cd03..bd9dbbf 100644 --- a/backend/metagenedb/api/catalog/filters/gene.py +++ b/backend/metagenedb/api/catalog/filters/gene.py @@ -33,4 +33,4 @@ class GeneFilter(filters.FilterSet): class Meta: model = Gene - fields = ['length', 'name'] + fields = ['length', 'name', 'source'] diff --git a/backend/metagenedb/api/catalog/qparams_validators/gene.py b/backend/metagenedb/api/catalog/qparams_validators/gene.py index 55f33d4..25d9ccb 100644 --- a/backend/metagenedb/api/catalog/qparams_validators/gene.py +++ b/backend/metagenedb/api/catalog/qparams_validators/gene.py @@ -13,3 +13,4 @@ class GeneQueryParams(PaginatedQueryParams): taxonomy_rank = fields.String() taxonomy_id = fields.Integer() function = fields.String() + source = fields.String() diff --git a/backend/metagenedb/apps/catalog/serializers/gene.py b/backend/metagenedb/apps/catalog/serializers/gene.py index 1327cf4..2f034bd 100644 --- a/backend/metagenedb/apps/catalog/serializers/gene.py +++ b/backend/metagenedb/apps/catalog/serializers/gene.py @@ -69,7 +69,7 @@ class GeneSerializer(serializers.ModelSerializer): class Meta: model = Gene list_serializer_class = GeneListSerializer - fields = ('gene_id', 'name', 'length', 'functions', 'taxonomy', 'sequence') + fields = ('gene_id', 'name', 'length', 'functions', 'taxonomy', 'sequence', 'source') def _extract_many_to_many(self, validated_data, info): many_to_many = {} diff --git a/frontend/src/views/GeneDetail.vue b/frontend/src/views/GeneDetail.vue index 5a9a6f8..dcf3cdc 100644 --- a/frontend/src/views/GeneDetail.vue +++ b/frontend/src/views/GeneDetail.vue @@ -109,6 +109,10 @@ export default { title: 'Length (bp)', content: response.data.length, }, + { + title: 'Source', + content: response.data.source, + }, ]; this.sequence = '>' + response.data.gene_id + '\n' + response.data.sequence; if (response.data.functions.length > 0) { diff --git a/frontend/src/views/genes/genes.html b/frontend/src/views/genes/genes.html index bde5ee2..0849299 100644 --- a/frontend/src/views/genes/genes.html +++ b/frontend/src/views/genes/genes.html @@ -25,8 +25,16 @@ - + + + + + {{ props.item.eggnog }} + {{ props.item.source }} diff --git a/frontend/src/views/genes/genes.js b/frontend/src/views/genes/genes.js index 88298f3..4293b9b 100644 --- a/frontend/src/views/genes/genes.js +++ b/frontend/src/views/genes/genes.js @@ -11,6 +11,7 @@ export default { pagination: { rowsPerPage: 20, }, + geneSource: null, searchGeneName: null, taxonomyRank: null, functionID: null, @@ -32,13 +33,19 @@ export default { { text: 'Taxonomy', value: 'taxonomy', sortable: false }, { text: 'KEGG', value: 'kegg', sortable: false }, { text: 'EggNOG', value: 'eggnog', sortable: false }, + { text: 'Source', value: 'source', sortable: false }, ]; }, taxonomyRanks() { return [ - 'Phylum', 'Genus', + 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species' ]; }, + geneSources() { + return [ + 'IGC', 'Virgo' + ] + }, rowsPerPageItems() { return [this.page_size]; }, @@ -57,6 +64,9 @@ export default { if (this.functionID){ qParams.function = this.functionID } + if (this.geneSource) { + qParams.source = this.source.toLowerCase() + } return qParams; }, maxGeneLength() { @@ -110,6 +120,7 @@ export default { emptyFilters() { this.taxonomyRank = null; this.functionID = null; + this.geneSource = null; this.filterGeneLength = false; this.geneLengthFilterRange = [0, 2000]; }, -- GitLab From 95ed8a65fa65e7068d3c18a60456daca3b1ab651 Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Mon, 11 May 2020 16:53:34 +0200 Subject: [PATCH 20/21] add dummy count number for paginator --- .gitignore | 3 +++ backend/metagenedb/common/django_default/pagination.py | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/.gitignore b/.gitignore index 5437400..b8e279c 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,9 @@ __pycache__/ # Backend static files backend/public +# Backend debugging folder for logs and profiling +debugging + # Frontend node_modules/ dist/ diff --git a/backend/metagenedb/common/django_default/pagination.py b/backend/metagenedb/common/django_default/pagination.py index e12bbd9..71d5d4d 100644 --- a/backend/metagenedb/common/django_default/pagination.py +++ b/backend/metagenedb/common/django_default/pagination.py @@ -1,6 +1,16 @@ +from django.core.paginator import Paginator +from django.utils.functional import cached_property from rest_framework.pagination import PageNumberPagination +class FastPaginator(Paginator): + + @cached_property + def count(self): + return 10000000 + + class DefaultPageNumberPagination(PageNumberPagination): page_size_query_param = 'page_size' max_page_size = 5000 + django_paginator_class = FastPaginator -- GitLab From c529bad7dad8dfcdcdbecf3edd9ec9a9d722a804 Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion Date: Mon, 11 May 2020 17:02:09 +0200 Subject: [PATCH 21/21] remove dummy count for paginator --- backend/metagenedb/common/django_default/pagination.py | 10 ---------- frontend/src/views/genes/genes.js | 2 +- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/backend/metagenedb/common/django_default/pagination.py b/backend/metagenedb/common/django_default/pagination.py index 71d5d4d..e12bbd9 100644 --- a/backend/metagenedb/common/django_default/pagination.py +++ b/backend/metagenedb/common/django_default/pagination.py @@ -1,16 +1,6 @@ -from django.core.paginator import Paginator -from django.utils.functional import cached_property from rest_framework.pagination import PageNumberPagination -class FastPaginator(Paginator): - - @cached_property - def count(self): - return 10000000 - - class DefaultPageNumberPagination(PageNumberPagination): page_size_query_param = 'page_size' max_page_size = 5000 - django_paginator_class = FastPaginator diff --git a/frontend/src/views/genes/genes.js b/frontend/src/views/genes/genes.js index 4293b9b..3e41328 100644 --- a/frontend/src/views/genes/genes.js +++ b/frontend/src/views/genes/genes.js @@ -65,7 +65,7 @@ export default { qParams.function = this.functionID } if (this.geneSource) { - qParams.source = this.source.toLowerCase() + qParams.source = this.geneSource.toLowerCase() } return qParams; }, -- GitLab