diff --git a/.gitignore b/.gitignore index 543740071faae3ece06627c568d09b220f1c357f..b8e279c218d70f940cecd12f84f5abfcfabdfc4d 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,9 @@ __pycache__/ # Backend static files backend/public +# Backend debugging folder for logs and profiling +debugging + # Frontend node_modules/ dist/ diff --git a/backend/metagenedb/api/catalog/filters/gene.py b/backend/metagenedb/api/catalog/filters/gene.py index 950cd03f5cd118cb317acb64a5aa77425f05d41e..bd9dbbf19ea2cebe55e3a1b1795cc17cbf02ccbf 100644 --- a/backend/metagenedb/api/catalog/filters/gene.py +++ b/backend/metagenedb/api/catalog/filters/gene.py @@ -33,4 +33,4 @@ class GeneFilter(filters.FilterSet): class Meta: model = Gene - fields = ['length', 'name'] + fields = ['length', 'name', 'source'] diff --git a/backend/metagenedb/api/catalog/qparams_validators/gene.py b/backend/metagenedb/api/catalog/qparams_validators/gene.py index 55f33d4b2bb2b1c1a112bf52b7096b475c7d2324..25d9ccb945310669f0c9991d5504897aa12866d8 100644 --- a/backend/metagenedb/api/catalog/qparams_validators/gene.py +++ b/backend/metagenedb/api/catalog/qparams_validators/gene.py @@ -13,3 +13,4 @@ class GeneQueryParams(PaginatedQueryParams): taxonomy_rank = fields.String() taxonomy_id = fields.Integer() function = fields.String() + source = fields.String() diff --git a/backend/metagenedb/apps/catalog/factory/function.py b/backend/metagenedb/apps/catalog/factory/function.py index 1ab878bd9d42815cbf8ce6da17db64a18674804d..a8b077ebc828c9b5bd0894b8fa3b3bff182cdf07 100644 --- a/backend/metagenedb/apps/catalog/factory/function.py +++ b/backend/metagenedb/apps/catalog/factory/function.py @@ -33,3 +33,18 @@ class EggNOGFactory(BaseFunctionFactory): class KeggOrthologyFactory(BaseFunctionFactory): class Meta: model = models.KeggOrthology + + +def _create_fake_kegg_db(): + KeggOrthologyFactory.create(function_id="K12345", name="Kegg1") + KeggOrthologyFactory.create(function_id="K67890", name="Kegg2") + + +def _create_fake_eggnog_db(): + EggNOGFactory.create(function_id="COG1234", name="COG1") + EggNOGFactory.create(function_id="COG5678", name="COG2") + + +def generate_fake_functions_db(): + _create_fake_eggnog_db() + _create_fake_kegg_db() diff --git a/backend/metagenedb/apps/catalog/factory/taxonomy.py b/backend/metagenedb/apps/catalog/factory/taxonomy.py index f57b96cda1072d010486c57c55d2c7f5262c3fcc..2504f40847b6254c9f113b397c01cd20d05e018e 100644 --- a/backend/metagenedb/apps/catalog/factory/taxonomy.py +++ b/backend/metagenedb/apps/catalog/factory/taxonomy.py @@ -17,3 +17,56 @@ class TaxonomyFactory(DjangoModelFactory): rank = fuzzy.FuzzyChoice(SELECTED_RANK) tax_id = FuzzyLowerText(prefix='tax-', length=15) name = fuzzy.FuzzyText(length=20) + + +class DbGenerator: + + def __init__(self): + self.created_ids = set() # store already created IDs to skip them + + def generate_db_from_tree(self, tree): + for rank, desc in tree.items(): + if desc['tax_id'] not in self.created_ids: + TaxonomyFactory.create( + tax_id=desc['tax_id'], + name=desc['name'], + rank=rank, + ) + self.created_ids.add(desc['tax_id']) + + +def _generate_lactobacillus_db(db_generator): + """ + Generate db with few ranks corresponding to Lactobacillus genus + """ + tree = { + "class": {"name": "Bacilli", "tax_id": "91061"}, + "genus": {"name": "Lactobacillus", "tax_id": "1578"}, + "order": {"name": "Lactobacillales", "tax_id": "186826"}, + "family": {"name": "Lactobacillaceae", "tax_id": "33958"}, + "phylum": {"name": "Firmicutes", "tax_id": "1239"}, + "no_rank": {"name": "cellular organisms", "tax_id": "131567"}, + "superkingdom": {"name": "Bacteria", "tax_id": "2"}, + "species_group": {"name": "Lactobacillus casei group", "tax_id": "655183"} + } + db_generator.generate_db_from_tree(tree) + + +def _generate_escherichia_db(db_generator): + tree = { + "class": {"name": "Gammaproteobacteria", "tax_id": "1236"}, + "genus": {"name": "Escherichia", "tax_id": "561"}, + "order": {"name": "Enterobacterales", "tax_id": "91347"}, + "family": {"name": "Enterobacteriaceae", "tax_id": "543"}, + "phylum": {"name": "Proteobacteria", "tax_id": "1224"}, + "no_rank": {"name": "cellular organisms", "tax_id": "131567"}, + "species": {"name": "Escherichia coli", "tax_id": "562"}, + "superkingdom": {"name": "Bacteria", "tax_id": "2"} + } + db_generator.generate_db_from_tree(tree) + + +def generate_simple_db(): + db_generator = DbGenerator() + _generate_escherichia_db(db_generator) + _generate_lactobacillus_db(db_generator) diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/__init__.py b/backend/metagenedb/apps/catalog/management/commands/commons/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/handle_functions.py b/backend/metagenedb/apps/catalog/management/commands/commons/handle_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..c715b7d3e4d2afaaa8107740880b681e4cdc6d34 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/commons/handle_functions.py @@ -0,0 +1,83 @@ +import logging + +from slugify import slugify + +from metagenedb.apps.catalog.models import Function, Gene, GeneFunction + +logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') +logger = logging.getLogger(__name__) + + +class HandleFunctions: + FUN_SOURCE_TO_DELETE = ['kegg', 'eggnog'] # links to get rid of everytime + + def _build_function_mapping(self, source): + logger.info("Building local mapping for %s function...", source) + instances = Function.objects.filter(source=source) + return {instance.function_id: instance for instance in instances} + + @property + def eggnog_mapping(self): + if getattr(self, '_eggnog_mapping', None) is None: + self._eggnog_mapping = self._build_function_mapping("eggnog") + return self._eggnog_mapping + + @property + def kegg_mapping(self): + if getattr(self, '_kegg_mapping', None) is None: + self._kegg_mapping = self._build_function_mapping("kegg") + return self._kegg_mapping + + def _clean_functions(self, functions, unknown_val='unknown'): + """ + Get rid of functions that are not in the db or entitled unknown + """ + cleaned_functions = {} + for gene_id, all_functions in functions.items(): + new_functions = [] + for kegg in all_functions['kegg']: + if kegg == unknown_val: + continue + elif kegg in self.kegg_mapping.keys(): + new_functions.append(self.kegg_mapping[kegg]) + for eggnog in all_functions['eggnog']: + if eggnog == unknown_val: + continue + elif eggnog in self.eggnog_mapping.keys(): + new_functions.append(self.eggnog_mapping[eggnog]) + if new_functions: + cleaned_functions[gene_id] = new_functions + return cleaned_functions + + def _remove_functions(self, gene_dicts): + functions = {} + for gene_dict in gene_dicts: + functions[slugify(gene_dict['gene_id'])] = { + 'kegg': gene_dict.pop('kegg_ko'), + 'eggnog': gene_dict.pop('eggnog') + } + return functions + + def _generate_gene_function_mapping(self, functions, genes): + """ + Generate a list of GeneFunction pair to create relation between them + """ + mapping = [] + for gene_id, function_list in functions.items(): + for function in function_list: + mapping.append(GeneFunction(gene=genes[gene_id], function=function)) + return mapping + + def _delete_previous_annotations(self, genes): + for function_source in self.FUN_SOURCE_TO_DELETE: + GeneFunction.objects.filter(gene__in=genes.values(), function__source=function_source).delete() + + def link_genes_to_functions(self, functions): + cleaned_functions = self._clean_functions(functions) + genes = Gene.objects.in_bulk(cleaned_functions.keys(), field_name='gene_id') + # Get all link with corresponding genes & Delete them + self._delete_previous_annotations(genes) + # Generate table for bulk_create of function <-> gene and create it + GeneFunction.objects.bulk_create( + self._generate_gene_function_mapping(cleaned_functions, genes) + ) diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/handle_taxonomy.py b/backend/metagenedb/apps/catalog/management/commands/commons/handle_taxonomy.py new file mode 100644 index 0000000000000000000000000000000000000000..dbb44d2f958622179915cf0de2e812c5135409b9 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/commons/handle_taxonomy.py @@ -0,0 +1,51 @@ +import logging + +from metagenedb.apps.catalog.models import Taxonomy + +logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') +logger = logging.getLogger(__name__) + + +class HandleTaxonomy: + MANUAL_TAXO_MAPPING = {} + + def _build_taxo_mapping(self, rank): + logger.info("Building local mapping for %s level...", rank) + instances = Taxonomy.objects.filter(rank=rank) + return {instance.name: instance for instance in instances} + + @property + def phylum_mapping(self): + if getattr(self, '_phylum_mapping', None) is None: + self._phylum_mapping = self._build_taxo_mapping("phylum") + return self._phylum_mapping + + @property + def genus_mapping(self): + if getattr(self, '_genus_mapping', None) is None: + self._genus_mapping = self._build_taxo_mapping("genus") + return self._genus_mapping + + @property + def species_mapping(self): + if getattr(self, '_species_mapping', None) is None: + self._species_mapping = self._build_taxo_mapping("species") + return self._species_mapping + + def _build_manual_mapping(self): + mapping = {} + for key, tax_id in self.MANUAL_TAXO_MAPPING.items(): + mapping[key] = Taxonomy.objects.get(tax_id=tax_id) + return mapping + + @property + def manual_mapping(self): + if getattr(self, '_manual_mapping', None) is None: + self._manual_mapping = self._build_manual_mapping() + return self._manual_mapping + + def _retrieve_taxonomy(self, name, rank='species', unknown_val='unknown'): + taxonomy_instance = None + if name != unknown_val: + taxonomy_instance = getattr(self, f"{rank}_mapping", {}).get(name, None) + return taxonomy_instance diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/import_gene_sequences.py b/backend/metagenedb/apps/catalog/management/commands/commons/import_gene_sequences.py new file mode 100644 index 0000000000000000000000000000000000000000..6fdc5b7ce5e51623cab35507cad8b55fd7f2b1a0 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/commons/import_gene_sequences.py @@ -0,0 +1,58 @@ +import logging + +import pyfastx +from slugify import slugify + +from metagenedb.apps.catalog.models import Gene + +logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') +logger = logging.getLogger(__name__) + + +class ImportGeneSequences(object): + CATALOG = "CAT_NAME" + + def __init__(self, sequence_file): + self.sequence_file = sequence_file + self._reset_counters() + + def _reset_counters(self): + self.processed_genes = 0 + self.updated_genes = 0 + self.skipped_genes = 0 + + def update_sequences(self, sequences): + genes = Gene.objects.filter(gene_id__in=sequences.keys()) + genes_retrieved = genes.count() + for gene in genes: + gene.sequence = sequences[gene.gene_id] + try: + Gene.objects.bulk_update(genes, ['sequence']) + self.updated_genes += genes_retrieved + self.skipped_genes += len(sequences) - genes_retrieved + except Exception: + logger.warning("Could not update genes... skipped.") + self.skipped_genes += len(sequences) + + def load_all(self, test=False, chunk_size=10000, skip_n_sequences=0): + logger.info("Starting %s Gene sequences import (update) to DB", self.CATALOG) + if skip_n_sequences > 0: + logger.info("Skipping first %s sequences", skip_n_sequences) + current_sequences = {} + for name, seq in pyfastx.Fasta(self.sequence_file, build_index=False): + if self.processed_genes < skip_n_sequences: + self.processed_genes += 1 + self.skipped_genes += 1 + continue + current_sequences[slugify(name.split()[0])] = seq + self.processed_genes += 1 + if self.processed_genes % chunk_size == 0: + self.update_sequences(current_sequences) + logger.info("%s Gene sequences processed so far...", self.processed_genes) + current_sequences = {} + if test is True: + break + if len(current_sequences) > 0: + self.update_sequences(current_sequences) + logger.info("[DONE] %s/%s Gene sequences updated.", self.updated_genes, self.processed_genes) + logger.info("[DONE] %s/%s Genes skipped.", self.skipped_genes, self.processed_genes) diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/import_genes.py b/backend/metagenedb/apps/catalog/management/commands/commons/import_genes.py new file mode 100644 index 0000000000000000000000000000000000000000..d3328073d8db8e786ff4a89201bd69612a9e7e1b --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/commons/import_genes.py @@ -0,0 +1,98 @@ +import logging +from itertools import islice + +from slugify import slugify + +from metagenedb.apps.catalog.models import Gene +from metagenedb.common.utils.chunks import file_len + +logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') +logger = logging.getLogger(__name__) + + +class BaseImportGenes(object): + IMPORT_TYPE = "gene" # For logs + SELECTED_KEYS = ['gene_id', 'length'] + UPDATED_FIELDS = ['length', 'name', 'source'] + SOURCE = 'undef' + PARSER = None + + def __init__(self, annotation_file): + self.annotation_file = annotation_file + self.total_genes = file_len(annotation_file) + self._reset_counters() + + def _reset_counters(self): + self.processed_genes = 0 + self.created_genes = 0 + self.updated_genes = 0 + self.skipped_genes = 0 + + def _parse_gene(self, raw_line): + gene_parser = self.PARSER + all_dict = gene_parser.gene(raw_line) + selected_dict = {k: v for k, v in all_dict.items() if k in self.SELECTED_KEYS} + return selected_dict + + def _format_for_model(self, ori_gene_dict): + gene_dict = {} + gene_dict['gene_id'] = slugify(ori_gene_dict['gene_id']) + gene_dict['name'] = ori_gene_dict['gene_id'] + gene_dict['length'] = ori_gene_dict['length'] + gene_dict['source'] = self.SOURCE + return gene_dict + + def _update_genes(self, gene_instances, gene_dict): + for gene_id, gene_instance in gene_instances.items(): + for key, value in gene_dict[gene_id].items(): + setattr(gene_instance, key, value) + try: + Gene.objects.bulk_update( + list(gene_instances.values()), + self.UPDATED_FIELDS + ) + self.updated_genes += len(gene_instances.keys()) + except Exception as exception: + logger.warning(exception) + self.skipped_genes += len(gene_instances.keys()) + + def _create_genes(self, gene_list): + try: + Gene.objects.bulk_create( + [Gene(**item) for item in gene_list] + ) + self.created_genes += len(gene_list) + except Exception as exception: + logger.warning(exception) + self.skipped_genes += len(gene_list) + + def create_or_update_genes(self, gene_dict): + update_instances = Gene.objects.in_bulk(gene_dict.keys(), field_name='gene_id') + self._update_genes(update_instances, gene_dict) + gene_ids_to_create = set(gene_dict.keys()) - set(update_instances.keys()) + if gene_ids_to_create: + self._create_genes([gene_dict[gene_id] for gene_id in gene_ids_to_create]) + + def _handle_chunk(self, chunk_genes): + """ + Overide for all different sources + """ + gene_dict_list = [self._parse_gene(i) for i in chunk_genes] + gene_clean_dict = {slugify(i['gene_id']): self._format_for_model(i) for i in gene_dict_list} + self.create_or_update_genes(gene_clean_dict) + + def load_all(self, test=False, chunk_size=10000): + logger.info("Starting %s import (creation or update) to DB", self.IMPORT_TYPE) + with open(self.annotation_file, 'r') as file: + while True: + chunk_genes = list(islice(file, chunk_size)) + if not chunk_genes: + break + self._handle_chunk(chunk_genes) + self.processed_genes += chunk_size + logger.info("%s Genes processed so far...", self.processed_genes) + if test is True: + break + logger.info("[DONE] %s/%s Genes created.", self.created_genes, self.total_genes) + logger.info("[DONE] %s/%s Genes updated.", self.updated_genes, self.total_genes) + logger.info("[DONE] %s/%s Genes skipped.", self.skipped_genes, self.total_genes) diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/test_files/genes.fa b/backend/metagenedb/apps/catalog/management/commands/commons/test_files/genes.fa new file mode 100644 index 0000000000000000000000000000000000000000..db0709e40d8cf959cc4b549989627dabda37a718 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/commons/test_files/genes.fa @@ -0,0 +1,4 @@ +>Gene1 +ACGT +>Gene2 +ATCG \ No newline at end of file diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/test_handle_functions.py b/backend/metagenedb/apps/catalog/management/commands/commons/test_handle_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..cc925d544a06075cebe2bc23f35298e8d7e2a75a --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/commons/test_handle_functions.py @@ -0,0 +1,111 @@ +from unittest import TestCase + +from rest_framework.test import APITestCase + +from metagenedb.apps.catalog.models import GeneFunction +from metagenedb.apps.catalog.management.commands.commons.handle_functions import HandleFunctions +from metagenedb.apps.catalog.factory import ( + FunctionFactory, + GeneFactory, +) + + +class BaseTestHandleFunctions(TestCase): + + def setUp(self): + self.handle_functions = HandleFunctions() + + +class TestRemoveFunctions(BaseTestHandleFunctions): + + def test_remove_functions(self): + input_dicts = [{ + 'gene_id': 'Test_gene', + 'kegg_ko': ['K0001'], + 'eggnog': ['COG1', 'COG2'] + }] + expected_functions = { + 'test-gene': { + 'kegg': ['K0001'], + 'eggnog': ['COG1', 'COG2'] + } + } + tested_dict = self.handle_functions._remove_functions(input_dicts) + self.assertDictEqual(tested_dict, expected_functions) + + +class TestCleanFunctions(APITestCase, BaseTestHandleFunctions): + + @classmethod + def setUpTestData(cls): + cls.kegg = FunctionFactory(source='kegg') + cls.eggnog = FunctionFactory(source='eggnog') + + def test_clean_functions_kegg_only(self): + functions = { + 'gene-kegg': { + 'kegg': [self.kegg.function_id, 'KO12345'], + 'eggnog': ['unknown'] + }, + } + expected_functions = { + 'gene-kegg': [self.kegg] + } + self.assertDictEqual(self.handle_functions._clean_functions(functions), expected_functions) + + def test_clean_functions_eggnog_only(self): + functions = { + 'gene-kegg': { + 'kegg': ['unknown'], + 'eggnog': [self.eggnog.function_id, 'COG12345'] + }, + } + expected_functions = { + 'gene-kegg': [self.eggnog] + } + self.assertDictEqual(self.handle_functions._clean_functions(functions), expected_functions) + + def test_clean_functions_kegg_eggnog(self): + functions = { + 'gene-kegg': { + 'kegg': [self.kegg.function_id, 'KO12345'], + 'eggnog': [self.eggnog.function_id, 'COG12345'] + }, + } + expected_functions = { + 'gene-kegg': [self.kegg, self.eggnog] + } + self.assertDictEqual(self.handle_functions._clean_functions(functions), expected_functions) + + def test_clean_functions_both_unknown(self): + functions = { + 'gene-kegg': { + 'kegg': ['unknown'], + 'eggnog': ['unknown'] + }, + } + expected_functions = {} + self.assertDictEqual(self.handle_functions._clean_functions(functions), expected_functions) + + +class TestLinkGenesToFunctions(APITestCase, BaseTestHandleFunctions): + + @classmethod + def setUpTestData(cls): + cls.kegg = FunctionFactory(source='kegg') + cls.eggnog = FunctionFactory(source='eggnog') + cls.gene = GeneFactory() + + def test_link_kegg_and_eggnog(self): + self.assertEqual(GeneFunction.objects.all().count(), 0) + functions = { + self.gene.gene_id: { + 'kegg': [self.kegg.function_id], + 'eggnog': [self.eggnog.function_id] + } + } + self.handle_functions.link_genes_to_functions(functions) + gene_functions = GeneFunction.objects.all() + self.assertEqual(gene_functions.count(), 2) + for link in gene_functions: + self.assertEqual(link.gene.gene_id, self.gene.gene_id) diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/test_handle_taxonomy.py b/backend/metagenedb/apps/catalog/management/commands/commons/test_handle_taxonomy.py new file mode 100644 index 0000000000000000000000000000000000000000..ba7dcc22f231a28b1c20d00e1a68de0a2715547e --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/commons/test_handle_taxonomy.py @@ -0,0 +1,52 @@ +from rest_framework.test import APITestCase + +from metagenedb.apps.catalog.factory import ( + TaxonomyFactory, +) + +from metagenedb.apps.catalog.management.commands.commons.handle_taxonomy import HandleTaxonomy + + +class TestRetrieveTaxonomy(APITestCase): + + @classmethod + def setUpTestData(cls): + cls.genus = TaxonomyFactory(rank='genus') + cls.phylum = TaxonomyFactory(rank='phylum') + + def setUp(self): + self.unknown = 'unknown' + self.handle_taxonomy = HandleTaxonomy() + + def test_genus_only(self): + tested_taxonomy = self.handle_taxonomy._retrieve_taxonomy( + self.genus.name, rank='genus', unknown_val=self.unknown + ) + self.assertEqual(tested_taxonomy.tax_id, self.genus.tax_id) + + def test_genus_not_in_db(self): + tested_taxonomy = self.handle_taxonomy._retrieve_taxonomy("Fake Name", rank="genus", unknown_val=self.unknown) + self.assertEqual(tested_taxonomy, None) + + def test_phylum_only(self): + tested_taxonomy = self.handle_taxonomy._retrieve_taxonomy( + self.phylum.name, rank="phylum", unknown_val=self.unknown + ) + self.assertEqual(tested_taxonomy.tax_id, self.phylum.tax_id) + + def test_phylum_not_in_db(self): + tested_taxonomy = self.handle_taxonomy._retrieve_taxonomy(self.unknown, "Fake Name") + self.assertEqual(tested_taxonomy, None) + + def test_both_unknown(self): + tested_taxonomy = self.handle_taxonomy._retrieve_taxonomy(self.unknown) + self.assertEqual(tested_taxonomy, None) + + def test_build_manual_mapping(self): + self.handle_taxonomy.MANUAL_TAXO_MAPPING = { + 'test_manual': self.genus.tax_id + } + tested_taxonomy = self.handle_taxonomy._retrieve_taxonomy( + 'test_manual', rank='manual', unknown_val=self.unknown + ) + self.assertEqual(tested_taxonomy.tax_id, self.genus.tax_id) diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/test_import_gene_sequences.py b/backend/metagenedb/apps/catalog/management/commands/commons/test_import_gene_sequences.py new file mode 100644 index 0000000000000000000000000000000000000000..95e7ab8ffe8fd7ccd36a7edaab73e13a849211ff --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/commons/test_import_gene_sequences.py @@ -0,0 +1,52 @@ +import os + +from rest_framework.test import APITestCase + +from metagenedb.apps.catalog.models import Gene +from metagenedb.apps.catalog.management.commands.commons.import_gene_sequences import ImportGeneSequences +from metagenedb.apps.catalog.factory import ( + GeneFactory, +) + + +class TestUpdateSequences(APITestCase): + + @classmethod + def setUpTestData(cls): + cls.gene = GeneFactory() + + def setUp(self): + self.import_igc_seq = ImportGeneSequences("test") # we never make real reference to the sequence_file + + def test_update_sequence(self): + seq = "ACTG" + sequences = { + self.gene.gene_id: seq + } + self.assertFalse(Gene.objects.get(gene_id=self.gene.gene_id).sequence) + self.import_igc_seq.update_sequences(sequences) + self.assertEqual(Gene.objects.get(gene_id=self.gene.gene_id).sequence, seq) + + +class TestEndToEnd(APITestCase): + + @classmethod + def setUpTestData(cls): + GeneFactory.create(gene_id="gene1") + GeneFactory.create(gene_id="gene2") + + def test_end_to_end(self): + test_file = os.path.join(os.path.dirname(__file__), "./test_files/genes.fa") + loader = ImportGeneSequences(test_file) + expected_genes = { + 'gene1': { + 'sequence': 'ACGT' + }, + 'gene2': { + 'sequence': 'ATCG' + }, + } + loader.load_all() + created_genes = Gene.objects.all() + for created_gene in created_genes: + self.assertEqual(getattr(created_gene, 'sequence'), expected_genes[created_gene.gene_id]['sequence']) diff --git a/backend/metagenedb/apps/catalog/management/commands/commons/test_import_genes.py b/backend/metagenedb/apps/catalog/management/commands/commons/test_import_genes.py new file mode 100644 index 0000000000000000000000000000000000000000..751ac12afc448424bc0687a9fd12eeffb76db421 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/commons/test_import_genes.py @@ -0,0 +1,87 @@ +from unittest import TestCase + +import mock +from rest_framework.test import APITestCase + +from metagenedb.apps.catalog.models import Gene +from metagenedb.apps.catalog.management.commands.commons.import_genes import BaseImportGenes +from metagenedb.apps.catalog.factory import ( + GeneFactory, +) + + +class ParserTest: + """Simple parser for test purposes""" + + @staticmethod + def gene(line): + gene_info = line.rstrip().split('\t') + return { + 'gene_id': gene_info[0], + 'length': gene_info[1], + } + + +class BaseTestImportGenes(TestCase): + + def setUp(self): + function_to_mock = 'metagenedb.apps.catalog.management.commands.commons.import_genes.file_len' + with mock.patch(function_to_mock) as MockFileLen: + MockFileLen.return_value = 10 + self.import_genes = BaseImportGenes('test') + self.import_genes.PARSER = ParserTest + + +class TestParseGene(BaseTestImportGenes): + + def setUp(self): + raw_data = [ + 'gene_ID', + 'length', + ] + self.raw_line = "\t".join(raw_data) + super().setUp() + + def test_parse_gene_default_selected_keys(self): + """ + This test should failed and need to be updated when SELECTED_KEYS are changed + """ + expected_dict = { + 'gene_id': 'gene_ID', + 'length': 'length', + } + tested_dict = self.import_genes._parse_gene(self.raw_line) + self.assertDictEqual(tested_dict, expected_dict) + + +class TestCreateOrUpdateGenes(APITestCase, BaseTestImportGenes): + + @classmethod + def setUpTestData(cls): + cls.gene = GeneFactory() + + def test_create_1_update_1(self): + gene_to_update = { + 'gene_id': self.gene.gene_id, + 'name': 'Updated Gene', + 'length': 2235, + } + gene_to_create = { + 'gene_id': 'gene-create-123', + 'name': 'Created Gene', + 'length': 5629, + } + gene_dict = { + gene_to_update['gene_id']: gene_to_update, + gene_to_create['gene_id']: gene_to_create + } + self.import_genes.create_or_update_genes(gene_dict) + self.assertEqual(Gene.objects.all().count(), 2) + # Check updated gene + updated_gene = Gene.objects.get(gene_id=gene_to_update['gene_id']) + for key, value in gene_to_update.items(): + self.assertEqual(getattr(updated_gene, key), value) + # Check created gene + created_gene = Gene.objects.get(gene_id=gene_to_create['gene_id']) + for key, value in gene_to_create.items(): + self.assertEqual(getattr(created_gene, key), value) diff --git a/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py b/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py index 20c38a70ea58820d0ba2e9c24322c111890a891f..351bc75f692e72b0391bdd0e517b136a5e33e602 100644 --- a/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py +++ b/backend/metagenedb/apps/catalog/management/commands/import_igc_annotation.py @@ -1,202 +1,52 @@ import logging -from itertools import islice from django.core.management.base import BaseCommand from slugify import slugify -from metagenedb.apps.catalog.models import Function, Gene, GeneFunction, Taxonomy -from metagenedb.common.utils.chunks import file_len +from metagenedb.apps.catalog.management.commands.commons.handle_functions import HandleFunctions +from metagenedb.apps.catalog.management.commands.commons.handle_taxonomy import HandleTaxonomy +from metagenedb.apps.catalog.management.commands.commons.import_genes import BaseImportGenes from metagenedb.common.utils.parsers import IGCLineParser logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') logger = logging.getLogger(__name__) -class ImportIGCGenes(object): - +class ImportIGCGenes(BaseImportGenes, HandleFunctions, HandleTaxonomy): PHYLUM_COL = 'taxo_phylum' GENUS_COL = 'taxo_genus' SELECTED_KEYS = ['gene_id', 'length', 'kegg_ko', 'eggnog', PHYLUM_COL, GENUS_COL] + IMPORT_TYPE = "IGC genes" # For logs + UPDATED_FIELDS = ['length', 'name', 'source', 'taxonomy'] + SOURCE = 'igc' + PARSER = IGCLineParser def __init__(self, annotation_file, skip_tax=False, skip_functions=False): - self.annotation_file = annotation_file - self.total_genes = file_len(annotation_file) - self._reset_counters() + super().__init__(annotation_file) # Skip some insertion if specified in script options self.skip_tax = skip_tax self.skip_functions = skip_functions - def _reset_counters(self): - self.processed_genes = 0 - self.created_genes = 0 - self.updated_genes = 0 - self.skipped_genes = 0 - - def _build_taxo_mapping(self, rank): - logger.info("Building local mapping for %s level...", rank) - instances = Taxonomy.objects.filter(rank=rank) - return {instance.name: instance for instance in instances} - - @property - def phylum_mapping(self): - if getattr(self, '_phylum_mapping', None) is None: - self._phylum_mapping = self._build_taxo_mapping("phylum") - return self._phylum_mapping - - @property - def genus_mapping(self): - if getattr(self, '_genus_mapping', None) is None: - self._genus_mapping = self._build_taxo_mapping("genus") - return self._genus_mapping - - def _build_function_mapping(self, source): - logger.info("Building local mapping for %s function...", source) - instances = Function.objects.filter(source=source) - return {instance.function_id: instance for instance in instances} - - @property - def eggnog_mapping(self): - if getattr(self, '_eggnog_mapping', None) is None: - self._eggnog_mapping = self._build_function_mapping("eggnog") - return self._eggnog_mapping - - @property - def kegg_mapping(self): - if getattr(self, '_kegg_mapping', None) is None: - self._kegg_mapping = self._build_function_mapping("kegg") - return self._kegg_mapping - - def _parse_gene(self, raw_line, selected_keys=SELECTED_KEYS): - """ - Use IGCLineParser and return selected keys - """ - gene_parser = IGCLineParser() - all_dict = gene_parser.gene(raw_line) - selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys} - return selected_dict - - def _retrieve_taxonomy(self, genus_name, phylum_name, unknown_val='unknown'): - taxonomy_instance = None - if genus_name != unknown_val: - taxonomy_instance = self.genus_mapping.get(genus_name, None) - if taxonomy_instance is None and phylum_name != unknown_val: - taxonomy_instance = self.phylum_mapping.get(phylum_name, None) - return taxonomy_instance - - def _remove_functions(self, gene_dicts): - functions = {} - for gene_dict in gene_dicts: - functions[slugify(gene_dict['gene_id'])] = { - 'kegg': gene_dict.pop('kegg_ko'), - 'eggnog': gene_dict.pop('eggnog') - } - return functions - def _format_for_model(self, igc_dict): - gene_dict = {} - gene_dict['name'] = igc_dict['gene_id'] - gene_dict['gene_id'] = slugify(igc_dict['gene_id']) - gene_dict['length'] = igc_dict['length'] + gene_dict = super()._format_for_model(igc_dict) if not self.skip_tax: - gene_dict['taxonomy'] = self._retrieve_taxonomy(igc_dict.get('taxo_genus'), igc_dict.get('taxo_phylum')) + taxonomy = self._retrieve_taxonomy(igc_dict.get('taxo_genus'), rank="genus") + if taxonomy is None: + taxonomy = self._retrieve_taxonomy(igc_dict.get('taxo_phylum'), rank="phylum") + gene_dict['taxonomy'] = taxonomy return gene_dict - def _update_genes(self, gene_instances, gene_dict): - for gene_id, gene_instance in gene_instances.items(): - for key, value in gene_dict[gene_id].items(): - setattr(gene_instance, key, value) - try: - Gene.objects.bulk_update( - list(gene_instances.values()), - ['name', 'taxonomy', 'length'] - ) - self.updated_genes += len(gene_instances.keys()) - except Exception as exception: - logger.warning(exception) - self.skipped_genes += len(gene_instances.keys()) - - def _create_genes(self, gene_list): - try: - Gene.objects.bulk_create( - [Gene(**item) for item in gene_list] - ) - self.created_genes += len(gene_list) - except Exception as exception: - logger.warning(exception) - self.skipped_genes += len(gene_list) - - def create_or_update_genes(self, gene_dict): - update_instances = Gene.objects.in_bulk(gene_dict.keys(), field_name='gene_id') - self._update_genes(update_instances, gene_dict) - gene_ids_to_create = set(gene_dict.keys()) - set(update_instances.keys()) - if gene_ids_to_create: - self._create_genes([gene_dict[gene_id] for gene_id in gene_ids_to_create]) - - def _clean_functions(self, functions, unknown_val='unknown'): - """ - Get rid of functions that are not in the db or entitled unknown - """ - cleaned_functions = {} - for gene_id, all_functions in functions.items(): - new_functions = [] - for kegg in all_functions['kegg']: - if kegg == unknown_val: - continue - elif kegg in self.kegg_mapping.keys(): - new_functions.append(self.kegg_mapping[kegg]) - for eggnog in all_functions['eggnog']: - if eggnog == unknown_val: - continue - elif eggnog in self.eggnog_mapping.keys(): - new_functions.append(self.eggnog_mapping[eggnog]) - if new_functions: - cleaned_functions[gene_id] = new_functions - return cleaned_functions - - def _generate_gene_function_mapping(self, functions, genes): - """ - Generate a list of GeneFunction pair to create relation between them - """ - mapping = [] - for gene_id, function_list in functions.items(): - for function in function_list: - mapping.append(GeneFunction(gene=genes[gene_id], function=function)) - return mapping - - def link_genes_to_functions(self, functions): - cleaned_functions = self._clean_functions(functions) - genes = Gene.objects.in_bulk(cleaned_functions.keys(), field_name='gene_id') - # Get all link with corresponding genes & Delete them - GeneFunction.objects.filter(gene__in=genes.values()).delete() - # Generate table for bulk_create of function <-> gene and create it - GeneFunction.objects.bulk_create( - self._generate_gene_function_mapping(cleaned_functions, genes) - ) - - def load_all(self, test=False, chunk_size=10000): - logger.info("Starting IGC genes import (creation or update) to DB") - with open(self.annotation_file, 'r') as file: - while True: - chunk_genes = list(islice(file, chunk_size)) - if not chunk_genes: - break - igc_dict_list = [self._parse_gene(i) for i in chunk_genes] - functions = self._remove_functions(igc_dict_list) - igc_clean_dict = {slugify(i['gene_id']): self._format_for_model(i) for i in igc_dict_list} - self.processed_genes += chunk_size - self.create_or_update_genes(igc_clean_dict) - if not self.skip_functions: - self.link_genes_to_functions(functions) - logger.info("%s Genes processed so far...", self.processed_genes) - if test is True: - break - logger.info("[DONE] %s/%s Genes created.", self.created_genes, self.total_genes) - logger.info("[DONE] %s/%s Genes updated.", self.updated_genes, self.total_genes) - logger.info("[DONE] %s/%s Genes skipped.", self.skipped_genes, self.total_genes) + def _handle_chunk(self, chunk_genes): + gene_dict_list = [self._parse_gene(i) for i in chunk_genes] + functions = self._remove_functions(gene_dict_list) + gene_clean_dict = {slugify(i['gene_id']): self._format_for_model(i) for i in gene_dict_list} + self.create_or_update_genes(gene_clean_dict) + if not self.skip_functions: + self.link_genes_to_functions(functions) class Command(BaseCommand): - help = 'Create or update all EggNOG entries from annotations.tsv file.' + help = 'Create or update IGC genes from IGC annotations file.' def add_arguments(self, parser): parser.add_argument('annotation', help='IGC.annotation_OF.summary file from IGC') diff --git a/backend/metagenedb/apps/catalog/management/commands/import_igc_sequences.py b/backend/metagenedb/apps/catalog/management/commands/import_igc_sequences.py index 4d751b3beed3b4ac9e141dcde1f7b775d7c056c5..0988c2c146fc045ff3f89fee3c1f362a5a9100aa 100644 --- a/backend/metagenedb/apps/catalog/management/commands/import_igc_sequences.py +++ b/backend/metagenedb/apps/catalog/management/commands/import_igc_sequences.py @@ -1,61 +1,15 @@ import logging -import pyfastx from django.core.management.base import BaseCommand -from slugify import slugify -from metagenedb.apps.catalog.models import Gene +from metagenedb.apps.catalog.management.commands.commons.import_gene_sequences import ImportGeneSequences logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') logger = logging.getLogger(__name__) -class ImportIGCGeneSequences(object): - - def __init__(self, sequence_file): - self.sequence_file = sequence_file - self._reset_counters() - - def _reset_counters(self): - self.processed_genes = 0 - self.updated_genes = 0 - self.skipped_genes = 0 - - def update_sequences(self, sequences): - genes = Gene.objects.filter(gene_id__in=sequences.keys()) - genes_retrieved = genes.count() - for gene in genes: - gene.sequence = sequences[gene.gene_id] - try: - Gene.objects.bulk_update(genes, ['sequence']) - self.updated_genes += genes_retrieved - self.skipped_genes += len(sequences) - genes_retrieved - except Exception: - logger.warning("Could not update genes... skipped.") - self.skipped_genes += len(sequences) - - def load_all(self, test=False, chunk_size=10000, skip_n_sequences=0): - logger.info("Starting IGC Gene sequences import (update) to DB") - if skip_n_sequences > 0: - logger.info("Skipping first %s sequences", skip_n_sequences) - current_sequences = {} - for name, seq in pyfastx.Fasta(self.sequence_file, build_index=False): - if self.processed_genes < skip_n_sequences: - self.processed_genes += 1 - self.skipped_genes += 1 - continue - current_sequences[slugify(name.split()[0])] = seq - self.processed_genes += 1 - if self.processed_genes % chunk_size == 0: - self.update_sequences(current_sequences) - logger.info("%s Gene sequences processed so far...", self.processed_genes) - current_sequences = {} - if test is True: - break - if len(current_sequences) > 0: - self.update_sequences(current_sequences) - logger.info("[DONE] %s/%s Gene sequences updated.", self.updated_genes, self.processed_genes) - logger.info("[DONE] %s/%s Genes skipped.", self.skipped_genes, self.processed_genes) +class ImportIGCGeneSequences(ImportGeneSequences): + CATALOG = "IGC" class Command(BaseCommand): diff --git a/backend/metagenedb/apps/catalog/management/commands/import_virgo_eggnog.py b/backend/metagenedb/apps/catalog/management/commands/import_virgo_eggnog.py new file mode 100644 index 0000000000000000000000000000000000000000..388e488b6a7e9f9acf647fa526d711599a8049fc --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/import_virgo_eggnog.py @@ -0,0 +1,88 @@ +import logging + +from django.core.management.base import BaseCommand +from slugify import slugify + +from metagenedb.apps.catalog.management.commands.commons.handle_functions import HandleFunctions +from metagenedb.apps.catalog.management.commands.commons.import_genes import BaseImportGenes +from metagenedb.common.utils.parsers import VirgoEggNOGLineParser + +logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') +logger = logging.getLogger(__name__) + + +class ImportVirgoGeneEggNOGAnnotation(BaseImportGenes, HandleFunctions): + + IMPORT_TYPE = "Virgo EggNOG annotations" # For logs + SELECTED_KEYS = ['gene_id', 'eggnog'] + UPDATED_FIELDS = ['name'] + SOURCE = 'virgo' + PARSER = VirgoEggNOGLineParser + + FUN_SOURCE_TO_DELETE = ['eggnog'] + + def _clean_functions(self, functions, unknown_val='unknown'): + """ + Get rid of functions that are not in the db or entitled unknown + """ + cleaned_functions = {} + for gene_id, all_functions in functions.items(): + new_functions = [] + eggnog_annotation = all_functions['eggnog'] + if eggnog_annotation == unknown_val: + continue + elif eggnog_annotation in self.eggnog_mapping.keys(): + new_functions.append(self.eggnog_mapping[eggnog_annotation]) + if new_functions: + cleaned_functions[gene_id] = new_functions + return cleaned_functions + + def _remove_functions(self, gene_dicts): + functions = {} + for gene_dict in gene_dicts: + functions[slugify(gene_dict['gene_id'])] = { + 'eggnog': gene_dict.pop('eggnog'), + } + return functions + + def _format_for_model(self, ori_gene_dict): + """ + @TODO remove in the future and makes function from parent class more modulable + """ + gene_dict = {} + gene_dict['gene_id'] = slugify(ori_gene_dict['gene_id']) + gene_dict['name'] = ori_gene_dict['gene_id'] + gene_dict['source'] = self.SOURCE + return gene_dict + + def _handle_chunk(self, chunk_genes): + """ + Overide for all different sources + """ + gene_dict_list = [self._parse_gene(i) for i in chunk_genes] + functions = self._remove_functions(gene_dict_list) + gene_clean_dict = {slugify(i['gene_id']): self._format_for_model(i) for i in gene_dict_list} + self.create_or_update_genes(gene_clean_dict) + self.link_genes_to_functions(functions) + + +class Command(BaseCommand): + help = 'Create or update all EggNOG annotation for Virgo genes (from `3.eggnog.NOG.txt` file).' + + def add_arguments(self, parser): + parser.add_argument( + 'annotation', + help='3.eggnog.NOG.txt file from Virgo. Genes need to exist in DB for this script to work.' + ) + parser.add_argument('--test', action='store_true', help='Run only on first 10000 entries.') + + def set_logger_level(self, verbosity): + if verbosity > 2: + logger.setLevel(logging.DEBUG) + elif verbosity > 1: + logger.setLevel(logging.INFO) + + def handle(self, *args, **options): + self.set_logger_level(int(options['verbosity'])) + import_annotations = ImportVirgoGeneEggNOGAnnotation(options['annotation']) + import_annotations.load_all(test=options['test']) diff --git a/backend/metagenedb/apps/catalog/management/commands/import_virgo_genes.py b/backend/metagenedb/apps/catalog/management/commands/import_virgo_genes.py new file mode 100644 index 0000000000000000000000000000000000000000..d37456f74fb740c85b6f55daaafcd8f6911c5177 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/import_virgo_genes.py @@ -0,0 +1,37 @@ +import logging + +from django.core.management.base import BaseCommand + +from metagenedb.apps.catalog.management.commands.commons.import_genes import BaseImportGenes +from metagenedb.common.utils.parsers import VirgoGeneLengthLineParser + +logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') +logger = logging.getLogger(__name__) + + +class ImportVirgoGenes(BaseImportGenes): + + IMPORT_TYPE = "Virgo gene length" # For logs + SELECTED_KEYS = ['gene_id', 'length'] + UPDATED_FIELDS = ['length', 'name', 'source'] + SOURCE = 'virgo' + PARSER = VirgoGeneLengthLineParser + + +class Command(BaseCommand): + help = 'Create or update all Virgo genes (name and length from `0.geneLength.txt` file).' + + def add_arguments(self, parser): + parser.add_argument('annotation', help='0.geneLength.txt file from Virgo') + parser.add_argument('--test', action='store_true', help='Run only on first 10000 entries.') + + def set_logger_level(self, verbosity): + if verbosity > 2: + logger.setLevel(logging.DEBUG) + elif verbosity > 1: + logger.setLevel(logging.INFO) + + def handle(self, *args, **options): + self.set_logger_level(int(options['verbosity'])) + import_virgo = ImportVirgoGenes(options['annotation']) + import_virgo.load_all(test=options['test']) diff --git a/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py b/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py new file mode 100644 index 0000000000000000000000000000000000000000..0b621a95259341e7fae8df6f61524b1a9faa06f6 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/import_virgo_kegg.py @@ -0,0 +1,88 @@ +import logging + +from django.core.management.base import BaseCommand +from slugify import slugify + +from metagenedb.apps.catalog.management.commands.commons.handle_functions import HandleFunctions +from metagenedb.apps.catalog.management.commands.commons.import_genes import BaseImportGenes +from metagenedb.common.utils.parsers import VirgoKEGGLineParser + +logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') +logger = logging.getLogger(__name__) + + +class ImportVirgoGeneKeggAnnotation(BaseImportGenes, HandleFunctions): + + IMPORT_TYPE = "Virgo KEGG annotations" # For logs + SELECTED_KEYS = ['gene_id', 'kegg_ko'] + UPDATED_FIELDS = ['name'] + SOURCE = 'virgo' + PARSER = VirgoKEGGLineParser + + FUN_SOURCE_TO_DELETE = ['kegg'] + + def _clean_functions(self, functions, unknown_val='unknown'): + """ + Get rid of functions that are not in the db or entitled unknown + """ + cleaned_functions = {} + for gene_id, all_functions in functions.items(): + new_functions = [] + kegg_annotation = all_functions['kegg'] + if kegg_annotation == unknown_val: + continue + elif kegg_annotation in self.kegg_mapping.keys(): + new_functions.append(self.kegg_mapping[kegg_annotation]) + if new_functions: + cleaned_functions[gene_id] = new_functions + return cleaned_functions + + def _remove_functions(self, gene_dicts): + functions = {} + for gene_dict in gene_dicts: + functions[slugify(gene_dict['gene_id'])] = { + 'kegg': gene_dict.pop('kegg_ko'), + } + return functions + + def _format_for_model(self, ori_gene_dict): + """ + @TODO remove in the future and makes function from parent class more modulable + """ + gene_dict = {} + gene_dict['gene_id'] = slugify(ori_gene_dict['gene_id']) + gene_dict['name'] = ori_gene_dict['gene_id'] + gene_dict['source'] = self.SOURCE + return gene_dict + + def _handle_chunk(self, chunk_genes): + """ + Overide for all different sources + """ + gene_dict_list = [self._parse_gene(i) for i in chunk_genes] + functions = self._remove_functions(gene_dict_list) + gene_clean_dict = {slugify(i['gene_id']): self._format_for_model(i) for i in gene_dict_list} + self.create_or_update_genes(gene_clean_dict) + self.link_genes_to_functions(functions) + + +class Command(BaseCommand): + help = 'Create or update all KEGG annotation for Virgo genes (from `8.A.kegg.ortholog.txt` file).' + + def add_arguments(self, parser): + parser.add_argument( + 'annotation', + help='8.A.kegg.ortholog.txt file from Virgo. Genes need to exist in DB for this script to work.' + ) + parser.add_argument('--test', action='store_true', help='Run only on first 10000 entries.') + + def set_logger_level(self, verbosity): + if verbosity > 2: + logger.setLevel(logging.DEBUG) + elif verbosity > 1: + logger.setLevel(logging.INFO) + + def handle(self, *args, **options): + self.set_logger_level(int(options['verbosity'])) + import_annotations = ImportVirgoGeneKeggAnnotation(options['annotation']) + import_annotations.load_all(test=options['test']) diff --git a/backend/metagenedb/apps/catalog/management/commands/import_virgo_sequences.py b/backend/metagenedb/apps/catalog/management/commands/import_virgo_sequences.py new file mode 100644 index 0000000000000000000000000000000000000000..9e96ca80fd6591ae7483dfc47f0073d60b16821e --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/import_virgo_sequences.py @@ -0,0 +1,35 @@ +import logging + +from django.core.management.base import BaseCommand + +from metagenedb.apps.catalog.management.commands.commons.import_gene_sequences import ImportGeneSequences + +logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') +logger = logging.getLogger(__name__) + + +class ImportVirgoGeneSequences(ImportGeneSequences): + CATALOG = "Virgo" + + +class Command(BaseCommand): + help = 'Create or update all Virgo gene equences (from `NT.fasta` file).' + + def add_arguments(self, parser): + parser.add_argument( + 'fasta', + help='NT.fasta file from Virgo. Genes need to exist in DB for this script to work.' + ) + parser.add_argument('--test', action='store_true', help='Run only on first 10000 sequences.') + parser.add_argument('--skip_n', type=int, default=0, help='Number of sequence to skip') + + def set_logger_level(self, verbosity): + if verbosity > 2: + logger.setLevel(logging.DEBUG) + elif verbosity > 1: + logger.setLevel(logging.INFO) + + def handle(self, *args, **options): + self.set_logger_level(int(options['verbosity'])) + import_igc = ImportVirgoGeneSequences(options['fasta']) + import_igc.load_all(test=options['test'], skip_n_sequences=options['skip_n']) diff --git a/backend/metagenedb/apps/catalog/management/commands/import_virgo_taxonomy.py b/backend/metagenedb/apps/catalog/management/commands/import_virgo_taxonomy.py new file mode 100644 index 0000000000000000000000000000000000000000..8670334860f10729cd7d22fc4a3a1da28e703d9f --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/import_virgo_taxonomy.py @@ -0,0 +1,72 @@ +import logging + +from django.core.management.base import BaseCommand +from slugify import slugify + +from metagenedb.apps.catalog.management.commands.commons.handle_taxonomy import HandleTaxonomy +from metagenedb.apps.catalog.management.commands.commons.import_genes import BaseImportGenes +from metagenedb.common.utils.parsers import VirgoTaxonomyLineParser + +logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s') +logger = logging.getLogger(__name__) + + +class ImportVirgoGeneTaxonomyAnnotation(BaseImportGenes, HandleTaxonomy): + SELECTED_KEYS = ['gene_id', 'taxonomy'] + IMPORT_TYPE = "Virgo taxonomy annotations" # For logs + UPDATED_FIELDS = ['name', 'taxonomy'] + SOURCE = 'virgo' + PARSER = VirgoTaxonomyLineParser + MANUAL_TAXO_MAPPING = { + 'BVAB1': '699240', + 'Clostridiales Family': '186802', + 'Chlamydophila psittaci': '83554' + } + + def _format_for_model(self, ori_gene_dict): + """ + @TODO remove in the future and makes function from parent class more modulable + """ + gene_dict = {} + gene_dict['gene_id'] = slugify(ori_gene_dict['gene_id']) + gene_dict['name'] = ori_gene_dict['gene_id'] + gene_dict['source'] = self.SOURCE + taxonomy_term = ori_gene_dict.get('taxonomy').replace('_', ' ') + taxonomy = self._retrieve_taxonomy(taxonomy_term, rank="species") + if taxonomy is None: + # Use manually created mapping dict + taxonomy = self._retrieve_taxonomy(taxonomy_term.split(' ')[0], rank="genus") + if taxonomy is None: + # Try to at least retrieve the genus from the first part of the taxonomy + taxonomy = self._retrieve_taxonomy(taxonomy_term, rank="genus") + # @TODO need to find a way of handling other cases + if taxonomy is None: + # Use manually created mapping dict + taxonomy = self._retrieve_taxonomy(taxonomy_term, rank="manual") + if taxonomy is None: + self.skipped_genes += 1 + logger.warning("Could not retrieve %s for %s", ori_gene_dict.get('taxonomy'), ori_gene_dict['gene_id']) + gene_dict['taxonomy'] = taxonomy + return gene_dict + + +class Command(BaseCommand): + help = 'Create or update all Taxonomy annotations for Virgo genes (from `1.taxon.tbl.txt` file).' + + def add_arguments(self, parser): + parser.add_argument( + 'annotation', + help='1.taxon.tbl.txt file from Virgo. Genes need to exist in DB for this script to work.' + ) + parser.add_argument('--test', action='store_true', help='Run only on first 10000 entries.') + + def set_logger_level(self, verbosity): + if verbosity > 2: + logger.setLevel(logging.DEBUG) + elif verbosity > 1: + logger.setLevel(logging.INFO) + + def handle(self, *args, **options): + self.set_logger_level(int(options['verbosity'])) + import_annotations = ImportVirgoGeneTaxonomyAnnotation(options['annotation']) + import_annotations.load_all(test=options['test']) diff --git a/backend/metagenedb/apps/catalog/management/commands/test_import_igc_annotation.py b/backend/metagenedb/apps/catalog/management/commands/test_import_igc_annotation.py deleted file mode 100644 index cea4c1d5ffb51f64523d5905f0ba8d70f90a61ad..0000000000000000000000000000000000000000 --- a/backend/metagenedb/apps/catalog/management/commands/test_import_igc_annotation.py +++ /dev/null @@ -1,250 +0,0 @@ -from unittest import TestCase - -import mock -from rest_framework.test import APITestCase - -from metagenedb.apps.catalog.models import Gene, GeneFunction -from metagenedb.apps.catalog.management.commands.import_igc_annotation import ImportIGCGenes -from metagenedb.apps.catalog.factory import ( - FunctionFactory, - GeneFactory, - TaxonomyFactory, -) - - -class BaseTestImportIGCGenes(TestCase): - - def setUp(self): - function_to_mock = 'metagenedb.apps.catalog.management.commands.import_igc_annotation.file_len' - with mock.patch(function_to_mock) as MockFileLen: - MockFileLen.return_value = 10 - self.import_igc_genes = ImportIGCGenes('test') - - -class TestParseGene(BaseTestImportIGCGenes): - - def setUp(self): - raw_data = [ - 'gene_id', - 'name', - 'length', - 'gene_completeness_status', - 'cohort_origin', - 'taxo_phylum', - 'taxo_genus', - 'kegg', - 'eggnog', - 'sample_occurence_freq', - 'ind_occurence_freq', - 'kegg_functional_cat', - 'eggnog_functional_cat', - 'cohort_assembled' - ] - self.raw_line = "\t".join(raw_data) - super().setUp() - - def test_parse_gene_default_selected_keys(self): - """ - This test should failed and need to be updated when SELECTED_KEYS are changed - """ - expected_dict = { - 'gene_id': 'name', - 'length': 'length', - 'kegg_ko': ['kegg'], - 'eggnog': ['eggnog'], - 'taxo_phylum': 'taxo_phylum', - 'taxo_genus': 'taxo_genus', - } - tested_dict = self.import_igc_genes._parse_gene(self.raw_line) - self.assertDictEqual(tested_dict, expected_dict) - - def test_parse_gene(self): - """ - This test should failed and need to be updated when SELECTED_KEYS are changed - """ - selected_keys = ['gene_id', 'length'] - expected_dict = { - 'gene_id': 'name', - 'length': 'length' - } - tested_dict = self.import_igc_genes._parse_gene(self.raw_line, selected_keys=selected_keys) - self.assertDictEqual(tested_dict, expected_dict) - - def test_parse_gene_unknown_key(self): - """ - Unknown key should be ignored - """ - selected_keys = ['gene_id', 'length', 'secret_code'] - expected_dict = { - 'gene_id': 'name', - 'length': 'length' - } - tested_dict = self.import_igc_genes._parse_gene(self.raw_line, selected_keys=selected_keys) - self.assertDictEqual(tested_dict, expected_dict) - - -class TestRetrieveTaxonomy(APITestCase, BaseTestImportIGCGenes): - - @classmethod - def setUpTestData(cls): - cls.genus = TaxonomyFactory(rank='genus') - cls.phylum = TaxonomyFactory(rank='phylum') - - def setUp(self): - self.unknown = 'unknown' - super().setUp() - - def test_genus_only(self): - tested_taxonomy = self.import_igc_genes._retrieve_taxonomy(self.genus.name, self.unknown) - self.assertEqual(tested_taxonomy.tax_id, self.genus.tax_id) - - def test_genus_not_in_db(self): - tested_taxonomy = self.import_igc_genes._retrieve_taxonomy("Fake Name", self.unknown) - self.assertEqual(tested_taxonomy, None) - - def test_phylum_only(self): - tested_taxonomy = self.import_igc_genes._retrieve_taxonomy(self.unknown, self.phylum.name) - self.assertEqual(tested_taxonomy.tax_id, self.phylum.tax_id) - - def test_phylum_not_in_db(self): - tested_taxonomy = self.import_igc_genes._retrieve_taxonomy(self.unknown, "Fake Name") - self.assertEqual(tested_taxonomy, None) - - def test_genus_phylum(self): - tested_taxonomy = self.import_igc_genes._retrieve_taxonomy(self.genus.name, self.phylum.name) - self.assertEqual(tested_taxonomy.tax_id, self.genus.tax_id) - - def test_both_unknown(self): - tested_taxonomy = self.import_igc_genes._retrieve_taxonomy(self.unknown, self.unknown) - self.assertEqual(tested_taxonomy, None) - - -class TestRemoveFunctions(BaseTestImportIGCGenes): - - def test_remove_functions(self): - input_dicts = [{ - 'gene_id': 'Test_gene', - 'kegg_ko': ['K0001'], - 'eggnog': ['COG1', 'COG2'] - }] - expected_functions = { - 'test-gene': { - 'kegg': ['K0001'], - 'eggnog': ['COG1', 'COG2'] - } - } - tested_dict = self.import_igc_genes._remove_functions(input_dicts) - self.assertDictEqual(tested_dict, expected_functions) - - -class TestCreateOrUpdateGenes(APITestCase, BaseTestImportIGCGenes): - - @classmethod - def setUpTestData(cls): - cls.gene = GeneFactory() - cls.taxo_list = TaxonomyFactory.create_batch(2) - - def test_create_1_update_1(self): - gene_to_update = { - 'gene_id': self.gene.gene_id, - 'name': 'Updated Gene', - 'length': 2235, - 'taxonomy': self.taxo_list[0] - } - gene_to_create = { - 'gene_id': 'gene-create-123', - 'name': 'Created Gene', - 'length': 5629, - 'taxonomy': self.taxo_list[1] - } - gene_dict = { - gene_to_update['gene_id']: gene_to_update, - gene_to_create['gene_id']: gene_to_create - } - self.import_igc_genes.create_or_update_genes(gene_dict) - self.assertEqual(Gene.objects.all().count(), 2) - # Check updated gene - updated_gene = Gene.objects.get(gene_id=gene_to_update['gene_id']) - for key, value in gene_to_update.items(): - self.assertEqual(getattr(updated_gene, key), value) - # Check created gene - created_gene = Gene.objects.get(gene_id=gene_to_create['gene_id']) - for key, value in gene_to_create.items(): - self.assertEqual(getattr(created_gene, key), value) - - -class TestRemoveUnknownFunctions(APITestCase, BaseTestImportIGCGenes): - - @classmethod - def setUpTestData(cls): - cls.kegg = FunctionFactory(source='kegg') - cls.eggnog = FunctionFactory(source='eggnog') - - def test_clean_functions_kegg_only(self): - functions = { - 'gene-kegg': { - 'kegg': [self.kegg.function_id, 'KO12345'], - 'eggnog': ['unknown'] - }, - } - expected_functions = { - 'gene-kegg': [self.kegg] - } - self.assertDictEqual(self.import_igc_genes._clean_functions(functions), expected_functions) - - def test_clean_functions_eggnog_only(self): - functions = { - 'gene-kegg': { - 'kegg': ['unknown'], - 'eggnog': [self.eggnog.function_id, 'COG12345'] - }, - } - expected_functions = { - 'gene-kegg': [self.eggnog] - } - self.assertDictEqual(self.import_igc_genes._clean_functions(functions), expected_functions) - - def test_clean_functions_kegg_eggnog(self): - functions = { - 'gene-kegg': { - 'kegg': [self.kegg.function_id, 'KO12345'], - 'eggnog': [self.eggnog.function_id, 'COG12345'] - }, - } - expected_functions = { - 'gene-kegg': [self.kegg, self.eggnog] - } - self.assertDictEqual(self.import_igc_genes._clean_functions(functions), expected_functions) - - def test_clean_functions_both_unknown(self): - functions = { - 'gene-kegg': { - 'kegg': ['unknown'], - 'eggnog': ['unknown'] - }, - } - expected_functions = {} - self.assertDictEqual(self.import_igc_genes._clean_functions(functions), expected_functions) - - -class TestLinkGenesToFunctions(APITestCase, BaseTestImportIGCGenes): - - @classmethod - def setUpTestData(cls): - cls.kegg = FunctionFactory(source='kegg') - cls.eggnog = FunctionFactory(source='eggnog') - cls.gene = GeneFactory() - - def test_link_kegg_and_eggnog(self): - self.assertEqual(GeneFunction.objects.all().count(), 0) - functions = { - self.gene.gene_id: { - 'kegg': [self.kegg.function_id], - 'eggnog': [self.eggnog.function_id] - } - } - self.import_igc_genes.link_genes_to_functions(functions) - gene_functions = GeneFunction.objects.all() - self.assertEqual(gene_functions.count(), 2) - for link in gene_functions: - self.assertEqual(link.gene.gene_id, self.gene.gene_id) diff --git a/backend/metagenedb/apps/catalog/management/commands/test_import_igc_sequences.py b/backend/metagenedb/apps/catalog/management/commands/test_import_igc_sequences.py deleted file mode 100644 index 5a1608ec3f63e7e98ec3d806bdaec5b8756c79e1..0000000000000000000000000000000000000000 --- a/backend/metagenedb/apps/catalog/management/commands/test_import_igc_sequences.py +++ /dev/null @@ -1,26 +0,0 @@ -from rest_framework.test import APITestCase - -from metagenedb.apps.catalog.models import Gene -from metagenedb.apps.catalog.management.commands.import_igc_sequences import ImportIGCGeneSequences -from metagenedb.apps.catalog.factory import ( - GeneFactory, -) - - -class TestUpdateSequences(APITestCase): - - @classmethod - def setUpTestData(cls): - cls.gene = GeneFactory() - - def setUp(self): - self.import_igc_seq = ImportIGCGeneSequences("test") # we never make real reference to the sequence_file - - def test_update_sequence(self): - seq = "ACTG" - sequences = { - self.gene.gene_id: seq - } - self.assertFalse(Gene.objects.get(gene_id=self.gene.gene_id).sequence) - self.import_igc_seq.update_sequences(sequences) - self.assertEqual(Gene.objects.get(gene_id=self.gene.gene_id).sequence, seq) diff --git a/backend/metagenedb/apps/catalog/management/commands/test_build_hierarchy.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_build_hierarchy.py similarity index 94% rename from backend/metagenedb/apps/catalog/management/commands/test_build_hierarchy.py rename to backend/metagenedb/apps/catalog/management/commands/tests/test_build_hierarchy.py index 68a33cf0ab4259a2b131dc4acef774ee6051e0c4..81d76ce8a830d2597a95a8709fa483d9a85eed4c 100644 --- a/backend/metagenedb/apps/catalog/management/commands/test_build_hierarchy.py +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_build_hierarchy.py @@ -3,7 +3,7 @@ from rest_framework.test import APITestCase from metagenedb.apps.catalog.factory import TaxonomyFactory from metagenedb.apps.catalog.models import Taxonomy -from .build_hierarchy import HierarchyBuilder +from metagenedb.apps.catalog.management.commands.build_hierarchy import HierarchyBuilder class TestBuildHierarchy(APITestCase): diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_files/igc_annotation.tsv b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/igc_annotation.tsv new file mode 100644 index 0000000000000000000000000000000000000000..1f912b19ddf9eb975c383af14ecb9a53f8cad69c --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/igc_annotation.tsv @@ -0,0 +1,2 @@ +1 Gene_1 123 Complete CHN Proteobacteria Escherichia K12345 COG1234 0.224151539068666 0.236448598130841 Lipid Metabolism Cell cycle control, cell division, chromosome partitioning;Cytoskeleton EUR;CHN;USA +2 Gene_2 456 Complete EUR Firmicutes Veillonella K67890 COG5678 0.352801894238358 0.351401869158878 Lipid Metabolism Cell cycle control, cell division, chromosome partitioning;Cytoskeleton EUR;CHN;USA diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_eggnog.tsv b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_eggnog.tsv new file mode 100644 index 0000000000000000000000000000000000000000..a79239e91f454debb192b88623c8c00d6e79df02 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_eggnog.tsv @@ -0,0 +1,2 @@ +Cluster_566081 V1 RPSI map03010 J 30S ribosomal protein S9 COG1234 +Cluster_308979 V2 TRUA J Formation of pseudouridine at positions 38, 39 and 40 in the anticodon stem and loop of transfer RNAs (By similarity) COG5678 diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_gene_length.tsv b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_gene_length.tsv new file mode 100644 index 0000000000000000000000000000000000000000..6cf83836aea3d10e7cce5e430d75b31c5d1112fe --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_gene_length.tsv @@ -0,0 +1,2 @@ +V1 101 +V2 102 diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_kegg.tsv b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_kegg.tsv new file mode 100644 index 0000000000000000000000000000000000000000..1a28b2507965aab32b18ae029d5489384bf89216 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_kegg.tsv @@ -0,0 +1,2 @@ +V1 K12345 ljo:LJ0360 dvvi:GSVIVP00035275001 GSVIVT00035275001; assembled CDS; K02948 small subunit ribosomal protein S11 +V2 K67890 shg:Sph21_4943 dtni:5367 ; diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_taxonomy.tsv b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_taxonomy.tsv new file mode 100644 index 0000000000000000000000000000000000000000..0f5d7a350534621e24ffd8ff3d067f3da6ecdb95 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_files/virgo_taxonomy.tsv @@ -0,0 +1,2 @@ +Cluster_566081 V1 Escherichia_coli 396 +Cluster_308979 V2 Lactobacillus_iners 783 diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py new file mode 100644 index 0000000000000000000000000000000000000000..a8538b3e916184839610191565e2044aae765937 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_igc_annotation.py @@ -0,0 +1,53 @@ +import os +from rest_framework.test import APITestCase + +from metagenedb.apps.catalog.models import Gene +from metagenedb.apps.catalog.management.commands.import_igc_annotation import ImportIGCGenes +from metagenedb.apps.catalog.factory.function import generate_fake_functions_db +from metagenedb.apps.catalog.factory.taxonomy import generate_simple_db + + +class TestEndToEnd(APITestCase): + + @classmethod + def setUpTestData(cls): + generate_simple_db() + generate_fake_functions_db() + + def test_end_to_end(self): + test_file = os.path.join(os.path.dirname(__file__), "./test_files/igc_annotation.tsv") + loader = ImportIGCGenes(test_file) + expected_genes = { + 'gene-1': { + 'source': 'igc', + 'length': 123, + 'name': 'Gene_1', + 'tax_id': '561', + 'functions': { + 'kegg': 'K12345', + 'eggnog': 'COG1234' + } + }, + 'gene-2': { + 'source': 'igc', + 'length': 456, + 'name': 'Gene_2', + 'tax_id': '1239', # Genus annotation Veillonella not in test db, but phylum yes + 'functions': { + 'kegg': 'K67890', + 'eggnog': 'COG5678' + } + }, + } + loader.load_all() + created_genes = Gene.objects.all().prefetch_related('functions') + for created_gene in created_genes: + for key in ['source', 'length', 'name']: + self.assertEqual(getattr(created_gene, key), expected_genes[created_gene.gene_id][key]) + self.assertEqual(created_gene.taxonomy.tax_id, expected_genes[created_gene.gene_id]['tax_id']) + # Check functions + for function in created_gene.functions.all(): + self.assertIn(function.source, ['kegg', 'eggnog']) + self.assertEqual( + function.function_id, expected_genes[created_gene.gene_id]['functions'][function.source] + ) diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_eggnog.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_eggnog.py new file mode 100644 index 0000000000000000000000000000000000000000..898f5ea18d943bb801c9d5c8344532c65215038c --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_eggnog.py @@ -0,0 +1,46 @@ +import os + +from rest_framework.test import APITestCase + +from metagenedb.apps.catalog.models import Gene +from metagenedb.apps.catalog.management.commands.import_virgo_eggnog import ImportVirgoGeneEggNOGAnnotation +from metagenedb.apps.catalog.factory import GeneFactory +from metagenedb.apps.catalog.factory.function import generate_fake_functions_db + + +class TestEndToEnd(APITestCase): + + @classmethod + def setUpTestData(cls): + generate_fake_functions_db() + GeneFactory.create(gene_id="v1") + GeneFactory.create(gene_id="v2") + + def test_end_to_end(self): + test_file = os.path.join(os.path.dirname(__file__), "./test_files/virgo_eggnog.tsv") + loader = ImportVirgoGeneEggNOGAnnotation(test_file) + expected_genes = { + 'v1': { + 'name': 'V1', + 'functions': { + 'eggnog': 'COG1234', + } + }, + 'v2': { + 'name': 'V2', + 'functions': { + 'eggnog': 'COG5678', + } + }, + } + loader.load_all() + created_genes = Gene.objects.all().prefetch_related('functions') + for created_gene in created_genes: + self.assertEqual(getattr(created_gene, 'name'), expected_genes[created_gene.gene_id]['name']) + # Check functions + self.assertTrue(created_gene.functions.all()) + for function in created_gene.functions.all(): + self.assertIn(function.source, ['kegg', 'eggnog']) + self.assertEqual( + function.function_id, expected_genes[created_gene.gene_id]['functions'][function.source] + ) diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_genes.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_genes.py new file mode 100644 index 0000000000000000000000000000000000000000..6166e009b10021664edd57c76c122c8c2ac9f477 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_genes.py @@ -0,0 +1,30 @@ +import os + +from rest_framework.test import APITestCase + +from metagenedb.apps.catalog.models import Gene +from metagenedb.apps.catalog.management.commands.import_virgo_genes import ImportVirgoGenes + + +class TestEndToEnd(APITestCase): + + def test_end_to_end(self): + test_file = os.path.join(os.path.dirname(__file__), "./test_files/virgo_gene_length.tsv") + loader = ImportVirgoGenes(test_file) + expected_genes = { + 'v1': { + 'source': 'virgo', + 'length': 101, + 'name': 'V1' + }, + 'v2': { + 'source': 'virgo', + 'length': 102, + 'name': 'V2' + }, + } + loader.load_all() + created_genes = Gene.objects.all().values() + for created_gene in created_genes: + for key in ['source', 'length', 'name']: + self.assertEqual(created_gene[key], expected_genes[created_gene['gene_id']][key]) diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_kegg.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_kegg.py new file mode 100644 index 0000000000000000000000000000000000000000..16b9bdc3829613d96010fb8782d29d6956ae8abe --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_kegg.py @@ -0,0 +1,46 @@ +import os + +from rest_framework.test import APITestCase + +from metagenedb.apps.catalog.models import Gene +from metagenedb.apps.catalog.management.commands.import_virgo_kegg import ImportVirgoGeneKeggAnnotation +from metagenedb.apps.catalog.factory import GeneFactory +from metagenedb.apps.catalog.factory.function import generate_fake_functions_db + + +class TestEndToEnd(APITestCase): + + @classmethod + def setUpTestData(cls): + generate_fake_functions_db() + GeneFactory.create(gene_id="v1") + GeneFactory.create(gene_id="v2") + + def test_end_to_end(self): + test_file = os.path.join(os.path.dirname(__file__), "./test_files/virgo_kegg.tsv") + loader = ImportVirgoGeneKeggAnnotation(test_file) + expected_genes = { + 'v1': { + 'name': 'V1', + 'functions': { + 'kegg': 'K12345', + } + }, + 'v2': { + 'name': 'V2', + 'functions': { + 'kegg': 'K67890', + } + }, + } + loader.load_all() + created_genes = Gene.objects.all().prefetch_related('functions') + for created_gene in created_genes: + self.assertEqual(getattr(created_gene, 'name'), expected_genes[created_gene.gene_id]['name']) + # Check functions + self.assertTrue(created_gene.functions.all()) + for function in created_gene.functions.all(): + self.assertIn(function.source, ['kegg', 'eggnog']) + self.assertEqual( + function.function_id, expected_genes[created_gene.gene_id]['functions'][function.source] + ) diff --git a/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_taxonomy.py b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_taxonomy.py new file mode 100644 index 0000000000000000000000000000000000000000..830b5373a64fdfb686fe46aeaeadaadfc28de8f5 --- /dev/null +++ b/backend/metagenedb/apps/catalog/management/commands/tests/test_import_virgo_taxonomy.py @@ -0,0 +1,36 @@ +import os + +from rest_framework.test import APITestCase + +from metagenedb.apps.catalog.models import Gene +from metagenedb.apps.catalog.management.commands.import_virgo_taxonomy import ImportVirgoGeneTaxonomyAnnotation +from metagenedb.apps.catalog.factory import GeneFactory +from metagenedb.apps.catalog.factory.taxonomy import generate_simple_db + + +class TestEndToEnd(APITestCase): + + @classmethod + def setUpTestData(cls): + generate_simple_db() + for gene_id in ['v1', 'v2']: + GeneFactory.create(gene_id=gene_id) + + def test_end_to_end(self): + test_file = os.path.join(os.path.dirname(__file__), "./test_files/virgo_taxonomy.tsv") + loader = ImportVirgoGeneTaxonomyAnnotation(test_file) + expected_genes = { + 'v1': { + 'name': 'V1', + 'tax_id': '562', + }, + 'v2': { + 'name': 'V2', + 'tax_id': '1578', + } + } + loader.load_all() + created_genes = Gene.objects.all().prefetch_related('functions') + for created_gene in created_genes: + self.assertEqual(getattr(created_gene, 'name'), expected_genes[created_gene.gene_id]['name']) + self.assertEqual(created_gene.taxonomy.tax_id, expected_genes[created_gene.gene_id]['tax_id']) diff --git a/backend/metagenedb/apps/catalog/migrations/0025_add_virgo_source.py b/backend/metagenedb/apps/catalog/migrations/0025_add_virgo_source.py new file mode 100644 index 0000000000000000000000000000000000000000..2a7df3dfe6b2a280a660077e7f400e17f1717734 --- /dev/null +++ b/backend/metagenedb/apps/catalog/migrations/0025_add_virgo_source.py @@ -0,0 +1,18 @@ +# Generated by Django 3.0.4 on 2020-04-27 13:32 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('catalog', '0024_set_undef_default_source_gene'), + ] + + operations = [ + migrations.AlterField( + model_name='gene', + name='source', + field=models.CharField(choices=[('undef', 'Undefined'), ('igc', 'IGC'), ('virgo', 'Virgo')], default='undef', max_length=10), + ), + ] diff --git a/backend/metagenedb/apps/catalog/models/gene.py b/backend/metagenedb/apps/catalog/models/gene.py index 50bc01b30524863badf75ce5a608bb6a7785952e..aee62eb8871ac9ec1232d2172d5d872d2ca45262 100644 --- a/backend/metagenedb/apps/catalog/models/gene.py +++ b/backend/metagenedb/apps/catalog/models/gene.py @@ -6,9 +6,11 @@ from .function import Function class Gene(models.Model): UNDEFINED = 'undef' IGC = 'igc' + VIRGO = 'virgo' SOURCE_CHOICES = [ (UNDEFINED, 'Undefined'), (IGC, 'IGC'), + (VIRGO, 'Virgo'), ] gene_id = models.SlugField(max_length=100, db_index=True, unique=True) diff --git a/backend/metagenedb/apps/catalog/serializers/gene.py b/backend/metagenedb/apps/catalog/serializers/gene.py index 1327cf4461ed0e4509684201df3a2bec154a276a..2f034bd35da77b1acfef7c7d16b8acb03e005e66 100644 --- a/backend/metagenedb/apps/catalog/serializers/gene.py +++ b/backend/metagenedb/apps/catalog/serializers/gene.py @@ -69,7 +69,7 @@ class GeneSerializer(serializers.ModelSerializer): class Meta: model = Gene list_serializer_class = GeneListSerializer - fields = ('gene_id', 'name', 'length', 'functions', 'taxonomy', 'sequence') + fields = ('gene_id', 'name', 'length', 'functions', 'taxonomy', 'sequence', 'source') def _extract_many_to_many(self, validated_data, info): many_to_many = {} diff --git a/backend/metagenedb/common/utils/parsers/__init__.py b/backend/metagenedb/common/utils/parsers/__init__.py index d6aa3459cb3ebaac331771005d91ba889b729da3..a4259885124b219cc6475319f0763ed70549a65a 100644 --- a/backend/metagenedb/common/utils/parsers/__init__.py +++ b/backend/metagenedb/common/utils/parsers/__init__.py @@ -2,3 +2,6 @@ from .eggnog import EggNOGAnnotationLineParser # noqa from .igc import IGCLineParser # noqa from .kegg import KEGGLineParser # noqa from .ncbi_taxonomy import NCBITaxonomyLineParser # noqa +from .virgo import ( # noqa + VirgoGeneLengthLineParser, VirgoKEGGLineParser, VirgoEggNOGLineParser, VirgoTaxonomyLineParser +) diff --git a/backend/metagenedb/common/utils/parsers/test_virgo.py b/backend/metagenedb/common/utils/parsers/test_virgo.py new file mode 100644 index 0000000000000000000000000000000000000000..31b1280336bf6f8eb24c811fd6bb7203508ecfd2 --- /dev/null +++ b/backend/metagenedb/common/utils/parsers/test_virgo.py @@ -0,0 +1,130 @@ +from unittest import TestCase + +from metagenedb.common.utils.parsers import ( + VirgoGeneLengthLineParser, VirgoKEGGLineParser, VirgoEggNOGLineParser, VirgoTaxonomyLineParser +) + + +class TestVirgoGeneLengthLineParser(TestCase): + + def test_gene(self): + raw_data = [ + 'gene_id', + 'length', + ] + raw_line = "\t".join(raw_data) + expected_dict = { + 'gene_id': raw_data[0], + 'length': raw_data[1], + } + test_dict = VirgoGeneLengthLineParser.gene(raw_line) + self.assertDictEqual(test_dict, expected_dict) + + def test_gene_wrong_format(self): + raw_line = "This is a wrong line format, with; information and tab" + with self.assertRaises(Exception) as context: # noqa + VirgoGeneLengthLineParser.gene(raw_line) + + +class TestVirgoKEGGLineParser(TestCase): + + def test_gene(self): + raw_data = [ + 'gene_id', + 'kegg_ko', + 'kegg_gene', + 'more_information', + ] + raw_line = "\t".join(raw_data) + expected_dict = { + 'gene_id': raw_data[0], + 'kegg_ko': raw_data[1], + 'kegg_gene': raw_data[2], + 'more_info': raw_data[3], + } + test_dict = VirgoKEGGLineParser.gene(raw_line) + self.assertDictEqual(test_dict, expected_dict) + + def test_gene_wrong_format(self): + raw_line = "This is a wrong line format, with; information and tab" + with self.assertRaises(Exception) as context: # noqa + VirgoKEGGLineParser.gene(raw_line) + + +class TestVirgoEggNOGLineParser(TestCase): + + def test_gene(self): + raw_data = [ + 'cluster_id', + 'gene_id', + 'ortholog', + 'kegg_pathway', + 'funcat', + 'name', + 'eggnog' + ] + raw_line = "\t".join(raw_data) + expected_dict = { + 'cluster_id': raw_data[0], + 'gene_id': raw_data[1], + 'ortholog': raw_data[2], + 'kegg_pathway': raw_data[3], + 'eggnog_funcat': raw_data[4], + 'function_name': raw_data[5], + 'eggnog': raw_data[6], + } + test_dict = VirgoEggNOGLineParser.gene(raw_line) + self.assertDictEqual(test_dict, expected_dict) + + def test_missing_kegg_pathway(self): + raw_data = [ + 'cluster_id', + 'gene_id', + 'ortholog', + '', + 'funcat', + 'name', + 'eggnog' + ] + raw_line = "\t".join(raw_data) + expected_dict = { + 'cluster_id': raw_data[0], + 'gene_id': raw_data[1], + 'ortholog': raw_data[2], + 'kegg_pathway': raw_data[3], + 'eggnog_funcat': raw_data[4], + 'function_name': raw_data[5], + 'eggnog': raw_data[6], + } + test_dict = VirgoEggNOGLineParser.gene(raw_line) + self.assertDictEqual(test_dict, expected_dict) + + def test_gene_wrong_format(self): + raw_line = "This is a wrong line format, with; information and tab" + with self.assertRaises(Exception) as context: # noqa + VirgoEggNOGLineParser.gene(raw_line) + + +class TestVirgoTaxonomyLineParser(TestCase): + + def test_gene(self): + raw_data = [ + 'cluster_id', + 'gene_id', + 'taxonomy', + '1234', + ] + raw_line = "\t".join(raw_data) + expected_dict = { + 'cluster_id': raw_data[0], + 'gene_id': raw_data[1], + 'taxonomy': raw_data[2], + 'length': raw_data[3], + } + test_dict = VirgoTaxonomyLineParser.gene(raw_line) + self.assertDictEqual(test_dict, expected_dict) + + def test_gene_wrong_format(self): + raw_line = "This is a wrong line format, with; information and tab" + with self.assertRaises(Exception) as context: # noqa + VirgoEggNOGLineParser.gene(raw_line) diff --git a/backend/metagenedb/common/utils/parsers/virgo.py b/backend/metagenedb/common/utils/parsers/virgo.py new file mode 100644 index 0000000000000000000000000000000000000000..30de3e58b4eedd058848cd6142906c9ab9dc87ef --- /dev/null +++ b/backend/metagenedb/common/utils/parsers/virgo.py @@ -0,0 +1,109 @@ +import logging + +_LOGGER = logging.getLogger(__name__) + + +class VirgoGeneLengthLineParser(object): + + @staticmethod + def gene(line): + """ + Parse line from Virgo KEGG annotations to return organized dict (8.A.kegg.ortholog.txt) + + IGC annotation columns: + 0: Gene ID Unique ID + 1: Gene Length Length of nucleotide sequence + """ + try: + gene_info = line.rstrip().split('\t') + return { + 'gene_id': gene_info[0], + 'length': gene_info[1], + } + except Exception: + _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from Virgo KEGG annotation file?") + raise + + +class VirgoKEGGLineParser(object): + + @staticmethod + def gene(line): + """ + Parse line from Virgo KEGG annotations to return organized dict (8.A.kegg.ortholog.txt) + + IGC annotation columns: + 0: Gene ID Unique ID + 1: KEGG KO Annotation Annotated KO(s) for a gene + 2: KEGG Gene + 2: More information Information separated by ; + """ + try: + gene_info = line.rstrip().split('\t') + return { + 'gene_id': gene_info[0], + 'kegg_ko': gene_info[1], + 'kegg_gene': gene_info[2], + 'more_info': gene_info[3], + } + except Exception: + _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from Virgo KEGG annotation file?") + raise + + +class VirgoEggNOGLineParser(object): + + @staticmethod + def gene(line): + """ + Parse line from Virgo EggNOG annotations to return organized dict (3.eggnog.NOG.txt) + + IGC annotation columns: + 0: Cluster ID + 1: Gene ID + 2: Ortholog + 3: KEGG pathway? + 4: EggNOG Functional category + 5: Name + 6: EggNOG annotation + """ + try: + gene_info = line.rstrip().split('\t') + return { + 'cluster_id': gene_info[0], + 'gene_id': gene_info[1], + 'ortholog': gene_info[2], + 'kegg_pathway': gene_info[3], + 'eggnog_funcat': gene_info[4], + 'function_name': gene_info[5], + 'eggnog': gene_info[6], + } + except Exception: + _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from Virgo EggNOG annotation file?") + raise + + +class VirgoTaxonomyLineParser(object): + + @staticmethod + def gene(line): + """ + Parse line from Virgo Taxonomy annotations to return organized dict (1.taxon.tbl.txt) + + IGC annotation columns: + 0: Cluster ID + 1: Gene ID + 2: Taxonomy annotation + 3: Gene length + """ + try: + gene_info = line.rstrip().split('\t') + return { + 'cluster_id': gene_info[0], + 'gene_id': gene_info[1], + 'taxonomy': gene_info[2], + 'length': gene_info[3], + } + except Exception: + _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from Virgo taxonomy file?") + raise diff --git a/frontend/src/views/GeneDetail.vue b/frontend/src/views/GeneDetail.vue index 5a9a6f89924501a549fd47a6ec4af5b25bfc41b5..dcf3cdcf4e39b9be953df0a461b8334ef720ebbc 100644 --- a/frontend/src/views/GeneDetail.vue +++ b/frontend/src/views/GeneDetail.vue @@ -109,6 +109,10 @@ export default { title: 'Length (bp)', content: response.data.length, }, + { + title: 'Source', + content: response.data.source, + }, ]; this.sequence = '>' + response.data.gene_id + '\n' + response.data.sequence; if (response.data.functions.length > 0) { diff --git a/frontend/src/views/genes/genes.html b/frontend/src/views/genes/genes.html index bde5ee240a09c1a2ffc65b07bf540af296481d27..0849299fa6bfa5559e9945a611b8aac2b8a0520a 100644 --- a/frontend/src/views/genes/genes.html +++ b/frontend/src/views/genes/genes.html @@ -25,8 +25,16 @@ - + + + + + {{ props.item.eggnog }} + {{ props.item.source }} diff --git a/frontend/src/views/genes/genes.js b/frontend/src/views/genes/genes.js index 88298f34912a66d0161f00d02c07ac34d3e73cb1..3e41328473b5e45baa8576a90fd0384ead86a3e3 100644 --- a/frontend/src/views/genes/genes.js +++ b/frontend/src/views/genes/genes.js @@ -11,6 +11,7 @@ export default { pagination: { rowsPerPage: 20, }, + geneSource: null, searchGeneName: null, taxonomyRank: null, functionID: null, @@ -32,13 +33,19 @@ export default { { text: 'Taxonomy', value: 'taxonomy', sortable: false }, { text: 'KEGG', value: 'kegg', sortable: false }, { text: 'EggNOG', value: 'eggnog', sortable: false }, + { text: 'Source', value: 'source', sortable: false }, ]; }, taxonomyRanks() { return [ - 'Phylum', 'Genus', + 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species' ]; }, + geneSources() { + return [ + 'IGC', 'Virgo' + ] + }, rowsPerPageItems() { return [this.page_size]; }, @@ -57,6 +64,9 @@ export default { if (this.functionID){ qParams.function = this.functionID } + if (this.geneSource) { + qParams.source = this.geneSource.toLowerCase() + } return qParams; }, maxGeneLength() { @@ -110,6 +120,7 @@ export default { emptyFilters() { this.taxonomyRank = null; this.functionID = null; + this.geneSource = null; this.filterGeneLength = false; this.geneLengthFilterRange = [0, 2000]; },