diff --git a/backend/metagenedb/apps/catalog/factory/taxonomy.py b/backend/metagenedb/apps/catalog/factory/taxonomy.py index 11f7066c0e702d1ce5e6a7ffdbd7cb3d2a111ccc..b43c4bdb1263069ba387e4a48edc467f9035c6ba 100644 --- a/backend/metagenedb/apps/catalog/factory/taxonomy.py +++ b/backend/metagenedb/apps/catalog/factory/taxonomy.py @@ -1,4 +1,4 @@ -from factory import DjangoModelFactory, fuzzy +from factory import DjangoModelFactory, fuzzy, Faker from faker import Factory from metagenedb.apps.catalog import models @@ -16,3 +16,4 @@ class TaxonomyFactory(DjangoModelFactory): rank = fuzzy.FuzzyChoice(SELECTED_RANK) tax_id = FuzzyLowerText(prefix='tax-', length=15) + name = fuzzy.FuzzyText(length=20) diff --git a/backend/scripts/populate_db/import_igc_data.py b/backend/scripts/populate_db/import_igc_data.py index af7a3c1425153400afb70ba22aa9799cb4baadea..a3a58da60363de6a02a0b5dbcf01f4ce48d6f93f 100755 --- a/backend/scripts/populate_db/import_igc_data.py +++ b/backend/scripts/populate_db/import_igc_data.py @@ -34,6 +34,23 @@ class ImportIGCGenes(object): self.skip_tax = skip_tax self.skip_functions = skip_functions + def _build_taxo_mapping(self, rank): + counter = 1 + next_page = None + mapping = {} + while counter == 1 or next_page is not None: + current_page = self.metagenedb_taxonomy_api.get_all(params={'page': counter, 'rank': rank}) + next_page = current_page['next'] + mapping.update({ + value['name']: value['tax_id'] for value in current_page['results'] + }) + counter += 1 + return mapping + + def build_mapping(self): + self.phylum_mapping = self._build_taxo_mapping("phylum") + self.genus_mapping = self._build_taxo_mapping("genus") + def _reset_counters(self): self.processed_genes = 0 self.created_genes = 0 @@ -89,6 +106,8 @@ class ImportIGCGenes(object): return gene_dict def load_annotation_file_to_db_in_chunks(self, chunk_size=1000, test=False): + # Build mapping for different phylum and genus + self.build_mapping() with open(self.annotation_file, 'r') as file: while True: chunk_genes = list(islice(file, chunk_size)) diff --git a/backend/scripts/populate_db/test_import_igc_data.py b/backend/scripts/populate_db/test_import_igc_data.py index 5d339df5427c6cc510fc61cd174b6c9a1248158c..b1db328f4eaca0450729f839b8800d44633b9756 100644 --- a/backend/scripts/populate_db/test_import_igc_data.py +++ b/backend/scripts/populate_db/test_import_igc_data.py @@ -116,12 +116,15 @@ class TestCleanGene(TestCase): class TestSelectTaxonomy(APITestCase): + @classmethod + def setUpTestData(cls): + cls.genus_name = 'Genus' + cls.phylum_name = 'Phylum' + cls.unknown_name = 'unknown' + cls.genus = TaxonomyFactory(rank="genus", name=cls.genus_name) + cls.phylum = TaxonomyFactory(rank="phylum", name=cls.phylum_name) + def setUp(self): - self.genus_name = 'Genus' - self.phylum_name = 'Phylum' - self.unknown_name = 'unknown' - self.genus = TaxonomyFactory(rank="genus", name=self.genus_name) - self.phylum = TaxonomyFactory(rank="phylum", name=self.phylum_name) self.import_igc_genes = ImportIGCGenes('test', 'test') self.api_mock = MetageneDBCatalogTaxonomyAPIMock(self.client) self.import_igc_genes.metagenedb_taxonomy_api = self.api_mock @@ -184,3 +187,27 @@ class TestSelectTaxonomy(APITestCase): } tested_dict = self.import_igc_genes._select_taxonomy(gene_dict) self.assertDictEqual(tested_dict, expected_dict) + + +class TestBuildTaxoMapping(APITestCase): + + @classmethod + def setUpTestData(cls): + cls.genus_items = TaxonomyFactory.create_batch(200, rank='genus') + cls.phylum_items = TaxonomyFactory.create_batch(20, rank='phylum') + + def setUp(self): + self.import_igc_genes = ImportIGCGenes('test', 'test') + self.api_mock = MetageneDBCatalogTaxonomyAPIMock(self.client) + self.import_igc_genes.metagenedb_taxonomy_api = self.api_mock + + def test_build_mapping(self): + expected_genus_dict = { + item.name: item.tax_id for item in self.genus_items + } + expected_phylum_dict = { + item.name: item.tax_id for item in self.phylum_items + } + self.import_igc_genes.build_mapping() + self.assertDictEqual(self.import_igc_genes.genus_mapping, expected_genus_dict) + self.assertDictEqual(self.import_igc_genes.phylum_mapping, expected_phylum_dict)