diff --git a/backend/scripts/populate_db/import_igc_data.py b/backend/scripts/populate_db/import_igc_data.py index 8fc5c755033876474b60e082b1eb99b5f3cf3e17..957b87b69d6f5db09df3e4666fd2add88e191a87 100755 --- a/backend/scripts/populate_db/import_igc_data.py +++ b/backend/scripts/populate_db/import_igc_data.py @@ -65,9 +65,7 @@ class ImportIGCGenes(object): if len(resp_dict['results']) > 1: logger.warning(f"More than 1 result found for phylum {phylum}. First result is kept.") if resp_dict.get('count', 0) > 0: - gene_dict.update( - {'taxonomy': resp_dict['results'][0]['tax_id']} - ) + gene_dict.update({'taxonomy': resp_dict['results'][0]['tax_id']}) return gene_dict def _parse_gene(self, raw_line, selected_keys=SELECTED_KEYS): @@ -83,8 +81,10 @@ class ImportIGCGenes(object): gene_dict['gene_name'] = gene_dict['gene_id'] gene_dict['gene_id'] = slugify(gene_dict['gene_id']) gene_dict['functions'] = gene_dict.pop('kegg_ko') - if self.skip_tax: + if gene_dict.get('taxonomy', None) == 'unknown' or self.skip_tax: gene_dict.pop('taxonomy') + else: + gene_dict = self._select_taxonomy(gene_dict) if self.skip_functions or 'unknown' in gene_dict['functions']: gene_dict.pop('functions') return gene_dict @@ -95,7 +95,7 @@ class ImportIGCGenes(object): chunk_genes = list(islice(file, chunk_size)) if not chunk_genes: break - genes = [self._clean_gene(self._select_taxonomy(self._parse_gene(i))) for i in chunk_genes] + genes = [self._clean_gene(self._parse_gene(i)) for i in chunk_genes] try: response = self.metagenedb_gene_api.put(genes) self.created_genes += response.get('created').get('count') diff --git a/backend/scripts/populate_db/test_import_igc_data.py b/backend/scripts/populate_db/test_import_igc_data.py index 55a5341efde35124e313b5f88dbf22663c6d8551..3c65c56e5561ed54aa1ca9b158661b4839747720 100644 --- a/backend/scripts/populate_db/test_import_igc_data.py +++ b/backend/scripts/populate_db/test_import_igc_data.py @@ -72,6 +72,7 @@ class TestCleanGene(TestCase): def setUp(self): self.import_igc_genes = ImportIGCGenes('test', 'test') + self.import_igc_genes._select_taxonomy = lambda x: x # Mock to return same dict self.gene_dict = { 'gene_id': 'gene.01', 'length': 135,