diff --git a/backend/metagenedb/common/utils/parsers/igc.py b/backend/metagenedb/common/utils/parsers/igc.py index 5790c5d38ffa188d74ed57ccf12f9736c6b7496b..7adf7a044d5b81f6a870ed1ee1e52471375d2e52 100644 --- a/backend/metagenedb/common/utils/parsers/igc.py +++ b/backend/metagenedb/common/utils/parsers/igc.py @@ -37,7 +37,7 @@ class IGCLineParser(object): 'cohort_origin': gene_info[4], 'taxo_phylum': gene_info[5], 'taxo_genus': gene_info[6], - 'kegg_ko': gene_info[7], + 'kegg_ko': gene_info[7].split(';'), 'eggnog': gene_info[8], 'sample_occurence_frequency': gene_info[9], 'individual_occurence_frequency': gene_info[10], diff --git a/backend/metagenedb/common/utils/parsers/test_igc.py b/backend/metagenedb/common/utils/parsers/test_igc.py index d71a7132e8bb6be14acb904c7773889cb1286c48..5c0044eb6376378f513f00541bf10f85bdf4893b 100644 --- a/backend/metagenedb/common/utils/parsers/test_igc.py +++ b/backend/metagenedb/common/utils/parsers/test_igc.py @@ -31,7 +31,7 @@ class TestIGCLineParser(TestCase): 'cohort_origin': raw_data[4], 'taxo_phylum': raw_data[5], 'taxo_genus': raw_data[6], - 'kegg_ko': raw_data[7], + 'kegg_ko': [raw_data[7]], 'eggnog': raw_data[8], 'sample_occurence_frequency': raw_data[9], 'individual_occurence_frequency': raw_data[10], @@ -46,3 +46,40 @@ class TestIGCLineParser(TestCase): raw_line = "This is a wrong line format, with; information and tab" with self.assertRaises(Exception) as context: # noqa IGCLineParser.gene(raw_line) + + def test_multiple_functions(self): + raw_data = [ + 'gene_id', + 'gene_name', + 'length', + 'gene_completeness_status', + 'cohort_origin', + 'taxo_phylum', + 'taxo_genus', + 'kegg;kegg2', + 'eggnog', + 'sample_occurence_freq', + 'ind_occurence_freq', + 'kegg_functional_cat', + 'eggnog_functional_cat', + 'cohort_assembled' + ] + raw_line = "\t".join(raw_data) + expected_dict = { + 'igc_id': raw_data[0], + 'gene_id': raw_data[1], + 'length': raw_data[2], + 'gene_completeness_status': raw_data[3], + 'cohort_origin': raw_data[4], + 'taxo_phylum': raw_data[5], + 'taxo_genus': raw_data[6], + 'kegg_ko': ['kegg', 'kegg2'], + 'eggnog': raw_data[8], + 'sample_occurence_frequency': raw_data[9], + 'individual_occurence_frequency': raw_data[10], + 'kegg_functional_categories': raw_data[11], + 'eggnog_functional_categories': raw_data[12], + 'cohort_assembled': raw_data[13] + } + test_dict = IGCLineParser.gene(raw_line) + self.assertDictEqual(test_dict, expected_dict) diff --git a/backend/scripts/populate_db/import_igc_data.py b/backend/scripts/populate_db/import_igc_data.py index 126078dacfe0f49bf6f945ab0d38c18fb397ce8c..8fc5c755033876474b60e082b1eb99b5f3cf3e17 100755 --- a/backend/scripts/populate_db/import_igc_data.py +++ b/backend/scripts/populate_db/import_igc_data.py @@ -82,7 +82,7 @@ class ImportIGCGenes(object): def _clean_gene(self, gene_dict): gene_dict['gene_name'] = gene_dict['gene_id'] gene_dict['gene_id'] = slugify(gene_dict['gene_id']) - gene_dict['functions'] = [gene_dict.pop('kegg_ko')] + gene_dict['functions'] = gene_dict.pop('kegg_ko') if self.skip_tax: gene_dict.pop('taxonomy') if self.skip_functions or 'unknown' in gene_dict['functions']: @@ -102,6 +102,7 @@ class ImportIGCGenes(object): self.updated_genes += response.get('updated').get('count') except HTTPError as http_error: logging.warning("%s: %s; %s", http_error, http_error.response.json(), genes) + self.skipped_genes += len(genes) self.processed_genes += len(chunk_genes) logger.info("%s Genes processed so far...", self.processed_genes) logger.info("[DONE] %s/%s Genes created.", self.created_genes, self.total_genes) diff --git a/backend/scripts/populate_db/test_import_igc_data.py b/backend/scripts/populate_db/test_import_igc_data.py index 3c1e620a8735c00f7a63b3dc93525f4315e57940..55a5341efde35124e313b5f88dbf22663c6d8551 100644 --- a/backend/scripts/populate_db/test_import_igc_data.py +++ b/backend/scripts/populate_db/test_import_igc_data.py @@ -36,7 +36,7 @@ class TestParseGene(TestCase): expected_dict = { 'gene_id': 'gene_name', 'length': 'length', - 'kegg_ko': 'kegg', + 'kegg_ko': ['kegg'], 'taxo_phylum': 'taxo_phylum', 'taxo_genus': 'taxo_genus', } @@ -76,7 +76,7 @@ class TestCleanGene(TestCase): 'gene_id': 'gene.01', 'length': 135, 'taxonomy': 'Taxo', - 'kegg_ko': 'K00001' + 'kegg_ko': ['K00001'] } def test_clean_gene(self):