[quick fix] allow creating gene with several KEGG from IGC

c9eb07d8 · Kenzo-Hugo Hillion · db3f33e9 · c9eb07d8 · c9eb07d8 · c9eb07d8
Commit c9eb07d8 authored 5 years ago by Kenzo-Hugo Hillion
--- a/backend/metagenedb/common/utils/parsers/igc.py
+++ b/backend/metagenedb/common/utils/parsers/igc.py
@@ -37,7 +37,7 @@ class IGCLineParser(object):
                'cohort_origin': gene_info[4],
                'taxo_phylum': gene_info[5],
                'taxo_genus': gene_info[6],
-                'kegg_ko': gene_info[7],
+                'kegg_ko': gene_info[7].split(';'),
                'eggnog': gene_info[8],
                'sample_occurence_frequency': gene_info[9],
                'individual_occurence_frequency': gene_info[10],

--- a/backend/metagenedb/common/utils/parsers/test_igc.py
+++ b/backend/metagenedb/common/utils/parsers/test_igc.py
@@ -31,7 +31,7 @@ class TestIGCLineParser(TestCase):
            'cohort_origin': raw_data[4],
            'taxo_phylum': raw_data[5],
            'taxo_genus': raw_data[6],
-            'kegg_ko': raw_data[7],
+            'kegg_ko': [raw_data[7]],
            'eggnog': raw_data[8],
            'sample_occurence_frequency': raw_data[9],
            'individual_occurence_frequency': raw_data[10],
@@ -46,3 +46,40 @@ class TestIGCLineParser(TestCase):
        raw_line = "This is a wrong line format, with; information   and tab"
        with self.assertRaises(Exception) as context:  # noqa
            IGCLineParser.gene(raw_line)
+    def test_multiple_functions(self):
+        raw_data = [
+            'gene_id',
+            'gene_name',
+            'length',
+            'gene_completeness_status',
+            'cohort_origin',
+            'taxo_phylum',
+            'taxo_genus',
+            'kegg;kegg2',
+            'eggnog',
+            'sample_occurence_freq',
+            'ind_occurence_freq',
+            'kegg_functional_cat',
+            'eggnog_functional_cat',
+            'cohort_assembled'
+        ]
+        raw_line = "\t".join(raw_data)
+        expected_dict = {
+            'igc_id': raw_data[0],
+            'gene_id': raw_data[1],
+            'length': raw_data[2],
+            'gene_completeness_status': raw_data[3],
+            'cohort_origin': raw_data[4],
+            'taxo_phylum': raw_data[5],
+            'taxo_genus': raw_data[6],
+            'kegg_ko': ['kegg', 'kegg2'],
+            'eggnog': raw_data[8],
+            'sample_occurence_frequency': raw_data[9],
+            'individual_occurence_frequency': raw_data[10],
+            'kegg_functional_categories': raw_data[11],
+            'eggnog_functional_categories': raw_data[12],
+            'cohort_assembled': raw_data[13]
+        }
+        test_dict = IGCLineParser.gene(raw_line)
+        self.assertDictEqual(test_dict, expected_dict)
--- a/backend/scripts/populate_db/import_igc_data.py
+++ b/backend/scripts/populate_db/import_igc_data.py
@@ -82,7 +82,7 @@ class ImportIGCGenes(object):
    def _clean_gene(self, gene_dict):
        gene_dict['gene_name'] = gene_dict['gene_id']
        gene_dict['gene_id'] = slugify(gene_dict['gene_id'])
-        gene_dict['functions'] = [gene_dict.pop('kegg_ko')]
+        gene_dict['functions'] = gene_dict.pop('kegg_ko')
        if self.skip_tax:
            gene_dict.pop('taxonomy')
        if self.skip_functions or 'unknown' in gene_dict['functions']:
@@ -102,6 +102,7 @@ class ImportIGCGenes(object):
                    self.updated_genes += response.get('updated').get('count')
                except HTTPError as http_error:
                    logging.warning("%s: %s; %s", http_error, http_error.response.json(), genes)
+                    self.skipped_genes += len(genes)
                self.processed_genes += len(chunk_genes)
                logger.info("%s Genes processed so far...", self.processed_genes)
        logger.info("[DONE] %s/%s Genes created.", self.created_genes, self.total_genes)

--- a/backend/scripts/populate_db/test_import_igc_data.py
+++ b/backend/scripts/populate_db/test_import_igc_data.py
@@ -36,7 +36,7 @@ class TestParseGene(TestCase):
        expected_dict = {
            'gene_id': 'gene_name',
            'length': 'length',
-            'kegg_ko': 'kegg',
+            'kegg_ko': ['kegg'],
            'taxo_phylum': 'taxo_phylum',
            'taxo_genus': 'taxo_genus',
        }
@@ -76,7 +76,7 @@ class TestCleanGene(TestCase):
            'gene_id': 'gene.01',
            'length': 135,
            'taxonomy': 'Taxo',
-            'kegg_ko': 'K00001'
+            'kegg_ko': ['K00001']
        }
    def test_clean_gene(self):