From 25377fe4f9e135f7dbd2ffae586eeeff0a52f665 Mon Sep 17 00:00:00 2001
From: Kenzo-Hugo Hillion <kenzo-hugo.hillion1@pasteur.fr>
Date: Mon, 5 Aug 2019 15:05:50 +0200
Subject: [PATCH] reformat parser for IGC and add tests

---
 .../metagenedb/api/catalog/views/__init__.py  |  2 -
 .../metagenedb/apps/catalog/admin/__init__.py |  9 ++--
 .../apps/catalog/models/__init__.py           |  9 ++--
 .../apps/catalog/serializers/__init__.py      |  9 ++--
 .../common/utils/parsers/__init__.py          |  3 ++
 .../metagenedb/common/utils/parsers/igc.py    | 51 +++++++++++++++++++
 .../metagenedb/common/utils/parsers/kegg.py   | 34 +++++++++++++
 .../{parsers.py => parsers/ncbi_taxonomy.py}  | 30 -----------
 .../common/utils/parsers/test_igc.py          | 48 +++++++++++++++++
 .../common/utils/parsers/test_kegg.py         | 22 ++++++++
 .../test_ncbi_taxonomy.py}                    | 21 +-------
 .../scripts/populate_db/import_igc_data.py    | 33 ++++--------
 .../populate_db/test_import_igc_data.py       | 37 ++++++++++++--
 13 files changed, 211 insertions(+), 97 deletions(-)
 create mode 100644 backend/metagenedb/common/utils/parsers/__init__.py
 create mode 100644 backend/metagenedb/common/utils/parsers/igc.py
 create mode 100644 backend/metagenedb/common/utils/parsers/kegg.py
 rename backend/metagenedb/common/utils/{parsers.py => parsers/ncbi_taxonomy.py} (76%)
 create mode 100644 backend/metagenedb/common/utils/parsers/test_igc.py
 create mode 100644 backend/metagenedb/common/utils/parsers/test_kegg.py
 rename backend/metagenedb/common/utils/{test_parsers.py => parsers/test_ncbi_taxonomy.py} (68%)

diff --git a/backend/metagenedb/api/catalog/views/__init__.py b/backend/metagenedb/api/catalog/views/__init__.py
index 2339a71..627996c 100644
--- a/backend/metagenedb/api/catalog/views/__init__.py
+++ b/backend/metagenedb/api/catalog/views/__init__.py
@@ -1,3 +1 @@
 from .gene import GeneViewSet  # noqa
-
-__all__ = ['GeneViewSet']
diff --git a/backend/metagenedb/apps/catalog/admin/__init__.py b/backend/metagenedb/apps/catalog/admin/__init__.py
index 83c364a..f7d7aca 100644
--- a/backend/metagenedb/apps/catalog/admin/__init__.py
+++ b/backend/metagenedb/apps/catalog/admin/__init__.py
@@ -1,6 +1,3 @@
-from .gene import GeneAdmin
-from .function import FunctionAdmin, KeggOrthologyAdmin
-from .taxonomy import TaxonomyAdmin
-
-
-__all__ = ['GeneAdmin', 'FunctionAdmin', 'KeggOrthologyAdmin', 'TaxonomyAdmin']
+from .gene import GeneAdmin  # noqa
+from .function import FunctionAdmin, KeggOrthologyAdmin  # noqa
+from .taxonomy import TaxonomyAdmin  # noqa
diff --git a/backend/metagenedb/apps/catalog/models/__init__.py b/backend/metagenedb/apps/catalog/models/__init__.py
index fe34c79..6968989 100644
--- a/backend/metagenedb/apps/catalog/models/__init__.py
+++ b/backend/metagenedb/apps/catalog/models/__init__.py
@@ -1,6 +1,3 @@
-from .function import Function, KeggOrthology
-from .gene import Gene
-from .taxonomy import Taxonomy
-
-
-__all__ = ['Function', 'KeggOrthology', 'Gene', 'Taxonomy']
+from .function import Function, KeggOrthology  # noqa
+from .gene import Gene  # noqa
+from .taxonomy import Taxonomy  # noqa
diff --git a/backend/metagenedb/apps/catalog/serializers/__init__.py b/backend/metagenedb/apps/catalog/serializers/__init__.py
index 5575274..9c3ab71 100644
--- a/backend/metagenedb/apps/catalog/serializers/__init__.py
+++ b/backend/metagenedb/apps/catalog/serializers/__init__.py
@@ -1,6 +1,3 @@
-from .function import FunctionSerializer
-from .gene import GeneSerializer
-from .taxonomy import TaxonomySerializer
-
-
-__all__ = ['FunctionSerializer', 'GeneSerializer', 'TaxonomySerializer']
+from .function import FunctionSerializer  # noqa
+from .gene import GeneSerializer  # noqa
+from .taxonomy import TaxonomySerializer  # noqa
diff --git a/backend/metagenedb/common/utils/parsers/__init__.py b/backend/metagenedb/common/utils/parsers/__init__.py
new file mode 100644
index 0000000..7c8b8f5
--- /dev/null
+++ b/backend/metagenedb/common/utils/parsers/__init__.py
@@ -0,0 +1,3 @@
+from .igc import IGCLineParser  # noqa
+from .kegg import KEGGLineParser  # noqa
+from .ncbi_taxonomy import NCBITaxonomyLineParser  # noqa
diff --git a/backend/metagenedb/common/utils/parsers/igc.py b/backend/metagenedb/common/utils/parsers/igc.py
new file mode 100644
index 0000000..684b83b
--- /dev/null
+++ b/backend/metagenedb/common/utils/parsers/igc.py
@@ -0,0 +1,51 @@
+import logging
+
+logging.basicConfig(level=logging.INFO)
+_LOGGER = logging.getLogger(__name__)
+
+
+class IGCLineParser(object):
+
+    @staticmethod
+    def gene(line):
+        """
+        Parse line from IGC genes list () to return organized dict
+
+        IGC annotation columns:
+            0: Gene ID	                            Unique ID
+            1: Gene Name                 	        Unique name
+            2: Gene Length	                        Length of nucleotide sequence
+            3: Gene Completeness Status	            I the gene complete or partial according to the gene predictor
+            4: Cohort Origin	                    Stating the cohort contributing the representative gene
+            5: Taxonomic Annotation(Phylum Level)	Annotated phylum for a gene
+            6: Taxonomic Annotation(Genus Level)	Annotated genus for a gene
+            7: KEGG Annotation	                    Annotated KO(s) for a gene
+            8: eggNOG Annotation	                Annotated eggNOG(s) for a gene
+            9: Sample Occurence Frequency	        Occurrence frequency in samples based on gene profile
+            10: Individual Occurence Frequency	    Occurrence frequency in individuals based on gene profile
+            11: KEGG Functional Categories	        KEGG functional category(ies) of the annotated KO(s)
+            12: eggNOG Functional Categories	    eggNOG functional category(ies) of the annotated eggNOG(s)
+            13: Cohort Assembled	                Stating the metagenomic sequencing cohort(s) contributing the
+                                                    representative gene or a redundant gene belonging to it
+        """
+        try:
+            gene_info = line.rstrip().split('\t')
+            return {
+                'igc_id': gene_info[0],
+                'gene_id': gene_info[1],
+                'gene_length': gene_info[2],
+                'gene_completeness_status': gene_info[3],
+                'cohort_origin': gene_info[4],
+                'taxo_phylum': gene_info[5],
+                'taxo_genus': gene_info[6],
+                'kegg_ko': gene_info[7],
+                'eggnog': gene_info[8],
+                'sample_occurence_frequency': gene_info[9],
+                'individual_occurence_frequency': gene_info[10],
+                'kegg_functional_categories': gene_info[11],
+                'eggnog_functional_categories': gene_info[12],
+                'cohort_assembled': gene_info[13]
+            }
+        except Exception:
+            _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from IGC genes list?")
+            raise
diff --git a/backend/metagenedb/common/utils/parsers/kegg.py b/backend/metagenedb/common/utils/parsers/kegg.py
new file mode 100644
index 0000000..2ed3f87
--- /dev/null
+++ b/backend/metagenedb/common/utils/parsers/kegg.py
@@ -0,0 +1,34 @@
+import logging
+
+logging.basicConfig(level=logging.INFO)
+_LOGGER = logging.getLogger(__name__)
+
+
+class KEGGLineParser(object):
+
+    @staticmethod
+    def ko_list(line):
+        """
+        Parse line from kegg KO list (http://rest.kegg.jp/list/ko) to return organized dict
+        """
+        try:
+            elements = line.split('\t')
+            function_id = elements[0].split(':')[1]
+            if ';' in elements[1]:
+                names = elements[1].split(';')
+            else:
+                _LOGGER.warning(f"Parsing issue with {function_id}, corresponding line: {line}")
+                names = [elements[1], '']  # Ugly fix to handle one specific case with no name: K23479
+            if '[EC:' in names[1]:
+                ec_number = names[1].split('[EC:')[1].rstrip(']')
+            else:
+                ec_number = ''
+            return {
+                'function_id': function_id,
+                'name': names[0],
+                'long_name': names[1].lstrip(),
+                'ec_number': ec_number
+            }
+        except Exception:
+            _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from KEGG KO list?")
+            raise
diff --git a/backend/metagenedb/common/utils/parsers.py b/backend/metagenedb/common/utils/parsers/ncbi_taxonomy.py
similarity index 76%
rename from backend/metagenedb/common/utils/parsers.py
rename to backend/metagenedb/common/utils/parsers/ncbi_taxonomy.py
index 3e91b96..d67d3d1 100644
--- a/backend/metagenedb/common/utils/parsers.py
+++ b/backend/metagenedb/common/utils/parsers/ncbi_taxonomy.py
@@ -4,36 +4,6 @@ logging.basicConfig(level=logging.INFO)
 _LOGGER = logging.getLogger(__name__)
 
 
-class KEGGLineParser(object):
-
-    @staticmethod
-    def ko_list(line):
-        """
-        Parse line from kegg KO list (http://rest.kegg.jp/list/ko) to return organized dict
-        """
-        try:
-            elements = line.split('\t')
-            function_id = elements[0].split(':')[1]
-            if ';' in elements[1]:
-                names = elements[1].split(';')
-            else:
-                _LOGGER.warning(f"Parsing issue with {function_id}, corresponding line: {line}")
-                names = [elements[1], '']  # Ugly fix to handle one specific case with no name: K23479
-            if '[EC:' in names[1]:
-                ec_number = names[1].split('[EC:')[1].rstrip(']')
-            else:
-                ec_number = ''
-            return {
-                'function_id': function_id,
-                'name': names[0],
-                'long_name': names[1].lstrip(),
-                'ec_number': ec_number
-            }
-        except Exception:
-            _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from KEGG KO list?")
-            raise
-
-
 class NCBITaxonomyLineParser(object):
 
     @staticmethod
diff --git a/backend/metagenedb/common/utils/parsers/test_igc.py b/backend/metagenedb/common/utils/parsers/test_igc.py
new file mode 100644
index 0000000..9bfe4f4
--- /dev/null
+++ b/backend/metagenedb/common/utils/parsers/test_igc.py
@@ -0,0 +1,48 @@
+from unittest import TestCase
+
+from metagenedb.common.utils.parsers import IGCLineParser
+
+
+class TestIGCLineParser(TestCase):
+
+    def test_gene(self):
+        raw_data = [
+            'gene_id',
+            'gene_name',
+            'gene_length',
+            'gene_completeness_status',
+            'cohort_origin',
+            'taxo_phylum',
+            'taxo_genus',
+            'kegg',
+            'eggnog',
+            'sample_occurence_freq',
+            'ind_occurence_freq',
+            'kegg_functional_cat',
+            'eggnog_functional_cat',
+            'cohort_assembled'
+        ]
+        raw_line = "\t".join(raw_data)
+        expected_dict = {
+            'igc_id': raw_data[0],
+            'gene_id': raw_data[1],
+            'gene_length': raw_data[2],
+            'gene_completeness_status': raw_data[3],
+            'cohort_origin': raw_data[4],
+            'taxo_phylum': raw_data[5],
+            'taxo_genus': raw_data[6],
+            'kegg_ko': raw_data[7],
+            'eggnog': raw_data[8],
+            'sample_occurence_frequency': raw_data[9],
+            'individual_occurence_frequency': raw_data[10],
+            'kegg_functional_categories': raw_data[11],
+            'eggnog_functional_categories': raw_data[12],
+            'cohort_assembled': raw_data[13]
+        }
+        test_dict = IGCLineParser.gene(raw_line)
+        self.assertDictEqual(test_dict, expected_dict)
+
+    def test_gene_wrong_format(self):
+        raw_line = "This is a wrong line format, with; information   and tab"
+        with self.assertRaises(Exception) as context:  # noqa
+            IGCLineParser.gene(raw_line)
diff --git a/backend/metagenedb/common/utils/parsers/test_kegg.py b/backend/metagenedb/common/utils/parsers/test_kegg.py
new file mode 100644
index 0000000..e726d68
--- /dev/null
+++ b/backend/metagenedb/common/utils/parsers/test_kegg.py
@@ -0,0 +1,22 @@
+from unittest import TestCase
+
+from metagenedb.common.utils.parsers import KEGGLineParser
+
+
+class TestKEGGLineParser(TestCase):
+
+    def test_ko_list(self):
+        ko_line = "ko:K00809	DHPS, dys; deoxyhypusine synthase [EC:2.5.1.46]"
+        expected_dict = {
+                'function_id': "K00809",
+                'name': "DHPS, dys",
+                'long_name': "deoxyhypusine synthase [EC:2.5.1.46]",
+                'ec_number': "2.5.1.46"
+            }
+        test_dict = KEGGLineParser.ko_list(ko_line)
+        self.assertDictEqual(test_dict, expected_dict)
+
+    def test_ko_list_wrong_format(self):
+        ko_line = "This is a wrong line format, with; information   and tab"
+        with self.assertRaises(Exception) as context:  # noqa
+            KEGGLineParser.ko_list(ko_line)
diff --git a/backend/metagenedb/common/utils/test_parsers.py b/backend/metagenedb/common/utils/parsers/test_ncbi_taxonomy.py
similarity index 68%
rename from backend/metagenedb/common/utils/test_parsers.py
rename to backend/metagenedb/common/utils/parsers/test_ncbi_taxonomy.py
index 902ad84..1c65803 100644
--- a/backend/metagenedb/common/utils/test_parsers.py
+++ b/backend/metagenedb/common/utils/parsers/test_ncbi_taxonomy.py
@@ -1,25 +1,6 @@
 from unittest import TestCase
 
-from metagenedb.common.utils.parsers import KEGGLineParser, NCBITaxonomyLineParser
-
-
-class TestKEGGLineParser(TestCase):
-
-    def test_ko_list(self):
-        ko_line = "ko:K00809	DHPS, dys; deoxyhypusine synthase [EC:2.5.1.46]"
-        expected_dict = {
-                'function_id': "K00809",
-                'name': "DHPS, dys",
-                'long_name': "deoxyhypusine synthase [EC:2.5.1.46]",
-                'ec_number': "2.5.1.46"
-            }
-        test_dict = KEGGLineParser.ko_list(ko_line)
-        self.assertDictEqual(test_dict, expected_dict)
-
-    def test_ko_list_wrong_format(self):
-        ko_line = "This is a wrong line format, with; information   and tab"
-        with self.assertRaises(Exception) as context:  # noqa
-            KEGGLineParser.ko_list(ko_line)
+from metagenedb.common.utils.parsers import NCBITaxonomyLineParser
 
 
 class TestNCBITaxonomyLineParser(TestCase):
diff --git a/backend/scripts/populate_db/import_igc_data.py b/backend/scripts/populate_db/import_igc_data.py
index 2faae48..b169f1d 100755
--- a/backend/scripts/populate_db/import_igc_data.py
+++ b/backend/scripts/populate_db/import_igc_data.py
@@ -8,6 +8,8 @@ from itertools import islice
 import django
 from rest_framework.exceptions import ValidationError
 
+from metagenedb.common.utils.parsers import IGCLineParser
+
 # Before model import, we need to called django.setup() to Load apps
 os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings")
 django.setup()
@@ -18,32 +20,17 @@ from metagenedb.apps.catalog.serializers import GeneSerializer  # noqa
 logging.basicConfig(level=logging.INFO)
 _LOGGER = logging.getLogger(__name__)
 
+SELECTED_KEYS = ['gene_id', 'gene_length', 'kegg_ko']
+
 
-def parse_gene(raw_line):
+def parse_gene(raw_line, selected_keys=SELECTED_KEYS):
     """
-    IGC annotation columns:
-        0: Gene ID	                            Unique ID
-        1: Gene Name                 	        Unique name
-        2: Gene Length	                        Length of nucleotide sequence
-        3: Gene Completeness Status	            Stating a gene is complete or partial according to the gene predictor
-        4: Cohort Origin	                    Stating the cohort contributing the representative gene
-        5: Taxonomic Annotation(Phylum Level)	Annotated phylum for a gene
-        6: Taxonomic Annotation(Genus Level)	Annotated genus for a gene
-        7: KEGG Annotation	                    Annotated KO(s) for a gene
-        8: eggNOG Annotation	                Annotated eggNOG(s) for a gene
-        9: Sample Occurence Frequency	        Occurrence frequency in samples based on gene profile
-        10:Individual Occurence Frequency	    Occurrence frequency in individuals based on gene profile
-        11: KEGG Functional Categories	        KEGG functional category(ies) of the annotated KO(s)
-        12: eggNOG Functional Categories	    eggNOG functional category(ies) of the annotated eggNOG(s)
-        13: Cohort Assembled	                Stating the metagenomic sequencing cohort(s) contributing the
-                                                representative gene or a redundant gene belonging to it
+    Use IGCLineParser and return selected keys
     """
-    gene_info = raw_line.rstrip().split('\t')
-    return {
-        'gene_id': gene_info[1],
-        'gene_length': gene_info[2],
-        'kegg_ko': gene_info[7]
-    }
+    gene_parser = IGCLineParser()
+    all_dict = gene_parser.gene(raw_line)
+    selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys}
+    return selected_dict
 
 
 def upsert_gene(gene_dict):
diff --git a/backend/scripts/populate_db/test_import_igc_data.py b/backend/scripts/populate_db/test_import_igc_data.py
index f06e889..de03536 100644
--- a/backend/scripts/populate_db/test_import_igc_data.py
+++ b/backend/scripts/populate_db/test_import_igc_data.py
@@ -9,7 +9,7 @@ from scripts.populate_db.import_igc_data import parse_gene, upsert_gene
 
 class TestParseGene(TestCase):
 
-    def test_parse_gene(self):
+    def setUp(self):
         raw_data = [
             'gene_id',
             'gene_name',
@@ -26,13 +26,42 @@ class TestParseGene(TestCase):
             'eggnog_functional_cat',
             'cohort_assembled'
         ]
-        raw_line = "\t".join(raw_data)
+        self.raw_line = "\t".join(raw_data)
+
+    def test_parse_gene_default_selected_keys(self):
+        """
+        This test should failed and need to be updated when SELECTED_KEYS are changed
+        """
         expected_dict = {
-            'gene_id': 'gene_name',  # We use the gene name for our gene ID
+            'gene_id': 'gene_name',
             'gene_length': 'gene_length',
             'kegg_ko': 'kegg'
         }
-        tested_dict = parse_gene(raw_line)
+        tested_dict = parse_gene(self.raw_line)
+        self.assertDictEqual(tested_dict, expected_dict)
+
+    def test_parse_gene(self):
+        """
+        This test should failed and need to be updated when SELECTED_KEYS are changed
+        """
+        selected_keys = ['gene_id', 'gene_length']
+        expected_dict = {
+            'gene_id': 'gene_name',
+            'gene_length': 'gene_length'
+        }
+        tested_dict = parse_gene(self.raw_line, selected_keys=selected_keys)
+        self.assertDictEqual(tested_dict, expected_dict)
+
+    def test_parse_gene_unknown_key(self):
+        """
+        Unknown key should be ignored
+        """
+        selected_keys = ['gene_id', 'gene_length', 'secret_code']
+        expected_dict = {
+            'gene_id': 'gene_name',
+            'gene_length': 'gene_length'
+        }
+        tested_dict = parse_gene(self.raw_line, selected_keys=selected_keys)
         self.assertDictEqual(tested_dict, expected_dict)
 
 
-- 
GitLab