From 25377fe4f9e135f7dbd2ffae586eeeff0a52f665 Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion <kenzo-hugo.hillion1@pasteur.fr> Date: Mon, 5 Aug 2019 15:05:50 +0200 Subject: [PATCH] reformat parser for IGC and add tests --- .../metagenedb/api/catalog/views/__init__.py | 2 - .../metagenedb/apps/catalog/admin/__init__.py | 9 ++-- .../apps/catalog/models/__init__.py | 9 ++-- .../apps/catalog/serializers/__init__.py | 9 ++-- .../common/utils/parsers/__init__.py | 3 ++ .../metagenedb/common/utils/parsers/igc.py | 51 +++++++++++++++++++ .../metagenedb/common/utils/parsers/kegg.py | 34 +++++++++++++ .../{parsers.py => parsers/ncbi_taxonomy.py} | 30 ----------- .../common/utils/parsers/test_igc.py | 48 +++++++++++++++++ .../common/utils/parsers/test_kegg.py | 22 ++++++++ .../test_ncbi_taxonomy.py} | 21 +------- .../scripts/populate_db/import_igc_data.py | 33 ++++-------- .../populate_db/test_import_igc_data.py | 37 ++++++++++++-- 13 files changed, 211 insertions(+), 97 deletions(-) create mode 100644 backend/metagenedb/common/utils/parsers/__init__.py create mode 100644 backend/metagenedb/common/utils/parsers/igc.py create mode 100644 backend/metagenedb/common/utils/parsers/kegg.py rename backend/metagenedb/common/utils/{parsers.py => parsers/ncbi_taxonomy.py} (76%) create mode 100644 backend/metagenedb/common/utils/parsers/test_igc.py create mode 100644 backend/metagenedb/common/utils/parsers/test_kegg.py rename backend/metagenedb/common/utils/{test_parsers.py => parsers/test_ncbi_taxonomy.py} (68%) diff --git a/backend/metagenedb/api/catalog/views/__init__.py b/backend/metagenedb/api/catalog/views/__init__.py index 2339a71..627996c 100644 --- a/backend/metagenedb/api/catalog/views/__init__.py +++ b/backend/metagenedb/api/catalog/views/__init__.py @@ -1,3 +1 @@ from .gene import GeneViewSet # noqa - -__all__ = ['GeneViewSet'] diff --git a/backend/metagenedb/apps/catalog/admin/__init__.py b/backend/metagenedb/apps/catalog/admin/__init__.py index 83c364a..f7d7aca 100644 --- a/backend/metagenedb/apps/catalog/admin/__init__.py +++ b/backend/metagenedb/apps/catalog/admin/__init__.py @@ -1,6 +1,3 @@ -from .gene import GeneAdmin -from .function import FunctionAdmin, KeggOrthologyAdmin -from .taxonomy import TaxonomyAdmin - - -__all__ = ['GeneAdmin', 'FunctionAdmin', 'KeggOrthologyAdmin', 'TaxonomyAdmin'] +from .gene import GeneAdmin # noqa +from .function import FunctionAdmin, KeggOrthologyAdmin # noqa +from .taxonomy import TaxonomyAdmin # noqa diff --git a/backend/metagenedb/apps/catalog/models/__init__.py b/backend/metagenedb/apps/catalog/models/__init__.py index fe34c79..6968989 100644 --- a/backend/metagenedb/apps/catalog/models/__init__.py +++ b/backend/metagenedb/apps/catalog/models/__init__.py @@ -1,6 +1,3 @@ -from .function import Function, KeggOrthology -from .gene import Gene -from .taxonomy import Taxonomy - - -__all__ = ['Function', 'KeggOrthology', 'Gene', 'Taxonomy'] +from .function import Function, KeggOrthology # noqa +from .gene import Gene # noqa +from .taxonomy import Taxonomy # noqa diff --git a/backend/metagenedb/apps/catalog/serializers/__init__.py b/backend/metagenedb/apps/catalog/serializers/__init__.py index 5575274..9c3ab71 100644 --- a/backend/metagenedb/apps/catalog/serializers/__init__.py +++ b/backend/metagenedb/apps/catalog/serializers/__init__.py @@ -1,6 +1,3 @@ -from .function import FunctionSerializer -from .gene import GeneSerializer -from .taxonomy import TaxonomySerializer - - -__all__ = ['FunctionSerializer', 'GeneSerializer', 'TaxonomySerializer'] +from .function import FunctionSerializer # noqa +from .gene import GeneSerializer # noqa +from .taxonomy import TaxonomySerializer # noqa diff --git a/backend/metagenedb/common/utils/parsers/__init__.py b/backend/metagenedb/common/utils/parsers/__init__.py new file mode 100644 index 0000000..7c8b8f5 --- /dev/null +++ b/backend/metagenedb/common/utils/parsers/__init__.py @@ -0,0 +1,3 @@ +from .igc import IGCLineParser # noqa +from .kegg import KEGGLineParser # noqa +from .ncbi_taxonomy import NCBITaxonomyLineParser # noqa diff --git a/backend/metagenedb/common/utils/parsers/igc.py b/backend/metagenedb/common/utils/parsers/igc.py new file mode 100644 index 0000000..684b83b --- /dev/null +++ b/backend/metagenedb/common/utils/parsers/igc.py @@ -0,0 +1,51 @@ +import logging + +logging.basicConfig(level=logging.INFO) +_LOGGER = logging.getLogger(__name__) + + +class IGCLineParser(object): + + @staticmethod + def gene(line): + """ + Parse line from IGC genes list () to return organized dict + + IGC annotation columns: + 0: Gene ID Unique ID + 1: Gene Name Unique name + 2: Gene Length Length of nucleotide sequence + 3: Gene Completeness Status I the gene complete or partial according to the gene predictor + 4: Cohort Origin Stating the cohort contributing the representative gene + 5: Taxonomic Annotation(Phylum Level) Annotated phylum for a gene + 6: Taxonomic Annotation(Genus Level) Annotated genus for a gene + 7: KEGG Annotation Annotated KO(s) for a gene + 8: eggNOG Annotation Annotated eggNOG(s) for a gene + 9: Sample Occurence Frequency Occurrence frequency in samples based on gene profile + 10: Individual Occurence Frequency Occurrence frequency in individuals based on gene profile + 11: KEGG Functional Categories KEGG functional category(ies) of the annotated KO(s) + 12: eggNOG Functional Categories eggNOG functional category(ies) of the annotated eggNOG(s) + 13: Cohort Assembled Stating the metagenomic sequencing cohort(s) contributing the + representative gene or a redundant gene belonging to it + """ + try: + gene_info = line.rstrip().split('\t') + return { + 'igc_id': gene_info[0], + 'gene_id': gene_info[1], + 'gene_length': gene_info[2], + 'gene_completeness_status': gene_info[3], + 'cohort_origin': gene_info[4], + 'taxo_phylum': gene_info[5], + 'taxo_genus': gene_info[6], + 'kegg_ko': gene_info[7], + 'eggnog': gene_info[8], + 'sample_occurence_frequency': gene_info[9], + 'individual_occurence_frequency': gene_info[10], + 'kegg_functional_categories': gene_info[11], + 'eggnog_functional_categories': gene_info[12], + 'cohort_assembled': gene_info[13] + } + except Exception: + _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from IGC genes list?") + raise diff --git a/backend/metagenedb/common/utils/parsers/kegg.py b/backend/metagenedb/common/utils/parsers/kegg.py new file mode 100644 index 0000000..2ed3f87 --- /dev/null +++ b/backend/metagenedb/common/utils/parsers/kegg.py @@ -0,0 +1,34 @@ +import logging + +logging.basicConfig(level=logging.INFO) +_LOGGER = logging.getLogger(__name__) + + +class KEGGLineParser(object): + + @staticmethod + def ko_list(line): + """ + Parse line from kegg KO list (http://rest.kegg.jp/list/ko) to return organized dict + """ + try: + elements = line.split('\t') + function_id = elements[0].split(':')[1] + if ';' in elements[1]: + names = elements[1].split(';') + else: + _LOGGER.warning(f"Parsing issue with {function_id}, corresponding line: {line}") + names = [elements[1], ''] # Ugly fix to handle one specific case with no name: K23479 + if '[EC:' in names[1]: + ec_number = names[1].split('[EC:')[1].rstrip(']') + else: + ec_number = '' + return { + 'function_id': function_id, + 'name': names[0], + 'long_name': names[1].lstrip(), + 'ec_number': ec_number + } + except Exception: + _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from KEGG KO list?") + raise diff --git a/backend/metagenedb/common/utils/parsers.py b/backend/metagenedb/common/utils/parsers/ncbi_taxonomy.py similarity index 76% rename from backend/metagenedb/common/utils/parsers.py rename to backend/metagenedb/common/utils/parsers/ncbi_taxonomy.py index 3e91b96..d67d3d1 100644 --- a/backend/metagenedb/common/utils/parsers.py +++ b/backend/metagenedb/common/utils/parsers/ncbi_taxonomy.py @@ -4,36 +4,6 @@ logging.basicConfig(level=logging.INFO) _LOGGER = logging.getLogger(__name__) -class KEGGLineParser(object): - - @staticmethod - def ko_list(line): - """ - Parse line from kegg KO list (http://rest.kegg.jp/list/ko) to return organized dict - """ - try: - elements = line.split('\t') - function_id = elements[0].split(':')[1] - if ';' in elements[1]: - names = elements[1].split(';') - else: - _LOGGER.warning(f"Parsing issue with {function_id}, corresponding line: {line}") - names = [elements[1], ''] # Ugly fix to handle one specific case with no name: K23479 - if '[EC:' in names[1]: - ec_number = names[1].split('[EC:')[1].rstrip(']') - else: - ec_number = '' - return { - 'function_id': function_id, - 'name': names[0], - 'long_name': names[1].lstrip(), - 'ec_number': ec_number - } - except Exception: - _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from KEGG KO list?") - raise - - class NCBITaxonomyLineParser(object): @staticmethod diff --git a/backend/metagenedb/common/utils/parsers/test_igc.py b/backend/metagenedb/common/utils/parsers/test_igc.py new file mode 100644 index 0000000..9bfe4f4 --- /dev/null +++ b/backend/metagenedb/common/utils/parsers/test_igc.py @@ -0,0 +1,48 @@ +from unittest import TestCase + +from metagenedb.common.utils.parsers import IGCLineParser + + +class TestIGCLineParser(TestCase): + + def test_gene(self): + raw_data = [ + 'gene_id', + 'gene_name', + 'gene_length', + 'gene_completeness_status', + 'cohort_origin', + 'taxo_phylum', + 'taxo_genus', + 'kegg', + 'eggnog', + 'sample_occurence_freq', + 'ind_occurence_freq', + 'kegg_functional_cat', + 'eggnog_functional_cat', + 'cohort_assembled' + ] + raw_line = "\t".join(raw_data) + expected_dict = { + 'igc_id': raw_data[0], + 'gene_id': raw_data[1], + 'gene_length': raw_data[2], + 'gene_completeness_status': raw_data[3], + 'cohort_origin': raw_data[4], + 'taxo_phylum': raw_data[5], + 'taxo_genus': raw_data[6], + 'kegg_ko': raw_data[7], + 'eggnog': raw_data[8], + 'sample_occurence_frequency': raw_data[9], + 'individual_occurence_frequency': raw_data[10], + 'kegg_functional_categories': raw_data[11], + 'eggnog_functional_categories': raw_data[12], + 'cohort_assembled': raw_data[13] + } + test_dict = IGCLineParser.gene(raw_line) + self.assertDictEqual(test_dict, expected_dict) + + def test_gene_wrong_format(self): + raw_line = "This is a wrong line format, with; information and tab" + with self.assertRaises(Exception) as context: # noqa + IGCLineParser.gene(raw_line) diff --git a/backend/metagenedb/common/utils/parsers/test_kegg.py b/backend/metagenedb/common/utils/parsers/test_kegg.py new file mode 100644 index 0000000..e726d68 --- /dev/null +++ b/backend/metagenedb/common/utils/parsers/test_kegg.py @@ -0,0 +1,22 @@ +from unittest import TestCase + +from metagenedb.common.utils.parsers import KEGGLineParser + + +class TestKEGGLineParser(TestCase): + + def test_ko_list(self): + ko_line = "ko:K00809 DHPS, dys; deoxyhypusine synthase [EC:2.5.1.46]" + expected_dict = { + 'function_id': "K00809", + 'name': "DHPS, dys", + 'long_name': "deoxyhypusine synthase [EC:2.5.1.46]", + 'ec_number': "2.5.1.46" + } + test_dict = KEGGLineParser.ko_list(ko_line) + self.assertDictEqual(test_dict, expected_dict) + + def test_ko_list_wrong_format(self): + ko_line = "This is a wrong line format, with; information and tab" + with self.assertRaises(Exception) as context: # noqa + KEGGLineParser.ko_list(ko_line) diff --git a/backend/metagenedb/common/utils/test_parsers.py b/backend/metagenedb/common/utils/parsers/test_ncbi_taxonomy.py similarity index 68% rename from backend/metagenedb/common/utils/test_parsers.py rename to backend/metagenedb/common/utils/parsers/test_ncbi_taxonomy.py index 902ad84..1c65803 100644 --- a/backend/metagenedb/common/utils/test_parsers.py +++ b/backend/metagenedb/common/utils/parsers/test_ncbi_taxonomy.py @@ -1,25 +1,6 @@ from unittest import TestCase -from metagenedb.common.utils.parsers import KEGGLineParser, NCBITaxonomyLineParser - - -class TestKEGGLineParser(TestCase): - - def test_ko_list(self): - ko_line = "ko:K00809 DHPS, dys; deoxyhypusine synthase [EC:2.5.1.46]" - expected_dict = { - 'function_id': "K00809", - 'name': "DHPS, dys", - 'long_name': "deoxyhypusine synthase [EC:2.5.1.46]", - 'ec_number': "2.5.1.46" - } - test_dict = KEGGLineParser.ko_list(ko_line) - self.assertDictEqual(test_dict, expected_dict) - - def test_ko_list_wrong_format(self): - ko_line = "This is a wrong line format, with; information and tab" - with self.assertRaises(Exception) as context: # noqa - KEGGLineParser.ko_list(ko_line) +from metagenedb.common.utils.parsers import NCBITaxonomyLineParser class TestNCBITaxonomyLineParser(TestCase): diff --git a/backend/scripts/populate_db/import_igc_data.py b/backend/scripts/populate_db/import_igc_data.py index 2faae48..b169f1d 100755 --- a/backend/scripts/populate_db/import_igc_data.py +++ b/backend/scripts/populate_db/import_igc_data.py @@ -8,6 +8,8 @@ from itertools import islice import django from rest_framework.exceptions import ValidationError +from metagenedb.common.utils.parsers import IGCLineParser + # Before model import, we need to called django.setup() to Load apps os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings") django.setup() @@ -18,32 +20,17 @@ from metagenedb.apps.catalog.serializers import GeneSerializer # noqa logging.basicConfig(level=logging.INFO) _LOGGER = logging.getLogger(__name__) +SELECTED_KEYS = ['gene_id', 'gene_length', 'kegg_ko'] + -def parse_gene(raw_line): +def parse_gene(raw_line, selected_keys=SELECTED_KEYS): """ - IGC annotation columns: - 0: Gene ID Unique ID - 1: Gene Name Unique name - 2: Gene Length Length of nucleotide sequence - 3: Gene Completeness Status Stating a gene is complete or partial according to the gene predictor - 4: Cohort Origin Stating the cohort contributing the representative gene - 5: Taxonomic Annotation(Phylum Level) Annotated phylum for a gene - 6: Taxonomic Annotation(Genus Level) Annotated genus for a gene - 7: KEGG Annotation Annotated KO(s) for a gene - 8: eggNOG Annotation Annotated eggNOG(s) for a gene - 9: Sample Occurence Frequency Occurrence frequency in samples based on gene profile - 10:Individual Occurence Frequency Occurrence frequency in individuals based on gene profile - 11: KEGG Functional Categories KEGG functional category(ies) of the annotated KO(s) - 12: eggNOG Functional Categories eggNOG functional category(ies) of the annotated eggNOG(s) - 13: Cohort Assembled Stating the metagenomic sequencing cohort(s) contributing the - representative gene or a redundant gene belonging to it + Use IGCLineParser and return selected keys """ - gene_info = raw_line.rstrip().split('\t') - return { - 'gene_id': gene_info[1], - 'gene_length': gene_info[2], - 'kegg_ko': gene_info[7] - } + gene_parser = IGCLineParser() + all_dict = gene_parser.gene(raw_line) + selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys} + return selected_dict def upsert_gene(gene_dict): diff --git a/backend/scripts/populate_db/test_import_igc_data.py b/backend/scripts/populate_db/test_import_igc_data.py index f06e889..de03536 100644 --- a/backend/scripts/populate_db/test_import_igc_data.py +++ b/backend/scripts/populate_db/test_import_igc_data.py @@ -9,7 +9,7 @@ from scripts.populate_db.import_igc_data import parse_gene, upsert_gene class TestParseGene(TestCase): - def test_parse_gene(self): + def setUp(self): raw_data = [ 'gene_id', 'gene_name', @@ -26,13 +26,42 @@ class TestParseGene(TestCase): 'eggnog_functional_cat', 'cohort_assembled' ] - raw_line = "\t".join(raw_data) + self.raw_line = "\t".join(raw_data) + + def test_parse_gene_default_selected_keys(self): + """ + This test should failed and need to be updated when SELECTED_KEYS are changed + """ expected_dict = { - 'gene_id': 'gene_name', # We use the gene name for our gene ID + 'gene_id': 'gene_name', 'gene_length': 'gene_length', 'kegg_ko': 'kegg' } - tested_dict = parse_gene(raw_line) + tested_dict = parse_gene(self.raw_line) + self.assertDictEqual(tested_dict, expected_dict) + + def test_parse_gene(self): + """ + This test should failed and need to be updated when SELECTED_KEYS are changed + """ + selected_keys = ['gene_id', 'gene_length'] + expected_dict = { + 'gene_id': 'gene_name', + 'gene_length': 'gene_length' + } + tested_dict = parse_gene(self.raw_line, selected_keys=selected_keys) + self.assertDictEqual(tested_dict, expected_dict) + + def test_parse_gene_unknown_key(self): + """ + Unknown key should be ignored + """ + selected_keys = ['gene_id', 'gene_length', 'secret_code'] + expected_dict = { + 'gene_id': 'gene_name', + 'gene_length': 'gene_length' + } + tested_dict = parse_gene(self.raw_line, selected_keys=selected_keys) self.assertDictEqual(tested_dict, expected_dict) -- GitLab