diff --git a/backend/metagenedb/api/catalog/views/__init__.py b/backend/metagenedb/api/catalog/views/__init__.py index 2339a71e6ba3fe7b9e18ece2578e7a12eaf87537..627996c6dd3dcd1b6fe0ef009fb256aa41b8d6cb 100644 --- a/backend/metagenedb/api/catalog/views/__init__.py +++ b/backend/metagenedb/api/catalog/views/__init__.py @@ -1,3 +1 @@ from .gene import GeneViewSet # noqa - -__all__ = ['GeneViewSet'] diff --git a/backend/metagenedb/apps/catalog/admin/__init__.py b/backend/metagenedb/apps/catalog/admin/__init__.py index 83c364a618054e165395043777680eb0c3a8b484..f7d7aca57c0b5b0c860d3a70d7887edc6cef111a 100644 --- a/backend/metagenedb/apps/catalog/admin/__init__.py +++ b/backend/metagenedb/apps/catalog/admin/__init__.py @@ -1,6 +1,3 @@ -from .gene import GeneAdmin -from .function import FunctionAdmin, KeggOrthologyAdmin -from .taxonomy import TaxonomyAdmin - - -__all__ = ['GeneAdmin', 'FunctionAdmin', 'KeggOrthologyAdmin', 'TaxonomyAdmin'] +from .gene import GeneAdmin # noqa +from .function import FunctionAdmin, KeggOrthologyAdmin # noqa +from .taxonomy import TaxonomyAdmin # noqa diff --git a/backend/metagenedb/apps/catalog/models/__init__.py b/backend/metagenedb/apps/catalog/models/__init__.py index fe34c798936bd74802ff63cb82e5c40ba5d16cf0..696898934e377dce77652053d8169a7c5aa8a77b 100644 --- a/backend/metagenedb/apps/catalog/models/__init__.py +++ b/backend/metagenedb/apps/catalog/models/__init__.py @@ -1,6 +1,3 @@ -from .function import Function, KeggOrthology -from .gene import Gene -from .taxonomy import Taxonomy - - -__all__ = ['Function', 'KeggOrthology', 'Gene', 'Taxonomy'] +from .function import Function, KeggOrthology # noqa +from .gene import Gene # noqa +from .taxonomy import Taxonomy # noqa diff --git a/backend/metagenedb/apps/catalog/serializers/__init__.py b/backend/metagenedb/apps/catalog/serializers/__init__.py index 5575274c9db546d89c943926cbb42dcf119b4be0..9c3ab71811f0c7a901c3ba1506d24e06d51f30ab 100644 --- a/backend/metagenedb/apps/catalog/serializers/__init__.py +++ b/backend/metagenedb/apps/catalog/serializers/__init__.py @@ -1,6 +1,3 @@ -from .function import FunctionSerializer -from .gene import GeneSerializer -from .taxonomy import TaxonomySerializer - - -__all__ = ['FunctionSerializer', 'GeneSerializer', 'TaxonomySerializer'] +from .function import FunctionSerializer # noqa +from .gene import GeneSerializer # noqa +from .taxonomy import TaxonomySerializer # noqa diff --git a/backend/metagenedb/common/utils/parsers/__init__.py b/backend/metagenedb/common/utils/parsers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7c8b8f5616f3ab00a535864b65bf390c1e5ac0f7 --- /dev/null +++ b/backend/metagenedb/common/utils/parsers/__init__.py @@ -0,0 +1,3 @@ +from .igc import IGCLineParser # noqa +from .kegg import KEGGLineParser # noqa +from .ncbi_taxonomy import NCBITaxonomyLineParser # noqa diff --git a/backend/metagenedb/common/utils/parsers/igc.py b/backend/metagenedb/common/utils/parsers/igc.py new file mode 100644 index 0000000000000000000000000000000000000000..684b83b2f015fe8b656c5ab991d7e3f8fceba2db --- /dev/null +++ b/backend/metagenedb/common/utils/parsers/igc.py @@ -0,0 +1,51 @@ +import logging + +logging.basicConfig(level=logging.INFO) +_LOGGER = logging.getLogger(__name__) + + +class IGCLineParser(object): + + @staticmethod + def gene(line): + """ + Parse line from IGC genes list () to return organized dict + + IGC annotation columns: + 0: Gene ID Unique ID + 1: Gene Name Unique name + 2: Gene Length Length of nucleotide sequence + 3: Gene Completeness Status I the gene complete or partial according to the gene predictor + 4: Cohort Origin Stating the cohort contributing the representative gene + 5: Taxonomic Annotation(Phylum Level) Annotated phylum for a gene + 6: Taxonomic Annotation(Genus Level) Annotated genus for a gene + 7: KEGG Annotation Annotated KO(s) for a gene + 8: eggNOG Annotation Annotated eggNOG(s) for a gene + 9: Sample Occurence Frequency Occurrence frequency in samples based on gene profile + 10: Individual Occurence Frequency Occurrence frequency in individuals based on gene profile + 11: KEGG Functional Categories KEGG functional category(ies) of the annotated KO(s) + 12: eggNOG Functional Categories eggNOG functional category(ies) of the annotated eggNOG(s) + 13: Cohort Assembled Stating the metagenomic sequencing cohort(s) contributing the + representative gene or a redundant gene belonging to it + """ + try: + gene_info = line.rstrip().split('\t') + return { + 'igc_id': gene_info[0], + 'gene_id': gene_info[1], + 'gene_length': gene_info[2], + 'gene_completeness_status': gene_info[3], + 'cohort_origin': gene_info[4], + 'taxo_phylum': gene_info[5], + 'taxo_genus': gene_info[6], + 'kegg_ko': gene_info[7], + 'eggnog': gene_info[8], + 'sample_occurence_frequency': gene_info[9], + 'individual_occurence_frequency': gene_info[10], + 'kegg_functional_categories': gene_info[11], + 'eggnog_functional_categories': gene_info[12], + 'cohort_assembled': gene_info[13] + } + except Exception: + _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from IGC genes list?") + raise diff --git a/backend/metagenedb/common/utils/parsers/kegg.py b/backend/metagenedb/common/utils/parsers/kegg.py new file mode 100644 index 0000000000000000000000000000000000000000..2ed3f8719829505668cba31a409c06e8ecf7dea7 --- /dev/null +++ b/backend/metagenedb/common/utils/parsers/kegg.py @@ -0,0 +1,34 @@ +import logging + +logging.basicConfig(level=logging.INFO) +_LOGGER = logging.getLogger(__name__) + + +class KEGGLineParser(object): + + @staticmethod + def ko_list(line): + """ + Parse line from kegg KO list (http://rest.kegg.jp/list/ko) to return organized dict + """ + try: + elements = line.split('\t') + function_id = elements[0].split(':')[1] + if ';' in elements[1]: + names = elements[1].split(';') + else: + _LOGGER.warning(f"Parsing issue with {function_id}, corresponding line: {line}") + names = [elements[1], ''] # Ugly fix to handle one specific case with no name: K23479 + if '[EC:' in names[1]: + ec_number = names[1].split('[EC:')[1].rstrip(']') + else: + ec_number = '' + return { + 'function_id': function_id, + 'name': names[0], + 'long_name': names[1].lstrip(), + 'ec_number': ec_number + } + except Exception: + _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from KEGG KO list?") + raise diff --git a/backend/metagenedb/common/utils/parsers.py b/backend/metagenedb/common/utils/parsers/ncbi_taxonomy.py similarity index 76% rename from backend/metagenedb/common/utils/parsers.py rename to backend/metagenedb/common/utils/parsers/ncbi_taxonomy.py index 3e91b967c4244bfffab0003e576b4f272afab2de..d67d3d1b670e7cd6199cda735d1e35e15d94dd17 100644 --- a/backend/metagenedb/common/utils/parsers.py +++ b/backend/metagenedb/common/utils/parsers/ncbi_taxonomy.py @@ -4,36 +4,6 @@ logging.basicConfig(level=logging.INFO) _LOGGER = logging.getLogger(__name__) -class KEGGLineParser(object): - - @staticmethod - def ko_list(line): - """ - Parse line from kegg KO list (http://rest.kegg.jp/list/ko) to return organized dict - """ - try: - elements = line.split('\t') - function_id = elements[0].split(':')[1] - if ';' in elements[1]: - names = elements[1].split(';') - else: - _LOGGER.warning(f"Parsing issue with {function_id}, corresponding line: {line}") - names = [elements[1], ''] # Ugly fix to handle one specific case with no name: K23479 - if '[EC:' in names[1]: - ec_number = names[1].split('[EC:')[1].rstrip(']') - else: - ec_number = '' - return { - 'function_id': function_id, - 'name': names[0], - 'long_name': names[1].lstrip(), - 'ec_number': ec_number - } - except Exception: - _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from KEGG KO list?") - raise - - class NCBITaxonomyLineParser(object): @staticmethod diff --git a/backend/metagenedb/common/utils/parsers/test_igc.py b/backend/metagenedb/common/utils/parsers/test_igc.py new file mode 100644 index 0000000000000000000000000000000000000000..9bfe4f4906925e98b465bcb203471734ea4e0a10 --- /dev/null +++ b/backend/metagenedb/common/utils/parsers/test_igc.py @@ -0,0 +1,48 @@ +from unittest import TestCase + +from metagenedb.common.utils.parsers import IGCLineParser + + +class TestIGCLineParser(TestCase): + + def test_gene(self): + raw_data = [ + 'gene_id', + 'gene_name', + 'gene_length', + 'gene_completeness_status', + 'cohort_origin', + 'taxo_phylum', + 'taxo_genus', + 'kegg', + 'eggnog', + 'sample_occurence_freq', + 'ind_occurence_freq', + 'kegg_functional_cat', + 'eggnog_functional_cat', + 'cohort_assembled' + ] + raw_line = "\t".join(raw_data) + expected_dict = { + 'igc_id': raw_data[0], + 'gene_id': raw_data[1], + 'gene_length': raw_data[2], + 'gene_completeness_status': raw_data[3], + 'cohort_origin': raw_data[4], + 'taxo_phylum': raw_data[5], + 'taxo_genus': raw_data[6], + 'kegg_ko': raw_data[7], + 'eggnog': raw_data[8], + 'sample_occurence_frequency': raw_data[9], + 'individual_occurence_frequency': raw_data[10], + 'kegg_functional_categories': raw_data[11], + 'eggnog_functional_categories': raw_data[12], + 'cohort_assembled': raw_data[13] + } + test_dict = IGCLineParser.gene(raw_line) + self.assertDictEqual(test_dict, expected_dict) + + def test_gene_wrong_format(self): + raw_line = "This is a wrong line format, with; information and tab" + with self.assertRaises(Exception) as context: # noqa + IGCLineParser.gene(raw_line) diff --git a/backend/metagenedb/common/utils/parsers/test_kegg.py b/backend/metagenedb/common/utils/parsers/test_kegg.py new file mode 100644 index 0000000000000000000000000000000000000000..e726d683047ad0aae7378be3d8a549a2daa126a7 --- /dev/null +++ b/backend/metagenedb/common/utils/parsers/test_kegg.py @@ -0,0 +1,22 @@ +from unittest import TestCase + +from metagenedb.common.utils.parsers import KEGGLineParser + + +class TestKEGGLineParser(TestCase): + + def test_ko_list(self): + ko_line = "ko:K00809 DHPS, dys; deoxyhypusine synthase [EC:2.5.1.46]" + expected_dict = { + 'function_id': "K00809", + 'name': "DHPS, dys", + 'long_name': "deoxyhypusine synthase [EC:2.5.1.46]", + 'ec_number': "2.5.1.46" + } + test_dict = KEGGLineParser.ko_list(ko_line) + self.assertDictEqual(test_dict, expected_dict) + + def test_ko_list_wrong_format(self): + ko_line = "This is a wrong line format, with; information and tab" + with self.assertRaises(Exception) as context: # noqa + KEGGLineParser.ko_list(ko_line) diff --git a/backend/metagenedb/common/utils/test_parsers.py b/backend/metagenedb/common/utils/parsers/test_ncbi_taxonomy.py similarity index 68% rename from backend/metagenedb/common/utils/test_parsers.py rename to backend/metagenedb/common/utils/parsers/test_ncbi_taxonomy.py index 902ad84b4bd8239b264464104ab301eeb250679c..1c65803715fef9051569219469523efb50968731 100644 --- a/backend/metagenedb/common/utils/test_parsers.py +++ b/backend/metagenedb/common/utils/parsers/test_ncbi_taxonomy.py @@ -1,25 +1,6 @@ from unittest import TestCase -from metagenedb.common.utils.parsers import KEGGLineParser, NCBITaxonomyLineParser - - -class TestKEGGLineParser(TestCase): - - def test_ko_list(self): - ko_line = "ko:K00809 DHPS, dys; deoxyhypusine synthase [EC:2.5.1.46]" - expected_dict = { - 'function_id': "K00809", - 'name': "DHPS, dys", - 'long_name': "deoxyhypusine synthase [EC:2.5.1.46]", - 'ec_number': "2.5.1.46" - } - test_dict = KEGGLineParser.ko_list(ko_line) - self.assertDictEqual(test_dict, expected_dict) - - def test_ko_list_wrong_format(self): - ko_line = "This is a wrong line format, with; information and tab" - with self.assertRaises(Exception) as context: # noqa - KEGGLineParser.ko_list(ko_line) +from metagenedb.common.utils.parsers import NCBITaxonomyLineParser class TestNCBITaxonomyLineParser(TestCase): diff --git a/backend/scripts/populate_db/import_igc_data.py b/backend/scripts/populate_db/import_igc_data.py index 2faae48803959ab5e8bde19f4605d42f25c46594..b169f1d2a83168c44ce3460e04e44d247b54fd8c 100755 --- a/backend/scripts/populate_db/import_igc_data.py +++ b/backend/scripts/populate_db/import_igc_data.py @@ -8,6 +8,8 @@ from itertools import islice import django from rest_framework.exceptions import ValidationError +from metagenedb.common.utils.parsers import IGCLineParser + # Before model import, we need to called django.setup() to Load apps os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings") django.setup() @@ -18,32 +20,17 @@ from metagenedb.apps.catalog.serializers import GeneSerializer # noqa logging.basicConfig(level=logging.INFO) _LOGGER = logging.getLogger(__name__) +SELECTED_KEYS = ['gene_id', 'gene_length', 'kegg_ko'] + -def parse_gene(raw_line): +def parse_gene(raw_line, selected_keys=SELECTED_KEYS): """ - IGC annotation columns: - 0: Gene ID Unique ID - 1: Gene Name Unique name - 2: Gene Length Length of nucleotide sequence - 3: Gene Completeness Status Stating a gene is complete or partial according to the gene predictor - 4: Cohort Origin Stating the cohort contributing the representative gene - 5: Taxonomic Annotation(Phylum Level) Annotated phylum for a gene - 6: Taxonomic Annotation(Genus Level) Annotated genus for a gene - 7: KEGG Annotation Annotated KO(s) for a gene - 8: eggNOG Annotation Annotated eggNOG(s) for a gene - 9: Sample Occurence Frequency Occurrence frequency in samples based on gene profile - 10:Individual Occurence Frequency Occurrence frequency in individuals based on gene profile - 11: KEGG Functional Categories KEGG functional category(ies) of the annotated KO(s) - 12: eggNOG Functional Categories eggNOG functional category(ies) of the annotated eggNOG(s) - 13: Cohort Assembled Stating the metagenomic sequencing cohort(s) contributing the - representative gene or a redundant gene belonging to it + Use IGCLineParser and return selected keys """ - gene_info = raw_line.rstrip().split('\t') - return { - 'gene_id': gene_info[1], - 'gene_length': gene_info[2], - 'kegg_ko': gene_info[7] - } + gene_parser = IGCLineParser() + all_dict = gene_parser.gene(raw_line) + selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys} + return selected_dict def upsert_gene(gene_dict): diff --git a/backend/scripts/populate_db/test_import_igc_data.py b/backend/scripts/populate_db/test_import_igc_data.py index f06e88956c949a495d18d440c11d04083b999041..de03536b602f5ede6ee77b446e5ccab0e48bc88e 100644 --- a/backend/scripts/populate_db/test_import_igc_data.py +++ b/backend/scripts/populate_db/test_import_igc_data.py @@ -9,7 +9,7 @@ from scripts.populate_db.import_igc_data import parse_gene, upsert_gene class TestParseGene(TestCase): - def test_parse_gene(self): + def setUp(self): raw_data = [ 'gene_id', 'gene_name', @@ -26,13 +26,42 @@ class TestParseGene(TestCase): 'eggnog_functional_cat', 'cohort_assembled' ] - raw_line = "\t".join(raw_data) + self.raw_line = "\t".join(raw_data) + + def test_parse_gene_default_selected_keys(self): + """ + This test should failed and need to be updated when SELECTED_KEYS are changed + """ expected_dict = { - 'gene_id': 'gene_name', # We use the gene name for our gene ID + 'gene_id': 'gene_name', 'gene_length': 'gene_length', 'kegg_ko': 'kegg' } - tested_dict = parse_gene(raw_line) + tested_dict = parse_gene(self.raw_line) + self.assertDictEqual(tested_dict, expected_dict) + + def test_parse_gene(self): + """ + This test should failed and need to be updated when SELECTED_KEYS are changed + """ + selected_keys = ['gene_id', 'gene_length'] + expected_dict = { + 'gene_id': 'gene_name', + 'gene_length': 'gene_length' + } + tested_dict = parse_gene(self.raw_line, selected_keys=selected_keys) + self.assertDictEqual(tested_dict, expected_dict) + + def test_parse_gene_unknown_key(self): + """ + Unknown key should be ignored + """ + selected_keys = ['gene_id', 'gene_length', 'secret_code'] + expected_dict = { + 'gene_id': 'gene_name', + 'gene_length': 'gene_length' + } + tested_dict = parse_gene(self.raw_line, selected_keys=selected_keys) self.assertDictEqual(tested_dict, expected_dict)