import_igc_data.py 9.34 KB
Newer Older
1
#!/usr/bin/env python
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
2
3
import argparse
import logging
4
import os
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
5
6
7
import sys
from itertools import islice

8
9
10
11
12
from bioapi import (
    MetageneDBCatalogFunctionAPI,
    MetageneDBCatalogGeneAPI,
    MetageneDBCatalogTaxonomyAPI,
    MetageneDBCatalogKeggOrthologyAPI,
13
    MetageneDBCatalogEggNogAPI,
14
)
15
from requests.exceptions import HTTPError
16
from slugify import slugify
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
17

18
19
from metagenedb.common.utils.parsers import IGCLineParser

20
logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s')
21
logger = logging.getLogger()
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
22

23

24
25
class ImportIGCGenes(object):
    METAGENEDB_GENE_API = MetageneDBCatalogGeneAPI
26
    METAGENEDB_TAXONOMY_API = MetageneDBCatalogTaxonomyAPI
27
    METAGENEDB_FUNCTION_API = MetageneDBCatalogFunctionAPI
28
29
    METAGENEDB_KEGG_API = MetageneDBCatalogKeggOrthologyAPI
    METAGENEDB_EGGNOG_API = MetageneDBCatalogEggNogAPI
30
31
32

    PHYLUM_COL = 'taxo_phylum'
    GENUS_COL = 'taxo_genus'
33
    SELECTED_KEYS = ['gene_id', 'length', 'kegg_ko', 'eggnog', PHYLUM_COL, GENUS_COL]
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
34

35
    def __init__(self, annotation_file, url, jwt_token, skip_tax=False, skip_functions=False):
36
37
        self.annotation_file = annotation_file
        self.url = url
38
        self._open_api_endpoints(jwt_token)
39
40
        self.total_genes = self._get_number_genes()
        self._reset_counters()
41
42
43
        # Skip some insertion if specified in script options
        self.skip_tax = skip_tax
        self.skip_functions = skip_functions
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
44

45
46
47
48
49
50
51
52
53
54
55
56
57
    def _reset_counters(self):
        self.processed_genes = 0
        self.created_genes = 0
        self.updated_genes = 0
        self.skipped_genes = 0

    def _open_api_endpoints(self, jwt_token):
        self.metagenedb_gene_api = self.METAGENEDB_GENE_API(base_url=self.url, jwt_token=jwt_token)
        self.metagenedb_taxonomy_api = self.METAGENEDB_TAXONOMY_API(base_url=self.url, jwt_token=jwt_token)
        self.metagenedb_function_api = self.METAGENEDB_FUNCTION_API(base_url=self.url, jwt_token=jwt_token)
        self.metagenedb_kegg_api = self.METAGENEDB_KEGG_API(base_url=self.url, jwt_token=jwt_token)
        self.metagenedb_eggnog_api = self.METAGENEDB_EGGNOG_API(base_url=self.url, jwt_token=jwt_token)

58
59
    def _build_taxo_mapping(self, rank, page_size=1000):
        logger.info("Building local mapping for %s level...", rank)
60
61
62
63
        counter = 1
        next_page = None
        mapping = {}
        while counter == 1 or next_page is not None:
64
65
66
67
68
69
            params = {
                'page': counter,
                'page_size': page_size,
                'rank': rank,
            }
            current_page = self.metagenedb_taxonomy_api.get_all(params=params)
70
71
72
73
74
75
76
            next_page = current_page['next']
            mapping.update({
                value['name']: value['tax_id'] for value in current_page['results']
            })
            counter += 1
        return mapping

77
78
    def _retrieve_function_catalog(self, api, page_size=1000):
        logger.info("Building local catalog from %s...", api.ROUTE)
79
80
81
82
83
84
85
86
        counter = 1
        next_page = None
        functions = set()
        while counter == 1 or next_page is not None:
            params = {
                'page': counter,
                'page_size': page_size,
            }
87
            current_page = api.get_all(params=params)
88
89
90
91
92
            next_page = current_page['next']
            functions = functions.union(set(
                [item['function_id'] for item in current_page['results']]
            ))
            counter += 1
93
94
95
96
97
        return functions

    def build_function_mappings(self, page_size=1000):
        self.metagenedb_keggs = self._retrieve_function_catalog(self.metagenedb_kegg_api, page_size=page_size)
        self.metagenedb_eggnogs = self._retrieve_function_catalog(self.metagenedb_eggnog_api, page_size=page_size)
98

99
100
101
    def build_mapping(self, page_size=1000):
        self.phylum_mapping = self._build_taxo_mapping("phylum", page_size=page_size)
        self.genus_mapping = self._build_taxo_mapping("genus", page_size=page_size)
102

103
104
105
106
107
108
109
110
111
    def _get_number_genes(self):
        if not os.path.isfile(self.annotation_file):
            return 0
        with open(self.annotation_file) as f:
            for i, l in enumerate(f):
                pass
        return i + 1

    def _select_taxonomy(self, gene_dict, unknown_val='unknown'):
112
113
114
115
        """
        Select the taxonomy to be assigned for the gene.
        genus has priority on phylum. If both unknow, remove the taxonomy key
        """
116
117
        phylum = gene_dict.pop(self.PHYLUM_COL)
        genus = gene_dict.pop(self.GENUS_COL)
118
119
        if self.skip_tax:
            return gene_dict
120
        taxonomy_id = None
121
        if genus != unknown_val:
122
123
124
            taxonomy_id = self.genus_mapping.get(genus, None)
            if taxonomy_id is None:
                logger.warning("No tax_id found for genus %s" % genus)
125
        elif phylum != unknown_val:
126
127
128
129
130
            taxonomy_id = self.phylum_mapping.get(phylum, None)
            if taxonomy_id is None:
                logger.warning("No tax_id found for phylum %s" % genus)
        if taxonomy_id is not None:
            gene_dict.update({'taxonomy': taxonomy_id})
131
132
133
134
135
136
137
138
139
140
        return gene_dict

    def _parse_gene(self, raw_line, selected_keys=SELECTED_KEYS):
        """
        Use IGCLineParser and return selected keys
        """
        gene_parser = IGCLineParser()
        all_dict = gene_parser.gene(raw_line)
        selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys}
        return selected_dict
141

142
143
144
    def _clean_functions(self, functions):
        clean_functions = []
        for function in functions:
145
146
147
148
149
            if function['function_id'] in getattr(self, f"metagenedb_{function['source']}s"):
                clean_functions.append(function['function_id'])
            else:
                logger.warning("Function %s not found from %s in metagenedb",
                               function['function_id'], function['source'])
150
151
        return clean_functions

152
    def _clean_gene(self, gene_dict):
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
153
        gene_dict['gene_name'] = gene_dict['gene_id']
154
        gene_dict['gene_id'] = slugify(gene_dict['gene_id'])
155
156
157
        gene_dict['functions'] = [
            {'source': 'kegg', 'function_id': v} for v in gene_dict.pop('kegg_ko') if v != 'unknown'] + \
            [{'source': 'eggnog', 'function_id': v} for v in gene_dict.pop('eggnog') if v != 'unknown']
158
        gene_dict = self._select_taxonomy(gene_dict)
159
        if self.skip_functions or not gene_dict['functions']:
160
            gene_dict.pop('functions')
161
162
        else:
            gene_dict['functions'] = self._clean_functions(gene_dict['functions'])
163
164
        return gene_dict

165
    def load_annotation_file_to_db_in_chunks(self, chunk_size=1000, test=False):
166
167
        if not self.skip_tax:
            self.build_mapping()
168
        if not self.skip_functions:
169
            self.build_function_mappings()
170
171
172
173
174
        with open(self.annotation_file, 'r') as file:
            while True:
                chunk_genes = list(islice(file, chunk_size))
                if not chunk_genes:
                    break
175
                genes = [self._clean_gene(self._parse_gene(i)) for i in chunk_genes]
176
177
178
179
180
181
                try:
                    response = self.metagenedb_gene_api.put(genes)
                    self.created_genes += response.get('created').get('count')
                    self.updated_genes += response.get('updated').get('count')
                except HTTPError as http_error:
                    logging.warning("%s: %s; %s", http_error, http_error.response.json(), genes)
182
                    self.skipped_genes += len(genes)
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
183
                self.processed_genes += len(chunk_genes)
184
                logger.info("%s Genes processed so far...", self.processed_genes)
185
186
                if test is True:
                    break
187
188
189
        logger.info("[DONE] %s/%s Genes created.", self.created_genes, self.total_genes)
        logger.info("[DONE] %s/%s Genes updated.", self.updated_genes, self.total_genes)
        logger.info("[DONE] %s/%s Genes skipped.", self.skipped_genes, self.total_genes)
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
190
191
192
193
194
195
196
197
198


def parse_arguments():
    """
    Defines parser.
    """
    parser = argparse.ArgumentParser(description='Populate database from a given IGC annotation file.')
    # Common arguments for analysis and annotations
    parser.add_argument('annotation', help='IGC annotation file')
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
199
    parser.add_argument('--url', help='base URL of the instance.', default='http://localhost/')
200
    parser.add_argument('-t', '--jwt_token', help='your JWT token obtain from web app', required=True)
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
201
202
    parser.add_argument('--chunk_size', type=int, default=1000,
                        help='How many genes to handle and create in the same time.')
203
204
    parser.add_argument('--skip_taxonomy', action='store_true', help='Skip taxonomy information from genes.')
    parser.add_argument('--skip_functions', action='store_true', help='Skip functions information from genes.')
205
    parser.add_argument('--test', action='store_true', help='Run only on first chunk.')
206
    parser.add_argument('-v', '--verbose', action='store_true')
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
207
208
209
210
211
212
213
214
215

    try:
        return parser.parse_args()
    except SystemExit:
        sys.exit(1)


def run():
    args = parse_arguments()
216
217
    if args.verbose:
        logger.setLevel(logging.INFO)
218
    import_igc_genes = ImportIGCGenes(args.annotation, args.url, args.jwt_token,
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
219
                                      skip_tax=args.skip_taxonomy, skip_functions=args.skip_functions)
220
    import_igc_genes.load_annotation_file_to_db_in_chunks(chunk_size=args.chunk_size, test=args.test)
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
221
222
223
224


if __name__ == "__main__":
    run()