import_igc_data.py 5.32 KB
Newer Older
1
#!/usr/bin/env python
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
2
3
4
5
import argparse
import logging
import sys
from itertools import islice
6
from requests.exceptions import HTTPError
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
7

8
from bioapi import MetageneDBCatalogGeneAPI, MetageneDBCatalogTaxonomyAPI
9
from slugify import slugify
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
10

11
12
from metagenedb.common.utils.parsers import IGCLineParser

Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
13
14
_LOGGER = logging.getLogger(__name__)

15

16
17
class ImportIGCGenes(object):
    METAGENEDB_GENE_API = MetageneDBCatalogGeneAPI
18
19
20
21
22
    METAGENEDB_TAXONOMY_API = MetageneDBCatalogTaxonomyAPI

    PHYLUM_COL = 'taxo_phylum'
    GENUS_COL = 'taxo_genus'
    SELECTED_KEYS = ['gene_id', 'length', 'kegg_ko', PHYLUM_COL, GENUS_COL]
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
23

24
25
26
27
    def __init__(self, annotation_file, url, skip_tax=False, skip_functions=False):
        self.annotation_file = annotation_file
        self.url = url
        self.metagenedb_gene_api = self.METAGENEDB_GENE_API(base_url=self.url)
28
        self.metagenedb_taxonomy_api = self.METAGENEDB_TAXONOMY_API(base_url=self.url)
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
29
30
        self.processed_genes = 0
        self.skipped_genes = 0
31
32
33
        # Skip some insertion if specified in script options
        self.skip_tax = skip_tax
        self.skip_functions = skip_functions
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
34

35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
    def _parse_gene(self, raw_line, selected_keys=SELECTED_KEYS):
        """
        Use IGCLineParser and return selected keys
        """
        gene_parser = IGCLineParser()
        all_dict = gene_parser.gene(raw_line)
        selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys}
        return selected_dict

    def _select_taxonomy(self, taxonomy_dict, unknown_val='unknown'):
        """
        Select the taxonomy to be assigned for the gene.
        genus has priority on phylum. If both unknow, remove the taxonomy key
        """
        phylum = taxonomy_dict.pop(self.PHYLUM_COL)
        genus = taxonomy_dict.pop(self.GENUS_COL)
        resp_dict = {}
        if genus != unknown_val:
            resp_dict = self.metagenedb_taxonomy_api.get_all(params={'name': genus, 'rank': 'genus'})
            if len(resp_dict['results']) > 1:
                _LOGGER.warning(f"More than 1 result found for genus {genus}. First result is kept.")
        elif phylum != unknown_val:
            resp_dict = self.metagenedb_taxonomy_api.get_all(params={'name': phylum, 'rank': 'phylum'})
            if len(resp_dict['results']) > 1:
                _LOGGER.warning(f"More than 1 result found for phylum {phylum}. First result is kept.")
        if resp_dict:
            taxonomy_dict.update(
                {'taxonomy': resp_dict['results'][0]['tax_id']}
            )
        return taxonomy_dict

66
    def _clean_gene(self, gene_dict):
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
67
        gene_dict['gene_name'] = gene_dict['gene_id']
68
        gene_dict['gene_id'] = slugify(gene_dict['gene_id'])
69
        gene_dict['functions'] = [{'function_id': gene_dict.pop('kegg_ko')}]
70
71
72
73
74
75
76
77
78
79
        if self.skip_tax:
            gene_dict.pop('taxonomy')
        if self.skip_functions:
            gene_dict.pop('functions')
        return gene_dict

    def _upsert_gene(self, gene_dict):
        clean_gene_dict = self._clean_gene(gene_dict)
        try:
            gene_id = clean_gene_dict['gene_id']
80
            self.metagenedb_gene_api.get(gene_id)  # Try to get obj to check if it exists
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
81
            self.metagenedb_gene_api.put(clean_gene_dict, entry_id=gene_id)
82
83
84
85
86
        except HTTPError:
            self.metagenedb_gene_api.post(clean_gene_dict)

    def _insert_gene_list(self, chunk_genes):
        for gene_line in chunk_genes:
87
88
            gene_dict = self._parse_gene(gene_line)
            gene_dict_with_taxo = self._select_taxonomy(gene_dict)
89
90
            try:
                self._upsert_gene(gene_dict_with_taxo)
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
91
            except HTTPError as e:
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
92
                self.skipped_genes += 1
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
93
                _LOGGER.warning(f"{e.response.json()} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.")
94
95
96
97
98
99
100

    def load_annotation_file_to_db_in_chunks(self, chunk_size=100000):
        with open(self.annotation_file, 'r') as file:
            while True:
                chunk_genes = list(islice(file, chunk_size))
                if not chunk_genes:
                    break
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
101
                self.processed_genes += len(chunk_genes)
102
                self._insert_gene_list(chunk_genes)
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
103
104
105
                _LOGGER.info(f"{self.processed_genes} genes inserted/updated so far...")
        _LOGGER.info(f"[DONE] {self.processed_genes} genes inserted/updated.")
        _LOGGER.info(f"[DONE] {self.skipped_genes} genes skipped.")
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
106
107
108
109
110
111
112
113
114


def parse_arguments():
    """
    Defines parser.
    """
    parser = argparse.ArgumentParser(description='Populate database from a given IGC annotation file.')
    # Common arguments for analysis and annotations
    parser.add_argument('annotation', help='IGC annotation file')
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
115
    parser.add_argument('--url', help='base URL of the instance.', default='http://localhost/')
116
117
    parser.add_argument('--skip_taxonomy', action='store_true', help='Skip taxonomy information from genes.')
    parser.add_argument('--skip_functions', action='store_true', help='Skip functions information from genes.')
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
118
119
120
121
122
123
124
125
126

    try:
        return parser.parse_args()
    except SystemExit:
        sys.exit(1)


def run():
    args = parse_arguments()
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
127
128
129
    import_igc_genes = ImportIGCGenes(args.annotation, args.url,
                                      skip_tax=args.skip_taxonomy, skip_functions=args.skip_functions)
    import_igc_genes.load_annotation_file_to_db_in_chunks()
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
130
131
132
133


if __name__ == "__main__":
    run()