import_igc_data.py 5.4 KB
Newer Older
1
#!/usr/bin/env python
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
2
3
import argparse
import logging
4
import os
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
5
6
7
import sys
from itertools import islice

8
from bioapi import MetageneDBCatalogGeneAPI, MetageneDBCatalogTaxonomyAPI
9
from slugify import slugify
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
10

11
12
from metagenedb.common.utils.parsers import IGCLineParser

13
14
logging.basicConfig()
logger = logging.getLogger()
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
15

16

17
18
class ImportIGCGenes(object):
    METAGENEDB_GENE_API = MetageneDBCatalogGeneAPI
19
20
21
22
23
    METAGENEDB_TAXONOMY_API = MetageneDBCatalogTaxonomyAPI

    PHYLUM_COL = 'taxo_phylum'
    GENUS_COL = 'taxo_genus'
    SELECTED_KEYS = ['gene_id', 'length', 'kegg_ko', PHYLUM_COL, GENUS_COL]
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
24

25
26
27
28
    def __init__(self, annotation_file, url, skip_tax=False, skip_functions=False):
        self.annotation_file = annotation_file
        self.url = url
        self.metagenedb_gene_api = self.METAGENEDB_GENE_API(base_url=self.url)
29
        self.metagenedb_taxonomy_api = self.METAGENEDB_TAXONOMY_API(base_url=self.url)
30
31
        self.total_genes = self._get_number_genes()
        self._reset_counters()
32
33
34
        # Skip some insertion if specified in script options
        self.skip_tax = skip_tax
        self.skip_functions = skip_functions
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
35

36
37
38
39
40
    def _reset_counters(self):
        self.processed_genes = 0
        self.created_genes = 0
        self.updated_genes = 0
        self.skipped_genes = 0
41

42
43
44
45
46
47
48
49
50
    def _get_number_genes(self):
        if not os.path.isfile(self.annotation_file):
            return 0
        with open(self.annotation_file) as f:
            for i, l in enumerate(f):
                pass
        return i + 1

    def _select_taxonomy(self, gene_dict, unknown_val='unknown'):
51
52
53
54
        """
        Select the taxonomy to be assigned for the gene.
        genus has priority on phylum. If both unknow, remove the taxonomy key
        """
55
56
        phylum = gene_dict.pop(self.PHYLUM_COL)
        genus = gene_dict.pop(self.GENUS_COL)
57
58
59
60
        resp_dict = {}
        if genus != unknown_val:
            resp_dict = self.metagenedb_taxonomy_api.get_all(params={'name': genus, 'rank': 'genus'})
            if len(resp_dict['results']) > 1:
61
                logger.warning(f"More than 1 result found for genus {genus}. First result is kept.")
62
63
64
        elif phylum != unknown_val:
            resp_dict = self.metagenedb_taxonomy_api.get_all(params={'name': phylum, 'rank': 'phylum'})
            if len(resp_dict['results']) > 1:
65
66
67
                logger.warning(f"More than 1 result found for phylum {phylum}. First result is kept.")
        if resp_dict.get('count', 0) > 0:
            gene_dict.update(
68
69
                {'taxonomy': resp_dict['results'][0]['tax_id']}
            )
70
71
72
73
74
75
76
77
78
79
80
81
        else:
            gene_dict.update({'taxonomy': None})
        return gene_dict

    def _parse_gene(self, raw_line, selected_keys=SELECTED_KEYS):
        """
        Use IGCLineParser and return selected keys
        """
        gene_parser = IGCLineParser()
        all_dict = gene_parser.gene(raw_line)
        selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys}
        return selected_dict
82

83
    def _clean_gene(self, gene_dict):
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
84
        gene_dict['gene_name'] = gene_dict['gene_id']
85
        gene_dict['gene_id'] = slugify(gene_dict['gene_id'])
86
        gene_dict['functions'] = [{'function_id': gene_dict.pop('kegg_ko')}]
87
88
89
90
91
92
        if self.skip_tax:
            gene_dict.pop('taxonomy')
        if self.skip_functions:
            gene_dict.pop('functions')
        return gene_dict

93
    def load_annotation_file_to_db_in_chunks(self, chunk_size=1000):
94
95
96
97
98
        with open(self.annotation_file, 'r') as file:
            while True:
                chunk_genes = list(islice(file, chunk_size))
                if not chunk_genes:
                    break
99
100
101
102
                genes = [self._clean_gene(self._select_taxonomy(self._parse_gene(i))) for i in chunk_genes]
                response = self.metagenedb_gene_api.put(genes)
                self.created_genes += response.get('created').get('count')
                self.updated_genes += response.get('updated').get('count')
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
103
                self.processed_genes += len(chunk_genes)
104
105
106
107
                logger.info("%s Genes processed so far...", self.processed_genes)
        logger.info("[DONE] %s/%s Genes created.", self.created_genes, self.total_genes)
        logger.info("[DONE] %s/%s Genes updated.", self.updated_genes, self.total_genes)
        logger.info("[DONE] %s/%s Genes skipped.", self.skipped_genes, self.total_genes)
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
108
109
110
111
112
113
114
115
116


def parse_arguments():
    """
    Defines parser.
    """
    parser = argparse.ArgumentParser(description='Populate database from a given IGC annotation file.')
    # Common arguments for analysis and annotations
    parser.add_argument('annotation', help='IGC annotation file')
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
117
    parser.add_argument('--url', help='base URL of the instance.', default='http://localhost/')
118
119
    parser.add_argument('--skip_taxonomy', action='store_true', help='Skip taxonomy information from genes.')
    parser.add_argument('--skip_functions', action='store_true', help='Skip functions information from genes.')
120
    parser.add_argument('-v', '--verbose', action='store_true')
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
121
122
123
124
125
126
127
128
129

    try:
        return parser.parse_args()
    except SystemExit:
        sys.exit(1)


def run():
    args = parse_arguments()
130
131
    if args.verbose:
        logger.setLevel(logging.INFO)
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
132
133
134
    import_igc_genes = ImportIGCGenes(args.annotation, args.url,
                                      skip_tax=args.skip_taxonomy, skip_functions=args.skip_functions)
    import_igc_genes.load_annotation_file_to_db_in_chunks()
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
135
136
137
138


if __name__ == "__main__":
    run()