import_igc_data.py 3.74 KB
Newer Older
1
#!/usr/bin/env python
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
2
3
4
5
6
7
8
import argparse
import logging
import os
import sys
from itertools import islice

import django
9
from rest_framework.exceptions import ValidationError
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
10

11
12
from metagenedb.common.utils.parsers import IGCLineParser

Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
13
14
15
16
# Before model import, we need to called django.setup() to Load apps
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings")
django.setup()

17
from metagenedb.apps.catalog.models import Gene, Function, Taxonomy  # noqa
18
from metagenedb.apps.catalog.serializers import GeneSerializer  # noqa
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
19
20
21
22

logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)

23
24
PHYLUM_COL = 'taxo_phylum'
GENUS_COL = 'taxo_genus'
25
SELECTED_KEYS = ['gene_id', 'length', 'kegg_ko', PHYLUM_COL, GENUS_COL]
26

Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
27

28
def parse_gene(raw_line, selected_keys=SELECTED_KEYS):
29
    """
30
    Use IGCLineParser and return selected keys
31
    """
32
33
34
35
    gene_parser = IGCLineParser()
    all_dict = gene_parser.gene(raw_line)
    selected_dict = {k: v for k, v in all_dict.items() if k in selected_keys}
    return selected_dict
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
36
37


38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def select_taxonomy(gene_dict, unknown_val='unknown'):
    """
    Select the taxonomy to be assigned for the gene.
    genus has priority on phylum. If both unknow, remove the taxonomy key
    """
    phylum = gene_dict.pop(PHYLUM_COL)
    genus = gene_dict.pop(GENUS_COL)
    if genus != unknown_val:
        queryset = Taxonomy.objects.filter(name=genus, rank="genus")
        if queryset.count() > 1:
            _LOGGER.warning(f"More than 1 result found for genus {genus}. First result is kept.")
        gene_dict.update(
            {'taxonomy': queryset[0].tax_id}
        )
    elif phylum != unknown_val:
        queryset = Taxonomy.objects.filter(name=phylum, rank="phylum")
        if queryset.count() > 1:
            _LOGGER.warning(f"More than 1 result found for phylum {phylum}. First result is kept.")
        gene_dict.update(
            {'taxonomy': queryset[0].tax_id}
        )
    return gene_dict


62
63
64
65
66
67
68
69
70
71
def upsert_gene(gene_dict):
    try:
        gene_obj = Gene.objects.get(gene_id=gene_dict.get('gene_id'))
        serializer = GeneSerializer(gene_obj, data=gene_dict)
    except Gene.DoesNotExist:
        serializer = GeneSerializer(data=gene_dict)
    serializer.is_valid(raise_exception=True)
    serializer.save()


Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
72
def insert_gene_list(chunk_genes):
73
    for gene_line in chunk_genes:
74
        gene_dict = parse_gene(gene_line)
75
        gene_dict_with_taxo = select_taxonomy(gene_dict)
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
76
        try:
77
            upsert_gene(gene_dict_with_taxo)
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
78
        except ValidationError as e:
79
            _LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.")
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
80
81
82


def load_annotation_file_to_db_in_chunks(annotation_file, chunk_size=100000):
83
    processed_genes = 0
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
84
85
86
87
88
    with open(annotation_file, 'r') as file:
        while True:
            chunk_genes = list(islice(file, chunk_size))
            if not chunk_genes:
                break
89
            processed_genes += len(chunk_genes)
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
90
            insert_gene_list(chunk_genes)
91
92
            _LOGGER.info(f"{processed_genes} genes processed so far...")
    _LOGGER.info(f"[DONE] {processed_genes} genes processed.")
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118


def parse_arguments():
    """
    Defines parser.
    """
    parser = argparse.ArgumentParser(description='Populate database from a given IGC annotation file.')
    # Common arguments for analysis and annotations
    parser.add_argument('annotation', help='IGC annotation file')
    parser.add_argument('--delete_all', action='store_true', help='Empty database before insertion.')

    try:
        return parser.parse_args()
    except SystemExit:
        sys.exit(1)


def run():
    args = parse_arguments()
    if args.delete_all:
        Gene.objects.all().delete()
    load_annotation_file_to_db_in_chunks(args.annotation)


if __name__ == "__main__":
    run()