import_igc_data.py 4.46 KB
Newer Older
1
#!/usr/bin/env python
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
2
3
4
5
6
7
8
9
10
import argparse
import logging
import os
import sys
from itertools import islice

import django
from django.core.exceptions import ValidationError

11
12
from metagenedb.utils.dict_operations import extract_dict

Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
13
14
15
16
# Before model import, we need to called django.setup() to Load apps
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings")
django.setup()

Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
17
from metagenedb.apps.catalog.models import Gene, Function  # noqa
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
18
19
20
21
22

logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)


23
def parse_gene(raw_line):
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
    """
    IGC annotation columns:
        0: Gene ID	                            Unique ID
        1: Gene Name                 	        Unique name
        2: Gene Length	                        Length of nucleotide sequence
        3: Gene Completeness Status	            Stating a gene is complete or partial according to the gene predictor
        4: Cohort Origin	                    Stating the cohort contributing the representative gene
        5: Taxonomic Annotation(Phylum Level)	Annotated phylum for a gene
        6: Taxonomic Annotation(Genus Level)	Annotated genus for a gene
        7: KEGG Annotation	                    Annotated KO(s) for a gene
        8: eggNOG Annotation	                Annotated eggNOG(s) for a gene
        9: Sample Occurence Frequency	        Occurrence frequency in samples based on gene profile
        10:Individual Occurence Frequency	    Occurrence frequency in individuals based on gene profile
        11: KEGG Functional Categories	        KEGG functional category(ies) of the annotated KO(s)
        12: eggNOG Functional Categories	    eggNOG functional category(ies) of the annotated eggNOG(s)
        13: Cohort Assembled	                Stating the metagenomic sequencing cohort(s) contributing the
                                                representative gene or a redundant gene belonging to it
    """
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
42
    gene_info = raw_line.rstrip().split('\t')
43
44
45
46
47
    return {
        'gene_id': gene_info[1],
        'gene_length': gene_info[2],
        'kegg_ko': gene_info[7]
    }
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
48
49


50
51
52
53
54
55
56
57
58
59
60
61
def link_to_function(obj_gene, gene_dict):
    try:
        function = Function.objects.get(function_id=gene_dict.get('kegg_ko'))
        obj_gene.functions.add(function)
        obj_gene.full_clean()
        obj_gene.save()
    except Function.DoesNotExist:
        _LOGGER.warning(f"{gene_dict.get('kegg_ko')} not found in the database {gene_dict}.")


def insert_gene(gene_dict):
    MANY_TO_MANY_FIELDS = ['kegg_ko']
62
    many_to_many_elements = extract_dict(gene_dict, MANY_TO_MANY_FIELDS)
63
64
65
    try:
        obj_gene = Gene.objects.get(gene_id=gene_dict.get('gene_id'))
        for key, value in gene_dict.items():
66
            setattr(obj_gene, key, value)
67
68
69
70
71
72
    except Gene.DoesNotExist:
        obj_gene = Gene(gene_id=gene_dict.get('gene_id'),
                        gene_length=gene_dict.get('gene_length'))
    obj_gene.full_clean()
    obj_gene.save()
    # Add link to KEGG
73
74
    if many_to_many_elements.get('kegg_ko') != 'unknown':
        link_to_function(obj_gene, many_to_many_elements)
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
75
76
77
78
79


def insert_gene_list(chunk_genes):
    for i in chunk_genes:
        try:
80
81
            gene_dict = parse_gene(i)
            insert_gene(gene_dict)
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
82
        except ValidationError as e:
83
            _LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.")
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
84
85
86


def load_annotation_file_to_db_in_chunks(annotation_file, chunk_size=100000):
87
    processed_genes = 0
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
88
89
90
91
92
    with open(annotation_file, 'r') as file:
        while True:
            chunk_genes = list(islice(file, chunk_size))
            if not chunk_genes:
                break
93
            processed_genes += len(chunk_genes)
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
94
            insert_gene_list(chunk_genes)
95
96
            _LOGGER.info(f"{processed_genes} genes processed so far...")
    _LOGGER.info(f"[DONE] {processed_genes} genes processed.")
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122


def parse_arguments():
    """
    Defines parser.
    """
    parser = argparse.ArgumentParser(description='Populate database from a given IGC annotation file.')
    # Common arguments for analysis and annotations
    parser.add_argument('annotation', help='IGC annotation file')
    parser.add_argument('--delete_all', action='store_true', help='Empty database before insertion.')

    try:
        return parser.parse_args()
    except SystemExit:
        sys.exit(1)


def run():
    args = parse_arguments()
    if args.delete_all:
        Gene.objects.all().delete()
    load_annotation_file_to_db_in_chunks(args.annotation)


if __name__ == "__main__":
    run()