import_igc_data.py 4.35 KB
Newer Older
1
#!/usr/bin/env python
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
2
3
4
5
6
7
8
9
10
11
12
13
14
import argparse
import logging
import os
import sys
from itertools import islice

import django
from django.core.exceptions import ValidationError

# Before model import, we need to called django.setup() to Load apps
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings")
django.setup()

15
from metagenedb.apps.catalog.models import Gene, Function
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
16
17
18
19
20

logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)


21
def parse_gene(raw_line):
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
    """
    IGC annotation columns:
        0: Gene ID	                            Unique ID
        1: Gene Name                 	        Unique name
        2: Gene Length	                        Length of nucleotide sequence
        3: Gene Completeness Status	            Stating a gene is complete or partial according to the gene predictor
        4: Cohort Origin	                    Stating the cohort contributing the representative gene
        5: Taxonomic Annotation(Phylum Level)	Annotated phylum for a gene
        6: Taxonomic Annotation(Genus Level)	Annotated genus for a gene
        7: KEGG Annotation	                    Annotated KO(s) for a gene
        8: eggNOG Annotation	                Annotated eggNOG(s) for a gene
        9: Sample Occurence Frequency	        Occurrence frequency in samples based on gene profile
        10:Individual Occurence Frequency	    Occurrence frequency in individuals based on gene profile
        11: KEGG Functional Categories	        KEGG functional category(ies) of the annotated KO(s)
        12: eggNOG Functional Categories	    eggNOG functional category(ies) of the annotated eggNOG(s)
        13: Cohort Assembled	                Stating the metagenomic sequencing cohort(s) contributing the
                                                representative gene or a redundant gene belonging to it
    """
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
40
    gene_info = raw_line.rstrip().split('\t')
41
42
43
44
45
    return {
        'gene_id': gene_info[1],
        'gene_length': gene_info[2],
        'kegg_ko': gene_info[7]
    }
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
46
47


48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def link_to_function(obj_gene, gene_dict):
    try:
        function = Function.objects.get(function_id=gene_dict.get('kegg_ko'))
        obj_gene.functions.add(function)
        obj_gene.full_clean()
        obj_gene.save()
    except Function.DoesNotExist:
        _LOGGER.warning(f"{gene_dict.get('kegg_ko')} not found in the database {gene_dict}.")


def insert_gene(gene_dict):
    MANY_TO_MANY_FIELDS = ['kegg_ko']

    try:
        obj_gene = Gene.objects.get(gene_id=gene_dict.get('gene_id'))
        for key, value in gene_dict.items():
            if key not in MANY_TO_MANY_FIELDS:
                setattr(obj_gene, key, value)
    except Gene.DoesNotExist:
        obj_gene = Gene(gene_id=gene_dict.get('gene_id'),
                        gene_length=gene_dict.get('gene_length'))
    obj_gene.full_clean()
    obj_gene.save()
    # Add link to KEGG
    if gene_dict.get('kegg_ko') != 'unknown':
        link_to_function(obj_gene, gene_dict)
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
74
75
76
77
78


def insert_gene_list(chunk_genes):
    for i in chunk_genes:
        try:
79
80
            gene_dict = parse_gene(i)
            insert_gene(gene_dict)
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
81
        except ValidationError as e:
82
            _LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.")
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
83
84
85


def load_annotation_file_to_db_in_chunks(annotation_file, chunk_size=100000):
86
    processed_genes = 0
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
87
88
89
90
91
    with open(annotation_file, 'r') as file:
        while True:
            chunk_genes = list(islice(file, chunk_size))
            if not chunk_genes:
                break
92
            processed_genes += len(chunk_genes)
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
93
            insert_gene_list(chunk_genes)
94
95
            _LOGGER.info(f"{processed_genes} genes processed so far...")
    _LOGGER.info(f"[DONE] {processed_genes} genes processed.")
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121


def parse_arguments():
    """
    Defines parser.
    """
    parser = argparse.ArgumentParser(description='Populate database from a given IGC annotation file.')
    # Common arguments for analysis and annotations
    parser.add_argument('annotation', help='IGC annotation file')
    parser.add_argument('--delete_all', action='store_true', help='Empty database before insertion.')

    try:
        return parser.parse_args()
    except SystemExit:
        sys.exit(1)


def run():
    args = parse_arguments()
    if args.delete_all:
        Gene.objects.all().delete()
    load_annotation_file_to_db_in_chunks(args.annotation)


if __name__ == "__main__":
    run()