import_igc_data.py 3.6 KB
Newer Older
1
#!/usr/bin/env python
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
2
3
4
5
6
7
8
9
10
11
12
13
14
import argparse
import logging
import os
import sys
from itertools import islice

import django
from django.core.exceptions import ValidationError

# Before model import, we need to called django.setup() to Load apps
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings")
django.setup()

Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
15
from metagenedb.apps.catalog.models import Gene, Function  # noqa
16
from metagenedb.apps.catalog.views.gene import GeneInsertion  # noqa
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
17
18
19
20
21

logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)


22
def parse_gene(raw_line):
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
    """
    IGC annotation columns:
        0: Gene ID	                            Unique ID
        1: Gene Name                 	        Unique name
        2: Gene Length	                        Length of nucleotide sequence
        3: Gene Completeness Status	            Stating a gene is complete or partial according to the gene predictor
        4: Cohort Origin	                    Stating the cohort contributing the representative gene
        5: Taxonomic Annotation(Phylum Level)	Annotated phylum for a gene
        6: Taxonomic Annotation(Genus Level)	Annotated genus for a gene
        7: KEGG Annotation	                    Annotated KO(s) for a gene
        8: eggNOG Annotation	                Annotated eggNOG(s) for a gene
        9: Sample Occurence Frequency	        Occurrence frequency in samples based on gene profile
        10:Individual Occurence Frequency	    Occurrence frequency in individuals based on gene profile
        11: KEGG Functional Categories	        KEGG functional category(ies) of the annotated KO(s)
        12: eggNOG Functional Categories	    eggNOG functional category(ies) of the annotated eggNOG(s)
        13: Cohort Assembled	                Stating the metagenomic sequencing cohort(s) contributing the
                                                representative gene or a redundant gene belonging to it
    """
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
41
    gene_info = raw_line.rstrip().split('\t')
42
43
44
45
46
    return {
        'gene_id': gene_info[1],
        'gene_length': gene_info[2],
        'kegg_ko': gene_info[7]
    }
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
47
48
49


def insert_gene_list(chunk_genes):
50
    for gene_line in chunk_genes:
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
51
        try:
52
53
54
55
            gene_dict = parse_gene(gene_line)
            # insert_gene(gene_dict)
            gene_insertion = GeneInsertion(gene_dict)
            gene_insertion.upsert_to_db()
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
56
        except ValidationError as e:
57
            _LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.")
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
58
59
60


def load_annotation_file_to_db_in_chunks(annotation_file, chunk_size=100000):
61
    processed_genes = 0
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
62
63
64
65
66
    with open(annotation_file, 'r') as file:
        while True:
            chunk_genes = list(islice(file, chunk_size))
            if not chunk_genes:
                break
67
            processed_genes += len(chunk_genes)
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
68
            insert_gene_list(chunk_genes)
69
70
            _LOGGER.info(f"{processed_genes} genes processed so far...")
    _LOGGER.info(f"[DONE] {processed_genes} genes processed.")
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96


def parse_arguments():
    """
    Defines parser.
    """
    parser = argparse.ArgumentParser(description='Populate database from a given IGC annotation file.')
    # Common arguments for analysis and annotations
    parser.add_argument('annotation', help='IGC annotation file')
    parser.add_argument('--delete_all', action='store_true', help='Empty database before insertion.')

    try:
        return parser.parse_args()
    except SystemExit:
        sys.exit(1)


def run():
    args = parse_arguments()
    if args.delete_all:
        Gene.objects.all().delete()
    load_annotation_file_to_db_in_chunks(args.annotation)


if __name__ == "__main__":
    run()