import_igc_data.py 3.39 KB
Newer Older
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import argparse
import logging
import os
import sys
from itertools import islice

import django
from django.core.exceptions import ValidationError

# Before model import, we need to called django.setup() to Load apps
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings")
django.setup()

from metagenedb.apps.catalog.models import Gene

logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)


def create_gene(raw_line):
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
    """
    IGC annotation columns:
        0: Gene ID	                            Unique ID
        1: Gene Name                 	        Unique name
        2: Gene Length	                        Length of nucleotide sequence
        3: Gene Completeness Status	            Stating a gene is complete or partial according to the gene predictor
        4: Cohort Origin	                    Stating the cohort contributing the representative gene
        5: Taxonomic Annotation(Phylum Level)	Annotated phylum for a gene
        6: Taxonomic Annotation(Genus Level)	Annotated genus for a gene
        7: KEGG Annotation	                    Annotated KO(s) for a gene
        8: eggNOG Annotation	                Annotated eggNOG(s) for a gene
        9: Sample Occurence Frequency	        Occurrence frequency in samples based on gene profile
        10:Individual Occurence Frequency	    Occurrence frequency in individuals based on gene profile
        11: KEGG Functional Categories	        KEGG functional category(ies) of the annotated KO(s)
        12: eggNOG Functional Categories	    eggNOG functional category(ies) of the annotated eggNOG(s)
        13: Cohort Assembled	                Stating the metagenomic sequencing cohort(s) contributing the
                                                representative gene or a redundant gene belonging to it
    """
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
39
40
    gene_info = raw_line.rstrip().split('\t')
    gene = Gene(gene_id=gene_info[1],
41
                gene_length=gene_info[2])
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
    return gene


def insert_gene(gene):
    gene.full_clean()
    gene.save()


def insert_gene_list(chunk_genes):
    for i in chunk_genes:
        try:
            gene = create_gene(i)
            insert_gene(gene)
        except ValidationError as e:
            _LOGGER.warning(f"{e.__dict__} for gene_id: {gene.gene_id}. Insertion skipped.")


def load_annotation_file_to_db_in_chunks(annotation_file, chunk_size=100000):
    loaded_genes = 0
    with open(annotation_file, 'r') as file:
        while True:
            chunk_genes = list(islice(file, chunk_size))
            if not chunk_genes:
                break
            loaded_genes += len(chunk_genes)
            insert_gene_list(chunk_genes)
            _LOGGER.info(f"{loaded_genes} genes processed so far...")
    _LOGGER.info(f"[DONE] {loaded_genes} genes processed.")


def parse_arguments():
    """
    Defines parser.
    """
    parser = argparse.ArgumentParser(description='Populate database from a given IGC annotation file.')
    # Common arguments for analysis and annotations
    parser.add_argument('annotation', help='IGC annotation file')
    parser.add_argument('--delete_all', action='store_true', help='Empty database before insertion.')

    try:
        return parser.parse_args()
    except SystemExit:
        sys.exit(1)


def run():
    args = parse_arguments()
    if args.delete_all:
        Gene.objects.all().delete()
    load_annotation_file_to_db_in_chunks(args.annotation)


if __name__ == "__main__":
    run()