import_igc_data.py 2.14 KB
Newer Older
Kenzo-Hugo Hillion's avatar
Kenzo-Hugo Hillion committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import argparse
import logging
import os
import sys
from itertools import islice

import django
from django.core.exceptions import ValidationError

# Before model import, we need to called django.setup() to Load apps
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings")
django.setup()

from metagenedb.apps.catalog.models import Gene

logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)


def create_gene(raw_line):
    gene_info = raw_line.rstrip().split('\t')
    gene = Gene(gene_id=gene_info[1],
                gene_length=gene_info[2],
                taxonomic_genus=gene_info[6],
                taxonomic_phylum=gene_info[5])
    return gene


def insert_gene(gene):
    gene.full_clean()
    gene.save()


def insert_gene_list(chunk_genes):
    for i in chunk_genes:
        try:
            gene = create_gene(i)
            insert_gene(gene)
        except ValidationError as e:
            _LOGGER.warning(f"{e.__dict__} for gene_id: {gene.gene_id}. Insertion skipped.")


def load_annotation_file_to_db_in_chunks(annotation_file, chunk_size=100000):
    loaded_genes = 0
    with open(annotation_file, 'r') as file:
        while True:
            chunk_genes = list(islice(file, chunk_size))
            if not chunk_genes:
                break
            loaded_genes += len(chunk_genes)
            insert_gene_list(chunk_genes)
            _LOGGER.info(f"{loaded_genes} genes processed so far...")
    _LOGGER.info(f"[DONE] {loaded_genes} genes processed.")


def parse_arguments():
    """
    Defines parser.
    """
    parser = argparse.ArgumentParser(description='Populate database from a given IGC annotation file.')
    # Common arguments for analysis and annotations
    parser.add_argument('annotation', help='IGC annotation file')
    parser.add_argument('--delete_all', action='store_true', help='Empty database before insertion.')

    try:
        return parser.parse_args()
    except SystemExit:
        sys.exit(1)


def run():
    args = parse_arguments()
    if args.delete_all:
        Gene.objects.all().delete()
    load_annotation_file_to_db_in_chunks(args.annotation)


if __name__ == "__main__":
    run()