import_igc_data.py

#!/usr/bin/env python
import argparse
import logging
import os
import sys
from itertools import islice

import django
from rest_framework.exceptions import ValidationError

# Before model import, we need to called django.setup() to Load apps
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings")
django.setup()

from metagenedb.apps.catalog.models import Gene, Function  # noqa
from metagenedb.apps.catalog.serializers import GeneSerializer  # noqa

logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)


def parse_gene(raw_line):
    """
    IGC annotation columns:
        0: Gene ID	                            Unique ID
        1: Gene Name                 	        Unique name
        2: Gene Length	                        Length of nucleotide sequence
        3: Gene Completeness Status	            Stating a gene is complete or partial according to the gene predictor
        4: Cohort Origin	                    Stating the cohort contributing the representative gene
        5: Taxonomic Annotation(Phylum Level)	Annotated phylum for a gene
        6: Taxonomic Annotation(Genus Level)	Annotated genus for a gene
        7: KEGG Annotation	                    Annotated KO(s) for a gene
        8: eggNOG Annotation	                Annotated eggNOG(s) for a gene
        9: Sample Occurence Frequency	        Occurrence frequency in samples based on gene profile
        10:Individual Occurence Frequency	    Occurrence frequency in individuals based on gene profile
        11: KEGG Functional Categories	        KEGG functional category(ies) of the annotated KO(s)
        12: eggNOG Functional Categories	    eggNOG functional category(ies) of the annotated eggNOG(s)
        13: Cohort Assembled	                Stating the metagenomic sequencing cohort(s) contributing the
                                                representative gene or a redundant gene belonging to it
    """
    gene_info = raw_line.rstrip().split('\t')
    return {
        'gene_id': gene_info[1],
        'gene_length': gene_info[2],
        'kegg_ko': gene_info[7]
    }


def upsert_gene(gene_dict):
    try:
        gene_obj = Gene.objects.get(gene_id=gene_dict.get('gene_id'))
        serializer = GeneSerializer(gene_obj, data=gene_dict)
    except Gene.DoesNotExist:
        serializer = GeneSerializer(data=gene_dict)
    serializer.is_valid(raise_exception=True)
    serializer.save()


def insert_gene_list(chunk_genes):
    for gene_line in chunk_genes:
        gene_dict = parse_gene(gene_line)
        try:
            upsert_gene(gene_dict)
        except ValidationError as e:
            _LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.")


def load_annotation_file_to_db_in_chunks(annotation_file, chunk_size=100000):
    processed_genes = 0
    with open(annotation_file, 'r') as file:
        while True:
            chunk_genes = list(islice(file, chunk_size))
            if not chunk_genes:
                break
            processed_genes += len(chunk_genes)
            insert_gene_list(chunk_genes)
            _LOGGER.info(f"{processed_genes} genes processed so far...")
    _LOGGER.info(f"[DONE] {processed_genes} genes processed.")


def parse_arguments():
    """
    Defines parser.
    """
    parser = argparse.ArgumentParser(description='Populate database from a given IGC annotation file.')
    # Common arguments for analysis and annotations
    parser.add_argument('annotation', help='IGC annotation file')
    parser.add_argument('--delete_all', action='store_true', help='Empty database before insertion.')

    try:
        return parser.parse_args()
    except SystemExit:
        sys.exit(1)


def run():
    args = parse_arguments()
    if args.delete_all:
        Gene.objects.all().delete()
    load_annotation_file_to_db_in_chunks(args.annotation)


if __name__ == "__main__":
    run()