#!/usr/bin/env python import argparse import logging import os import sys from itertools import islice import django from rest_framework.exceptions import ValidationError # Before model import, we need to called django.setup() to Load apps os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings") django.setup() from metagenedb.apps.catalog.models import Gene, Function # noqa from metagenedb.apps.catalog.serializers import GeneSerializer # noqa logging.basicConfig(level=logging.INFO) _LOGGER = logging.getLogger(__name__) def parse_gene(raw_line): """ IGC annotation columns: 0: Gene ID Unique ID 1: Gene Name Unique name 2: Gene Length Length of nucleotide sequence 3: Gene Completeness Status Stating a gene is complete or partial according to the gene predictor 4: Cohort Origin Stating the cohort contributing the representative gene 5: Taxonomic Annotation(Phylum Level) Annotated phylum for a gene 6: Taxonomic Annotation(Genus Level) Annotated genus for a gene 7: KEGG Annotation Annotated KO(s) for a gene 8: eggNOG Annotation Annotated eggNOG(s) for a gene 9: Sample Occurence Frequency Occurrence frequency in samples based on gene profile 10:Individual Occurence Frequency Occurrence frequency in individuals based on gene profile 11: KEGG Functional Categories KEGG functional category(ies) of the annotated KO(s) 12: eggNOG Functional Categories eggNOG functional category(ies) of the annotated eggNOG(s) 13: Cohort Assembled Stating the metagenomic sequencing cohort(s) contributing the representative gene or a redundant gene belonging to it """ gene_info = raw_line.rstrip().split('\t') return { 'gene_id': gene_info[1], 'gene_length': gene_info[2], 'kegg_ko': gene_info[7] } def upsert_gene(gene_dict): try: gene_obj = Gene.objects.get(gene_id=gene_dict.get('gene_id')) serializer = GeneSerializer(gene_obj, data=gene_dict) except Gene.DoesNotExist: serializer = GeneSerializer(data=gene_dict) serializer.is_valid(raise_exception=True) serializer.save() def insert_gene_list(chunk_genes): for gene_line in chunk_genes: gene_dict = parse_gene(gene_line) try: upsert_gene(gene_dict) except ValidationError as e: _LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.") def load_annotation_file_to_db_in_chunks(annotation_file, chunk_size=100000): processed_genes = 0 with open(annotation_file, 'r') as file: while True: chunk_genes = list(islice(file, chunk_size)) if not chunk_genes: break processed_genes += len(chunk_genes) insert_gene_list(chunk_genes) _LOGGER.info(f"{processed_genes} genes processed so far...") _LOGGER.info(f"[DONE] {processed_genes} genes processed.") def parse_arguments(): """ Defines parser. """ parser = argparse.ArgumentParser(description='Populate database from a given IGC annotation file.') # Common arguments for analysis and annotations parser.add_argument('annotation', help='IGC annotation file') parser.add_argument('--delete_all', action='store_true', help='Empty database before insertion.') try: return parser.parse_args() except SystemExit: sys.exit(1) def run(): args = parse_arguments() if args.delete_all: Gene.objects.all().delete() load_annotation_file_to_db_in_chunks(args.annotation) if __name__ == "__main__": run()