Select Git revision
install_python.md
import_igc_data.py 3.82 KiB
#!/usr/bin/env python
import argparse
import logging
import os
import sys
from itertools import islice
import django
from rest_framework.exceptions import ValidationError
# Before model import, we need to called django.setup() to Load apps
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings")
django.setup()
from metagenedb.apps.catalog.models import Gene, Function # noqa
from metagenedb.apps.catalog.serializers import GeneSerializer # noqa
logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)
def parse_gene(raw_line):
"""
IGC annotation columns:
0: Gene ID Unique ID
1: Gene Name Unique name
2: Gene Length Length of nucleotide sequence
3: Gene Completeness Status Stating a gene is complete or partial according to the gene predictor
4: Cohort Origin Stating the cohort contributing the representative gene
5: Taxonomic Annotation(Phylum Level) Annotated phylum for a gene
6: Taxonomic Annotation(Genus Level) Annotated genus for a gene
7: KEGG Annotation Annotated KO(s) for a gene
8: eggNOG Annotation Annotated eggNOG(s) for a gene
9: Sample Occurence Frequency Occurrence frequency in samples based on gene profile
10:Individual Occurence Frequency Occurrence frequency in individuals based on gene profile
11: KEGG Functional Categories KEGG functional category(ies) of the annotated KO(s)
12: eggNOG Functional Categories eggNOG functional category(ies) of the annotated eggNOG(s)
13: Cohort Assembled Stating the metagenomic sequencing cohort(s) contributing the
representative gene or a redundant gene belonging to it
"""
gene_info = raw_line.rstrip().split('\t')
return {
'gene_id': gene_info[1],
'gene_length': gene_info[2],
'kegg_ko': gene_info[7]
}
def upsert_gene(gene_dict):
try:
gene_obj = Gene.objects.get(gene_id=gene_dict.get('gene_id'))
serializer = GeneSerializer(gene_obj, data=gene_dict)
except Gene.DoesNotExist:
serializer = GeneSerializer(data=gene_dict)
serializer.is_valid(raise_exception=True)
serializer.save()
def insert_gene_list(chunk_genes):
for gene_line in chunk_genes:
gene_dict = parse_gene(gene_line)
try:
upsert_gene(gene_dict)
except ValidationError as e:
_LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.")
def load_annotation_file_to_db_in_chunks(annotation_file, chunk_size=100000):
processed_genes = 0
with open(annotation_file, 'r') as file:
while True:
chunk_genes = list(islice(file, chunk_size))
if not chunk_genes:
break
processed_genes += len(chunk_genes)
insert_gene_list(chunk_genes)
_LOGGER.info(f"{processed_genes} genes processed so far...")
_LOGGER.info(f"[DONE] {processed_genes} genes processed.")
def parse_arguments():
"""
Defines parser.
"""
parser = argparse.ArgumentParser(description='Populate database from a given IGC annotation file.')
# Common arguments for analysis and annotations
parser.add_argument('annotation', help='IGC annotation file')
parser.add_argument('--delete_all', action='store_true', help='Empty database before insertion.')
try:
return parser.parse_args()
except SystemExit:
sys.exit(1)
def run():
args = parse_arguments()
if args.delete_all:
Gene.objects.all().delete()
load_annotation_file_to_db_in_chunks(args.annotation)
if __name__ == "__main__":
run()