Select Git revision
test_RapidPeptidesGenerator.py
import_igc_data.py 4.35 KiB
#!/usr/bin/env python
import argparse
import logging
import os
import sys
from itertools import islice
import django
from django.core.exceptions import ValidationError
# Before model import, we need to called django.setup() to Load apps
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings")
django.setup()
from metagenedb.apps.catalog.models import Gene, Function
logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)
def parse_gene(raw_line):
"""
IGC annotation columns:
0: Gene ID Unique ID
1: Gene Name Unique name
2: Gene Length Length of nucleotide sequence
3: Gene Completeness Status Stating a gene is complete or partial according to the gene predictor
4: Cohort Origin Stating the cohort contributing the representative gene
5: Taxonomic Annotation(Phylum Level) Annotated phylum for a gene
6: Taxonomic Annotation(Genus Level) Annotated genus for a gene
7: KEGG Annotation Annotated KO(s) for a gene
8: eggNOG Annotation Annotated eggNOG(s) for a gene
9: Sample Occurence Frequency Occurrence frequency in samples based on gene profile
10:Individual Occurence Frequency Occurrence frequency in individuals based on gene profile
11: KEGG Functional Categories KEGG functional category(ies) of the annotated KO(s)
12: eggNOG Functional Categories eggNOG functional category(ies) of the annotated eggNOG(s)
13: Cohort Assembled Stating the metagenomic sequencing cohort(s) contributing the
representative gene or a redundant gene belonging to it
"""
gene_info = raw_line.rstrip().split('\t')
return {
'gene_id': gene_info[1],
'gene_length': gene_info[2],
'kegg_ko': gene_info[7]
}
def link_to_function(obj_gene, gene_dict):
try:
function = Function.objects.get(function_id=gene_dict.get('kegg_ko'))
obj_gene.functions.add(function)
obj_gene.full_clean()
obj_gene.save()
except Function.DoesNotExist:
_LOGGER.warning(f"{gene_dict.get('kegg_ko')} not found in the database {gene_dict}.")
def insert_gene(gene_dict):
MANY_TO_MANY_FIELDS = ['kegg_ko']
try:
obj_gene = Gene.objects.get(gene_id=gene_dict.get('gene_id'))
for key, value in gene_dict.items():
if key not in MANY_TO_MANY_FIELDS:
setattr(obj_gene, key, value)
except Gene.DoesNotExist:
obj_gene = Gene(gene_id=gene_dict.get('gene_id'),
gene_length=gene_dict.get('gene_length'))
obj_gene.full_clean()
obj_gene.save()
# Add link to KEGG
if gene_dict.get('kegg_ko') != 'unknown':
link_to_function(obj_gene, gene_dict)
def insert_gene_list(chunk_genes):
for i in chunk_genes:
try:
gene_dict = parse_gene(i)
insert_gene(gene_dict)
except ValidationError as e:
_LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.")
def load_annotation_file_to_db_in_chunks(annotation_file, chunk_size=100000):
processed_genes = 0
with open(annotation_file, 'r') as file:
while True:
chunk_genes = list(islice(file, chunk_size))
if not chunk_genes:
break
processed_genes += len(chunk_genes)
insert_gene_list(chunk_genes)
_LOGGER.info(f"{processed_genes} genes processed so far...")
_LOGGER.info(f"[DONE] {processed_genes} genes processed.")
def parse_arguments():
"""
Defines parser.
"""
parser = argparse.ArgumentParser(description='Populate database from a given IGC annotation file.')
# Common arguments for analysis and annotations
parser.add_argument('annotation', help='IGC annotation file')
parser.add_argument('--delete_all', action='store_true', help='Empty database before insertion.')
try:
return parser.parse_args()
except SystemExit:
sys.exit(1)
def run():
args = parse_arguments()
if args.delete_all:
Gene.objects.all().delete()
load_annotation_file_to_db_in_chunks(args.annotation)
if __name__ == "__main__":
run()