Skip to content
Snippets Groups Projects
Select Git revision
  • 0ee9e00f8ce3e3d3ae508dce69bc3088e9afedbc
  • master default protected
2 results

install_python.md

Blame
  • import_igc_data.py 3.82 KiB
    #!/usr/bin/env python
    import argparse
    import logging
    import os
    import sys
    from itertools import islice
    
    import django
    from rest_framework.exceptions import ValidationError
    
    # Before model import, we need to called django.setup() to Load apps
    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings")
    django.setup()
    
    from metagenedb.apps.catalog.models import Gene, Function  # noqa
    from metagenedb.apps.catalog.serializers import GeneSerializer  # noqa
    
    logging.basicConfig(level=logging.INFO)
    _LOGGER = logging.getLogger(__name__)
    
    
    def parse_gene(raw_line):
        """
        IGC annotation columns:
            0: Gene ID	                            Unique ID
            1: Gene Name                 	        Unique name
            2: Gene Length	                        Length of nucleotide sequence
            3: Gene Completeness Status	            Stating a gene is complete or partial according to the gene predictor
            4: Cohort Origin	                    Stating the cohort contributing the representative gene
            5: Taxonomic Annotation(Phylum Level)	Annotated phylum for a gene
            6: Taxonomic Annotation(Genus Level)	Annotated genus for a gene
            7: KEGG Annotation	                    Annotated KO(s) for a gene
            8: eggNOG Annotation	                Annotated eggNOG(s) for a gene
            9: Sample Occurence Frequency	        Occurrence frequency in samples based on gene profile
            10:Individual Occurence Frequency	    Occurrence frequency in individuals based on gene profile
            11: KEGG Functional Categories	        KEGG functional category(ies) of the annotated KO(s)
            12: eggNOG Functional Categories	    eggNOG functional category(ies) of the annotated eggNOG(s)
            13: Cohort Assembled	                Stating the metagenomic sequencing cohort(s) contributing the
                                                    representative gene or a redundant gene belonging to it
        """
        gene_info = raw_line.rstrip().split('\t')
        return {
            'gene_id': gene_info[1],
            'gene_length': gene_info[2],
            'kegg_ko': gene_info[7]
        }
    
    
    def upsert_gene(gene_dict):
        try:
            gene_obj = Gene.objects.get(gene_id=gene_dict.get('gene_id'))
            serializer = GeneSerializer(gene_obj, data=gene_dict)
        except Gene.DoesNotExist:
            serializer = GeneSerializer(data=gene_dict)
        serializer.is_valid(raise_exception=True)
        serializer.save()
    
    
    def insert_gene_list(chunk_genes):
        for gene_line in chunk_genes:
            gene_dict = parse_gene(gene_line)
            try:
                upsert_gene(gene_dict)
            except ValidationError as e:
                _LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.")
    
    
    def load_annotation_file_to_db_in_chunks(annotation_file, chunk_size=100000):
        processed_genes = 0
        with open(annotation_file, 'r') as file:
            while True:
                chunk_genes = list(islice(file, chunk_size))
                if not chunk_genes:
                    break
                processed_genes += len(chunk_genes)
                insert_gene_list(chunk_genes)
                _LOGGER.info(f"{processed_genes} genes processed so far...")
        _LOGGER.info(f"[DONE] {processed_genes} genes processed.")
    
    
    def parse_arguments():
        """
        Defines parser.
        """
        parser = argparse.ArgumentParser(description='Populate database from a given IGC annotation file.')
        # Common arguments for analysis and annotations
        parser.add_argument('annotation', help='IGC annotation file')
        parser.add_argument('--delete_all', action='store_true', help='Empty database before insertion.')
    
        try:
            return parser.parse_args()
        except SystemExit:
            sys.exit(1)
    
    
    def run():
        args = parse_arguments()
        if args.delete_all:
            Gene.objects.all().delete()
        load_annotation_file_to_db_in_chunks(args.annotation)
    
    
    if __name__ == "__main__":
        run()