Skip to content
Snippets Groups Projects
Select Git revision
  • 409509ee81337b29c9871844d2fd7f5139a7a09c
  • master default protected
  • 2.0.5
  • 2.0.4
  • 2.0.3
  • 2.0.2
  • 2.0.1
  • 2.0.0
  • 1.2.4
  • 1.2.2
  • 1.2.1
  • 1.1.0
  • v1.0.9
13 results

test_RapidPeptidesGenerator.py

Blame
  • import_igc_data.py 4.35 KiB
    #!/usr/bin/env python
    import argparse
    import logging
    import os
    import sys
    from itertools import islice
    
    import django
    from django.core.exceptions import ValidationError
    
    # Before model import, we need to called django.setup() to Load apps
    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "metagenedb.settings")
    django.setup()
    
    from metagenedb.apps.catalog.models import Gene, Function
    
    logging.basicConfig(level=logging.INFO)
    _LOGGER = logging.getLogger(__name__)
    
    
    def parse_gene(raw_line):
        """
        IGC annotation columns:
            0: Gene ID	                            Unique ID
            1: Gene Name                 	        Unique name
            2: Gene Length	                        Length of nucleotide sequence
            3: Gene Completeness Status	            Stating a gene is complete or partial according to the gene predictor
            4: Cohort Origin	                    Stating the cohort contributing the representative gene
            5: Taxonomic Annotation(Phylum Level)	Annotated phylum for a gene
            6: Taxonomic Annotation(Genus Level)	Annotated genus for a gene
            7: KEGG Annotation	                    Annotated KO(s) for a gene
            8: eggNOG Annotation	                Annotated eggNOG(s) for a gene
            9: Sample Occurence Frequency	        Occurrence frequency in samples based on gene profile
            10:Individual Occurence Frequency	    Occurrence frequency in individuals based on gene profile
            11: KEGG Functional Categories	        KEGG functional category(ies) of the annotated KO(s)
            12: eggNOG Functional Categories	    eggNOG functional category(ies) of the annotated eggNOG(s)
            13: Cohort Assembled	                Stating the metagenomic sequencing cohort(s) contributing the
                                                    representative gene or a redundant gene belonging to it
        """
        gene_info = raw_line.rstrip().split('\t')
        return {
            'gene_id': gene_info[1],
            'gene_length': gene_info[2],
            'kegg_ko': gene_info[7]
        }
    
    
    def link_to_function(obj_gene, gene_dict):
        try:
            function = Function.objects.get(function_id=gene_dict.get('kegg_ko'))
            obj_gene.functions.add(function)
            obj_gene.full_clean()
            obj_gene.save()
        except Function.DoesNotExist:
            _LOGGER.warning(f"{gene_dict.get('kegg_ko')} not found in the database {gene_dict}.")
    
    
    def insert_gene(gene_dict):
        MANY_TO_MANY_FIELDS = ['kegg_ko']
    
        try:
            obj_gene = Gene.objects.get(gene_id=gene_dict.get('gene_id'))
            for key, value in gene_dict.items():
                if key not in MANY_TO_MANY_FIELDS:
                    setattr(obj_gene, key, value)
        except Gene.DoesNotExist:
            obj_gene = Gene(gene_id=gene_dict.get('gene_id'),
                            gene_length=gene_dict.get('gene_length'))
        obj_gene.full_clean()
        obj_gene.save()
        # Add link to KEGG
        if gene_dict.get('kegg_ko') != 'unknown':
            link_to_function(obj_gene, gene_dict)
    
    
    def insert_gene_list(chunk_genes):
        for i in chunk_genes:
            try:
                gene_dict = parse_gene(i)
                insert_gene(gene_dict)
            except ValidationError as e:
                _LOGGER.warning(f"{e.__dict__} for gene_id: {gene_dict.get('gene_id')}. Insertion skipped.")
    
    
    def load_annotation_file_to_db_in_chunks(annotation_file, chunk_size=100000):
        processed_genes = 0
        with open(annotation_file, 'r') as file:
            while True:
                chunk_genes = list(islice(file, chunk_size))
                if not chunk_genes:
                    break
                processed_genes += len(chunk_genes)
                insert_gene_list(chunk_genes)
                _LOGGER.info(f"{processed_genes} genes processed so far...")
        _LOGGER.info(f"[DONE] {processed_genes} genes processed.")
    
    
    def parse_arguments():
        """
        Defines parser.
        """
        parser = argparse.ArgumentParser(description='Populate database from a given IGC annotation file.')
        # Common arguments for analysis and annotations
        parser.add_argument('annotation', help='IGC annotation file')
        parser.add_argument('--delete_all', action='store_true', help='Empty database before insertion.')
    
        try:
            return parser.parse_args()
        except SystemExit:
            sys.exit(1)
    
    
    def run():
        args = parse_arguments()
        if args.delete_all:
            Gene.objects.all().delete()
        load_annotation_file_to_db_in_chunks(args.annotation)
    
    
    if __name__ == "__main__":
        run()