igc.py 2.53 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import logging

_LOGGER = logging.getLogger(__name__)


class IGCLineParser(object):

    @staticmethod
    def gene(line):
        """
        Parse line from IGC genes list () to return organized dict

        IGC annotation columns:
            0: Gene ID	                            Unique ID
            1: Gene Name                 	        Unique name
            2: Gene Length	                        Length of nucleotide sequence
            3: Gene Completeness Status	            I the gene complete or partial according to the gene predictor
            4: Cohort Origin	                    Stating the cohort contributing the representative gene
            5: Taxonomic Annotation(Phylum Level)	Annotated phylum for a gene
            6: Taxonomic Annotation(Genus Level)	Annotated genus for a gene
            7: KEGG Annotation	                    Annotated KO(s) for a gene
            8: eggNOG Annotation	                Annotated eggNOG(s) for a gene
            9: Sample Occurence Frequency	        Occurrence frequency in samples based on gene profile
            10: Individual Occurence Frequency	    Occurrence frequency in individuals based on gene profile
            11: KEGG Functional Categories	        KEGG functional category(ies) of the annotated KO(s)
            12: eggNOG Functional Categories	    eggNOG functional category(ies) of the annotated eggNOG(s)
            13: Cohort Assembled	                Stating the metagenomic sequencing cohort(s) contributing the
                                                    representative gene or a redundant gene belonging to it
        """
        try:
            gene_info = line.rstrip().split('\t')
            return {
                'igc_id': gene_info[0],
                'gene_id': gene_info[1],
35
                'length': gene_info[2],
36
37
38
39
                'gene_completeness_status': gene_info[3],
                'cohort_origin': gene_info[4],
                'taxo_phylum': gene_info[5],
                'taxo_genus': gene_info[6],
40
                'kegg_ko': gene_info[7].split(';'),
41
                'eggnog': gene_info[8].split(';'),
42
43
44
45
46
47
48
49
50
                'sample_occurence_frequency': gene_info[9],
                'individual_occurence_frequency': gene_info[10],
                'kegg_functional_categories': gene_info[11],
                'eggnog_functional_categories': gene_info[12],
                'cohort_assembled': gene_info[13]
            }
        except Exception:
            _LOGGER.error(f"Could not parse: {line.rstrip()}. Are you sure it comes from IGC genes list?")
            raise