re-wrote parser for gene annotation PB in genome browser

6c95fb77 · Hanna JULIENNE · b036b240 · 6c95fb77 · 6c95fb77
Commit 6c95fb77 authored 3 years ago by Hanna JULIENNE
--- a/jass/models/inittable.py
+++ b/jass/models/inittable.py
@@ -7,6 +7,7 @@ Created on Tue Mar 28 09:57:33 2017
 import os
 import glob
 import logging
+import re
 from pandas import HDFStore, DataFrame, read_csv, concat, options, read_hdf

 # create (or open) an hdf5 file and opens in append mode
@@ -368,121 +369,61 @@ def add_gene_annotation(
    :return: the dataframes df_gene and df_exon
    :rtype: 2 PANDAS dataframes
    """
-    # lists containing gene data
-    gene_id_label = []
-    gene_GeneID = []
-    gene_chr = []
-    gene_start = []
-    gene_end = []
-    gene_direction = []
-    gene_biotype = []
-
-    # lists containing exon data
-    exon_id_label = []
-    exon_GeneID = []
-    exon_chr = []
-    exon_start = []
-    exon_end = []
-    exon_direction = []
-
-    # temporary list containing the data of all exons
-    TMP__exon_id_label = []
-    TMP__exon_GeneID = []
-    TMP__exon_chr = []
-    TMP__exon_start = []
-    TMP__exon_end = []
-    TMP__exon_direction = []
-
-    fichier = open(gene_data_path, "r")
-    lignes = fichier.readlines()
-    fichier.close()
-
-    for ligne in lignes:
-        elements = ligne.split("\t")
-        if elements[0].startswith("NC_"):
-            decode_chr = elements[0].strip("NC_").split(".")
-            chr = int(decode_chr[0])
-            if chr <= 22:
-                if elements[2] == "gene":
-                    gene_chr.append(chr)
-                    gene_start.append(int(elements[3]))
-                    gene_end.append(int(elements[4]))
-                    gene_direction.append(elements[6])
-                    decode_id_1 = elements[8].split(";")
-                    decode_id_2 = decode_id_1[0].split("=")
-                    gene_id_label.append(decode_id_2[1].strip("gene-"))
-                    decode_id_3 = decode_id_1[1].split(",")
-                    decode_id_4 = decode_id_3[0].split("=")
-                    decode_id_5 = decode_id_4[1].split(":")
-                    gene_GeneID.append(decode_id_5[1])
-                    decode_id_6 = decode_id_1[5].split("=")
-                    if decode_id_6[0] == "gene_biotype":
-                        The_biotype = decode_id_6[1].rstrip('\r\n')
-                        gene_biotype.append(The_biotype)
-                    else:
-                        decode_id_7 = decode_id_1[6].split("=")
-                        if decode_id_7[0] == "gene_biotype":
-                            The_biotype = decode_id_7[1].rstrip('\r\n')
-                            gene_biotype.append(The_biotype)
-                        else:
-                            decode_id_8 = decode_id_1[7].split("=")
-                            if decode_id_8[0] == "gene_biotype":
-                                The_biotype = decode_id_8[1].rstrip('\r\n')
-                                gene_biotype.append(The_biotype)
-                            else:
-                                The_biotype = "unknown"
-                                gene_biotype.append(The_biotype)
-
-                elif elements[2] == "exon":
-                    TMP__exon_chr.append(chr)
-                    TMP__exon_start.append(int(elements[3]))
-                    TMP__exon_end.append(int(elements[4]))
-                    TMP__exon_direction.append(elements[6])
-                    decode_id_1 = elements[8].split(";")
-                    decode_id_2 = decode_id_1[0].split("=")
-                    TMP__exon_id_label.append(decode_id_2[1].strip("exon-"))
-                    decode_id_3 = decode_id_1[2].split(",")
-                    decode_id_4 = decode_id_3[0].split("=")
-                    decode_id_5 = decode_id_4[1].split(":")
-                    TMP__exon_GeneID.append(decode_id_5[1])
-
-    # We only keep the exons that correspond to a gene
-    for i in range(len(TMP__exon_id_label)):
-        if TMP__exon_GeneID[i] in gene_GeneID:
-            exon_id_label.append(TMP__exon_id_label[i])
-            exon_GeneID.append(TMP__exon_GeneID[i])
-            exon_chr.append(TMP__exon_chr[i])
-            exon_start.append(TMP__exon_start[i])
-            exon_end.append(TMP__exon_end[i])
-            exon_direction.append(TMP__exon_direction[i])
-
-    # We insert genes and exons into dataframes
-    df_gene = DataFrame(
-        {
-            "Chr": gene_chr,
-            "GeneID": gene_GeneID,
-            "gene_label": gene_id_label,
-            "start": gene_start,
-            "end": gene_end,
-            "direction": gene_direction,
-            "biotype": gene_biotype,
-        }
-    )
-    df_exon = DataFrame(
-        {
-            "Chr": exon_chr,
-            "exon_label": exon_id_label,
-            "GeneID": exon_GeneID,
-            "start": exon_start,
-            "end": exon_end,
-            "direction": exon_direction,
-        }
-    )
+    # Parsing Helper
+    def get_chr(x):
+        return(int(re.split("NC_|\.", x)[1]))
+
+    def get_label(x):
+        return x[0].split("=")[1].strip("gene-")
+
+    def get_NRexon_ID(x):
+        IDex = re.search('^ID=exon-([A-Z\_0-9\.\-]+);', x)
+        if IDex:
+            return(IDex.group(1))
+
+    def get_gene_ID_split(x):
+        return re.split("=|:", x[1].split(",")[0])[2]
+
+    def get_gene_ID(x):
+        IDex = re.search('Dbxref=GeneID:([0-9]+)[,;]', x)
+        if IDex:
+            return(IDex.group(1))
+
+    bt = re.compile("gene_biotype=")
+    def get_biotype(x):
+        return(list(filter(bt.match, x))[0].split("=")[1])
+
+
+    GRC = read_csv(gene_data_path, sep="\t", skiprows=9, names=["Loci", "Annot", "biotype", "start", "end", "a", "direction", "b", "Info"])
+
+    GRC = GRC.loc[(GRC.biotype=='gene') | (GRC.biotype=="exon")]
+    GRC["Chr"] = GRC.Loci.apply(get_chr)
+    GRC["info_element"]=GRC.Info.str.split(";")
+    GRC["GeneID"] = GRC.Info.apply(get_gene_ID)
+
+    df_gene = GRC.loc[(GRC.biotype=='gene')]
+    df_exon = GRC.loc[(GRC.biotype=='exon')]
+
+    df_gene["gene_label"] = df_gene.info_element.apply(get_label)
+    df_gene["biotype"] = df_gene.info_element.apply(get_biotype)
+    df_gene["GeneID_split"] = df_gene.info_element.apply(get_gene_ID_split)
+    df_gene["GeneID"] = df_gene.Info.apply(get_gene_ID)
+
+    df_exon["exon_label"] = df_exon.Info.apply(get_NRexon_ID)
+    df_exon["GeneID"] = df_exon.Info.apply(get_gene_ID)
+    df_exon.set_index("GeneID", inplace=True)
+    shared_gene = df_exon.index.intersection(df_gene.GeneID)
+
+    df_exon = df_exon.loc[shared_gene]
+    df_exon.reset_index(inplace=True)

    # The rows of the dataframes are sorted by chromosome number and start position (and end position for exon)
    df_gene.sort_values(by=["Chr", "start"], inplace=True, ignore_index=True)
    df_exon.sort_values(by=["Chr", "start", "end"], inplace=True, ignore_index=True)

+    df_exon = df_exon[["Chr", "exon_label", "GeneID","start","end","direction"]]
+    df_gene = df_gene[["Chr", "GeneID", "gene_label","start","end","direction","biotype"]]
+
    # This information about genes and exons is stored in an hdf5 file if possible
    if initTable_path is not None:
        hdf_file = HDFStore(initTable_path)

--- a/read_gff.py
+++ b/read_gff.py
 import pandas as pd
 import re
+
 def add_gene_annotation(
    gene_data_path #initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None
 ):
@@ -137,47 +138,79 @@ def add_gene_annotation(

    return (df_gene, df_exon)

-#initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None
-gene_data_path = "/home/hjulienn/Project/jass/data/Extract_GRCh37.gff"
-dgen, dexon = add_gene_annotation(gene_data_path)
-print(dexon.head())
-print(dgen.head())

+def add_gene_annotation_Hanna(gene_data_path):
+
+    # Parsing Helper
+    def get_chr(x):
+        return(int(re.split("NC_|\.", x)[1]))
+
+    def get_label(x):
+        return x[0].split("=")[1].strip("gene-")
+
+    def get_NRexon_ID(x):
+        IDex = re.search('^ID=exon-([A-Z\_0-9\.\-]+);', x)
+        if IDex:
+            return(IDex.group(1))
+
+    def get_gene_ID_split(x):
+        return re.split("=|:", x[1].split(",")[0])[2]

-def get_chr(x):
-    return(int(re.split("NC_|\.", x)[1]))
+    def get_gene_ID(x):
+        IDex = re.search('Dbxref=GeneID:([0-9]+)[,;]', x)
+        if IDex:
+            return(IDex.group(1))

-def get_label(x):
-    return x[0].split("=")[1].strip("gene-")
+    bt = re.compile("gene_biotype=")
+    def get_biotype(x):
+        print(list(filter(bt.match, x))[0].split("=")[1])
+        return(list(filter(bt.match, x))[0].split("=")[1])

-def get_gene_ID(x):
-    return re.split("=|:", x[1].split(",")[0])[2]

-bt = re.compile("gene_biotype=")
-def get_biotype(x):
-    print(list(filter(bt.match, x))[0].split("=")[1])
-    return(list(filter(bt.match, x))[0].split("=")[1])
+    GRC = pd.read_csv(gene_data_path, sep="\t", skiprows=9, names=["Loci", "Annot", "biotype", "start", "end", "a", "direction", "b", "Info"])
+
+    GRC = GRC.loc[(GRC.biotype=='gene') | (GRC.biotype=="exon")]
+    GRC["Chr"] = GRC.Loci.apply(get_chr)
+    GRC["info_element"]=GRC.Info.str.split(";")
+    GRC["GeneID"] = GRC.Info.apply(get_gene_ID)
+
+    df_gene = GRC.loc[(GRC.biotype=='gene')]
+    df_exon = GRC.loc[(GRC.biotype=='exon')]
+
+    df_gene["gene_label"] = df_gene.info_element.apply(get_label)
+    df_gene["biotype"] = df_gene.info_element.apply(get_biotype)
+    df_gene["GeneID_split"] = df_gene.info_element.apply(get_gene_ID_split)
+    df_gene["GeneID"] = df_gene.Info.apply(get_gene_ID)
+
+    df_exon["exon_label"] = df_exon.Info.apply(get_NRexon_ID)
+    df_exon["GeneID"] = df_exon.Info.apply(get_gene_ID)
+    df_exon.set_index("GeneID", inplace=True)
+    shared_gene = df_exon.index.intersection(df_gene.GeneID)
+
+    df_exon = df_exon.loc[shared_gene]
+    df_exon.reset_index(inplace=True)
+    df_gene.sort_values(by=["Chr", "start"], inplace=True, ignore_index=True)
+    df_exon.sort_values(by=["Chr", "start", "end"], inplace=True, ignore_index=True)

-GRC = pd.read_csv(gene_data_path, sep="\t", skiprows=9, nrows=10000, names=["Loci", "Annot", "biotype", "start", "end", "a", "direction", "b", "Info"])
-#Filter out to keep only genes and exons
-GRC = GRC.loc[(GRC.biotype=='gene') | (GRC.biotype=="exon")]
-GRC["Chr"] = GRC.Loci.apply(get_chr)
-GRC["info_element"]=GRC.Info.str.split(";")
+    df_exon = df_exon[["Chr", "exon_label", "GeneID","start","end","direction"]]
+    df_gene = df_gene[["Chr", "GeneID", "gene_label","start","end","direction","biotype"]]

-GRC_gene = GRC.loc[(GRC.biotype=='gene')]
-GRC_exon = GRC.loc[(GRC.biotype=='exon')]
-GRC_gene["gene_label"] = GRC_gene.info_element.apply(get_label)
-GRC_gene["GeneID"] = GRC_gene.info_element.apply(get_gene_ID)
-GRC_gene["biotype"] = GRC_gene.info_element.apply(get_biotype)
+    return(df_gene, df_exon)

+gene_data_path = "/home/hjulienn/Project/JASS_dev/jass/data/GRCh37_latest_genomic.gff"
+dgen, dexon = add_gene_annotation(gene_data_path) #initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None)
+df_gene, df_exon = add_gene_annotation_Hanna(gene_data_path)

-print(GRC.shape)
-print(GRC.head(30))
-print(GRC.iloc[0,8])
+print("Hanna func")
+print(df_gene.head())
+print(df_exon.head())
+print(df_gene.shape)
+print(df_exon.shape)

-print(GRC_exon.head())
 print("cyril func")
 print(dgen.head())
 print(dexon.head())
-print("hanna func")
-print(GRC_gene[["Chr", "GeneID", "gene_label","start","end","direction","biotype"]].head())
+print(dgen.shape)
+print(dexon.shape)
+print(df_gene.compare(dgen))
+print(df_exon.compare(dexon))