reproduce cyril parser in pandas for gene table

b036b240 · Hanna JULIENNE · 58aa4962 · b036b240
Commit b036b240 authored 3 years ago by Hanna JULIENNE
--- a/read_gff.py
+++ b/read_gff.py
+import pandas as pd
+import re
+def add_gene_annotation(
+    gene_data_path #initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None
+):
+    """
+    add_gene_annotation
+    for the first 22 chromosomes, retrieves the label of the genes
+    and their position as well as those of the exons associated with the genes.
+    Then store this information in a hdf5 file
+
+    :param gene_data_path: path to the GFF file containing gene and exon data (for example, GRCh37_latest_genomic.gff)
+    :type gene_data_path: str
+    :param initTable_path: path to the file initTable.hdf5
+    :type initTable_path: str
+    :param df_gene_csv_path: path to the file df_gene.csv
+    :type df_gene_csv_path: str
+    :param df_exon_csv_path: path to the file df_exon.csv
+    :type df_exon_csv_path: str
+
+    :return: the dataframes df_gene and df_exon
+    :rtype: 2 PANDAS dataframes
+    """
+    # lists containing gene data
+    gene_id_label = []
+    gene_GeneID = []
+    gene_chr = []
+    gene_start = []
+    gene_end = []
+    gene_direction = []
+    gene_biotype = []
+
+    # lists containing exon data
+    exon_id_label = []
+    exon_GeneID = []
+    exon_chr = []
+    exon_start = []
+    exon_end = []
+    exon_direction = []
+
+    # temporary list containing the data of all exons
+    TMP__exon_id_label = []
+    TMP__exon_GeneID = []
+    TMP__exon_chr = []
+    TMP__exon_start = []
+    TMP__exon_end = []
+    TMP__exon_direction = []
+
+    fichier = open(gene_data_path, "r")
+    lignes = fichier.readlines()
+    fichier.close()
+
+    for ligne in lignes:
+        elements = ligne.split("\t")
+        if elements[0].startswith("NC_"):
+            decode_chr = elements[0].strip("NC_").split(".")
+            chr = int(decode_chr[0])
+            if chr <= 22:
+                if elements[2] == "gene":
+                    gene_chr.append(chr)
+                    gene_start.append(int(elements[3]))
+                    gene_end.append(int(elements[4]))
+                    gene_direction.append(elements[6])
+                    decode_id_1 = elements[8].split(";")
+                    decode_id_2 = decode_id_1[0].split("=")
+                    gene_id_label.append(decode_id_2[1].strip("gene-"))
+                    decode_id_3 = decode_id_1[1].split(",")
+                    decode_id_4 = decode_id_3[0].split("=")
+                    decode_id_5 = decode_id_4[1].split(":")
+                    gene_GeneID.append(decode_id_5[1])
+                    decode_id_6 = decode_id_1[5].split("=")
+                    if decode_id_6[0] == "gene_biotype":
+                        The_biotype = decode_id_6[1].rstrip('\r\n')
+                        gene_biotype.append(The_biotype)
+                    else:
+                        decode_id_7 = decode_id_1[6].split("=")
+                        if decode_id_7[0] == "gene_biotype":
+                            The_biotype = decode_id_7[1].rstrip('\r\n')
+                            gene_biotype.append(The_biotype)
+                        else:
+                            decode_id_8 = decode_id_1[7].split("=")
+                            if decode_id_8[0] == "gene_biotype":
+                                The_biotype = decode_id_8[1].rstrip('\r\n')
+                                gene_biotype.append(The_biotype)
+                            else:
+                                The_biotype = "unknown"
+                                gene_biotype.append(The_biotype)
+
+                elif elements[2] == "exon":
+                    TMP__exon_chr.append(chr)
+                    TMP__exon_start.append(int(elements[3]))
+                    TMP__exon_end.append(int(elements[4]))
+                    TMP__exon_direction.append(elements[6])
+                    decode_id_1 = elements[8].split(";")
+                    decode_id_2 = decode_id_1[0].split("=")
+                    TMP__exon_id_label.append(decode_id_2[1].strip("exon-"))
+                    decode_id_3 = decode_id_1[2].split(",")
+                    decode_id_4 = decode_id_3[0].split("=")
+                    decode_id_5 = decode_id_4[1].split(":")
+                    TMP__exon_GeneID.append(decode_id_5[1])
+
+    # We only keep the exons that correspond to a gene
+    for i in range(len(TMP__exon_id_label)):
+        if TMP__exon_GeneID[i] in gene_GeneID:
+            exon_id_label.append(TMP__exon_id_label[i])
+            exon_GeneID.append(TMP__exon_GeneID[i])
+            exon_chr.append(TMP__exon_chr[i])
+            exon_start.append(TMP__exon_start[i])
+            exon_end.append(TMP__exon_end[i])
+            exon_direction.append(TMP__exon_direction[i])
+
+    # We insert genes and exons into dataframes
+    df_gene = pd.DataFrame(
+        {
+            "Chr": gene_chr,
+            "GeneID": gene_GeneID,
+            "gene_label": gene_id_label,
+            "start": gene_start,
+            "end": gene_end,
+            "direction": gene_direction,
+            "biotype": gene_biotype,
+        }
+    )
+    df_exon = pd.DataFrame(
+        {
+            "Chr": exon_chr,
+            "exon_label": exon_id_label,
+            "GeneID": exon_GeneID,
+            "start": exon_start,
+            "end": exon_end,
+            "direction": exon_direction,
+        }
+    )
+    # The rows of the dataframes are sorted by chromosome number and start position (and end position for exon)
+    df_gene.sort_values(by=["Chr", "start"], inplace=True, ignore_index=True)
+    df_exon.sort_values(by=["Chr", "start", "end"], inplace=True, ignore_index=True)
+
+    return (df_gene, df_exon)
+
+#initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None
+gene_data_path = "/home/hjulienn/Project/jass/data/Extract_GRCh37.gff"
+dgen, dexon = add_gene_annotation(gene_data_path)
+print(dexon.head())
+print(dgen.head())
+
+
+def get_chr(x):
+    return(int(re.split("NC_|\.", x)[1]))
+
+def get_label(x):
+    return x[0].split("=")[1].strip("gene-")
+
+def get_gene_ID(x):
+    return re.split("=|:", x[1].split(",")[0])[2]
+
+bt = re.compile("gene_biotype=")
+def get_biotype(x):
+    print(list(filter(bt.match, x))[0].split("=")[1])
+    return(list(filter(bt.match, x))[0].split("=")[1])
+
+GRC = pd.read_csv(gene_data_path, sep="\t", skiprows=9, nrows=10000, names=["Loci", "Annot", "biotype", "start", "end", "a", "direction", "b", "Info"])
+#Filter out to keep only genes and exons
+GRC = GRC.loc[(GRC.biotype=='gene') | (GRC.biotype=="exon")]
+GRC["Chr"] = GRC.Loci.apply(get_chr)
+GRC["info_element"]=GRC.Info.str.split(";")
+
+GRC_gene = GRC.loc[(GRC.biotype=='gene')]
+GRC_exon = GRC.loc[(GRC.biotype=='exon')]
+GRC_gene["gene_label"] = GRC_gene.info_element.apply(get_label)
+GRC_gene["GeneID"] = GRC_gene.info_element.apply(get_gene_ID)
+GRC_gene["biotype"] = GRC_gene.info_element.apply(get_biotype)
+
+
+print(GRC.shape)
+print(GRC.head(30))
+print(GRC.iloc[0,8])
+
+print(GRC_exon.head())
+print("cyril func")
+print(dgen.head())
+print(dexon.head())
+print("hanna func")
+print(GRC_gene[["Chr", "GeneID", "gene_label","start","end","direction","biotype"]].head())