Skip to content
Snippets Groups Projects
Commit 6c95fb77 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

re-wrote parser for gene annotation PB in genome browser

parent b036b240
Branches dev_cnerin
No related tags found
No related merge requests found
......@@ -7,6 +7,7 @@ Created on Tue Mar 28 09:57:33 2017
import os
import glob
import logging
import re
from pandas import HDFStore, DataFrame, read_csv, concat, options, read_hdf
# create (or open) an hdf5 file and opens in append mode
......@@ -368,121 +369,61 @@ def add_gene_annotation(
:return: the dataframes df_gene and df_exon
:rtype: 2 PANDAS dataframes
"""
# lists containing gene data
gene_id_label = []
gene_GeneID = []
gene_chr = []
gene_start = []
gene_end = []
gene_direction = []
gene_biotype = []
# lists containing exon data
exon_id_label = []
exon_GeneID = []
exon_chr = []
exon_start = []
exon_end = []
exon_direction = []
# temporary list containing the data of all exons
TMP__exon_id_label = []
TMP__exon_GeneID = []
TMP__exon_chr = []
TMP__exon_start = []
TMP__exon_end = []
TMP__exon_direction = []
fichier = open(gene_data_path, "r")
lignes = fichier.readlines()
fichier.close()
for ligne in lignes:
elements = ligne.split("\t")
if elements[0].startswith("NC_"):
decode_chr = elements[0].strip("NC_").split(".")
chr = int(decode_chr[0])
if chr <= 22:
if elements[2] == "gene":
gene_chr.append(chr)
gene_start.append(int(elements[3]))
gene_end.append(int(elements[4]))
gene_direction.append(elements[6])
decode_id_1 = elements[8].split(";")
decode_id_2 = decode_id_1[0].split("=")
gene_id_label.append(decode_id_2[1].strip("gene-"))
decode_id_3 = decode_id_1[1].split(",")
decode_id_4 = decode_id_3[0].split("=")
decode_id_5 = decode_id_4[1].split(":")
gene_GeneID.append(decode_id_5[1])
decode_id_6 = decode_id_1[5].split("=")
if decode_id_6[0] == "gene_biotype":
The_biotype = decode_id_6[1].rstrip('\r\n')
gene_biotype.append(The_biotype)
else:
decode_id_7 = decode_id_1[6].split("=")
if decode_id_7[0] == "gene_biotype":
The_biotype = decode_id_7[1].rstrip('\r\n')
gene_biotype.append(The_biotype)
else:
decode_id_8 = decode_id_1[7].split("=")
if decode_id_8[0] == "gene_biotype":
The_biotype = decode_id_8[1].rstrip('\r\n')
gene_biotype.append(The_biotype)
else:
The_biotype = "unknown"
gene_biotype.append(The_biotype)
elif elements[2] == "exon":
TMP__exon_chr.append(chr)
TMP__exon_start.append(int(elements[3]))
TMP__exon_end.append(int(elements[4]))
TMP__exon_direction.append(elements[6])
decode_id_1 = elements[8].split(";")
decode_id_2 = decode_id_1[0].split("=")
TMP__exon_id_label.append(decode_id_2[1].strip("exon-"))
decode_id_3 = decode_id_1[2].split(",")
decode_id_4 = decode_id_3[0].split("=")
decode_id_5 = decode_id_4[1].split(":")
TMP__exon_GeneID.append(decode_id_5[1])
# We only keep the exons that correspond to a gene
for i in range(len(TMP__exon_id_label)):
if TMP__exon_GeneID[i] in gene_GeneID:
exon_id_label.append(TMP__exon_id_label[i])
exon_GeneID.append(TMP__exon_GeneID[i])
exon_chr.append(TMP__exon_chr[i])
exon_start.append(TMP__exon_start[i])
exon_end.append(TMP__exon_end[i])
exon_direction.append(TMP__exon_direction[i])
# We insert genes and exons into dataframes
df_gene = DataFrame(
{
"Chr": gene_chr,
"GeneID": gene_GeneID,
"gene_label": gene_id_label,
"start": gene_start,
"end": gene_end,
"direction": gene_direction,
"biotype": gene_biotype,
}
)
df_exon = DataFrame(
{
"Chr": exon_chr,
"exon_label": exon_id_label,
"GeneID": exon_GeneID,
"start": exon_start,
"end": exon_end,
"direction": exon_direction,
}
)
# Parsing Helper
def get_chr(x):
return(int(re.split("NC_|\.", x)[1]))
def get_label(x):
return x[0].split("=")[1].strip("gene-")
def get_NRexon_ID(x):
IDex = re.search('^ID=exon-([A-Z\_0-9\.\-]+);', x)
if IDex:
return(IDex.group(1))
def get_gene_ID_split(x):
return re.split("=|:", x[1].split(",")[0])[2]
def get_gene_ID(x):
IDex = re.search('Dbxref=GeneID:([0-9]+)[,;]', x)
if IDex:
return(IDex.group(1))
bt = re.compile("gene_biotype=")
def get_biotype(x):
return(list(filter(bt.match, x))[0].split("=")[1])
GRC = read_csv(gene_data_path, sep="\t", skiprows=9, names=["Loci", "Annot", "biotype", "start", "end", "a", "direction", "b", "Info"])
GRC = GRC.loc[(GRC.biotype=='gene') | (GRC.biotype=="exon")]
GRC["Chr"] = GRC.Loci.apply(get_chr)
GRC["info_element"]=GRC.Info.str.split(";")
GRC["GeneID"] = GRC.Info.apply(get_gene_ID)
df_gene = GRC.loc[(GRC.biotype=='gene')]
df_exon = GRC.loc[(GRC.biotype=='exon')]
df_gene["gene_label"] = df_gene.info_element.apply(get_label)
df_gene["biotype"] = df_gene.info_element.apply(get_biotype)
df_gene["GeneID_split"] = df_gene.info_element.apply(get_gene_ID_split)
df_gene["GeneID"] = df_gene.Info.apply(get_gene_ID)
df_exon["exon_label"] = df_exon.Info.apply(get_NRexon_ID)
df_exon["GeneID"] = df_exon.Info.apply(get_gene_ID)
df_exon.set_index("GeneID", inplace=True)
shared_gene = df_exon.index.intersection(df_gene.GeneID)
df_exon = df_exon.loc[shared_gene]
df_exon.reset_index(inplace=True)
# The rows of the dataframes are sorted by chromosome number and start position (and end position for exon)
df_gene.sort_values(by=["Chr", "start"], inplace=True, ignore_index=True)
df_exon.sort_values(by=["Chr", "start", "end"], inplace=True, ignore_index=True)
df_exon = df_exon[["Chr", "exon_label", "GeneID","start","end","direction"]]
df_gene = df_gene[["Chr", "GeneID", "gene_label","start","end","direction","biotype"]]
# This information about genes and exons is stored in an hdf5 file if possible
if initTable_path is not None:
hdf_file = HDFStore(initTable_path)
......
import pandas as pd
import re
def add_gene_annotation(
gene_data_path #initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None
):
......@@ -137,47 +138,79 @@ def add_gene_annotation(
return (df_gene, df_exon)
#initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None
gene_data_path = "/home/hjulienn/Project/jass/data/Extract_GRCh37.gff"
dgen, dexon = add_gene_annotation(gene_data_path)
print(dexon.head())
print(dgen.head())
def add_gene_annotation_Hanna(gene_data_path):
# Parsing Helper
def get_chr(x):
return(int(re.split("NC_|\.", x)[1]))
def get_label(x):
return x[0].split("=")[1].strip("gene-")
def get_NRexon_ID(x):
IDex = re.search('^ID=exon-([A-Z\_0-9\.\-]+);', x)
if IDex:
return(IDex.group(1))
def get_gene_ID_split(x):
return re.split("=|:", x[1].split(",")[0])[2]
def get_chr(x):
return(int(re.split("NC_|\.", x)[1]))
def get_gene_ID(x):
IDex = re.search('Dbxref=GeneID:([0-9]+)[,;]', x)
if IDex:
return(IDex.group(1))
def get_label(x):
return x[0].split("=")[1].strip("gene-")
bt = re.compile("gene_biotype=")
def get_biotype(x):
print(list(filter(bt.match, x))[0].split("=")[1])
return(list(filter(bt.match, x))[0].split("=")[1])
def get_gene_ID(x):
return re.split("=|:", x[1].split(",")[0])[2]
bt = re.compile("gene_biotype=")
def get_biotype(x):
print(list(filter(bt.match, x))[0].split("=")[1])
return(list(filter(bt.match, x))[0].split("=")[1])
GRC = pd.read_csv(gene_data_path, sep="\t", skiprows=9, names=["Loci", "Annot", "biotype", "start", "end", "a", "direction", "b", "Info"])
GRC = GRC.loc[(GRC.biotype=='gene') | (GRC.biotype=="exon")]
GRC["Chr"] = GRC.Loci.apply(get_chr)
GRC["info_element"]=GRC.Info.str.split(";")
GRC["GeneID"] = GRC.Info.apply(get_gene_ID)
df_gene = GRC.loc[(GRC.biotype=='gene')]
df_exon = GRC.loc[(GRC.biotype=='exon')]
df_gene["gene_label"] = df_gene.info_element.apply(get_label)
df_gene["biotype"] = df_gene.info_element.apply(get_biotype)
df_gene["GeneID_split"] = df_gene.info_element.apply(get_gene_ID_split)
df_gene["GeneID"] = df_gene.Info.apply(get_gene_ID)
df_exon["exon_label"] = df_exon.Info.apply(get_NRexon_ID)
df_exon["GeneID"] = df_exon.Info.apply(get_gene_ID)
df_exon.set_index("GeneID", inplace=True)
shared_gene = df_exon.index.intersection(df_gene.GeneID)
df_exon = df_exon.loc[shared_gene]
df_exon.reset_index(inplace=True)
df_gene.sort_values(by=["Chr", "start"], inplace=True, ignore_index=True)
df_exon.sort_values(by=["Chr", "start", "end"], inplace=True, ignore_index=True)
GRC = pd.read_csv(gene_data_path, sep="\t", skiprows=9, nrows=10000, names=["Loci", "Annot", "biotype", "start", "end", "a", "direction", "b", "Info"])
#Filter out to keep only genes and exons
GRC = GRC.loc[(GRC.biotype=='gene') | (GRC.biotype=="exon")]
GRC["Chr"] = GRC.Loci.apply(get_chr)
GRC["info_element"]=GRC.Info.str.split(";")
df_exon = df_exon[["Chr", "exon_label", "GeneID","start","end","direction"]]
df_gene = df_gene[["Chr", "GeneID", "gene_label","start","end","direction","biotype"]]
GRC_gene = GRC.loc[(GRC.biotype=='gene')]
GRC_exon = GRC.loc[(GRC.biotype=='exon')]
GRC_gene["gene_label"] = GRC_gene.info_element.apply(get_label)
GRC_gene["GeneID"] = GRC_gene.info_element.apply(get_gene_ID)
GRC_gene["biotype"] = GRC_gene.info_element.apply(get_biotype)
return(df_gene, df_exon)
gene_data_path = "/home/hjulienn/Project/JASS_dev/jass/data/GRCh37_latest_genomic.gff"
dgen, dexon = add_gene_annotation(gene_data_path) #initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None)
df_gene, df_exon = add_gene_annotation_Hanna(gene_data_path)
print(GRC.shape)
print(GRC.head(30))
print(GRC.iloc[0,8])
print("Hanna func")
print(df_gene.head())
print(df_exon.head())
print(df_gene.shape)
print(df_exon.shape)
print(GRC_exon.head())
print("cyril func")
print(dgen.head())
print(dexon.head())
print("hanna func")
print(GRC_gene[["Chr", "GeneID", "gene_label","start","end","direction","biotype"]].head())
print(dgen.shape)
print(dexon.shape)
print(df_gene.compare(dgen))
print(df_exon.compare(dexon))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment