Skip to content
Snippets Groups Projects
Commit b036b240 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

reproduce cyril parser in pandas for gene table

parent 58aa4962
No related branches found
No related tags found
1 merge request!37Draft: Dev cnerin
import pandas as pd
import re
def add_gene_annotation(
gene_data_path #initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None
):
"""
add_gene_annotation
for the first 22 chromosomes, retrieves the label of the genes
and their position as well as those of the exons associated with the genes.
Then store this information in a hdf5 file
:param gene_data_path: path to the GFF file containing gene and exon data (for example, GRCh37_latest_genomic.gff)
:type gene_data_path: str
:param initTable_path: path to the file initTable.hdf5
:type initTable_path: str
:param df_gene_csv_path: path to the file df_gene.csv
:type df_gene_csv_path: str
:param df_exon_csv_path: path to the file df_exon.csv
:type df_exon_csv_path: str
:return: the dataframes df_gene and df_exon
:rtype: 2 PANDAS dataframes
"""
# lists containing gene data
gene_id_label = []
gene_GeneID = []
gene_chr = []
gene_start = []
gene_end = []
gene_direction = []
gene_biotype = []
# lists containing exon data
exon_id_label = []
exon_GeneID = []
exon_chr = []
exon_start = []
exon_end = []
exon_direction = []
# temporary list containing the data of all exons
TMP__exon_id_label = []
TMP__exon_GeneID = []
TMP__exon_chr = []
TMP__exon_start = []
TMP__exon_end = []
TMP__exon_direction = []
fichier = open(gene_data_path, "r")
lignes = fichier.readlines()
fichier.close()
for ligne in lignes:
elements = ligne.split("\t")
if elements[0].startswith("NC_"):
decode_chr = elements[0].strip("NC_").split(".")
chr = int(decode_chr[0])
if chr <= 22:
if elements[2] == "gene":
gene_chr.append(chr)
gene_start.append(int(elements[3]))
gene_end.append(int(elements[4]))
gene_direction.append(elements[6])
decode_id_1 = elements[8].split(";")
decode_id_2 = decode_id_1[0].split("=")
gene_id_label.append(decode_id_2[1].strip("gene-"))
decode_id_3 = decode_id_1[1].split(",")
decode_id_4 = decode_id_3[0].split("=")
decode_id_5 = decode_id_4[1].split(":")
gene_GeneID.append(decode_id_5[1])
decode_id_6 = decode_id_1[5].split("=")
if decode_id_6[0] == "gene_biotype":
The_biotype = decode_id_6[1].rstrip('\r\n')
gene_biotype.append(The_biotype)
else:
decode_id_7 = decode_id_1[6].split("=")
if decode_id_7[0] == "gene_biotype":
The_biotype = decode_id_7[1].rstrip('\r\n')
gene_biotype.append(The_biotype)
else:
decode_id_8 = decode_id_1[7].split("=")
if decode_id_8[0] == "gene_biotype":
The_biotype = decode_id_8[1].rstrip('\r\n')
gene_biotype.append(The_biotype)
else:
The_biotype = "unknown"
gene_biotype.append(The_biotype)
elif elements[2] == "exon":
TMP__exon_chr.append(chr)
TMP__exon_start.append(int(elements[3]))
TMP__exon_end.append(int(elements[4]))
TMP__exon_direction.append(elements[6])
decode_id_1 = elements[8].split(";")
decode_id_2 = decode_id_1[0].split("=")
TMP__exon_id_label.append(decode_id_2[1].strip("exon-"))
decode_id_3 = decode_id_1[2].split(",")
decode_id_4 = decode_id_3[0].split("=")
decode_id_5 = decode_id_4[1].split(":")
TMP__exon_GeneID.append(decode_id_5[1])
# We only keep the exons that correspond to a gene
for i in range(len(TMP__exon_id_label)):
if TMP__exon_GeneID[i] in gene_GeneID:
exon_id_label.append(TMP__exon_id_label[i])
exon_GeneID.append(TMP__exon_GeneID[i])
exon_chr.append(TMP__exon_chr[i])
exon_start.append(TMP__exon_start[i])
exon_end.append(TMP__exon_end[i])
exon_direction.append(TMP__exon_direction[i])
# We insert genes and exons into dataframes
df_gene = pd.DataFrame(
{
"Chr": gene_chr,
"GeneID": gene_GeneID,
"gene_label": gene_id_label,
"start": gene_start,
"end": gene_end,
"direction": gene_direction,
"biotype": gene_biotype,
}
)
df_exon = pd.DataFrame(
{
"Chr": exon_chr,
"exon_label": exon_id_label,
"GeneID": exon_GeneID,
"start": exon_start,
"end": exon_end,
"direction": exon_direction,
}
)
# The rows of the dataframes are sorted by chromosome number and start position (and end position for exon)
df_gene.sort_values(by=["Chr", "start"], inplace=True, ignore_index=True)
df_exon.sort_values(by=["Chr", "start", "end"], inplace=True, ignore_index=True)
return (df_gene, df_exon)
#initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None
gene_data_path = "/home/hjulienn/Project/jass/data/Extract_GRCh37.gff"
dgen, dexon = add_gene_annotation(gene_data_path)
print(dexon.head())
print(dgen.head())
def get_chr(x):
return(int(re.split("NC_|\.", x)[1]))
def get_label(x):
return x[0].split("=")[1].strip("gene-")
def get_gene_ID(x):
return re.split("=|:", x[1].split(",")[0])[2]
bt = re.compile("gene_biotype=")
def get_biotype(x):
print(list(filter(bt.match, x))[0].split("=")[1])
return(list(filter(bt.match, x))[0].split("=")[1])
GRC = pd.read_csv(gene_data_path, sep="\t", skiprows=9, nrows=10000, names=["Loci", "Annot", "biotype", "start", "end", "a", "direction", "b", "Info"])
#Filter out to keep only genes and exons
GRC = GRC.loc[(GRC.biotype=='gene') | (GRC.biotype=="exon")]
GRC["Chr"] = GRC.Loci.apply(get_chr)
GRC["info_element"]=GRC.Info.str.split(";")
GRC_gene = GRC.loc[(GRC.biotype=='gene')]
GRC_exon = GRC.loc[(GRC.biotype=='exon')]
GRC_gene["gene_label"] = GRC_gene.info_element.apply(get_label)
GRC_gene["GeneID"] = GRC_gene.info_element.apply(get_gene_ID)
GRC_gene["biotype"] = GRC_gene.info_element.apply(get_biotype)
print(GRC.shape)
print(GRC.head(30))
print(GRC.iloc[0,8])
print(GRC_exon.head())
print("cyril func")
print(dgen.head())
print(dexon.head())
print("hanna func")
print(GRC_gene[["Chr", "GeneID", "gene_label","start","end","direction","biotype"]].head())
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment