diff --git a/jass/models/inittable.py b/jass/models/inittable.py index b6cde3dd69f879a63287ff392680a3cde58631c7..2efe9213e18ddebabed7261d0348892ab6fc1197 100644 --- a/jass/models/inittable.py +++ b/jass/models/inittable.py @@ -7,6 +7,7 @@ Created on Tue Mar 28 09:57:33 2017 import os import glob import logging +import re from pandas import HDFStore, DataFrame, read_csv, concat, options, read_hdf # create (or open) an hdf5 file and opens in append mode @@ -368,121 +369,61 @@ def add_gene_annotation( :return: the dataframes df_gene and df_exon :rtype: 2 PANDAS dataframes """ - # lists containing gene data - gene_id_label = [] - gene_GeneID = [] - gene_chr = [] - gene_start = [] - gene_end = [] - gene_direction = [] - gene_biotype = [] - - # lists containing exon data - exon_id_label = [] - exon_GeneID = [] - exon_chr = [] - exon_start = [] - exon_end = [] - exon_direction = [] - - # temporary list containing the data of all exons - TMP__exon_id_label = [] - TMP__exon_GeneID = [] - TMP__exon_chr = [] - TMP__exon_start = [] - TMP__exon_end = [] - TMP__exon_direction = [] - - fichier = open(gene_data_path, "r") - lignes = fichier.readlines() - fichier.close() - - for ligne in lignes: - elements = ligne.split("\t") - if elements[0].startswith("NC_"): - decode_chr = elements[0].strip("NC_").split(".") - chr = int(decode_chr[0]) - if chr <= 22: - if elements[2] == "gene": - gene_chr.append(chr) - gene_start.append(int(elements[3])) - gene_end.append(int(elements[4])) - gene_direction.append(elements[6]) - decode_id_1 = elements[8].split(";") - decode_id_2 = decode_id_1[0].split("=") - gene_id_label.append(decode_id_2[1].strip("gene-")) - decode_id_3 = decode_id_1[1].split(",") - decode_id_4 = decode_id_3[0].split("=") - decode_id_5 = decode_id_4[1].split(":") - gene_GeneID.append(decode_id_5[1]) - decode_id_6 = decode_id_1[5].split("=") - if decode_id_6[0] == "gene_biotype": - The_biotype = decode_id_6[1].rstrip('\r\n') - gene_biotype.append(The_biotype) - else: - decode_id_7 = decode_id_1[6].split("=") - if decode_id_7[0] == "gene_biotype": - The_biotype = decode_id_7[1].rstrip('\r\n') - gene_biotype.append(The_biotype) - else: - decode_id_8 = decode_id_1[7].split("=") - if decode_id_8[0] == "gene_biotype": - The_biotype = decode_id_8[1].rstrip('\r\n') - gene_biotype.append(The_biotype) - else: - The_biotype = "unknown" - gene_biotype.append(The_biotype) - - elif elements[2] == "exon": - TMP__exon_chr.append(chr) - TMP__exon_start.append(int(elements[3])) - TMP__exon_end.append(int(elements[4])) - TMP__exon_direction.append(elements[6]) - decode_id_1 = elements[8].split(";") - decode_id_2 = decode_id_1[0].split("=") - TMP__exon_id_label.append(decode_id_2[1].strip("exon-")) - decode_id_3 = decode_id_1[2].split(",") - decode_id_4 = decode_id_3[0].split("=") - decode_id_5 = decode_id_4[1].split(":") - TMP__exon_GeneID.append(decode_id_5[1]) - - # We only keep the exons that correspond to a gene - for i in range(len(TMP__exon_id_label)): - if TMP__exon_GeneID[i] in gene_GeneID: - exon_id_label.append(TMP__exon_id_label[i]) - exon_GeneID.append(TMP__exon_GeneID[i]) - exon_chr.append(TMP__exon_chr[i]) - exon_start.append(TMP__exon_start[i]) - exon_end.append(TMP__exon_end[i]) - exon_direction.append(TMP__exon_direction[i]) - - # We insert genes and exons into dataframes - df_gene = DataFrame( - { - "Chr": gene_chr, - "GeneID": gene_GeneID, - "gene_label": gene_id_label, - "start": gene_start, - "end": gene_end, - "direction": gene_direction, - "biotype": gene_biotype, - } - ) - df_exon = DataFrame( - { - "Chr": exon_chr, - "exon_label": exon_id_label, - "GeneID": exon_GeneID, - "start": exon_start, - "end": exon_end, - "direction": exon_direction, - } - ) + # Parsing Helper + def get_chr(x): + return(int(re.split("NC_|\.", x)[1])) + + def get_label(x): + return x[0].split("=")[1].strip("gene-") + + def get_NRexon_ID(x): + IDex = re.search('^ID=exon-([A-Z\_0-9\.\-]+);', x) + if IDex: + return(IDex.group(1)) + + def get_gene_ID_split(x): + return re.split("=|:", x[1].split(",")[0])[2] + + def get_gene_ID(x): + IDex = re.search('Dbxref=GeneID:([0-9]+)[,;]', x) + if IDex: + return(IDex.group(1)) + + bt = re.compile("gene_biotype=") + def get_biotype(x): + return(list(filter(bt.match, x))[0].split("=")[1]) + + + GRC = read_csv(gene_data_path, sep="\t", skiprows=9, names=["Loci", "Annot", "biotype", "start", "end", "a", "direction", "b", "Info"]) + + GRC = GRC.loc[(GRC.biotype=='gene') | (GRC.biotype=="exon")] + GRC["Chr"] = GRC.Loci.apply(get_chr) + GRC["info_element"]=GRC.Info.str.split(";") + GRC["GeneID"] = GRC.Info.apply(get_gene_ID) + + df_gene = GRC.loc[(GRC.biotype=='gene')] + df_exon = GRC.loc[(GRC.biotype=='exon')] + + df_gene["gene_label"] = df_gene.info_element.apply(get_label) + df_gene["biotype"] = df_gene.info_element.apply(get_biotype) + df_gene["GeneID_split"] = df_gene.info_element.apply(get_gene_ID_split) + df_gene["GeneID"] = df_gene.Info.apply(get_gene_ID) + + df_exon["exon_label"] = df_exon.Info.apply(get_NRexon_ID) + df_exon["GeneID"] = df_exon.Info.apply(get_gene_ID) + df_exon.set_index("GeneID", inplace=True) + shared_gene = df_exon.index.intersection(df_gene.GeneID) + + df_exon = df_exon.loc[shared_gene] + df_exon.reset_index(inplace=True) # The rows of the dataframes are sorted by chromosome number and start position (and end position for exon) df_gene.sort_values(by=["Chr", "start"], inplace=True, ignore_index=True) df_exon.sort_values(by=["Chr", "start", "end"], inplace=True, ignore_index=True) + df_exon = df_exon[["Chr", "exon_label", "GeneID","start","end","direction"]] + df_gene = df_gene[["Chr", "GeneID", "gene_label","start","end","direction","biotype"]] + # This information about genes and exons is stored in an hdf5 file if possible if initTable_path is not None: hdf_file = HDFStore(initTable_path) diff --git a/read_gff.py b/read_gff.py index d9b827d03962ca64e0c959f44e04eb67c4ade34d..10b8659e68daf9a07aad4dc7dc1eb95760816dc5 100644 --- a/read_gff.py +++ b/read_gff.py @@ -1,5 +1,6 @@ import pandas as pd import re + def add_gene_annotation( gene_data_path #initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None ): @@ -137,47 +138,79 @@ def add_gene_annotation( return (df_gene, df_exon) -#initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None -gene_data_path = "/home/hjulienn/Project/jass/data/Extract_GRCh37.gff" -dgen, dexon = add_gene_annotation(gene_data_path) -print(dexon.head()) -print(dgen.head()) +def add_gene_annotation_Hanna(gene_data_path): + + # Parsing Helper + def get_chr(x): + return(int(re.split("NC_|\.", x)[1])) + + def get_label(x): + return x[0].split("=")[1].strip("gene-") + + def get_NRexon_ID(x): + IDex = re.search('^ID=exon-([A-Z\_0-9\.\-]+);', x) + if IDex: + return(IDex.group(1)) + + def get_gene_ID_split(x): + return re.split("=|:", x[1].split(",")[0])[2] -def get_chr(x): - return(int(re.split("NC_|\.", x)[1])) + def get_gene_ID(x): + IDex = re.search('Dbxref=GeneID:([0-9]+)[,;]', x) + if IDex: + return(IDex.group(1)) -def get_label(x): - return x[0].split("=")[1].strip("gene-") + bt = re.compile("gene_biotype=") + def get_biotype(x): + print(list(filter(bt.match, x))[0].split("=")[1]) + return(list(filter(bt.match, x))[0].split("=")[1]) -def get_gene_ID(x): - return re.split("=|:", x[1].split(",")[0])[2] -bt = re.compile("gene_biotype=") -def get_biotype(x): - print(list(filter(bt.match, x))[0].split("=")[1]) - return(list(filter(bt.match, x))[0].split("=")[1]) + GRC = pd.read_csv(gene_data_path, sep="\t", skiprows=9, names=["Loci", "Annot", "biotype", "start", "end", "a", "direction", "b", "Info"]) + + GRC = GRC.loc[(GRC.biotype=='gene') | (GRC.biotype=="exon")] + GRC["Chr"] = GRC.Loci.apply(get_chr) + GRC["info_element"]=GRC.Info.str.split(";") + GRC["GeneID"] = GRC.Info.apply(get_gene_ID) + + df_gene = GRC.loc[(GRC.biotype=='gene')] + df_exon = GRC.loc[(GRC.biotype=='exon')] + + df_gene["gene_label"] = df_gene.info_element.apply(get_label) + df_gene["biotype"] = df_gene.info_element.apply(get_biotype) + df_gene["GeneID_split"] = df_gene.info_element.apply(get_gene_ID_split) + df_gene["GeneID"] = df_gene.Info.apply(get_gene_ID) + + df_exon["exon_label"] = df_exon.Info.apply(get_NRexon_ID) + df_exon["GeneID"] = df_exon.Info.apply(get_gene_ID) + df_exon.set_index("GeneID", inplace=True) + shared_gene = df_exon.index.intersection(df_gene.GeneID) + + df_exon = df_exon.loc[shared_gene] + df_exon.reset_index(inplace=True) + df_gene.sort_values(by=["Chr", "start"], inplace=True, ignore_index=True) + df_exon.sort_values(by=["Chr", "start", "end"], inplace=True, ignore_index=True) -GRC = pd.read_csv(gene_data_path, sep="\t", skiprows=9, nrows=10000, names=["Loci", "Annot", "biotype", "start", "end", "a", "direction", "b", "Info"]) -#Filter out to keep only genes and exons -GRC = GRC.loc[(GRC.biotype=='gene') | (GRC.biotype=="exon")] -GRC["Chr"] = GRC.Loci.apply(get_chr) -GRC["info_element"]=GRC.Info.str.split(";") + df_exon = df_exon[["Chr", "exon_label", "GeneID","start","end","direction"]] + df_gene = df_gene[["Chr", "GeneID", "gene_label","start","end","direction","biotype"]] -GRC_gene = GRC.loc[(GRC.biotype=='gene')] -GRC_exon = GRC.loc[(GRC.biotype=='exon')] -GRC_gene["gene_label"] = GRC_gene.info_element.apply(get_label) -GRC_gene["GeneID"] = GRC_gene.info_element.apply(get_gene_ID) -GRC_gene["biotype"] = GRC_gene.info_element.apply(get_biotype) + return(df_gene, df_exon) +gene_data_path = "/home/hjulienn/Project/JASS_dev/jass/data/GRCh37_latest_genomic.gff" +dgen, dexon = add_gene_annotation(gene_data_path) #initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None) +df_gene, df_exon = add_gene_annotation_Hanna(gene_data_path) -print(GRC.shape) -print(GRC.head(30)) -print(GRC.iloc[0,8]) +print("Hanna func") +print(df_gene.head()) +print(df_exon.head()) +print(df_gene.shape) +print(df_exon.shape) -print(GRC_exon.head()) print("cyril func") print(dgen.head()) print(dexon.head()) -print("hanna func") -print(GRC_gene[["Chr", "GeneID", "gene_label","start","end","direction","biotype"]].head()) +print(dgen.shape) +print(dexon.shape) +print(df_gene.compare(dgen)) +print(df_exon.compare(dexon))