diff --git a/read_gff.py b/read_gff.py new file mode 100644 index 0000000000000000000000000000000000000000..d9b827d03962ca64e0c959f44e04eb67c4ade34d --- /dev/null +++ b/read_gff.py @@ -0,0 +1,183 @@ +import pandas as pd +import re +def add_gene_annotation( + gene_data_path #initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None +): + """ + add_gene_annotation + for the first 22 chromosomes, retrieves the label of the genes + and their position as well as those of the exons associated with the genes. + Then store this information in a hdf5 file + + :param gene_data_path: path to the GFF file containing gene and exon data (for example, GRCh37_latest_genomic.gff) + :type gene_data_path: str + :param initTable_path: path to the file initTable.hdf5 + :type initTable_path: str + :param df_gene_csv_path: path to the file df_gene.csv + :type df_gene_csv_path: str + :param df_exon_csv_path: path to the file df_exon.csv + :type df_exon_csv_path: str + + :return: the dataframes df_gene and df_exon + :rtype: 2 PANDAS dataframes + """ + # lists containing gene data + gene_id_label = [] + gene_GeneID = [] + gene_chr = [] + gene_start = [] + gene_end = [] + gene_direction = [] + gene_biotype = [] + + # lists containing exon data + exon_id_label = [] + exon_GeneID = [] + exon_chr = [] + exon_start = [] + exon_end = [] + exon_direction = [] + + # temporary list containing the data of all exons + TMP__exon_id_label = [] + TMP__exon_GeneID = [] + TMP__exon_chr = [] + TMP__exon_start = [] + TMP__exon_end = [] + TMP__exon_direction = [] + + fichier = open(gene_data_path, "r") + lignes = fichier.readlines() + fichier.close() + + for ligne in lignes: + elements = ligne.split("\t") + if elements[0].startswith("NC_"): + decode_chr = elements[0].strip("NC_").split(".") + chr = int(decode_chr[0]) + if chr <= 22: + if elements[2] == "gene": + gene_chr.append(chr) + gene_start.append(int(elements[3])) + gene_end.append(int(elements[4])) + gene_direction.append(elements[6]) + decode_id_1 = elements[8].split(";") + decode_id_2 = decode_id_1[0].split("=") + gene_id_label.append(decode_id_2[1].strip("gene-")) + decode_id_3 = decode_id_1[1].split(",") + decode_id_4 = decode_id_3[0].split("=") + decode_id_5 = decode_id_4[1].split(":") + gene_GeneID.append(decode_id_5[1]) + decode_id_6 = decode_id_1[5].split("=") + if decode_id_6[0] == "gene_biotype": + The_biotype = decode_id_6[1].rstrip('\r\n') + gene_biotype.append(The_biotype) + else: + decode_id_7 = decode_id_1[6].split("=") + if decode_id_7[0] == "gene_biotype": + The_biotype = decode_id_7[1].rstrip('\r\n') + gene_biotype.append(The_biotype) + else: + decode_id_8 = decode_id_1[7].split("=") + if decode_id_8[0] == "gene_biotype": + The_biotype = decode_id_8[1].rstrip('\r\n') + gene_biotype.append(The_biotype) + else: + The_biotype = "unknown" + gene_biotype.append(The_biotype) + + elif elements[2] == "exon": + TMP__exon_chr.append(chr) + TMP__exon_start.append(int(elements[3])) + TMP__exon_end.append(int(elements[4])) + TMP__exon_direction.append(elements[6]) + decode_id_1 = elements[8].split(";") + decode_id_2 = decode_id_1[0].split("=") + TMP__exon_id_label.append(decode_id_2[1].strip("exon-")) + decode_id_3 = decode_id_1[2].split(",") + decode_id_4 = decode_id_3[0].split("=") + decode_id_5 = decode_id_4[1].split(":") + TMP__exon_GeneID.append(decode_id_5[1]) + + # We only keep the exons that correspond to a gene + for i in range(len(TMP__exon_id_label)): + if TMP__exon_GeneID[i] in gene_GeneID: + exon_id_label.append(TMP__exon_id_label[i]) + exon_GeneID.append(TMP__exon_GeneID[i]) + exon_chr.append(TMP__exon_chr[i]) + exon_start.append(TMP__exon_start[i]) + exon_end.append(TMP__exon_end[i]) + exon_direction.append(TMP__exon_direction[i]) + + # We insert genes and exons into dataframes + df_gene = pd.DataFrame( + { + "Chr": gene_chr, + "GeneID": gene_GeneID, + "gene_label": gene_id_label, + "start": gene_start, + "end": gene_end, + "direction": gene_direction, + "biotype": gene_biotype, + } + ) + df_exon = pd.DataFrame( + { + "Chr": exon_chr, + "exon_label": exon_id_label, + "GeneID": exon_GeneID, + "start": exon_start, + "end": exon_end, + "direction": exon_direction, + } + ) + # The rows of the dataframes are sorted by chromosome number and start position (and end position for exon) + df_gene.sort_values(by=["Chr", "start"], inplace=True, ignore_index=True) + df_exon.sort_values(by=["Chr", "start", "end"], inplace=True, ignore_index=True) + + return (df_gene, df_exon) + +#initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None +gene_data_path = "/home/hjulienn/Project/jass/data/Extract_GRCh37.gff" +dgen, dexon = add_gene_annotation(gene_data_path) +print(dexon.head()) +print(dgen.head()) + + +def get_chr(x): + return(int(re.split("NC_|\.", x)[1])) + +def get_label(x): + return x[0].split("=")[1].strip("gene-") + +def get_gene_ID(x): + return re.split("=|:", x[1].split(",")[0])[2] + +bt = re.compile("gene_biotype=") +def get_biotype(x): + print(list(filter(bt.match, x))[0].split("=")[1]) + return(list(filter(bt.match, x))[0].split("=")[1]) + +GRC = pd.read_csv(gene_data_path, sep="\t", skiprows=9, nrows=10000, names=["Loci", "Annot", "biotype", "start", "end", "a", "direction", "b", "Info"]) +#Filter out to keep only genes and exons +GRC = GRC.loc[(GRC.biotype=='gene') | (GRC.biotype=="exon")] +GRC["Chr"] = GRC.Loci.apply(get_chr) +GRC["info_element"]=GRC.Info.str.split(";") + +GRC_gene = GRC.loc[(GRC.biotype=='gene')] +GRC_exon = GRC.loc[(GRC.biotype=='exon')] +GRC_gene["gene_label"] = GRC_gene.info_element.apply(get_label) +GRC_gene["GeneID"] = GRC_gene.info_element.apply(get_gene_ID) +GRC_gene["biotype"] = GRC_gene.info_element.apply(get_biotype) + + +print(GRC.shape) +print(GRC.head(30)) +print(GRC.iloc[0,8]) + +print(GRC_exon.head()) +print("cyril func") +print(dgen.head()) +print(dexon.head()) +print("hanna func") +print(GRC_gene[["Chr", "GeneID", "gene_label","start","end","direction","biotype"]].head())