Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
J
jass
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container Registry
Model registry
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Statistical-Genetics
jass
Commits
b036b240
Commit
b036b240
authored
3 years ago
by
Hanna JULIENNE
Browse files
Options
Downloads
Patches
Plain Diff
reproduce cyril parser in pandas for gene table
parent
58aa4962
No related branches found
No related tags found
1 merge request
!37
Draft: Dev cnerin
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
read_gff.py
+183
-0
183 additions, 0 deletions
read_gff.py
with
183 additions
and
0 deletions
read_gff.py
0 → 100644
+
183
−
0
View file @
b036b240
import
pandas
as
pd
import
re
def
add_gene_annotation
(
gene_data_path
#initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None
):
"""
add_gene_annotation
for the first 22 chromosomes, retrieves the label of the genes
and their position as well as those of the exons associated with the genes.
Then store this information in a hdf5 file
:param gene_data_path: path to the GFF file containing gene and exon data (for example, GRCh37_latest_genomic.gff)
:type gene_data_path: str
:param initTable_path: path to the file initTable.hdf5
:type initTable_path: str
:param df_gene_csv_path: path to the file df_gene.csv
:type df_gene_csv_path: str
:param df_exon_csv_path: path to the file df_exon.csv
:type df_exon_csv_path: str
:return: the dataframes df_gene and df_exon
:rtype: 2 PANDAS dataframes
"""
# lists containing gene data
gene_id_label
=
[]
gene_GeneID
=
[]
gene_chr
=
[]
gene_start
=
[]
gene_end
=
[]
gene_direction
=
[]
gene_biotype
=
[]
# lists containing exon data
exon_id_label
=
[]
exon_GeneID
=
[]
exon_chr
=
[]
exon_start
=
[]
exon_end
=
[]
exon_direction
=
[]
# temporary list containing the data of all exons
TMP__exon_id_label
=
[]
TMP__exon_GeneID
=
[]
TMP__exon_chr
=
[]
TMP__exon_start
=
[]
TMP__exon_end
=
[]
TMP__exon_direction
=
[]
fichier
=
open
(
gene_data_path
,
"
r
"
)
lignes
=
fichier
.
readlines
()
fichier
.
close
()
for
ligne
in
lignes
:
elements
=
ligne
.
split
(
"
\t
"
)
if
elements
[
0
].
startswith
(
"
NC_
"
):
decode_chr
=
elements
[
0
].
strip
(
"
NC_
"
).
split
(
"
.
"
)
chr
=
int
(
decode_chr
[
0
])
if
chr
<=
22
:
if
elements
[
2
]
==
"
gene
"
:
gene_chr
.
append
(
chr
)
gene_start
.
append
(
int
(
elements
[
3
]))
gene_end
.
append
(
int
(
elements
[
4
]))
gene_direction
.
append
(
elements
[
6
])
decode_id_1
=
elements
[
8
].
split
(
"
;
"
)
decode_id_2
=
decode_id_1
[
0
].
split
(
"
=
"
)
gene_id_label
.
append
(
decode_id_2
[
1
].
strip
(
"
gene-
"
))
decode_id_3
=
decode_id_1
[
1
].
split
(
"
,
"
)
decode_id_4
=
decode_id_3
[
0
].
split
(
"
=
"
)
decode_id_5
=
decode_id_4
[
1
].
split
(
"
:
"
)
gene_GeneID
.
append
(
decode_id_5
[
1
])
decode_id_6
=
decode_id_1
[
5
].
split
(
"
=
"
)
if
decode_id_6
[
0
]
==
"
gene_biotype
"
:
The_biotype
=
decode_id_6
[
1
].
rstrip
(
'
\r\n
'
)
gene_biotype
.
append
(
The_biotype
)
else
:
decode_id_7
=
decode_id_1
[
6
].
split
(
"
=
"
)
if
decode_id_7
[
0
]
==
"
gene_biotype
"
:
The_biotype
=
decode_id_7
[
1
].
rstrip
(
'
\r\n
'
)
gene_biotype
.
append
(
The_biotype
)
else
:
decode_id_8
=
decode_id_1
[
7
].
split
(
"
=
"
)
if
decode_id_8
[
0
]
==
"
gene_biotype
"
:
The_biotype
=
decode_id_8
[
1
].
rstrip
(
'
\r\n
'
)
gene_biotype
.
append
(
The_biotype
)
else
:
The_biotype
=
"
unknown
"
gene_biotype
.
append
(
The_biotype
)
elif
elements
[
2
]
==
"
exon
"
:
TMP__exon_chr
.
append
(
chr
)
TMP__exon_start
.
append
(
int
(
elements
[
3
]))
TMP__exon_end
.
append
(
int
(
elements
[
4
]))
TMP__exon_direction
.
append
(
elements
[
6
])
decode_id_1
=
elements
[
8
].
split
(
"
;
"
)
decode_id_2
=
decode_id_1
[
0
].
split
(
"
=
"
)
TMP__exon_id_label
.
append
(
decode_id_2
[
1
].
strip
(
"
exon-
"
))
decode_id_3
=
decode_id_1
[
2
].
split
(
"
,
"
)
decode_id_4
=
decode_id_3
[
0
].
split
(
"
=
"
)
decode_id_5
=
decode_id_4
[
1
].
split
(
"
:
"
)
TMP__exon_GeneID
.
append
(
decode_id_5
[
1
])
# We only keep the exons that correspond to a gene
for
i
in
range
(
len
(
TMP__exon_id_label
)):
if
TMP__exon_GeneID
[
i
]
in
gene_GeneID
:
exon_id_label
.
append
(
TMP__exon_id_label
[
i
])
exon_GeneID
.
append
(
TMP__exon_GeneID
[
i
])
exon_chr
.
append
(
TMP__exon_chr
[
i
])
exon_start
.
append
(
TMP__exon_start
[
i
])
exon_end
.
append
(
TMP__exon_end
[
i
])
exon_direction
.
append
(
TMP__exon_direction
[
i
])
# We insert genes and exons into dataframes
df_gene
=
pd
.
DataFrame
(
{
"
Chr
"
:
gene_chr
,
"
GeneID
"
:
gene_GeneID
,
"
gene_label
"
:
gene_id_label
,
"
start
"
:
gene_start
,
"
end
"
:
gene_end
,
"
direction
"
:
gene_direction
,
"
biotype
"
:
gene_biotype
,
}
)
df_exon
=
pd
.
DataFrame
(
{
"
Chr
"
:
exon_chr
,
"
exon_label
"
:
exon_id_label
,
"
GeneID
"
:
exon_GeneID
,
"
start
"
:
exon_start
,
"
end
"
:
exon_end
,
"
direction
"
:
exon_direction
,
}
)
# The rows of the dataframes are sorted by chromosome number and start position (and end position for exon)
df_gene
.
sort_values
(
by
=
[
"
Chr
"
,
"
start
"
],
inplace
=
True
,
ignore_index
=
True
)
df_exon
.
sort_values
(
by
=
[
"
Chr
"
,
"
start
"
,
"
end
"
],
inplace
=
True
,
ignore_index
=
True
)
return
(
df_gene
,
df_exon
)
#initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None
gene_data_path
=
"
/home/hjulienn/Project/jass/data/Extract_GRCh37.gff
"
dgen
,
dexon
=
add_gene_annotation
(
gene_data_path
)
print
(
dexon
.
head
())
print
(
dgen
.
head
())
def
get_chr
(
x
):
return
(
int
(
re
.
split
(
"
NC_|\.
"
,
x
)[
1
]))
def
get_label
(
x
):
return
x
[
0
].
split
(
"
=
"
)[
1
].
strip
(
"
gene-
"
)
def
get_gene_ID
(
x
):
return
re
.
split
(
"
=|:
"
,
x
[
1
].
split
(
"
,
"
)[
0
])[
2
]
bt
=
re
.
compile
(
"
gene_biotype=
"
)
def
get_biotype
(
x
):
print
(
list
(
filter
(
bt
.
match
,
x
))[
0
].
split
(
"
=
"
)[
1
])
return
(
list
(
filter
(
bt
.
match
,
x
))[
0
].
split
(
"
=
"
)[
1
])
GRC
=
pd
.
read_csv
(
gene_data_path
,
sep
=
"
\t
"
,
skiprows
=
9
,
nrows
=
10000
,
names
=
[
"
Loci
"
,
"
Annot
"
,
"
biotype
"
,
"
start
"
,
"
end
"
,
"
a
"
,
"
direction
"
,
"
b
"
,
"
Info
"
])
#Filter out to keep only genes and exons
GRC
=
GRC
.
loc
[(
GRC
.
biotype
==
'
gene
'
)
|
(
GRC
.
biotype
==
"
exon
"
)]
GRC
[
"
Chr
"
]
=
GRC
.
Loci
.
apply
(
get_chr
)
GRC
[
"
info_element
"
]
=
GRC
.
Info
.
str
.
split
(
"
;
"
)
GRC_gene
=
GRC
.
loc
[(
GRC
.
biotype
==
'
gene
'
)]
GRC_exon
=
GRC
.
loc
[(
GRC
.
biotype
==
'
exon
'
)]
GRC_gene
[
"
gene_label
"
]
=
GRC_gene
.
info_element
.
apply
(
get_label
)
GRC_gene
[
"
GeneID
"
]
=
GRC_gene
.
info_element
.
apply
(
get_gene_ID
)
GRC_gene
[
"
biotype
"
]
=
GRC_gene
.
info_element
.
apply
(
get_biotype
)
print
(
GRC
.
shape
)
print
(
GRC
.
head
(
30
))
print
(
GRC
.
iloc
[
0
,
8
])
print
(
GRC_exon
.
head
())
print
(
"
cyril func
"
)
print
(
dgen
.
head
())
print
(
dexon
.
head
())
print
(
"
hanna func
"
)
print
(
GRC_gene
[[
"
Chr
"
,
"
GeneID
"
,
"
gene_label
"
,
"
start
"
,
"
end
"
,
"
direction
"
,
"
biotype
"
]].
head
())
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment