Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
J
jass
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Terraform modules
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Léo HENCHES
jass
Commits
6c95fb77
Commit
6c95fb77
authored
3 years ago
by
Hanna JULIENNE
Browse files
Options
Downloads
Patches
Plain Diff
re-wrote parser for gene annotation PB in genome browser
parent
b036b240
Branches
dev_cnerin
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
jass/models/inittable.py
+51
-110
51 additions, 110 deletions
jass/models/inittable.py
read_gff.py
+64
-31
64 additions, 31 deletions
read_gff.py
with
115 additions
and
141 deletions
jass/models/inittable.py
+
51
−
110
View file @
6c95fb77
...
...
@@ -7,6 +7,7 @@ Created on Tue Mar 28 09:57:33 2017
import
os
import
glob
import
logging
import
re
from
pandas
import
HDFStore
,
DataFrame
,
read_csv
,
concat
,
options
,
read_hdf
# create (or open) an hdf5 file and opens in append mode
...
...
@@ -368,121 +369,61 @@ def add_gene_annotation(
:return: the dataframes df_gene and df_exon
:rtype: 2 PANDAS dataframes
"""
# lists containing gene data
gene_id_label
=
[]
gene_GeneID
=
[]
gene_chr
=
[]
gene_start
=
[]
gene_end
=
[]
gene_direction
=
[]
gene_biotype
=
[]
# lists containing exon data
exon_id_label
=
[]
exon_GeneID
=
[]
exon_chr
=
[]
exon_start
=
[]
exon_end
=
[]
exon_direction
=
[]
# temporary list containing the data of all exons
TMP__exon_id_label
=
[]
TMP__exon_GeneID
=
[]
TMP__exon_chr
=
[]
TMP__exon_start
=
[]
TMP__exon_end
=
[]
TMP__exon_direction
=
[]
fichier
=
open
(
gene_data_path
,
"
r
"
)
lignes
=
fichier
.
readlines
()
fichier
.
close
()
for
ligne
in
lignes
:
elements
=
ligne
.
split
(
"
\t
"
)
if
elements
[
0
].
startswith
(
"
NC_
"
):
decode_chr
=
elements
[
0
].
strip
(
"
NC_
"
).
split
(
"
.
"
)
chr
=
int
(
decode_chr
[
0
])
if
chr
<=
22
:
if
elements
[
2
]
==
"
gene
"
:
gene_chr
.
append
(
chr
)
gene_start
.
append
(
int
(
elements
[
3
]))
gene_end
.
append
(
int
(
elements
[
4
]))
gene_direction
.
append
(
elements
[
6
])
decode_id_1
=
elements
[
8
].
split
(
"
;
"
)
decode_id_2
=
decode_id_1
[
0
].
split
(
"
=
"
)
gene_id_label
.
append
(
decode_id_2
[
1
].
strip
(
"
gene-
"
))
decode_id_3
=
decode_id_1
[
1
].
split
(
"
,
"
)
decode_id_4
=
decode_id_3
[
0
].
split
(
"
=
"
)
decode_id_5
=
decode_id_4
[
1
].
split
(
"
:
"
)
gene_GeneID
.
append
(
decode_id_5
[
1
])
decode_id_6
=
decode_id_1
[
5
].
split
(
"
=
"
)
if
decode_id_6
[
0
]
==
"
gene_biotype
"
:
The_biotype
=
decode_id_6
[
1
].
rstrip
(
'
\r\n
'
)
gene_biotype
.
append
(
The_biotype
)
else
:
decode_id_7
=
decode_id_1
[
6
].
split
(
"
=
"
)
if
decode_id_7
[
0
]
==
"
gene_biotype
"
:
The_biotype
=
decode_id_7
[
1
].
rstrip
(
'
\r\n
'
)
gene_biotype
.
append
(
The_biotype
)
else
:
decode_id_8
=
decode_id_1
[
7
].
split
(
"
=
"
)
if
decode_id_8
[
0
]
==
"
gene_biotype
"
:
The_biotype
=
decode_id_8
[
1
].
rstrip
(
'
\r\n
'
)
gene_biotype
.
append
(
The_biotype
)
else
:
The_biotype
=
"
unknown
"
gene_biotype
.
append
(
The_biotype
)
elif
elements
[
2
]
==
"
exon
"
:
TMP__exon_chr
.
append
(
chr
)
TMP__exon_start
.
append
(
int
(
elements
[
3
]))
TMP__exon_end
.
append
(
int
(
elements
[
4
]))
TMP__exon_direction
.
append
(
elements
[
6
])
decode_id_1
=
elements
[
8
].
split
(
"
;
"
)
decode_id_2
=
decode_id_1
[
0
].
split
(
"
=
"
)
TMP__exon_id_label
.
append
(
decode_id_2
[
1
].
strip
(
"
exon-
"
))
decode_id_3
=
decode_id_1
[
2
].
split
(
"
,
"
)
decode_id_4
=
decode_id_3
[
0
].
split
(
"
=
"
)
decode_id_5
=
decode_id_4
[
1
].
split
(
"
:
"
)
TMP__exon_GeneID
.
append
(
decode_id_5
[
1
])
# We only keep the exons that correspond to a gene
for
i
in
range
(
len
(
TMP__exon_id_label
)):
if
TMP__exon_GeneID
[
i
]
in
gene_GeneID
:
exon_id_label
.
append
(
TMP__exon_id_label
[
i
])
exon_GeneID
.
append
(
TMP__exon_GeneID
[
i
])
exon_chr
.
append
(
TMP__exon_chr
[
i
])
exon_start
.
append
(
TMP__exon_start
[
i
])
exon_end
.
append
(
TMP__exon_end
[
i
])
exon_direction
.
append
(
TMP__exon_direction
[
i
])
# We insert genes and exons into dataframes
df_gene
=
DataFrame
(
{
"
Chr
"
:
gene_chr
,
"
GeneID
"
:
gene_GeneID
,
"
gene_label
"
:
gene_id_label
,
"
start
"
:
gene_start
,
"
end
"
:
gene_end
,
"
direction
"
:
gene_direction
,
"
biotype
"
:
gene_biotype
,
}
)
df_exon
=
DataFrame
(
{
"
Chr
"
:
exon_chr
,
"
exon_label
"
:
exon_id_label
,
"
GeneID
"
:
exon_GeneID
,
"
start
"
:
exon_start
,
"
end
"
:
exon_end
,
"
direction
"
:
exon_direction
,
}
)
# Parsing Helper
def
get_chr
(
x
):
return
(
int
(
re
.
split
(
"
NC_|\.
"
,
x
)[
1
]))
def
get_label
(
x
):
return
x
[
0
].
split
(
"
=
"
)[
1
].
strip
(
"
gene-
"
)
def
get_NRexon_ID
(
x
):
IDex
=
re
.
search
(
'
^ID=exon-([A-Z\_0-9\.\-]+);
'
,
x
)
if
IDex
:
return
(
IDex
.
group
(
1
))
def
get_gene_ID_split
(
x
):
return
re
.
split
(
"
=|:
"
,
x
[
1
].
split
(
"
,
"
)[
0
])[
2
]
def
get_gene_ID
(
x
):
IDex
=
re
.
search
(
'
Dbxref=GeneID:([0-9]+)[,;]
'
,
x
)
if
IDex
:
return
(
IDex
.
group
(
1
))
bt
=
re
.
compile
(
"
gene_biotype=
"
)
def
get_biotype
(
x
):
return
(
list
(
filter
(
bt
.
match
,
x
))[
0
].
split
(
"
=
"
)[
1
])
GRC
=
read_csv
(
gene_data_path
,
sep
=
"
\t
"
,
skiprows
=
9
,
names
=
[
"
Loci
"
,
"
Annot
"
,
"
biotype
"
,
"
start
"
,
"
end
"
,
"
a
"
,
"
direction
"
,
"
b
"
,
"
Info
"
])
GRC
=
GRC
.
loc
[(
GRC
.
biotype
==
'
gene
'
)
|
(
GRC
.
biotype
==
"
exon
"
)]
GRC
[
"
Chr
"
]
=
GRC
.
Loci
.
apply
(
get_chr
)
GRC
[
"
info_element
"
]
=
GRC
.
Info
.
str
.
split
(
"
;
"
)
GRC
[
"
GeneID
"
]
=
GRC
.
Info
.
apply
(
get_gene_ID
)
df_gene
=
GRC
.
loc
[(
GRC
.
biotype
==
'
gene
'
)]
df_exon
=
GRC
.
loc
[(
GRC
.
biotype
==
'
exon
'
)]
df_gene
[
"
gene_label
"
]
=
df_gene
.
info_element
.
apply
(
get_label
)
df_gene
[
"
biotype
"
]
=
df_gene
.
info_element
.
apply
(
get_biotype
)
df_gene
[
"
GeneID_split
"
]
=
df_gene
.
info_element
.
apply
(
get_gene_ID_split
)
df_gene
[
"
GeneID
"
]
=
df_gene
.
Info
.
apply
(
get_gene_ID
)
df_exon
[
"
exon_label
"
]
=
df_exon
.
Info
.
apply
(
get_NRexon_ID
)
df_exon
[
"
GeneID
"
]
=
df_exon
.
Info
.
apply
(
get_gene_ID
)
df_exon
.
set_index
(
"
GeneID
"
,
inplace
=
True
)
shared_gene
=
df_exon
.
index
.
intersection
(
df_gene
.
GeneID
)
df_exon
=
df_exon
.
loc
[
shared_gene
]
df_exon
.
reset_index
(
inplace
=
True
)
# The rows of the dataframes are sorted by chromosome number and start position (and end position for exon)
df_gene
.
sort_values
(
by
=
[
"
Chr
"
,
"
start
"
],
inplace
=
True
,
ignore_index
=
True
)
df_exon
.
sort_values
(
by
=
[
"
Chr
"
,
"
start
"
,
"
end
"
],
inplace
=
True
,
ignore_index
=
True
)
df_exon
=
df_exon
[[
"
Chr
"
,
"
exon_label
"
,
"
GeneID
"
,
"
start
"
,
"
end
"
,
"
direction
"
]]
df_gene
=
df_gene
[[
"
Chr
"
,
"
GeneID
"
,
"
gene_label
"
,
"
start
"
,
"
end
"
,
"
direction
"
,
"
biotype
"
]]
# This information about genes and exons is stored in an hdf5 file if possible
if
initTable_path
is
not
None
:
hdf_file
=
HDFStore
(
initTable_path
)
...
...
This diff is collapsed.
Click to expand it.
read_gff.py
+
64
−
31
View file @
6c95fb77
import
pandas
as
pd
import
re
def
add_gene_annotation
(
gene_data_path
#initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None
):
...
...
@@ -137,47 +138,79 @@ def add_gene_annotation(
return
(
df_gene
,
df_exon
)
#initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None
gene_data_path
=
"
/home/hjulienn/Project/jass/data/Extract_GRCh37.gff
"
dgen
,
dexon
=
add_gene_annotation
(
gene_data_path
)
print
(
dexon
.
head
())
print
(
dgen
.
head
())
def
add_gene_annotation_Hanna
(
gene_data_path
):
# Parsing Helper
def
get_chr
(
x
):
return
(
int
(
re
.
split
(
"
NC_|\.
"
,
x
)[
1
]))
def
get_label
(
x
):
return
x
[
0
].
split
(
"
=
"
)[
1
].
strip
(
"
gene-
"
)
def
get_NRexon_ID
(
x
):
IDex
=
re
.
search
(
'
^ID=exon-([A-Z\_0-9\.\-]+);
'
,
x
)
if
IDex
:
return
(
IDex
.
group
(
1
))
def
get_gene_ID_split
(
x
):
return
re
.
split
(
"
=|:
"
,
x
[
1
].
split
(
"
,
"
)[
0
])[
2
]
def
get_chr
(
x
):
return
(
int
(
re
.
split
(
"
NC_|\.
"
,
x
)[
1
]))
def
get_gene_ID
(
x
):
IDex
=
re
.
search
(
'
Dbxref=GeneID:([0-9]+)[,;]
'
,
x
)
if
IDex
:
return
(
IDex
.
group
(
1
))
def
get_label
(
x
):
return
x
[
0
].
split
(
"
=
"
)[
1
].
strip
(
"
gene-
"
)
bt
=
re
.
compile
(
"
gene_biotype=
"
)
def
get_biotype
(
x
):
print
(
list
(
filter
(
bt
.
match
,
x
))[
0
].
split
(
"
=
"
)[
1
])
return
(
list
(
filter
(
bt
.
match
,
x
))[
0
].
split
(
"
=
"
)[
1
])
def
get_gene_ID
(
x
):
return
re
.
split
(
"
=|:
"
,
x
[
1
].
split
(
"
,
"
)[
0
])[
2
]
bt
=
re
.
compile
(
"
gene_biotype=
"
)
def
get_biotype
(
x
):
print
(
list
(
filter
(
bt
.
match
,
x
))[
0
].
split
(
"
=
"
)[
1
])
return
(
list
(
filter
(
bt
.
match
,
x
))[
0
].
split
(
"
=
"
)[
1
])
GRC
=
pd
.
read_csv
(
gene_data_path
,
sep
=
"
\t
"
,
skiprows
=
9
,
names
=
[
"
Loci
"
,
"
Annot
"
,
"
biotype
"
,
"
start
"
,
"
end
"
,
"
a
"
,
"
direction
"
,
"
b
"
,
"
Info
"
])
GRC
=
GRC
.
loc
[(
GRC
.
biotype
==
'
gene
'
)
|
(
GRC
.
biotype
==
"
exon
"
)]
GRC
[
"
Chr
"
]
=
GRC
.
Loci
.
apply
(
get_chr
)
GRC
[
"
info_element
"
]
=
GRC
.
Info
.
str
.
split
(
"
;
"
)
GRC
[
"
GeneID
"
]
=
GRC
.
Info
.
apply
(
get_gene_ID
)
df_gene
=
GRC
.
loc
[(
GRC
.
biotype
==
'
gene
'
)]
df_exon
=
GRC
.
loc
[(
GRC
.
biotype
==
'
exon
'
)]
df_gene
[
"
gene_label
"
]
=
df_gene
.
info_element
.
apply
(
get_label
)
df_gene
[
"
biotype
"
]
=
df_gene
.
info_element
.
apply
(
get_biotype
)
df_gene
[
"
GeneID_split
"
]
=
df_gene
.
info_element
.
apply
(
get_gene_ID_split
)
df_gene
[
"
GeneID
"
]
=
df_gene
.
Info
.
apply
(
get_gene_ID
)
df_exon
[
"
exon_label
"
]
=
df_exon
.
Info
.
apply
(
get_NRexon_ID
)
df_exon
[
"
GeneID
"
]
=
df_exon
.
Info
.
apply
(
get_gene_ID
)
df_exon
.
set_index
(
"
GeneID
"
,
inplace
=
True
)
shared_gene
=
df_exon
.
index
.
intersection
(
df_gene
.
GeneID
)
df_exon
=
df_exon
.
loc
[
shared_gene
]
df_exon
.
reset_index
(
inplace
=
True
)
df_gene
.
sort_values
(
by
=
[
"
Chr
"
,
"
start
"
],
inplace
=
True
,
ignore_index
=
True
)
df_exon
.
sort_values
(
by
=
[
"
Chr
"
,
"
start
"
,
"
end
"
],
inplace
=
True
,
ignore_index
=
True
)
GRC
=
pd
.
read_csv
(
gene_data_path
,
sep
=
"
\t
"
,
skiprows
=
9
,
nrows
=
10000
,
names
=
[
"
Loci
"
,
"
Annot
"
,
"
biotype
"
,
"
start
"
,
"
end
"
,
"
a
"
,
"
direction
"
,
"
b
"
,
"
Info
"
])
#Filter out to keep only genes and exons
GRC
=
GRC
.
loc
[(
GRC
.
biotype
==
'
gene
'
)
|
(
GRC
.
biotype
==
"
exon
"
)]
GRC
[
"
Chr
"
]
=
GRC
.
Loci
.
apply
(
get_chr
)
GRC
[
"
info_element
"
]
=
GRC
.
Info
.
str
.
split
(
"
;
"
)
df_exon
=
df_exon
[[
"
Chr
"
,
"
exon_label
"
,
"
GeneID
"
,
"
start
"
,
"
end
"
,
"
direction
"
]]
df_gene
=
df_gene
[[
"
Chr
"
,
"
GeneID
"
,
"
gene_label
"
,
"
start
"
,
"
end
"
,
"
direction
"
,
"
biotype
"
]]
GRC_gene
=
GRC
.
loc
[(
GRC
.
biotype
==
'
gene
'
)]
GRC_exon
=
GRC
.
loc
[(
GRC
.
biotype
==
'
exon
'
)]
GRC_gene
[
"
gene_label
"
]
=
GRC_gene
.
info_element
.
apply
(
get_label
)
GRC_gene
[
"
GeneID
"
]
=
GRC_gene
.
info_element
.
apply
(
get_gene_ID
)
GRC_gene
[
"
biotype
"
]
=
GRC_gene
.
info_element
.
apply
(
get_biotype
)
return
(
df_gene
,
df_exon
)
gene_data_path
=
"
/home/hjulienn/Project/JASS_dev/jass/data/GRCh37_latest_genomic.gff
"
dgen
,
dexon
=
add_gene_annotation
(
gene_data_path
)
#initTable_path=None, df_gene_csv_path=None, df_exon_csv_path=None)
df_gene
,
df_exon
=
add_gene_annotation_Hanna
(
gene_data_path
)
print
(
GRC
.
shape
)
print
(
GRC
.
head
(
30
))
print
(
GRC
.
iloc
[
0
,
8
])
print
(
"
Hanna func
"
)
print
(
df_gene
.
head
())
print
(
df_exon
.
head
())
print
(
df_gene
.
shape
)
print
(
df_exon
.
shape
)
print
(
GRC_exon
.
head
())
print
(
"
cyril func
"
)
print
(
dgen
.
head
())
print
(
dexon
.
head
())
print
(
"
hanna func
"
)
print
(
GRC_gene
[[
"
Chr
"
,
"
GeneID
"
,
"
gene_label
"
,
"
start
"
,
"
end
"
,
"
direction
"
,
"
biotype
"
]].
head
())
print
(
dgen
.
shape
)
print
(
dexon
.
shape
)
print
(
df_gene
.
compare
(
dgen
))
print
(
df_exon
.
compare
(
dexon
))
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment