Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Yoann DUFRESNE
linked reads molecule ordering
Commits
86fce860
Commit
86fce860
authored
May 12, 2020
by
Rayan Chikhi
Browse files
fix for accuracy of lcp-graph path detection algo
parent
01d8b380
Changes
1
Hide whitespace changes
Inline
Side-by-side
deconvolution/main/d2_path_evaluation.py
→
deconvolution/main/d2_
random_
path_evaluation.py
View file @
86fce860
...
...
@@ -33,7 +33,7 @@ import random
def
findRandomPath
(
G
,
u
,
n
,
previous_path_nodes
=
set
()):
if
n
==
0
:
return
[
u
]
poss_neigh
=
list
(
set
(
G
.
neighbors
(
u
))
-
previous_path_nodes
)
poss_neigh
=
list
(
set
(
G
.
neighbors
(
u
))
-
previous_path_nodes
-
set
([
u
])
)
if
len
(
poss_neigh
)
==
0
:
return
None
neighbor
=
random
.
choice
(
poss_neigh
)
new_previous_path_nodes
=
previous_path_nodes
|
set
([
u
])
...
...
@@ -42,29 +42,46 @@ def findRandomPath(G,u,n,previous_path_nodes=set()):
if
len
(
path
)
!=
n
:
return
None
return
[
u
]
+
path
"""
given as input a list of molecules in central nodes,
return a stripped-down list that only includes molecules that overlap their neighbor
"""
def
extract_likely_molecule_path_wrapper
(
path
,
starter_mol
):
debug
=
False
res
=
[]
direction
=
None
previous_mol
=
starter_mol
for
i
,
mols
in
enumerate
(
path
):
good_mol
=
None
for
mol
in
mols
:
if
(
abs
(
previous_mol
[
0
]
-
mol
[
0
])
<
9000
):
# the max distance between two starting points, if min overlap is 6000 and mols at 15kbp of length
""" shouldn't enforce directionality. why? because a->b->c can sometimes be a->c->b and still be correct, because a,b,c are a clique """
#if i == 0:
#direction = "decreasing" if previous_mol[0] > mol[0] else "increasing"
good_mol
=
mol
"""
else:
if direction == "increasing":
if previous_mol[0] <= mol[0]:
good_mol = mol
else:
if mol[0] <= previous_mol[0]:
good_mol = mol
"""
if
good_mol
is
None
:
if
debug
:
print
(
"starting"
,
starter_mol
,
"bad path"
,
path
,
res
)
return
None
previous_mol
=
good_mol
res
+=
[
good_mol
]
if
debug
:
print
(
"starting"
,
starter_mol
,
"good path"
,
path
,
res
)
return
res
import
itertools
""" determine, given an ordered list of central nodes, whether there exists a coherent paths of overlapping molecules in them """
def
is_there_path_acc
(
mols_in_central_nodes
,
min_overlap_length
=
5000
):
# don't consider overlaps smaller than 5kbp
for
mols
in
itertools
.
product
(
*
mols_in_central_nodes
):
#print(mols)
last_end
=
None
last_start
=
None
good_path
=
True
for
mol
in
sorted
(
mols
):
start
,
end
=
mol
if
last_end
is
None
:
last_end
=
end
if
last_start
is
None
:
last_start
=
start
if
not
(
start
>=
last_start
and
start
<=
last_end
-
min_overlap_length
):
#print("bad path",mols)
good_path
=
False
break
last_end
=
end
last_start
=
start
if
good_path
:
return
True
return
False
def
is_there_path_acc
(
path
):
# don't consider overlaps smaller than 5kbp
# try with all possible starting molecules
return
any
(
extract_likely_molecule_path_wrapper
(
path
[
1
:],
mol
)
is
not
None
for
mol
in
path
[
0
])
""" converts a central node of a d-graph into its list of molecules (given the ground truth) """
def
central_node_to_molecules
(
nodestr
):
...
...
@@ -80,20 +97,22 @@ def central_node_to_molecules(nodestr):
def
is_coherent_path
(
central_nodes
,
path_len
):
mols_in_central_nodes
=
[]
for
node
in
central_nodes
:
#print("central node",node)
cur_node_mols
=
central_node_to_molecules
(
node
)
mols_in_central_nodes
+=
[
cur_node_mols
]
assert
(
len
(
mols_in_central_nodes
)
==
path_len
+
1
)
return
is_there_path_acc
(
mols_in_central_nodes
)
#return True
def
get_barcode
(
x
):
if
'udg'
in
graph
.
nodes
[
x
]:
return
graph
.
nodes
[
x
][
'udg'
].
split
()[
0
]
return
graph
.
nodes
[
x
][
'udg'
].
split
(
']'
)[
0
]
# here the format may change..
else
:
return
graph
.
nodes
[
x
][
'label'
]
""" the main function that tests for accuracy of random paths """
graph
=
None
def
evaluate_accuracy_paths
(
path_len
,
max_paths_per_node
=
100
):
def
evaluate_accuracy_paths
(
path_len
,
max_paths_per_node
=
5
):
global
graph
nb_bad_paths
=
0
nb_good_paths
=
0
...
...
@@ -114,7 +133,8 @@ def evaluate_accuracy_paths(path_len,max_paths_per_node=100):
nb_good_paths
+=
1
else
:
nb_bad_paths
+=
1
print
(
"accuracy for l=%d:"
%
path_len
,
nb_good_paths
/
(
nb_good_paths
+
nb_bad_paths
))
nb_paths
=
nb_bad_paths
+
nb_good_paths
print
(
"accuracy for l=%d (%d paths, %d nodes explored):"
%
(
path_len
,
nb_paths
,
nb_paths
*
path_len
),
nb_good_paths
/
(
nb_good_paths
+
nb_bad_paths
))
# ---- sensitivity evaluation
...
...
@@ -160,9 +180,12 @@ def main():
args
=
parse_args
()
graph
=
load_graph
(
args
.
filename
)
p
=
Pool
(
4
)
p
.
map
(
evaluate_accuracy_paths
,
[
1
,
2
,
3
,
4
])
p
.
map
(
evaluate_sensitivity_paths
,
[
1
,
2
,
3
,
4
])
p
=
Pool
(
12
)
#p.map(evaluate_accuracy_paths, [1,2,4,6,8,10,15,20,50,100,200,500])
p
.
map
(
evaluate_accuracy_paths
,
[
2
,
4
,
10
,
100
])
p
.
join
()
#p.map(evaluate_sensitivity_paths, [1,2,3,4])
#evaluate_accuracy_paths(100)
if
__name__
==
"__main__"
:
main
()
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment