Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
linked reads molecule ordering
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Yoann DUFRESNE
linked reads molecule ordering
Commits
80b70c7a
Commit
80b70c7a
authored
5 years ago
by
Rayan Chikhi
Browse files
Options
Downloads
Patches
Plain Diff
another wave of bugfixes on the eval script
parent
31a82afc
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
deconvolution/main/d2_path_evaluation.py
+28
-17
28 additions, 17 deletions
deconvolution/main/d2_path_evaluation.py
with
28 additions
and
17 deletions
deconvolution/main/d2_path_evaluation.py
+
28
−
17
View file @
80b70c7a
...
...
@@ -28,40 +28,45 @@ def load_graph(filename):
print
(
"
Wrong file format. Require graphml or gefx format
"
,
file
=
sys
.
stderr
)
exit
()
"""
return a random path in G starting in u and having n
nod
es
"""
"""
return a random path in G starting in u and having n
_edg
es
_
"""
import
random
def
findRandomPath
(
G
,
u
,
n
,
previous_path_nodes
=
set
()):
if
n
==
0
:
return
[
u
]
path
=
[
u
]
poss_neigh
=
list
(
set
(
G
.
neighbors
(
u
))
-
previous_path_nodes
)
if
len
(
poss_neigh
)
==
0
:
return
None
neighbor
=
random
.
choice
(
poss_neigh
)
new_previous_path_nodes
=
previous_path_nodes
|
set
([
u
])
path
=
findRandomPath
(
G
,
neighbor
,
n
-
1
,
new_previous_path_nodes
)
if
path
is
None
:
return
None
if
len
(
path
)
!=
n
:
return
None
return
[
u
]
+
path
import
itertools
def
is_there_path_acc
(
central_nodes
,
overlap_length
):
for
mols
in
itertools
.
product
(
*
central_nodes
):
"""
determine, given an ordered list of central nodes, whether there exists a coherent paths of overlapping molecules in them
"""
def
is_there_path_acc
(
mols_in_
central_nodes
,
min_
overlap_length
=
5000
):
# don't consider overlaps smaller than 5kbp
for
mols
in
itertools
.
product
(
*
mols_in_
central_nodes
):
#print(mols)
last_end
=
None
last_start
=
None
good_path
=
True
for
mol
in
mols
:
for
mol
in
sorted
(
mols
)
:
start
,
end
=
mol
if
last_end
is
None
:
last_end
=
end
else
:
if
start
>
last_end
-
overlap_length
:
#print("bad path",mols)
good_path
=
False
break
if
last_start
is
None
:
last_start
=
start
if
not
(
start
>=
last_start
and
start
<=
last_end
-
min_overlap_length
):
#print("bad path",mols)
good_path
=
False
break
last_end
=
end
last_start
=
start
if
good_path
:
return
True
return
False
"""
converts a central node of a d-graph into its list of molecules (given the ground truth)
"""
def
central_node_to_molecules
(
nodestr
):
# format for a 2-merge: 1:NC_000913.3_298281_313280_0:0:0_0:0:0_2fb/1_NC_000913.3_338611_353610_0:0:0_0:0:0_37b/1
cur_node_mols
=
[]
...
...
@@ -72,15 +77,17 @@ def central_node_to_molecules(nodestr):
cur_node_mols
+=
[(
start
,
end
)]
return
cur_node_mols
def
is_coherent_path
(
central_nodes
,
overlap
_len
gth
):
mols
=
[]
def
is_coherent_path
(
central_nodes
,
path
_len
):
mols
_in_central_nodes
=
[]
for
node
in
central_nodes
:
cur_node_mols
=
central_node_to_molecules
(
node
)
mols
+=
[
cur_node_mols
]
return
is_there_path_acc
(
mols
,
overlap_length
)
mols_in_central_nodes
+=
[
cur_node_mols
]
assert
(
len
(
mols_in_central_nodes
)
==
path_len
+
1
)
return
is_there_path_acc
(
mols_in_central_nodes
)
"""
the main function that tests for accuracy of random paths
"""
graph
=
None
def
evaluate_accuracy_paths
(
path_len
,
overlap_length
=
7000
,
max_paths_per_node
=
100
):
def
evaluate_accuracy_paths
(
path_len
,
max_paths_per_node
=
100
):
global
graph
nb_bad_paths
=
0
nb_good_paths
=
0
...
...
@@ -92,10 +99,12 @@ def evaluate_accuracy_paths(path_len,overlap_length=7000,max_paths_per_node=100)
if
path
is
None
:
continue
if
tuple
(
sorted
(
path
))
in
seen_paths
:
continue
# avoids looking at the same path twice
seen_paths
.
add
(
tuple
(
sorted
(
path
)))
assert
(
len
(
path
)
==
path_len
+
1
)
#print("path",path)
central_nodes
=
[
graph
.
nodes
[
x
][
'
udg
'
].
split
()[
0
]
for
x
in
path
]
assert
(
len
(
central_nodes
)
==
path_len
+
1
)
#print(path,central_nodes)
if
is_coherent_path
(
central_nodes
,
overlap
_len
gth
):
if
is_coherent_path
(
central_nodes
,
path
_len
):
nb_good_paths
+=
1
else
:
nb_bad_paths
+=
1
...
...
@@ -103,6 +112,8 @@ def evaluate_accuracy_paths(path_len,overlap_length=7000,max_paths_per_node=100)
# ---- sensitivity evaluation
"""
given an ordered list of molecules, determine if the graph contains a path of central nodse which have these molecules.
it does that by testing all possible combinations of d-graphs having those molecules in their central nodes
"""
def
is_there_path
(
graph
,
molecules_to_nodes
,
sought_path
):
possible_central_nodes
=
[]
for
mol
in
sought_path
:
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment