Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Yoann DUFRESNE
linked reads molecule ordering
Commits
264169f4
Commit
264169f4
authored
Aug 07, 2019
by
rchikhi
Browse files
annotation of d2-graphs given ground truth
parent
4e175f2c
Changes
1
Hide whitespace changes
Inline
Side-by-side
deconvolution/evaluate.py
View file @
264169f4
...
...
@@ -13,8 +13,8 @@ def parse_args():
parser
=
argparse
.
ArgumentParser
(
description
=
'Process some integers.'
)
parser
.
add_argument
(
'filename'
,
type
=
str
,
help
=
'The file to evalute'
)
parser
.
add_argument
(
'--type'
,
'-t'
,
choices
=
[
"d2"
,
"path"
],
default
=
"path"
,
required
=
True
,
help
=
"Define the data type to evaluate. Must be 'd2' or 'path'."
)
parser
.
add_argument
(
'--type'
,
'-t'
,
choices
=
[
"d2"
,
"path"
,
"d2-2annotate"
],
default
=
"path"
,
required
=
True
,
help
=
"Define the data type to evaluate. Must be 'd2' or 'path'
or 'd2-2annotate' (Rayan's hack)
."
)
parser
.
add_argument
(
'--light-print'
,
'-l'
,
action
=
'store_true'
,
help
=
'Print only wrong nodes and paths'
)
parser
.
add_argument
(
'--optimization_file'
,
'-o'
,
...
...
@@ -33,6 +33,17 @@ def load_graph(filename):
print
(
"Wrong file format. Require graphml or gefx format"
,
file
=
sys
.
stderr
)
exit
()
def
save_graph
(
g
,
filename
):
if
filename
.
endswith
(
'.graphml'
):
return
nx
.
write_graphml
(
g
,
filename
)
elif
filename
.
endswith
(
'.gexf'
):
return
nx
.
write_gexf
(
g
,
filename
)
else
:
print
(
"Wrong file format. Require graphml or gefx format"
,
file
=
sys
.
stderr
)
exit
()
# ------------- Path Graph -------------
...
...
@@ -378,6 +389,79 @@ def compute_covered_variables(optimization_file, path):
return
vars
# returns True iff there exist x in mol1 such that there exists y in mol2 and |x-y| <= some_value
def
nearby_udg_molecules
(
mols1
,
mols2
):
for
x
in
mols1
:
for
y
in
mols2
:
if
abs
(
x
-
y
)
<=
5
:
return
True
return
False
def
verify_graph_edges
(
d2_component
):
udg_molecules_dict
=
dict
()
for
node
in
d2_component
.
nodes
():
# Parse the current node name
head
,
c1
,
c2
=
parse_dg_name
(
node
)
# Construct the molecule(s) that this udg really 'reflects'
# i.e. the udg has a central node and two cliques
# that central nodes is the result of merging of several molecules
# ideally, only one of them is connected to the cliques
# (there could be more than one though; in that case the udg is 'ambiguous')
# udg_molecules aims to reflect the molecule(s) underlying this udg
udg_molecules
=
set
()
# Get the current molecule idxs of central node
molecule_idxs
=
mols_from_node
(
head
[
1
])
#print("mol idxs", molecule_idxs)
# Examine molecule idx's of cliques to see which are close to the central node
# rationale: c1/c2 contain nearby molecule id's
for
mol_idx
in
molecule_idxs
:
nexts
=
[]
for
c
in
[
c1
,
c2
]:
for
c_node
in
c
:
nei_mols
=
mols_from_node
(
c_node
.
split
()[
0
])
nei_mols
=
[
x
for
x
in
nei_mols
if
x
>
mol_idx
]
# fixme: also look at the <= molecules for more robustness
# If there are molecule next
if
len
(
nei_mols
)
>
0
:
next_nei_mol
=
min
(
nei_mols
)
nexts
.
append
((
next_nei_mol
))
nexts
.
sort
(
key
=
lambda
x
:
x
)
quality
=
sum
([
1.0
/
x
if
mol_idx
+
x
in
nexts
else
0
for
x
in
range
(
1
,
6
)])
/
sum
([
1.0
/
x
for
x
in
range
(
1
,
6
)])
if
quality
>
0.6
:
udg_molecules
.
add
(
mol_idx
)
#print("mol",mol_idx,molecule_idxs,"quality",quality,"nexts",nexts)
udg_molecules_dict
[
head
[
0
]]
=
udg_molecules
# Then: annotate edges as to whether they're real (their udg_molecule(s) are nearby) or fake
for
n1
,
n2
,
data
in
d2_component
.
edges
(
data
=
True
):
# Parse the current node name
head
,
c1
,
c2
=
parse_dg_name
(
n1
)
node_udg_molecules
=
udg_molecules_dict
[
head
[
0
]]
n_head
,
n_c1
,
n_c2
=
parse_dg_name
(
n2
)
neighbor_udg_molecules
=
udg_molecules_dict
[
n_head
[
0
]]
if
nearby_udg_molecules
(
node_udg_molecules
,
neighbor_udg_molecules
):
color
=
'green'
else
:
color
=
'red'
data
[
'color'
]
=
color
print
(
"edge"
,
node_udg_molecules
,
neighbor_udg_molecules
,
color
)
# also, annotate nodes by their putative molecule found
for
n
,
data
in
d2_component
.
nodes
(
data
=
True
):
# Parse the current node name
head
,
c1
,
c2
=
parse_dg_name
(
n
)
node_udg_molecules
=
udg_molecules_dict
[
head
[
0
]]
data
[
'udg_molecule'
]
=
'_'
.
join
(
list
(
map
(
str
,
node_udg_molecules
)))
def
main
():
args
=
parse_args
()
...
...
@@ -398,7 +482,20 @@ def main():
if
args
.
optimization_file
and
len
(
args
.
optimization_file
)
>
0
:
covered_vars
=
compute_covered_variables
(
args
.
optimization_file
,
longest_path
)
print_d2_summary
(
components
,
longest_path
,
covered_vars
=
covered_vars
,
light_print
=
args
.
light_print
)
# added by Rayan
# example:
# python evaluate.py --type d2-2annotate ~/Dropbox/cnrs/projects/10x-barcodes/yoann_to_cedric_ilp/d2_graph.gexf
elif
args
.
type
==
"d2-2annotate"
:
components
=
list
(
nx
.
connected_components
(
graph
))
components
.
sort
(
key
=
lambda
x
:
-
len
(
x
))
component
=
graph
.
subgraph
(
components
[
0
])
verify_graph_edges
(
component
)
extension
=
args
.
filename
.
split
(
'.'
)[
-
1
]
base_filename
=
'.'
.
join
(
args
.
filename
.
split
(
'.'
)[:
-
1
])
save_graph
(
component
,
base_filename
+
".verified."
+
extension
)
if
__name__
==
"__main__"
:
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment