Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Yoann DUFRESNE
linked reads molecule ordering
Commits
96a90ed9
Commit
96a90ed9
authored
Aug 08, 2019
by
Yoann Dufresne
Browse files
Merge branch 'master' of gitlab.pasteur.fr:ydufresne/10x-deconvolve
parents
d7e6c8cf
c880bb1d
Changes
1
Show whitespace changes
Inline
Side-by-side
deconvolution/evaluate.py
View file @
96a90ed9
...
...
@@ -13,8 +13,8 @@ def parse_args():
parser
=
argparse
.
ArgumentParser
(
description
=
'Process some integers.'
)
parser
.
add_argument
(
'filename'
,
type
=
str
,
help
=
'The file to evalute'
)
parser
.
add_argument
(
'--type'
,
'-t'
,
choices
=
[
"d2"
,
"path"
],
default
=
"path"
,
required
=
True
,
help
=
"Define the data type to evaluate. Must be 'd2' or 'path'."
)
parser
.
add_argument
(
'--type'
,
'-t'
,
choices
=
[
"d2"
,
"path"
,
"d2-2annotate"
],
default
=
"path"
,
required
=
True
,
help
=
"Define the data type to evaluate. Must be 'd2' or 'path'
or 'd2-2annotate' (Rayan's hack)
."
)
parser
.
add_argument
(
'--light-print'
,
'-l'
,
action
=
'store_true'
,
help
=
'Print only wrong nodes and paths'
)
parser
.
add_argument
(
'--optimization_file'
,
'-o'
,
...
...
@@ -33,6 +33,17 @@ def load_graph(filename):
print
(
"Wrong file format. Require graphml or gefx format"
,
file
=
sys
.
stderr
)
exit
()
def
save_graph
(
g
,
filename
):
if
filename
.
endswith
(
'.graphml'
):
return
nx
.
write_graphml
(
g
,
filename
)
elif
filename
.
endswith
(
'.gexf'
):
return
nx
.
write_gexf
(
g
,
filename
)
else
:
print
(
"Wrong file format. Require graphml or gefx format"
,
file
=
sys
.
stderr
)
exit
()
# ------------- Path Graph -------------
...
...
@@ -378,6 +389,106 @@ def compute_covered_variables(optimization_file, path):
return
vars
# returns True iff there exist x in mol1 such that there exists y in mol2 and |x-y| <= some_value
def
nearby_udg_molecules
(
mols1
,
mols2
):
for
x
in
mols1
:
for
y
in
mols2
:
if
abs
(
x
-
y
)
<=
5
:
return
True
return
False
def
verify_graph_edges
(
d2_component
):
udg_molecules_dict
=
dict
()
for
node
in
d2_component
.
nodes
():
# Parse the current node name
head
,
c1
,
c2
=
parse_dg_name
(
node
)
# Construct the molecule(s) that this udg really 'reflects'
# i.e. the udg has a central node and two cliques
# that central nodes is the result of merging of several molecules
# ideally, only one of them is connected to the cliques
# (there could be more than one though; in that case the udg is 'ambiguous')
# udg_molecules aims to reflect the molecule(s) underlying this udg
udg_molecules
=
set
()
# Get the current molecule idxs of central node
molecule_idxs
=
mols_from_node
(
head
[
1
])
#print("mol idxs", molecule_idxs)
# Examine molecule idx's of cliques to see which are close to the central node
# rationale: c1/c2 contain nearby molecule id's
for
mol_idx
in
molecule_idxs
:
nexts
=
[]
for
c
in
[
c1
,
c2
]:
for
c_node
in
c
:
nei_mols
=
mols_from_node
(
c_node
.
split
()[
0
])
nei_mols
=
[
x
for
x
in
nei_mols
if
x
>
mol_idx
]
# fixme: also look at the <= molecules for more robustness
# If there are molecule next
if
len
(
nei_mols
)
>
0
:
next_nei_mol
=
min
(
nei_mols
)
nexts
.
append
((
next_nei_mol
))
nexts
.
sort
(
key
=
lambda
x
:
x
)
quality
=
sum
([
1.0
/
x
if
mol_idx
+
x
in
nexts
else
0
for
x
in
range
(
1
,
6
)])
/
sum
([
1.0
/
x
for
x
in
range
(
1
,
6
)])
if
quality
>
0.6
:
udg_molecules
.
add
(
mol_idx
)
#print("mol",mol_idx,molecule_idxs,"quality",quality,"nexts",nexts)
udg_molecules_dict
[
head
[
0
]]
=
udg_molecules
# Then: annotate edges as to whether they're real (their udg_molecule(s) are nearby) or fake
for
n1
,
n2
,
data
in
d2_component
.
edges
(
data
=
True
):
# Parse the current node name
head
,
c1
,
c2
=
parse_dg_name
(
n1
)
node_udg_molecules
=
udg_molecules_dict
[
head
[
0
]]
n_head
,
n_c1
,
n_c2
=
parse_dg_name
(
n2
)
neighbor_udg_molecules
=
udg_molecules_dict
[
n_head
[
0
]]
if
nearby_udg_molecules
(
node_udg_molecules
,
neighbor_udg_molecules
):
color
=
'green'
else
:
color
=
'red'
data
[
'color'
]
=
color
#print("edge",node_udg_molecules,neighbor_udg_molecules,color)
# also, annotate nodes by their putative molecule found
for
n
,
data
in
d2_component
.
nodes
(
data
=
True
):
# Parse the current node name
head
,
c1
,
c2
=
parse_dg_name
(
n
)
node_udg_molecules
=
udg_molecules_dict
[
head
[
0
]]
data
[
'udg_molecule'
]
=
'_'
.
join
(
list
(
map
(
str
,
node_udg_molecules
)))
# aggressive: delete nodes which have either no found udg_molecule, or two udg_molecules
# turns out it's not a good strategy as the nodes with two udg_molecules are important to connect portions of graph
# but what if we magically keep those where the two adjacent molecules are close together
if
True
:
d2_component
=
d2_component
.
copy
()
nodes_to_remove
=
[]
for
n
,
data
in
d2_component
.
nodes
(
data
=
True
):
# Parse the current node name
head
,
c1
,
c2
=
parse_dg_name
(
n
)
if
"_"
in
data
[
'udg_molecule'
]
or
data
[
'udg_molecule'
]
==
''
:
if
"_"
in
data
[
'udg_molecule'
]:
m1
,
m2
=
list
(
map
(
int
,
data
[
'udg_molecule'
].
split
(
"_"
)))
if
abs
(
m2
-
m1
)
<
30
:
continue
# don't remove that kind of nodes
nodes_to_remove
+=
[
n
]
d2_component
.
remove_nodes_from
(
nodes_to_remove
)
print
(
"removed"
,
len
(
nodes_to_remove
),
"bad nodes"
)
# aggressive: delete red edges
if
True
:
d2_component
=
d2_component
.
copy
()
edges_to_remove
=
[]
for
n1
,
n2
,
data
in
d2_component
.
edges
(
data
=
True
):
if
data
[
'color'
]
==
'red'
:
edges_to_remove
+=
[(
n1
,
n2
)]
d2_component
.
remove_edges_from
(
edges_to_remove
)
print
(
"removed"
,
len
(
edges_to_remove
),
"bad edges"
)
return
d2_component
def
main
():
args
=
parse_args
()
...
...
@@ -399,6 +510,19 @@ def main():
covered_vars
=
compute_covered_variables
(
args
.
optimization_file
,
longest_path
)
print_d2_summary
(
components
,
longest_path
,
covered_vars
=
covered_vars
,
light_print
=
args
.
light_print
)
# added by Rayan
# example:
# python evaluate.py --type d2-2annotate ~/Dropbox/cnrs/projects/10x-barcodes/yoann_to_cedric_ilp/d2_graph.gexf
elif
args
.
type
==
"d2-2annotate"
:
components
=
list
(
nx
.
connected_components
(
graph
))
components
.
sort
(
key
=
lambda
x
:
-
len
(
x
))
component
=
graph
.
subgraph
(
components
[
0
])
component
=
verify_graph_edges
(
component
)
extension
=
args
.
filename
.
split
(
'.'
)[
-
1
]
base_filename
=
'.'
.
join
(
args
.
filename
.
split
(
'.'
)[:
-
1
])
save_graph
(
component
,
base_filename
+
".verified."
+
extension
)
if
__name__
==
"__main__"
:
main
()
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment