Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Yoann DUFRESNE
linked reads molecule ordering
Commits
c475c1d3
Commit
c475c1d3
authored
Mar 14, 2019
by
Yoann Dufresne
Browse files
evaluation
parent
398b47ac
Changes
1
Hide whitespace changes
Inline
Side-by-side
evaluate.py
0 → 100755
View file @
c475c1d3
#!/usr/bin/env python3
import
sys
import
argparse
from
termcolor
import
colored
import
networkx
as
nx
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
'Process some integers.'
)
parser
.
add_argument
(
'filename'
,
type
=
str
,
help
=
'The output file to evalute'
)
parser
.
add_argument
(
'--light-print'
,
'-l'
,
action
=
'store_true'
,
help
=
'Print only wrong nodes and paths'
)
args
=
parser
.
parse_args
()
return
args
def
load_graph
(
filename
):
if
filename
.
endswith
(
'.graphml'
):
return
nx
.
read_graphml
(
filename
)
elif
filename
.
endswith
(
'.gexf'
):
return
nx
.
read_gexf
(
filename
)
else
:
print
(
"Wrong file format. Require graphml or gefx format"
,
file
=
sys
.
stderr
)
exit
()
def
mols_from_node
(
node_name
):
return
[
int
(
idx
)
for
idx
in
node_name
.
split
(
":"
)[
1
].
split
(
"."
)[
0
].
split
(
"_"
)]
""" Compute appearance frequencies from node names.
All the node names must be under the format :
{idx}:{mol1_id}_{mol2_id}_...{molx_id}.other_things_here
@param graph The networkx graph representinf the deconvolved graph
@param only_wong If True, don't print correct nodes
@param file_pointer Where to print the output. If set to stdout, then pretty print. If set to None, don't print anything.
@return A tuple containing two dictionaries. The first one with theoritical frequencies of each node, the second one with observed frequencies.
"""
def
parse_graph_frequencies
(
graph
):
# Compute origin nodes formated as `{idx}:{mol1_id}_{mol2_id}_...`
observed_frequencies
=
{}
origin_node_names
=
[]
node_per_barcode
=
{}
for
node
in
graph
.
nodes
():
origin_name
=
node
.
split
(
"."
)[
0
]
if
not
origin_name
in
node_per_barcode
:
node_per_barcode
[
origin_name
]
=
[]
node_per_barcode
[
origin_name
].
append
(
node
)
# Count frequency
if
not
origin_name
in
observed_frequencies
:
observed_frequencies
[
origin_name
]
=
0
origin_node_names
.
append
(
origin_name
)
observed_frequencies
[
origin_name
]
+=
1
# Compute wanted frequencies
theoritical_frequencies
=
{}
for
node_name
in
origin_node_names
:
_
,
composition
=
node_name
.
split
(
':'
)
mol_ids
=
composition
.
split
(
'_'
)
# The node should be splited into the number of molecules inside itself
theoritical_frequencies
[
node_name
]
=
len
(
mol_ids
)
return
theoritical_frequencies
,
observed_frequencies
,
node_per_barcode
""" This function aims to look for direct molecule neighbors.
If a node has more than 2 direct neighbors, it's not rightly splitted
"""
def
parse_graph_path
(
graph
):
neighborhood
=
{}
for
node
in
graph
.
nodes
():
molecules
=
mols_from_node
(
node
)
neighbors
=
list
(
graph
.
neighbors
(
node
))
neighborhood
[
node
]
=
[]
for
mol
in
molecules
:
for
nei
in
neighbors
:
nei_mols
=
mols_from_node
(
nei
)
if
mol
-
1
in
nei_mols
:
neighborhood
[
node
].
append
(
nei
)
if
mol
+
1
in
nei_mols
:
neighborhood
[
node
].
append
(
nei
)
return
neighborhood
def
print_summary
(
frequencies
,
neighborhood
,
light_print
=
False
,
file_pointer
=
sys
.
stdout
):
if
file_pointer
==
None
:
return
print
(
"--- Nodes analysis ---"
,
file
=
file_pointer
)
theoritical_frequencies
,
observed_frequencies
,
node_per_barcode
=
frequencies
for
key
in
theoritical_frequencies
:
obs
,
the
=
observed_frequencies
[
key
],
theoritical_frequencies
[
key
]
result
=
f
"
{
key
}
:
{
obs
}
/
{
the
}
"
if
file_pointer
==
sys
.
stdout
:
result
=
colored
(
result
,
'green'
if
obs
==
the
else
'red'
)
# Compute neighborhood correctness
neighborhood_ok
=
True
for
node
in
node_per_barcode
[
key
]:
if
len
(
neighborhood
[
node
])
!=
2
:
neighborhood_ok
=
False
if
light_print
and
obs
==
the
and
neighborhood_ok
:
continue
print
(
result
,
file
=
file_pointer
)
for
node
in
node_per_barcode
[
key
]:
text
=
f
"
\t
{
node
}
\t
{
' '
.
join
(
neighborhood
[
node
])
}
"
if
file_pointer
==
sys
.
stdout
:
text
=
colored
(
text
,
'green'
if
len
(
neighborhood
[
node
])
==
2
else
'yellow'
)
print
(
text
,
file
=
file_pointer
)
print
(
"--- Global summary ---"
,
file
=
file_pointer
)
# --- Frequency usage ---
# Tags
distinct_theoritical_nodes
=
len
(
frequencies
[
0
])
distinct_observed_nodes
=
len
(
frequencies
[
1
])
print
(
f
"Distinct barcodes:
{
distinct_observed_nodes
}
/
{
distinct_theoritical_nodes
}
"
,
file
=
file_pointer
)
# molecules
cumulative_theoritical_nodes
=
sum
(
frequencies
[
0
].
values
())
cumulative_observed_nodes
=
sum
(
frequencies
[
1
].
values
())
print
(
f
"Molecules:
{
cumulative_observed_nodes
}
/
{
cumulative_theoritical_nodes
}
"
,
file
=
file_pointer
)
# Wrong splits
over_split
=
0
under_split
=
0
for
barcode
in
frequencies
[
0
]:
observed
=
frequencies
[
1
][
barcode
]
theoritic
=
frequencies
[
0
][
barcode
]
over_split
+=
max
(
observed
-
theoritic
,
0
)
under_split
+=
max
(
theoritic
-
observed
,
0
)
print
(
f
"Under/Over splitting:
{
under_split
}
-
{
over_split
}
"
)
def
main
():
args
=
parse_args
()
graph
=
load_graph
(
args
.
filename
)
frequencies
=
parse_graph_frequencies
(
graph
)
neighborhood
=
parse_graph_path
(
graph
)
print_summary
(
frequencies
,
neighborhood
,
light_print
=
args
.
light_print
)
if
__name__
==
"__main__"
:
main
()
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment