Commit 80b70c7a authored by Rayan Chikhi's avatar Rayan Chikhi
Browse files

another wave of bugfixes on the eval script

parent 31a82afc
......@@ -28,40 +28,45 @@ def load_graph(filename):
print("Wrong file format. Require graphml or gefx format", file=sys.stderr)
exit()
""" return a random path in G starting in u and having n nodes """
""" return a random path in G starting in u and having n _edges_ """
import random
def findRandomPath(G,u,n,previous_path_nodes=set()):
if n==0:
return [u]
path = [u]
poss_neigh = list(set(G.neighbors(u)) - previous_path_nodes)
if len(poss_neigh) == 0: return None
neighbor = random.choice(poss_neigh)
new_previous_path_nodes = previous_path_nodes | set([u])
path = findRandomPath(G,neighbor,n-1, new_previous_path_nodes)
if path is None: return None
if len(path) != n: return None
return [u]+path
import itertools
def is_there_path_acc(central_nodes,overlap_length):
for mols in itertools.product(*central_nodes):
""" determine, given an ordered list of central nodes, whether there exists a coherent paths of overlapping molecules in them """
def is_there_path_acc(mols_in_central_nodes,min_overlap_length=5000): # don't consider overlaps smaller than 5kbp
for mols in itertools.product(*mols_in_central_nodes):
#print(mols)
last_end = None
last_start = None
good_path = True
for mol in mols:
for mol in sorted(mols):
start,end = mol
if last_end is None:
last_end = end
else:
if start > last_end-overlap_length:
#print("bad path",mols)
good_path = False
break
if last_start is None:
last_start = start
if not (start >= last_start and start <= last_end-min_overlap_length):
#print("bad path",mols)
good_path = False
break
last_end = end
last_start = start
if good_path:
return True
return False
""" converts a central node of a d-graph into its list of molecules (given the ground truth) """
def central_node_to_molecules(nodestr):
# format for a 2-merge: 1:NC_000913.3_298281_313280_0:0:0_0:0:0_2fb/1_NC_000913.3_338611_353610_0:0:0_0:0:0_37b/1
cur_node_mols = []
......@@ -72,15 +77,17 @@ def central_node_to_molecules(nodestr):
cur_node_mols += [(start,end)]
return cur_node_mols
def is_coherent_path(central_nodes, overlap_length):
mols = []
def is_coherent_path(central_nodes,path_len):
mols_in_central_nodes = []
for node in central_nodes:
cur_node_mols = central_node_to_molecules(node)
mols += [cur_node_mols]
return is_there_path_acc(mols,overlap_length)
mols_in_central_nodes += [cur_node_mols]
assert(len(mols_in_central_nodes) == path_len+1)
return is_there_path_acc(mols_in_central_nodes)
""" the main function that tests for accuracy of random paths """
graph = None
def evaluate_accuracy_paths(path_len,overlap_length=7000,max_paths_per_node=100):
def evaluate_accuracy_paths(path_len,max_paths_per_node=100):
global graph
nb_bad_paths = 0
nb_good_paths = 0
......@@ -92,10 +99,12 @@ def evaluate_accuracy_paths(path_len,overlap_length=7000,max_paths_per_node=100)
if path is None: continue
if tuple(sorted(path)) in seen_paths: continue # avoids looking at the same path twice
seen_paths.add(tuple(sorted(path)))
assert(len(path) == path_len+1)
#print("path",path)
central_nodes = [graph.nodes[x]['udg'].split()[0] for x in path]
assert(len(central_nodes) == path_len+1)
#print(path,central_nodes)
if is_coherent_path(central_nodes,overlap_length):
if is_coherent_path(central_nodes,path_len):
nb_good_paths += 1
else:
nb_bad_paths += 1
......@@ -103,6 +112,8 @@ def evaluate_accuracy_paths(path_len,overlap_length=7000,max_paths_per_node=100)
# ---- sensitivity evaluation
""" given an ordered list of molecules, determine if the graph contains a path of central nodse which have these molecules.
it does that by testing all possible combinations of d-graphs having those molecules in their central nodes """
def is_there_path(graph,molecules_to_nodes,sought_path):
possible_central_nodes = []
for mol in sought_path:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment