Commit 86fce860 authored by Rayan Chikhi's avatar Rayan Chikhi

fix for accuracy of lcp-graph path detection algo

parent 01d8b380
......@@ -33,7 +33,7 @@ import random
def findRandomPath(G,u,n,previous_path_nodes=set()):
if n==0:
return [u]
poss_neigh = list(set(G.neighbors(u)) - previous_path_nodes)
poss_neigh = list(set(G.neighbors(u)) - previous_path_nodes - set([u]))
if len(poss_neigh) == 0: return None
neighbor = random.choice(poss_neigh)
new_previous_path_nodes = previous_path_nodes | set([u])
......@@ -42,29 +42,46 @@ def findRandomPath(G,u,n,previous_path_nodes=set()):
if len(path) != n: return None
return [u]+path
"""
given as input a list of molecules in central nodes,
return a stripped-down list that only includes molecules that overlap their neighbor
"""
def extract_likely_molecule_path_wrapper(path,starter_mol):
debug = False
res = []
direction = None
previous_mol = starter_mol
for i,mols in enumerate(path):
good_mol = None
for mol in mols:
if (abs(previous_mol[0] - mol[0])< 9000): # the max distance between two starting points, if min overlap is 6000 and mols at 15kbp of length
""" shouldn't enforce directionality. why? because a->b->c can sometimes be a->c->b and still be correct, because a,b,c are a clique """
#if i == 0:
#direction = "decreasing" if previous_mol[0] > mol[0] else "increasing"
good_mol = mol
"""
else:
if direction == "increasing":
if previous_mol[0] <= mol[0]:
good_mol = mol
else:
if mol[0] <= previous_mol[0]:
good_mol = mol
"""
if good_mol is None:
if debug: print("starting",starter_mol,"bad path",path,res)
return None
previous_mol = good_mol
res += [good_mol]
if debug: print("starting",starter_mol,"good path",path,res)
return res
import itertools
""" determine, given an ordered list of central nodes, whether there exists a coherent paths of overlapping molecules in them """
def is_there_path_acc(mols_in_central_nodes,min_overlap_length=5000): # don't consider overlaps smaller than 5kbp
for mols in itertools.product(*mols_in_central_nodes):
#print(mols)
last_end = None
last_start = None
good_path = True
for mol in sorted(mols):
start,end = mol
if last_end is None:
last_end = end
if last_start is None:
last_start = start
if not (start >= last_start and start <= last_end-min_overlap_length):
#print("bad path",mols)
good_path = False
break
last_end = end
last_start = start
if good_path:
return True
return False
def is_there_path_acc(path): # don't consider overlaps smaller than 5kbp
# try with all possible starting molecules
return any(extract_likely_molecule_path_wrapper(path[1:],mol) is not None for mol in path[0])
""" converts a central node of a d-graph into its list of molecules (given the ground truth) """
def central_node_to_molecules(nodestr):
......@@ -80,20 +97,22 @@ def central_node_to_molecules(nodestr):
def is_coherent_path(central_nodes,path_len):
mols_in_central_nodes = []
for node in central_nodes:
#print("central node",node)
cur_node_mols = central_node_to_molecules(node)
mols_in_central_nodes += [cur_node_mols]
assert(len(mols_in_central_nodes) == path_len+1)
return is_there_path_acc(mols_in_central_nodes)
#return True
def get_barcode(x):
if 'udg' in graph.nodes[x]:
return graph.nodes[x]['udg'].split()[0]
return graph.nodes[x]['udg'].split(']')[0] # here the format may change..
else:
return graph.nodes[x]['label']
""" the main function that tests for accuracy of random paths """
graph = None
def evaluate_accuracy_paths(path_len,max_paths_per_node=100):
def evaluate_accuracy_paths(path_len,max_paths_per_node=5):
global graph
nb_bad_paths = 0
nb_good_paths = 0
......@@ -114,7 +133,8 @@ def evaluate_accuracy_paths(path_len,max_paths_per_node=100):
nb_good_paths += 1
else:
nb_bad_paths += 1
print("accuracy for l=%d:" % path_len,nb_good_paths / (nb_good_paths + nb_bad_paths))
nb_paths = nb_bad_paths + nb_good_paths
print("accuracy for l=%d (%d paths, %d nodes explored):" % (path_len, nb_paths, nb_paths*path_len),nb_good_paths / (nb_good_paths + nb_bad_paths))
# ---- sensitivity evaluation
......@@ -160,9 +180,12 @@ def main():
args = parse_args()
graph = load_graph(args.filename)
p = Pool(4)
p.map(evaluate_accuracy_paths, [1,2,3,4])
p.map(evaluate_sensitivity_paths, [1,2,3,4])
p = Pool(12)
#p.map(evaluate_accuracy_paths, [1,2,4,6,8,10,15,20,50,100,200,500])
p.map(evaluate_accuracy_paths, [2,4,10,100])
p.join()
#p.map(evaluate_sensitivity_paths, [1,2,3,4])
#evaluate_accuracy_paths(100)
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment