another wave of bugfixes on the eval script

80b70c7a · Rayan Chikhi · 31a82afc · 80b70c7a
Commit 80b70c7a authored 5 years ago by Rayan Chikhi
--- a/deconvolution/main/d2_path_evaluation.py
+++ b/deconvolution/main/d2_path_evaluation.py
@@ -28,40 +28,45 @@ def load_graph(filename):
        print("Wrong file format. Require graphml or gefx format", file=sys.stderr)
        exit()

-""" return a random path in G starting in u and having n nodes """
+""" return a random path in G starting in u and having n _edges_ """
 import random
 def findRandomPath(G,u,n,previous_path_nodes=set()):
    if n==0:
        return [u]
-    path = [u]
    poss_neigh = list(set(G.neighbors(u)) - previous_path_nodes)
    if len(poss_neigh) == 0: return None
    neighbor = random.choice(poss_neigh)
    new_previous_path_nodes = previous_path_nodes | set([u])
    path = findRandomPath(G,neighbor,n-1, new_previous_path_nodes)
    if path is None: return None
+    if len(path) != n: return None
    return [u]+path

 import itertools
-
-def is_there_path_acc(central_nodes,overlap_length):
-    for mols in itertools.product(*central_nodes):
+""" determine, given an ordered list of central nodes, whether there exists a coherent paths of overlapping molecules in them """
+def is_there_path_acc(mols_in_central_nodes,min_overlap_length=5000): # don't consider overlaps smaller than 5kbp
+    for mols in itertools.product(*mols_in_central_nodes):
        #print(mols)
        last_end = None
+        last_start = None
        good_path = True
-        for mol in mols:
+        for mol in sorted(mols):
            start,end = mol
            if last_end is None:
                last_end = end
-            else:
-                if start > last_end-overlap_length:
-                    #print("bad path",mols)
-                    good_path = False
-                    break
+            if last_start is None:
+                last_start = start
+            if not (start >= last_start and start <= last_end-min_overlap_length):
+                #print("bad path",mols)
+                good_path = False
+                break
+            last_end = end
+            last_start = start
        if good_path:
            return True
    return False

+""" converts a central node of a d-graph into its list of molecules (given the ground truth) """
 def central_node_to_molecules(nodestr):
    # format for a 2-merge: 1:NC_000913.3_298281_313280_0:0:0_0:0:0_2fb/1_NC_000913.3_338611_353610_0:0:0_0:0:0_37b/1
    cur_node_mols = []
@@ -72,15 +77,17 @@ def central_node_to_molecules(nodestr):
        cur_node_mols += [(start,end)]
    return cur_node_mols 

-def is_coherent_path(central_nodes, overlap_length):
-    mols =  []
+def is_coherent_path(central_nodes,path_len):
+    mols_in_central_nodes =  []
    for node in central_nodes:
        cur_node_mols = central_node_to_molecules(node)
-        mols += [cur_node_mols]
-    return is_there_path_acc(mols,overlap_length)
+        mols_in_central_nodes += [cur_node_mols]
+    assert(len(mols_in_central_nodes) == path_len+1)
+    return is_there_path_acc(mols_in_central_nodes)

+""" the main function that tests for accuracy of random paths """
 graph = None
-def evaluate_accuracy_paths(path_len,overlap_length=7000,max_paths_per_node=100):
+def evaluate_accuracy_paths(path_len,max_paths_per_node=100):
    global graph
    nb_bad_paths = 0
    nb_good_paths = 0
@@ -92,10 +99,12 @@ def evaluate_accuracy_paths(path_len,overlap_length=7000,max_paths_per_node=100)
            if path is None: continue
            if tuple(sorted(path)) in seen_paths: continue # avoids looking at the same path twice
            seen_paths.add(tuple(sorted(path)))
+            assert(len(path) == path_len+1)
            #print("path",path)
            central_nodes = [graph.nodes[x]['udg'].split()[0] for x in path]
+            assert(len(central_nodes) == path_len+1)
            #print(path,central_nodes)
-            if is_coherent_path(central_nodes,overlap_length):
+            if is_coherent_path(central_nodes,path_len):
                nb_good_paths += 1
            else:
                nb_bad_paths += 1
@@ -103,6 +112,8 @@ def evaluate_accuracy_paths(path_len,overlap_length=7000,max_paths_per_node=100)

 # ---- sensitivity evaluation

+""" given an ordered list of molecules, determine if the graph contains a path of central nodse which have these molecules.
+it does that by testing all possible combinations of d-graphs having those molecules in their central nodes """
 def is_there_path(graph,molecules_to_nodes,sought_path):
    possible_central_nodes = []
    for mol in sought_path: