diff --git a/deconvolution/main/d2_path_evaluation.py b/deconvolution/main/d2_path_evaluation.py index 5c8299e76f4dcbf8e62b73f573c6ce24ab19e9ef..e3a4a943e85c4e40f0af10a8c462fed26898b59d 100755 --- a/deconvolution/main/d2_path_evaluation.py +++ b/deconvolution/main/d2_path_evaluation.py @@ -28,40 +28,45 @@ def load_graph(filename): print("Wrong file format. Require graphml or gefx format", file=sys.stderr) exit() -""" return a random path in G starting in u and having n nodes """ +""" return a random path in G starting in u and having n _edges_ """ import random def findRandomPath(G,u,n,previous_path_nodes=set()): if n==0: return [u] - path = [u] poss_neigh = list(set(G.neighbors(u)) - previous_path_nodes) if len(poss_neigh) == 0: return None neighbor = random.choice(poss_neigh) new_previous_path_nodes = previous_path_nodes | set([u]) path = findRandomPath(G,neighbor,n-1, new_previous_path_nodes) if path is None: return None + if len(path) != n: return None return [u]+path import itertools - -def is_there_path_acc(central_nodes,overlap_length): - for mols in itertools.product(*central_nodes): +""" determine, given an ordered list of central nodes, whether there exists a coherent paths of overlapping molecules in them """ +def is_there_path_acc(mols_in_central_nodes,min_overlap_length=5000): # don't consider overlaps smaller than 5kbp + for mols in itertools.product(*mols_in_central_nodes): #print(mols) last_end = None + last_start = None good_path = True - for mol in mols: + for mol in sorted(mols): start,end = mol if last_end is None: last_end = end - else: - if start > last_end-overlap_length: - #print("bad path",mols) - good_path = False - break + if last_start is None: + last_start = start + if not (start >= last_start and start <= last_end-min_overlap_length): + #print("bad path",mols) + good_path = False + break + last_end = end + last_start = start if good_path: return True return False +""" converts a central node of a d-graph into its list of molecules (given the ground truth) """ def central_node_to_molecules(nodestr): # format for a 2-merge: 1:NC_000913.3_298281_313280_0:0:0_0:0:0_2fb/1_NC_000913.3_338611_353610_0:0:0_0:0:0_37b/1 cur_node_mols = [] @@ -72,15 +77,17 @@ def central_node_to_molecules(nodestr): cur_node_mols += [(start,end)] return cur_node_mols -def is_coherent_path(central_nodes, overlap_length): - mols = [] +def is_coherent_path(central_nodes,path_len): + mols_in_central_nodes = [] for node in central_nodes: cur_node_mols = central_node_to_molecules(node) - mols += [cur_node_mols] - return is_there_path_acc(mols,overlap_length) + mols_in_central_nodes += [cur_node_mols] + assert(len(mols_in_central_nodes) == path_len+1) + return is_there_path_acc(mols_in_central_nodes) +""" the main function that tests for accuracy of random paths """ graph = None -def evaluate_accuracy_paths(path_len,overlap_length=7000,max_paths_per_node=100): +def evaluate_accuracy_paths(path_len,max_paths_per_node=100): global graph nb_bad_paths = 0 nb_good_paths = 0 @@ -92,10 +99,12 @@ def evaluate_accuracy_paths(path_len,overlap_length=7000,max_paths_per_node=100) if path is None: continue if tuple(sorted(path)) in seen_paths: continue # avoids looking at the same path twice seen_paths.add(tuple(sorted(path))) + assert(len(path) == path_len+1) #print("path",path) central_nodes = [graph.nodes[x]['udg'].split()[0] for x in path] + assert(len(central_nodes) == path_len+1) #print(path,central_nodes) - if is_coherent_path(central_nodes,overlap_length): + if is_coherent_path(central_nodes,path_len): nb_good_paths += 1 else: nb_bad_paths += 1 @@ -103,6 +112,8 @@ def evaluate_accuracy_paths(path_len,overlap_length=7000,max_paths_per_node=100) # ---- sensitivity evaluation +""" given an ordered list of molecules, determine if the graph contains a path of central nodse which have these molecules. +it does that by testing all possible combinations of d-graphs having those molecules in their central nodes """ def is_there_path(graph,molecules_to_nodes,sought_path): possible_central_nodes = [] for mol in sought_path: