Commit 01f9f006 authored by Yoann Dufresne's avatar Yoann Dufresne

longest path evaluation

parent c1d36427
......@@ -303,7 +303,7 @@ def compute_next_nodes(d2_component, max_jumps=0):
nexts.sort(key=lambda x: x[0])
next_nodes[node][mol_idx] = nexts
#print("next nodes of node",node,"mol idx",mol_idx,":",next_nodes)
# print("next nodes of node",node,"mol idx",mol_idx,":",next_nodes)
return next_nodes
......@@ -318,10 +318,6 @@ def compute_longest_increasing_paths(d2_component, max_gap=0):
for mol_idx in next_nodes[start_node]:
recursive_longest_path(start_node, mol_idx, next_nodes, longest_paths)
test_node = '5339'
for mol in longest_paths[test_node]:
print("investigating node",test_node,"mol",mol,longest_paths[test_node][mol])
# Get the longest path size,
# across all barcode graph nodes and all molecules in these barcodes
max_len, node_val, mol_idx = 0, None, -1
......@@ -338,6 +334,109 @@ def compute_longest_increasing_paths(d2_component, max_gap=0):
return path
def compute_shortest_edit_path(path):
min_mol = float("inf")
max_mol = 0
molecules = {}
path_extremes = []
node_names = {}
# Parse molecules
for node_name, data in path.nodes(data=True):
udg = data["udg"]
central_node = udg.split(']')[0][1:]
node_names[node_name] = central_node
if len(list(path[node_name])) == 1:
path_extremes.append(node_name)
mol_names = central_node.split(":")[1].split('_')
for mol_name in mol_names:
mol_idx = int(mol_name)
if mol_idx < min_mol: min_mol = mol_idx
if mol_idx >= max_mol: max_mol = mol_idx
molecules[mol_idx] = central_node
# create barcode path from molecules
molecule_order = []
for idx in range(min_mol, max_mol+1):
if idx in molecules:
molecule_order.append(molecules[idx])
# create barcode path from d2_path
first = path_extremes[0]
last = path_extremes[1]
d2_path_order = [first, list(path[first])[0]]
while d2_path_order[-1] != last:
neighbors = list(path[d2_path_order[-1]])
if neighbors[0] == d2_path_order[-2]:
d2_path_order.append(neighbors[1])
else:
d2_path_order.append(neighbors[0])
d2_path_order = [node_names[x] for x in d2_path_order]
edit_path = edit_distance(molecule_order, d2_path_order)
filtered_edit_path = [x[0] for x in edit_path if x[0] == x[1]]
reverse_edit_path = edit_distance(molecule_order, d2_path_order[::-1])
reverse_filtered_edit_path = [x[0] for x in edit_path if x[0] == x[1]]
if len(filtered_edit_path) > len(reverse_filtered_edit_path):
return filtered_edit_path
else:
return reverse_filtered_edit_path
def edit_distance(array_vertical, array_horizontal):
dynamic = [[float("inf") for column in range(len(array_horizontal)+1)] for row in range(len(array_vertical)+1)]
# Fill init line and column
for i in range(len(array_horizontal)+1):
dynamic[0][i] = i
for i in range(len(array_vertical)+1):
dynamic[i][0] = i
# Fill the array
for row in range(1, len(array_vertical)+1):
for column in range(1, len(array_horizontal)+1):
if array_vertical[row-1] == array_horizontal[column-1]:
dynamic[row][column] = min(dynamic[row-1][column-1], dynamic[row-1][column]+1, dynamic[row][column-1]+1)
else:
dynamic[row][column] = min(dynamic[row-1][column-1]+1, dynamic[row-1][column]+1, dynamic[row][column-1]+1)
# Compute alignment
row = len(array_vertical)
column = len(array_horizontal)
path = [(array_vertical[row-1], array_horizontal[column-1])]
while row != 0 and column != 0:
score = dynamic[row][column]
# Match
if array_vertical[row-1] == array_horizontal[column-1] and dynamic[row-1][column-1] == dynamic[row][column]:
row -= 1
column -= 1
elif array_vertical[row-1] != array_horizontal[column-1] and dynamic[row-1][column-1] == dynamic[row][column] - 1:
row -= 1
column -= 1
elif dynamic[row-1][column] == dynamic[row][column] - 1:
row -= 1
elif dynamic[row][column-1] == dynamic[row][column] - 1:
column -= 1
else:
print("Huge problem in edit distance !", file=sys.stderr)
return []
path.append((array_vertical[row-1], array_horizontal[column-1]))
for finalize_row in range(row-1, -1, -1):
path.append((array_vertical[finalize_row], array_horizontal[column]))
for finalize_column in range(column-1, -1, -1):
path.append((array_vertical[row], array_horizontal[finalize_column]))
return path[::-1]
def backtrack_longest_path(node, molecule, longest_paths, path=[]):
if node is None:
return path
......@@ -531,22 +630,28 @@ def main():
graph = load_graph(args.filename)
if args.type == "path":
if args.barcode_graph is None:
print("--barcode_graph is required for path analysis", file=sys.stderr)
exit(0)
barcode_graph = load_graph(args.barcode_graph)
if len(list(nx.connected_components(graph))) != 1:
exit("when running evaluate.py --type path, the graph should have a single connected component (it's supposed to be a path, after all)")
# if len(list(nx.connected_components(graph))) != 1:
# print([len(x) for x in list(nx.connected_components(graph))])
# exit("when running evaluate.py --type path, the graph should have a single connected component (it's supposed to be a path, after all)")
# compute LIS over the path
longest_path = compute_longest_increasing_paths(graph)
# longest_path = compute_longest_increasing_paths(graph)
longest_path = compute_shortest_edit_path(graph)
print("--- Largest component analysis ---")
print(f"Size of the longest path: {len(longest_path)}")
#print("Jumps in central nodes:") # what does this do?
#print(path_to_jumps(longest_path))
return
if not args.light_print:
print(longest_path)
# get over/under counted molecules
print("--- Under/over molecule counts ---")
frequencies = parse_path_graph_frequencies(graph, barcode_graph)
print_path_summary(frequencies, light_print=args.light_print)
print(f"Size of the longest path: {len(longest_path)}")
elif args.type == "dgraphs":
udg_per_node = parse_udg_qualities(graph)
# print(udg_per_node)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment