Commit 17971a94 authored by Yoann Dufresne's avatar Yoann Dufresne
Browse files

evaluate longest path ok

parent 76239d13
......@@ -138,7 +138,7 @@ class D2Graph(object):
def to_nx_graph(self):
next_idx = 0
# next_idx = 0
nodes = {}
G = nx.Graph()
......@@ -146,8 +146,8 @@ class D2Graph(object):
for d_idx, dg in enumerate(self.index[dmer]):
# Create a node name
if not dg in nodes:
nodes[dg] = next_idx
next_idx += 1
nodes[dg] = str(dg)
# next_idx += 1
# Add the node
G.add_node(nodes[dg])
......
......@@ -166,7 +166,8 @@ class Dgraph(object):
def __repr__(self):
# print(self.halves)
representation = str(self.center) + " " + str(self.score) + "/" + str(self.get_optimal_score()) + " "
representation = "" if self.idx == -1 else f"{self.idx} "
representation += str(self.center) + " " + str(self.score) + "/" + str(self.get_optimal_score()) + " "
representation += "[" + ", ".join([f"{node} {self.connexity[0][node]}" for node in self.halves[0]]) + "]"
representation += "[" + ", ".join([f"{node} {self.connexity[1][node]}" for node in self.halves[1]]) + "]"
return representation
......@@ -174,10 +175,9 @@ class Dgraph(object):
""" From a barcode graph, compute all the possible max d-graphs by node.
@param graph A barcode graph
@param n_best Only keep n d-graphs (the nearest to 1.0 ratio)
@return A dictionary associating each node to its list of all possible d-graphs. The d-graphs are sorted by decreasing ratio.
"""
def compute_all_max_d_graphs(graph, n_best=100, debug=False):
def compute_all_max_d_graphs(graph, debug=False):
d_graphs = {}
for node in list(graph.nodes()):
......
......@@ -11,12 +11,13 @@ import networkx as nx
def parse_args():
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('filename', type=str,
help='The output file to evalute')
help='The file to evalute')
parser.add_argument('--type', '-t', choices=["d2", "path"], default="path", required=True,
help="Define the data type to evaluate. Must be 'd2' or 'path'.")
parser.add_argument('--light-print', '-l', action='store_true',
help='Print only wrong nodes and paths')
args = parser.parse_args()
return args
......@@ -30,6 +31,8 @@ def load_graph(filename):
exit()
# ------------- Path Graph -------------
def mols_from_node(node_name):
return [int(idx) for idx in node_name.split(":")[1].split(".")[0].split("_")]
......@@ -42,7 +45,7 @@ def mols_from_node(node_name):
@param file_pointer Where to print the output. If set to stdout, then pretty print. If set to None, don't print anything.
@return A tuple containing two dictionaries. The first one with theoritical frequencies of each node, the second one with observed frequencies.
"""
def parse_graph_frequencies(graph):
def parse_path_graph_frequencies(graph):
# Compute origin nodes formated as `{idx}:{mol1_id}_{mol2_id}_...`
observed_frequencies = {}
origin_node_names = []
......@@ -94,7 +97,7 @@ def parse_graph_path(graph):
return neighborhood
def print_summary(frequencies, neighborhood, light_print=False, file_pointer=sys.stdout):
def print_path_summary(frequencies, neighborhood, light_print=False, file_pointer=sys.stdout):
if file_pointer == None:
return
......@@ -148,13 +151,171 @@ def print_summary(frequencies, neighborhood, light_print=False, file_pointer=sys
print(f"Under/Over splitting: {under_split} - {over_split}")
# ------------- D2 Graph -------------
def parse_dg_name(name):
name = name.replace("]", "").replace(' [', '[')
header, h1, h2 = name.split('[')
# Parse header
header = header.split(" ")
idx = central = score = -1
if len(header) == 3:
idx, central, score = header
else:
central, score = header
# Parse hands
h1 = h1.split(', ')
h2 = h2.split(', ')
return (idx, central, score), h1, h2
def print_d2_summary(connected_components, light_print=False):
print("--- Global summary ---")
print(f"Number of connected components: {len(connected_components)}")
print(f"Total number of nodes: {sum([len(x) for x in connected_components])}")
print(f"The 5 largest components: {[len(x) for x in connected_components][:5]}")
print("--- Largest component analysis ---")
# def component_to_nearest_neighbor_graph(component):
# nng = nx.Graph()
# nng.add_nodes_from(component.nodes())
# for edge in component.edges():
# node1, node2 = edge
# node1 = parse_dg_name(node1)
# node2 = parse_dg_name(node2)
# central1 = mols_from_node(node1[0][1])
# central2 = frozenset(mols_from_node(node2[0][1]))
# for mol1 in central1:
# if mol1-1 in central2 or mol1+1 in central2:
# nng.add_edge(edge[0], edge[1])
# componnents = list(nx.connected_components(nng))
# print([len(x) for x in componnents])
# componnents.sort(key=lambda x: -len(x))
# componnents = [nng.subgraph(x) for x in componnents]
# nx.write_gexf(componnents[0], "data/d2_reducted.gexf")
# return nng, componnents
def compute_next_nodes(d2_component):
next_nodes = {}
for node in d2_component.nodes():
# Parse the current node name
head, h1, h2 = parse_dg_name(node)
next_nodes[node] = {}
neighbors = list(d2_component.neighbors(node))
# Get the current molecule idxs
molecule_idxs = mols_from_node(head[1])
for mol_idx in molecule_idxs:
nexts = []
for neighbor in neighbors:
nei_head, _, _ = parse_dg_name(neighbor)
nei_mols = mols_from_node(nei_head[1])
nei_mols = [x for x in nei_mols if x > mol_idx]
# If there are molecule next
if len(nei_mols) > 0:
next_nei_mol = min(nei_mols)
nexts.append((next_nei_mol, neighbor))
nexts.sort(key=lambda x: x[0])
next_nodes[node][mol_idx] = nexts
# print(next_nodes)
return next_nodes
def compute_longest_increasing_paths(d2_component):
next_nodes = compute_next_nodes(d2_component)
# Compute the longest path for each node
longest_paths = {}
for idx, start_node in enumerate(next_nodes):
# print(f"{idx}/{len(next_nodes)}")
for mol_idx in next_nodes[start_node]:
recursive_longest_path(start_node, mol_idx , next_nodes, longest_paths)
# Get the longest path size
max_len, node_val, mol_idx = 0, None, -1
for node in longest_paths:
for mol in longest_paths[node]:
length, _, _ = longest_paths[node][mol]
if max_len < length:
max_len = length
node_val = node
mol_idx = mol
# Backtrack the longest path
path = backtrack_longest_path(node_val, mol_idx, longest_paths)
return path
def backtrack_longest_path(node, molecule, longest_paths, path=[]):
if node == None:
return path
path.append(node)
print(node, molecule)
length, next_node, next_mol = longest_paths[node][molecule]
return backtrack_longest_path(next_node, next_mol, longest_paths, path)
def recursive_longest_path(current_node, current_molecule, next_nodes, longest_paths):
# Dynamic programming
if current_node in longest_paths and current_molecule in longest_paths[current_node]:
return longest_paths[current_node][current_molecule]
longest = 0
longest_next = None
min_mol_idx = current_molecule + 10000
# Recursively compute the longest path
for mol_idx, node in next_nodes[current_node][current_molecule]:
size, _, _ = recursive_longest_path(node, mol_idx, next_nodes, longest_paths)
if size + 1 > longest:
longest = size + 1
longest_next = node
min_mol_idx = mol_idx
# If there is an alternative path with shorter distance
elif size + 1 == longest and mol_idx < min_mol_idx:
longest = size + 1
longest_next = node
min_mol_idx = mol_idx
# Save the result
if not current_node in longest_paths:
longest_paths[current_node] = {}
longest_paths[current_node][current_molecule] = (longest, longest_next, min_mol_idx)
return longest_paths[current_node][current_molecule]
def main():
args = parse_args()
graph = load_graph(args.filename)
frequencies = parse_graph_frequencies(graph)
neighborhood = parse_graph_path(graph)
print_summary(frequencies, neighborhood, light_print=args.light_print)
if args.type == "path":
frequencies = parse_path_graph_frequencies(graph)
neighborhood = parse_graph_path(graph)
print_path_summary(frequencies, neighborhood, light_print=args.light_print)
elif args.type == "d2":
components = list(nx.connected_components(graph))
components.sort(key=lambda x: -len(x))
component = graph.subgraph(components[0])
longest_path = compute_longest_increasing_paths(component)
print_d2_summary(components, longest_path, light_print=args.light_print)
if __name__ == "__main__":
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment