#!/usr/bin/env python3 import sys import argparse from termcolor import colored import networkx as nx sys.setrecursionlimit(10000) def parse_args(): parser = argparse.ArgumentParser(description='Process a d2 graph (complete graph or path) to evaluate its quality.') parser.add_argument('filename', type=str, help='The file to evalute') parser.add_argument('--type', '-t', choices=["d2", "path", "d2-2annotate", "dgraphs"], default="path", required=True, help="Define the data type to evaluate. Must be 'd2' or 'path' or 'd2-2annotate' (Rayan's hack).") parser.add_argument('--light-print', '-l', action='store_true', help='Print only wrong nodes and paths') parser.add_argument('--max_gap', '-g', type=int, default=0, help="Allow to jump over max_gap nodes during the increasing path search") parser.add_argument('--barcode_graph', '-b', help="Path to the barcode graph corresponding to the d2_graph to analyse.") parser.add_argument('--optimization_file', '-o', help="If the main file is a d2, a file formatted for optimization can be set. This file will be used to compute the coverage of the longest path on the barcode graph.") args = parser.parse_args() return args def transform_bg(graph): idx = 0 node_names = {} nx.set_node_attributes(graph, 0, 'score') nx.set_node_attributes(graph, "", 'barcode_edges') for node in graph.nodes(): graph.nodes[node]['udg'] = f"[{node}][][]" node_names[node] = str(idx) idx += 1 graph = nx.relabel_nodes(graph, node_names) return graph # ------------- Path Graph ------------- def mols_from_node(node_name): return [int(idx) for idx in node_name.split(":")[1].split(".")[0].split("_")] def parse_udg_qualities(graph): """ Compute the quality for the best udgs present in the graph. All the node names must be under the format : {idx}:{mol1_id}_{mol2_id}_...{molx_id}.other_things_here :param graph: The networkx graph representing the deconvolved graph :return: A tuple containing two dictionaries. The first one with theoretical frequencies of each node, the second one with observed frequencies. """ dg_per_node = {} for node, data in graph.nodes(data=True): str_udg = data["udg"] central, h1, h2 = str_to_udg_lists(str_udg) if central not in dg_per_node: dg_per_node[central] = [] dg_per_node[central].append(data["udg"]) for node in dg_per_node: print(node, dg_per_node[node]) print(len(dg_per_node)) return dg_per_node def parse_path_graph_frequencies(graph, barcode_graph): """ Compute appearance frequencies from node names. All the node names must be under the format : {idx}:{mol1_id}_{mol2_id}_...{molx_id}.other_things_here :param graph: The networkx graph representing the deconvolved graph :param barcode_graph: The barcode graph :return: A tuple containing two dictionaries. The first one with theoretical frequencies of each node, the second one with observed frequencies. """ # Compute origin nodes formatted as `{idx}:{mol1_id}_{mol2_id}_...` observed_frequencies = {} real_frequencies = {} origin_node_names = [] node_per_barcode = {} for node, data in graph.nodes(data=True): parsed = parse_dg_name(graph, node) origin_name = parsed[0][1] if origin_name not in node_per_barcode: node_per_barcode[origin_name] = [] node_per_barcode[origin_name].append(node) # Count frequency if origin_name not in observed_frequencies: observed_frequencies[origin_name] = 0 origin_node_names.append(origin_name) observed_frequencies[origin_name] += 1 # Theoretical frequencies real_frequencies = {node_id: len(node_id.split(":")[1].split("_")) for node_id in barcode_graph.nodes()} return real_frequencies, observed_frequencies, node_per_barcode def parse_graph_path(graph): """ This function aims to look for direct molecule neighbors. If a node has more than 2 direct neighbors, it's not rightly split """ neighborhood = {} for node in graph.nodes(): molecules = mols_from_node(node) neighbors = list(graph.neighbors(node)) neighborhood[node] = [] for mol in molecules: for nei in neighbors: nei_mols = mols_from_node(nei) if mol-1 in nei_mols: neighborhood[node].append(nei) if mol+1 in nei_mols: neighborhood[node].append(nei) return neighborhood def print_path_summary(frequencies, light_print=False, file_pointer=sys.stdout): if file_pointer is None: return print("--- Nodes analysis ---", file=file_pointer) theoretical_frequencies, observed_frequencies, node_per_barcode = frequencies for key in theoretical_frequencies: obs, the = observed_frequencies[key] if key in observed_frequencies else 0, theoretical_frequencies[key] result = f"{key}: {obs}/{the}" if file_pointer == sys.stdout: result = colored(result, 'green' if obs==the else 'red') if light_print and obs == the: continue print(result, file=file_pointer) print("--- Global summary ---", file=file_pointer) # --- Frequency usage --- # Tags distinct_theoretical_nodes = len(theoretical_frequencies) distinct_observed_nodes = len(observed_frequencies) print(f"Distinct barcodes: {distinct_observed_nodes}/{distinct_theoretical_nodes}", file=file_pointer) # molecules cumulative_theoretical_nodes = sum(theoretical_frequencies.values()) cumulative_observed_nodes = sum(observed_frequencies.values()) print(f"Molecules: {cumulative_observed_nodes}/{cumulative_theoretical_nodes}", file=file_pointer) # Wrong splits over_split = 0 under_split = 0 for barcode in theoretical_frequencies: observed = observed_frequencies[barcode] if barcode in observed_frequencies else 0 theoretic = theoretical_frequencies[barcode] over_split += max(observed-theoretic, 0) under_split += max(theoretic-observed, 0) print(f"Under/Over splitting: {under_split} - {over_split}") def print_dgraphs_summary(frequencies, light_print=False): pass # ------------- D2 Graph ------------- def str_to_udg_lists(s): udg = s.replace("]", "").replace(' [', '[') return udg.split('[')[1:] # speeds up networkx access to attributes cached_udg_attr = None cached_score_attr = None def parse_dg_name(gr, name): global cached_udg_attr, cached_score_attr if cached_udg_attr is None: cached_udg_attr = nx.get_node_attributes(gr, 'udg') udg = cached_udg_attr[name] res = str_to_udg_lists(udg) if len(res) != 3: print("parsing problem:",res) central, h1, h2 = res idx = name if cached_score_attr is None: cached_score_attr = nx.get_node_attributes(gr, 'score') score = cached_score_attr[name] # Parse hands h1 = h1.split(', ') h2 = h2.split(', ') return (idx, central, score), h1, h2 def path_to_jumps(path): chuncks = [] prev_start = -1000 current_molecule = -1000 for mol, node in path: # If there is a gap if mol > current_molecule + 1: chuncks.append((prev_start, current_molecule)) prev_start = mol current_molecule = mol # Add the last piece chuncks.append((prev_start, current_molecule)) del chuncks[0] return chuncks def print_d2_summary(connected_components, longest_path, coverage_vars=(0, 0), light_print=False): print("--- Global summary ---") print(f"Number of connected components: {len(connected_components)}") print(f"Total number of nodes: {sum([len(x) for x in connected_components])}") no_singleton = [x for x in connected_components if len(x) > 1] print(f"There are {len(no_singleton)} connex components with at least 2 nodes") print(f"The 5 largest components: {[len(x) for x in connected_components][:5]}") print("--- Largest component analysis ---") # Get the list of node idx path_dg_idx = [int(x[1].split(" ")[0]) for x in longest_path] # print("\n".join(longest_path)) if not light_print: print("Longest path for increasing molecule number:") print(path_dg_idx) print(f"Size of the longest path: {len(longest_path)}") if not light_print: print("Jumps in central nodes:") print(path_to_jumps(longest_path)) print(f"Number of usable coverage variables: {len(coverage_vars[1])}") print(f"Coverage: {len(coverage_vars[0])}/{len(coverage_vars[1])}") if not light_print: print(f"Missing coverage variables:\n{coverage_vars[1]-coverage_vars[0]}") def _get_distant_neighbors(graph, node, dist): neighbors = set() to_compute = [node] for _ in range(dist): next_compute = [] for node in to_compute: for neighbor in graph[node]: if neighbor not in neighbors: neighbors.add(neighbor) next_compute.append(neighbor) to_compute = next_compute return neighbors def compute_next_nodes(d2_component, max_jumps=0): # First parse dg names dg_names = {} for node in d2_component.nodes(): dg_names[node] = parse_dg_name(d2_component,node) next_nodes = {} for node in d2_component.nodes(): # Parse the current node name head, h1, h2 = dg_names[node] next_nodes[node] = {} # Get the current molecule idxs molecule_idxs = mols_from_node(head[1]) #print("node",node,"dg name",dg_names[node],"mol idxs",molecule_idxs) for mol_idx in molecule_idxs: nexts = [] # for neighbor in d2_component[node]: for neighbor in _get_distant_neighbors(d2_component, node, max_jumps+1): # nei_head: central node of the neighbor of 'node' nei_head, _, _ = dg_names[neighbor] nei_mols = mols_from_node(nei_head[1]) # only consider neighbor molecules that are strictly bigger than the current molecule idx considered (from 'node') nei_mols = [x for x in nei_mols if x > mol_idx] # If there are molecule next if len(nei_mols) > 0: next_nei_mol = min(nei_mols) # append to the neighbors of (node,mol_idx) that 'neighbor' if it contains a molecule that's bigger than mol_idx nexts.append((next_nei_mol, neighbor)) nexts.sort(key=lambda x: x[0]) next_nodes[node][mol_idx] = nexts # print("next nodes of node",node,"mol idx",mol_idx,":",next_nodes) return next_nodes def compute_longest_increasing_paths(d2_component, max_gap=0): next_nodes = compute_next_nodes(d2_component, max_jumps=max_gap) sys.setrecursionlimit(len(d2_component.nodes)*2) # Compute the longest path for each node longest_paths = {} for idx, start_node in enumerate(next_nodes): # print(f"{idx}/{len(next_nodes)}") for mol_idx in next_nodes[start_node]: recursive_longest_path(start_node, mol_idx, next_nodes, longest_paths) # Get the longest path size, # across all barcode graph nodes and all molecules in these barcodes max_len, node_val, mol_idx = 0, None, -1 for node in longest_paths: for mol in longest_paths[node]: length, _, _ = longest_paths[node][mol] if max_len < length: max_len = length node_val = node mol_idx = mol # Backtrack the longest path path = backtrack_longest_path(node_val, mol_idx, longest_paths) return path def compute_shortest_edit_path(path): min_mol = float("inf") max_mol = 0 molecules = {} path_extremes = [] node_names = {} # Parse molecules for node_name, data in path.nodes(data=True): udg = data["udg"] central_node = udg.split(']')[0][1:] node_names[node_name] = central_node if len(list(path[node_name])) == 1: path_extremes.append(node_name) mol_names = central_node.split(":")[1].split('_') for mol_name in mol_names: mol_idx = int(mol_name) if mol_idx < min_mol: min_mol = mol_idx if mol_idx >= max_mol: max_mol = mol_idx molecules[mol_idx] = central_node # create barcode path from molecules molecule_order = [] for idx in range(min_mol, max_mol+1): if idx in molecules: molecule_order.append(molecules[idx]) # create barcode path from d2_path first = path_extremes[0] last = path_extremes[1] d2_path_order = [first, list(path[first])[0]] while d2_path_order[-1] != last: neighbors = list(path[d2_path_order[-1]]) if neighbors[0] == d2_path_order[-2]: d2_path_order.append(neighbors[1]) else: d2_path_order.append(neighbors[0]) d2_path_order = [node_names[x] for x in d2_path_order] edit_path = edit_distance(molecule_order, d2_path_order) filtered_edit_path = [x[0] for x in edit_path if x[0] == x[1]] reverse_edit_path = edit_distance(molecule_order, d2_path_order[::-1]) reverse_filtered_edit_path = [x[0] for x in reverse_edit_path if x[0] == x[1]] if len(filtered_edit_path) > len(reverse_filtered_edit_path): return filtered_edit_path else: return reverse_filtered_edit_path def edit_distance(array_vertical, array_horizontal): dynamic = [[float("inf") for column in range(len(array_horizontal)+1)] for row in range(len(array_vertical)+1)] # Fill init line and column for i in range(len(array_horizontal)+1): dynamic[0][i] = i for i in range(len(array_vertical)+1): dynamic[i][0] = i # Fill the array for row in range(1, len(array_vertical)+1): for column in range(1, len(array_horizontal)+1): if array_vertical[row-1] == array_horizontal[column-1]: dynamic[row][column] = min(dynamic[row-1][column-1], dynamic[row-1][column]+1, dynamic[row][column-1]+1) else: dynamic[row][column] = min(dynamic[row-1][column-1]+1, dynamic[row-1][column]+1, dynamic[row][column-1]+1) # Compute alignment row = len(array_vertical) column = len(array_horizontal) path = [(array_vertical[row-1], array_horizontal[column-1])] while row != 0 and column != 0: score = dynamic[row][column] # Match if array_vertical[row-1] == array_horizontal[column-1] and dynamic[row-1][column-1] == dynamic[row][column]: row -= 1 column -= 1 elif array_vertical[row-1] != array_horizontal[column-1] and dynamic[row-1][column-1] == dynamic[row][column] - 1: row -= 1 column -= 1 elif dynamic[row-1][column] == dynamic[row][column] - 1: row -= 1 elif dynamic[row][column-1] == dynamic[row][column] - 1: column -= 1 else: print("Huge problem in edit distance !", file=sys.stderr) return [] path.append((array_vertical[row-1], array_horizontal[column-1])) for finalize_row in range(row-1, -1, -1): path.append((array_vertical[finalize_row], array_horizontal[column])) for finalize_column in range(column-1, -1, -1): path.append((array_vertical[row], array_horizontal[finalize_column])) return path[::-1] def backtrack_longest_path(node, molecule, longest_paths, path=[]): if node is None: return path path.append((molecule, node)) length, next_node, next_mol = longest_paths[node][molecule] return backtrack_longest_path(next_node, next_mol, longest_paths, path) def recursive_longest_path(current_node, current_molecule, next_nodes, longest_paths): # Dynamic programming if current_node in longest_paths and current_molecule in longest_paths[current_node]: #print("getting cached result for node",current_node,"mol",current_molecule,longest_paths[current_node][current_molecule]) return longest_paths[current_node][current_molecule] longest = 0 longest_next = None min_mol_idx = float('inf') # Recursively compute the longest path for mol_idx, node in next_nodes[current_node][current_molecule]: size, _, _ = recursive_longest_path(node, mol_idx, next_nodes, longest_paths) if size + 1 > longest: longest = size + 1 longest_next = node min_mol_idx = mol_idx # If there is an alternative path with shorter distance elif size + 1 == longest and mol_idx < min_mol_idx: longest = size + 1 longest_next = node min_mol_idx = mol_idx # Save the result if not current_node in longest_paths: longest_paths[current_node] = {} longest_paths[current_node][current_molecule] = (longest, longest_next, min_mol_idx) return longest_paths[current_node][current_molecule] def compute_covered_variables(graph, path): path_nodes = set() for mol, node_name in path: path_nodes.add(node_name) used_vars = set() total_vars = set() for node, data in graph.nodes(data=True): vars = data["barcode_edges"].split(" ") total_vars.update(vars) if node in path_nodes: used_vars.update(vars) return used_vars, total_vars # returns True iff there exist x in mol1 such that there exists y in mol2 and |x-y| <= some_value def nearby_udg_molecules(mols1, mols2): for x in mols1: for y in mols2: if abs(x-y) <= 5: return True return False def verify_graph_edges(d2_component): udg_molecules_dict=dict() for node in d2_component.nodes(): # Parse the current node name head, c1, c2 = parse_dg_name(d2_component,node) # Construct the molecule(s) that this udg really 'reflects' # i.e. the udg has a central node and two cliques # that central node is the result of merging of several molecules # ideally, only one of those molecules is connected to the molecules of the cliques # (there could be more than one though; in that case the udg is 'ambiguous') # udg_molecules aims to reflect the molecule(s) underlying this udg udg_molecules = set() # Get the current molecule idxs of central node molecule_idxs = mols_from_node(head[1]) # print("mol idxs", molecule_idxs) # Examine molecule idx's of cliques to see which are close to the central node # rationale: c1/c2 contain nearby molecule id's for mol_idx in molecule_idxs: nexts = [] for c in [c1,c2]: for c_node in c: nei_mols = mols_from_node(c_node.split()[0]) nei_mols = [x for x in nei_mols if x > mol_idx] # fixme: also look at the <= molecules for more robustness # If there are molecule next if len(nei_mols) > 0: next_nei_mol = min(nei_mols) nexts.append((next_nei_mol)) nexts.sort(key=lambda x: x) quality = sum([1.0/x if mol_idx+x in nexts else 0 for x in range(1,6)]) / sum([1.0/x for x in range(1,6)]) if quality > 0.6: # eyeballed but still arbitrary udg_molecules.add(mol_idx) # print("mol",mol_idx,molecule_idxs,"quality",quality,"nexts",nexts) udg_molecules_dict[head[0]]=udg_molecules # Then: annotate edges as to whether they're real (their udg_molecule(s) are nearby) or fake for n1, n2 , data in d2_component.edges(data=True): # Parse the current node name head, c1, c2 = parse_dg_name(d2_component,n1) node_udg_molecules = udg_molecules_dict[head[0]] n_head, n_c1, n_c2 = parse_dg_name(d2_component,n2) neighbor_udg_molecules = udg_molecules_dict[n_head[0]] if nearby_udg_molecules(node_udg_molecules, neighbor_udg_molecules): color = 'green' else: color = 'red' data['color'] = color # print("edge",node_udg_molecules,neighbor_udg_molecules,color) # also, annotate nodes by their putative molecule found for n, data in d2_component.nodes(data=True): # Parse the current node name head, c1, c2 = parse_dg_name(d2_component,n) node_udg_molecules = udg_molecules_dict[head[0]] data['udg_molecule']= '_'.join(list(map(str,node_udg_molecules))) # aggressive: delete nodes which have either no found udg_molecule, or two udg_molecules # turns out it's not a good strategy as the nodes with two udg_molecules are important to connect portions of graph # but what if we magically keep those where the two adjacent molecules are close together if True: d2_component = d2_component.copy() nodes_to_remove = [] for n, data in d2_component.nodes(data=True): # Parse the current node name head, c1, c2 = parse_dg_name(d2_component,n) if "_" in data['udg_molecule'] or data['udg_molecule'] == '': if "_" in data['udg_molecule']: m1, m2 = list(map(int,data['udg_molecule'].split("_"))) if abs(m2-m1) < 30: continue # don't remove that kind of nodes nodes_to_remove += [n] d2_component.remove_nodes_from(nodes_to_remove) print("removed",len(nodes_to_remove),"bad nodes") # aggressive: delete red edges if True: d2_component = d2_component.copy() edges_to_remove = [] for n1, n2, data in d2_component.edges(data=True): if data['color'] == 'red': edges_to_remove += [(n1,n2)] d2_component.remove_edges_from(edges_to_remove) print("removed",len(edges_to_remove),"bad edges") return d2_component def main(): args = parse_args() graph = nx.read_gexf(args.filename) if args.type == "path": if args.barcode_graph is None: print("--barcode_graph is required for path analysis", file=sys.stderr) exit(0) barcode_graph = nx.read_gexf(args.barcode_graph) # if len(list(nx.connected_components(graph))) != 1: # print([len(x) for x in list(nx.connected_components(graph))]) # exit("when running evaluate.py --type path, the graph should have a single connected component (it's supposed to be a path, after all)") # compute LIS over the path # longest_path = compute_longest_increasing_paths(graph) longest_path = compute_shortest_edit_path(graph) print("--- Largest component analysis ---") print(f"Size of the longest path: {len(longest_path)}") if not args.light_print: print(longest_path) # get over/under counted molecules print("--- Under/over molecule counts ---") frequencies = parse_path_graph_frequencies(graph, barcode_graph) print_path_summary(frequencies, light_print=args.light_print) print(f"Size of the longest path: {len(longest_path)}") elif args.type == "dgraphs": udg_per_node = parse_udg_qualities(graph) # print(udg_per_node) elif args.type == "d2": components = list(nx.connected_components(graph)) components.sort(key=lambda x: -len(x)) component = graph.subgraph(components[0]) longest_path = compute_longest_increasing_paths(component, max_gap=args.max_gap) vars = compute_covered_variables(graph, longest_path) print_d2_summary(components, longest_path, coverage_vars=vars, light_print=args.light_print) # added by Rayan # example: # python evaluate.py --type d2-2annotate ~/Dropbox/cnrs/projects/10x-barcodes/yoann_to_cedric_ilp/d2_graph.gexf elif args.type == "d2-2annotate": components = list(nx.connected_components(graph)) components.sort(key=lambda x: -len(x)) component = graph.subgraph(components[0]) component = verify_graph_edges(component) extension = args.filename.split('.')[-1] base_filename = '.'.join(args.filename.split('.')[:-1]) nx.write_gexf(component, base_filename+".verified."+extension) if __name__ == "__main__": main()