Commit 1c653eb8 authored by Yoann Dufresne's avatar Yoann Dufresne

allow jumps in increasing sequence

parent 97b187fc
......@@ -2,10 +2,10 @@
import sys
sys.setrecursionlimit(10000)
import argparse
from termcolor import colored
import networkx as nx
sys.setrecursionlimit(10000)
def parse_args():
......@@ -16,6 +16,7 @@ def parse_args():
help="Define the data type to evaluate. Must be 'd2' or 'path' or 'd2-2annotate' (Rayan's hack).")
parser.add_argument('--light-print', '-l', action='store_true',
help='Print only wrong nodes and paths')
parser.add_argument('--max_gap', '-g', type=int, default=0, help="Allow to jump over max_gap nodes during the increasing path search")
parser.add_argument('--barcode_graph', '-b', help="Path to the barcode graph corresponding to the d2_graph to analyse.")
parser.add_argument('--optimization_file', '-o',
help="If the main file is a d2, a file formatted for optimization can be set. This file will be used to compute the coverage of the longest path on the barcode graph.")
......@@ -55,8 +56,8 @@ def parse_udg_qualities(graph):
""" Compute the quality for the best udgs present in the graph.
All the node names must be under the format :
{idx}:{mol1_id}_{mol2_id}_...{molx_id}.other_things_here
:param graph: The networkx graph representinf the deconvolved graph
:return: A tuple containing two dictionaries. The first one with theoritical frequencies of each node, the second one with observed frequencies.
:param graph: The networkx graph representing the deconvolved graph
:return: A tuple containing two dictionaries. The first one with theoretical frequencies of each node, the second one with observed frequencies.
"""
dg_per_node = {}
......@@ -80,8 +81,7 @@ def parse_path_graph_frequencies(graph, barcode_graph):
All the node names must be under the format :
{idx}:{mol1_id}_{mol2_id}_...{molx_id}.other_things_here
:param graph: The networkx graph representing the deconvolved graph
:param only_wong: If True, don't print correct nodes
:param file_pointer: Where to print the output. If set to stdout, then pretty print. If set to None, don't print anything.
:param barcode_graph: The barcode graph
:return: A tuple containing two dictionaries. The first one with theoretical frequencies of each node, the second one with observed frequencies.
"""
# Compute origin nodes formatted as `{idx}:{mol1_id}_{mol2_id}_...`
......@@ -110,10 +110,10 @@ def parse_path_graph_frequencies(graph, barcode_graph):
return real_frequencies, observed_frequencies, node_per_barcode
""" This function aims to look for direct molecule neighbors.
If a node has more than 2 direct neighbors, it's not rightly splitted
"""
def parse_graph_path(graph):
""" This function aims to look for direct molecule neighbors.
If a node has more than 2 direct neighbors, it's not rightly split
"""
neighborhood = {}
for node in graph.nodes():
......@@ -241,10 +241,25 @@ def print_d2_summary(connected_components, longest_path, coverage_vars=(0, 0), l
print(f"Number of usable coverage variables: {len(coverage_vars[1])}")
print(f"Coverage: {len(coverage_vars[0])}/{len(coverage_vars[1])}")
print(f"Missing coverage variables:\n{coverage_vars[1]-coverage_vars[0]}")
if not light_print:
print(f"Missing coverage variables:\n{coverage_vars[1]-coverage_vars[0]}")
def _get_distant_neighbors(graph, node, dist):
neighbors = set()
to_compute = [node]
for _ in range(dist):
next_compute = []
for node in to_compute:
for neighbor in graph[node]:
if neighbor not in neighbors:
neighbors.add(neighbor)
next_compute.append(neighbor)
to_compute = next_compute
return neighbors
def compute_next_nodes(d2_component):
def compute_next_nodes(d2_component, max_jumps=0):
# First parse dg names
dg_names = {}
for node in d2_component.nodes():
......@@ -261,7 +276,8 @@ def compute_next_nodes(d2_component):
molecule_idxs = mols_from_node(head[1])
for mol_idx in molecule_idxs:
nexts = []
for neighbor in d2_component[node]:
# for neighbor in d2_component[node]:
for neighbor in _get_distant_neighbors(d2_component, node, max_jumps+1):
nei_head, _, _ = dg_names[neighbor]
nei_mols = mols_from_node(nei_head[1])
nei_mols = [x for x in nei_mols if x > mol_idx]
......@@ -278,15 +294,15 @@ def compute_next_nodes(d2_component):
return next_nodes
def compute_longest_increasing_paths(d2_component):
next_nodes = compute_next_nodes(d2_component)
def compute_longest_increasing_paths(d2_component, max_gap=0):
next_nodes = compute_next_nodes(d2_component, max_jumps=max_gap)
# Compute the longest path for each node
longest_paths = {}
for idx, start_node in enumerate(next_nodes):
# print(f"{idx}/{len(next_nodes)}")
for mol_idx in next_nodes[start_node]:
recursive_longest_path(start_node, mol_idx , next_nodes, longest_paths)
recursive_longest_path(start_node, mol_idx, next_nodes, longest_paths)
# Get the longest path size
max_len, node_val, mol_idx = 0, None, -1
......@@ -304,7 +320,7 @@ def compute_longest_increasing_paths(d2_component):
def backtrack_longest_path(node, molecule, longest_paths, path=[]):
if node == None:
if node is None:
return path
path.append((molecule, node))
......@@ -340,6 +356,40 @@ def recursive_longest_path(current_node, current_molecule, next_nodes, longest_p
return longest_paths[current_node][current_molecule]
# def longest_common_subsequence(barcode_true_path, barcoded_graph):
# """ Assume that the two graphs have an attribute barcode for each node and a unique node name"""
# path_nodes = []
# path_nodes_barcodes = []
# for node, data in barcode_true_path.nodes(data=True):
# path_nodes.append(node)
# path_nodes_barcodes.append(data["barcode"])
# path_nodes_to_idx = {n: idx for idx, n in enumerate(path_nodes)}
#
# graph_nodes = []
# graph_nodes_barcodes = []
# for node, data in barcoded_graph.nodes(data=True):
# graph_nodes.append(node)
# graph_nodes_barcodes.append(data["barcode"])
# graph_nodes_to_idx = {n: idx for idx, n in enumerate(graph_nodes)}
#
# dynamic_array = [[0 for _ in range(len(graph_nodes)+1)] for _ in range(len(path_nodes)+1)]
# for row in range(1, len(path_nodes)+1):
# path_node = path_nodes[row-1]
# path_barcode = path_nodes_barcodes[row-1]
#
# for column in range(1, len(graph_nodes)):
# graph_node = graph_nodes[column-1]
# graph_barcode = graph_nodes_barcodes[column-1]
#
# prev_scores = [dynamic_array[row-1][column]]
# for neighbor_node in barcoded_graph[graph_node]:
# neighbor_idx = graph_nodes_to_idx[neighbor_node]
# prev_scores.append(dynamic_array[row-1][neighbor_idx])
#
# match_point = 1 if path_barcode == graph_barcode else 0
# dynamic_array[row][column] = max(prev_scores) + match_point
def compute_covered_variables(graph, path):
path_nodes = set()
for mol, node_name in path:
......@@ -355,7 +405,7 @@ def compute_covered_variables(graph, path):
return used_vars, total_vars
# returns True iff there exist x in mol1 such that there exists y in mol2 and |x-y| <= some_value
# returns True iff there exist x in mol1 such that there exists y in mol2 and |x-y| <= some_value
def nearby_udg_molecules(mols1, mols2):
for x in mols1:
for y in mols2:
......@@ -370,19 +420,19 @@ def verify_graph_edges(d2_component):
head, c1, c2 = parse_dg_name(d2_component,node)
# Construct the molecule(s) that this udg really 'reflects'
# i.e. the udg has a central node and two cliques
# that central node is the result of merging of several molecules
# ideally, only one of those molecules is connected to the molecules of the cliques
# (there could be more than one though; in that case the udg is 'ambiguous')
# udg_molecules aims to reflect the molecule(s) underlying this udg
# i.e. the udg has a central node and two cliques
# that central node is the result of merging of several molecules
# ideally, only one of those molecules is connected to the molecules of the cliques
# (there could be more than one though; in that case the udg is 'ambiguous')
# udg_molecules aims to reflect the molecule(s) underlying this udg
udg_molecules = set()
# Get the current molecule idxs of central node
molecule_idxs = mols_from_node(head[1])
#print("mol idxs", molecule_idxs)
# print("mol idxs", molecule_idxs)
# Examine molecule idx's of cliques to see which are close to the central node
# rationale: c1/c2 contain nearby molecule id's
# rationale: c1/c2 contain nearby molecule id's
for mol_idx in molecule_idxs:
nexts = []
for c in [c1,c2]:
......@@ -397,9 +447,9 @@ def verify_graph_edges(d2_component):
nexts.sort(key=lambda x: x)
quality = sum([1.0/x if mol_idx+x in nexts else 0 for x in range(1,6)]) / sum([1.0/x for x in range(1,6)])
if quality > 0.6: eyeballed but still arbitrary
if quality > 0.6: # eyeballed but still arbitrary
udg_molecules.add(mol_idx)
#print("mol",mol_idx,molecule_idxs,"quality",quality,"nexts",nexts)
# print("mol",mol_idx,molecule_idxs,"quality",quality,"nexts",nexts)
udg_molecules_dict[head[0]]=udg_molecules
......@@ -417,7 +467,7 @@ def verify_graph_edges(d2_component):
else:
color = 'red'
data['color'] = color
#print("edge",node_udg_molecules,neighbor_udg_molecules,color)
# print("edge",node_udg_molecules,neighbor_udg_molecules,color)
# also, annotate nodes by their putative molecule found
for n, data in d2_component.nodes(data=True):
......@@ -438,7 +488,7 @@ def verify_graph_edges(d2_component):
if "_" in data['udg_molecule'] or data['udg_molecule'] == '':
if "_" in data['udg_molecule']:
m1, m2 = list(map(int,data['udg_molecule'].split("_")))
if abs(m2-m1) < 30: continue don't remove that kind of nodes
if abs(m2-m1) < 30: continue # don't remove that kind of nodes
nodes_to_remove += [n]
d2_component.remove_nodes_from(nodes_to_remove)
print("removed",len(nodes_to_remove),"bad nodes")
......@@ -473,7 +523,7 @@ def main():
components.sort(key=lambda x: -len(x))
component = graph.subgraph(components[0])
longest_path = compute_longest_increasing_paths(component)
longest_path = compute_longest_increasing_paths(component, max_gap=args.max_gap)
vars = compute_covered_variables(graph, longest_path)
print_d2_summary(components, longest_path, coverage_vars=vars, light_print=args.light_print)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment