Commit a6a5f6de authored by Yoann Dufresne's avatar Yoann Dufresne
Browse files

repository clarification

parent 0d3dcbf7
#C3BI paul avait émis la notion de d-graph
# "Lets call a graph a d-graph if the vertices can be ordered on a line and any vertices within d/2 of each other are connected by an edge. Observe that a d-graph has all vertices of degree d, except at the very end and beginning. For example, for d=6, vertex at position 10 will be connected to 7,8,9,11,12,13. A d-graph (sort of?) captures what the neighborhood of a molecule would look like."
# alternatively, observe that a d-graph can be seen as a line in a special overlap graph, where the nodes are the set of neighbors of a certain barcode graph node node, and two nodes in the special overlap graph are linked by an edge if the neighbors intersect over d-1 (or d-2?) elements
# arguments: graphml file
# attempts to deconvolve a barcode graph using d-graph detection
import sys
import itertools
import operator
from collections import Counter
from networkx import nx
#from networkx.algorithms import community
import community # python-louvain
import sys
if len(sys.argv) == 1:
print("args: .graphml or .gexf file")
if ".graphml" in sys.argv[1]:
G = nx.read_graphml(sys.argv[1])
elif ".gexf" in sys.argv[1]:
G = nx.read_gexf(sys.argv[1])
# too strict
def strict_d_line_compatible_neighbors(node1, neigh1, node2, neigh2):
neigh1 = set(neigh1) | set([node1])
neigh2 = set(neigh2) | set([node2])
inter = neigh1 & neigh2
diff1 = neigh1 - neigh2
diff2 = neigh2 - neigh1
if len(inter) >= len(neigh1)-1 and len(inter) >= len(neigh2) - 1 and len(diff1) <= 1 and len(diff2) <= 1:
return True
return False
def d_line_compatible_neighbors(node1, neigh1, node2, neigh2):
neigh1 = set(neigh1) | set([node1])
neigh2 = set(neigh2) | set([node2])
inter = neigh1 & neigh2
diff1 = neigh1 - neigh2
diff2 = neigh2 - neigh1
if len(inter) >= len(neigh1)-wiggle and len(inter) >= len(neigh2) - wiggle and len(diff1) <= wiggle and len(diff2) <= wiggle:
return True
return False
# actually a very loose definition of d-graph, where consecutive neighbors of node on the line don't need to be exactly of cardinality +1/-1
# (for that, use strict_d_line_compatible_neighbors)
# but can be +3/-3
def is_d_graph(graph, all_neighbors_graph):
Gn = nx.Graph() # create a graph of 'neighbor-compatibility': whether two nodes in the original graphs have 'almost' the same neighbors-set, apart from one to the left and one to the right (typical in d-graphs)
for node in graph.nodes():
for node1 in graph.nodes():
for node2 in graph.nodes():
if node1 <= node2: continue
neigh1 = list(graph.neighbors(node1))
neigh2 = list(graph.neighbors(node2))
#print(node1, sorted(neigh1))
#print(node2, sorted(neigh2))
if d_line_compatible_neighbors(node1, neigh1, node2, neigh2):
#print("tentative d graph",list(nx.connected_components(Gn)))
if len(list(nx.connected_components(Gn))) != 1:
return False
# this is really a heuristic:
# for all nodes in the putative d-graph, make sure their neighbors are all in majority in the component rathe than the whole graph
# this is a critical filter to remove putative d-graphs that are made of multiple molecules
# see e.g. 100_5_2-neighbors_of_molecule_21_83 without that filter..
# 3 d-graphs found (2 ok), dont 1 pas ok:
# d graph found ['10:24_65', '12:66_19', '22:25_81', '37:22_42', '46:63_86', '9:85_45'] -> chimere des molecules voisines de 21 et 83 via le barcode 22:25_81
for node1 in graph.nodes():
neigh1 = set(list(all_neighbors_graph.neighbors(node1)))
if len(neigh1 & set(graph.nodes())) < len(neigh1) / 2:
#print("is not a d graph due to bad separation with rest",graph)
return False
return True
def compute_score(d_graphs_set,G):
score = 0
for d_graph in d_graphs_set:
for node in d_graph:
for neigh in G.neighbors(node):
score += 1 if neigh in d_graph else -4
return score
debug = False
def deconvolve2(G,node,dont_separate=False):
global debug
neighbors = list(G.neighbors(node))
G2 = nx.Graph(G.subgraph(neighbors))
if debug:
nx.write_graphml(G2,node + ".graphml")
# add node labels
for g2_node in G2.nodes():
G2.nodes[g2_node]['label'] = str(g2_node)
# make sure there is a single connected component
while True:
ccs = list(nx.connected_components(G2))
if len(ccs) == 1:
# else artificially and weakly connect components to run fluid community
n1 = list(ccs[0])[0]
n2 = list(ccs[1])[0]
d_graphs = []
cliques = list(nx.find_cliques(G2))
# remove cliques of size < 4
cliques = [c for c in cliques if len(c) >= 3]
# enumerate all d-graphs
print(len(cliques),"cliques,",len(G2.nodes()),"nodes in neighbors of", node)
#print("cliques:", cliques)
for nb_cliques in range(len(cliques),0,-1):
for candidate in itertools.combinations(cliques, nb_cliques):
candidate_graph = nx.Graph(G2.subgraph([x for y in candidate for x in y ]))
#print("testing for d graph",sorted(list(candidate_graph.nodes())))
if len(candidate_graph.nodes()) <= 5: continue # don't consider small graphs # bit of a hack
if is_d_graph(candidate_graph, G2):
#print("d graph found",sorted(list(candidate_graph.nodes())))
d_graphs += [tuple(sorted(candidate_graph.nodes()))]
d_graphs = list(set(d_graphs)) # dedup
print(len(d_graphs),"d-graphs found in neighbors of",node)
# remove ones included in others
to_remove = set()
for i,d_graph1 in enumerate(d_graphs):
for d_graph2 in d_graphs:
if (set(d_graph1).issubset(set(d_graph2))) and len(d_graph1) < len(d_graph2):
to_remove |= set([i])
d_graphs = [d for (i, d) in enumerate(d_graphs) if i not in to_remove]
print("removed",len(to_remove),"redundant d graphs")
# find the best set of d-graphs which minimize a certain score
scores = []
for nb_dgraphs in range(4,0,-1):
for candidate in itertools.combinations(d_graphs, nb_dgraphs):
node_counter = Counter([x for y in candidate for x in y ])
candidate = [ [node for node in d if node_counter[node] == 1 ] for d in candidate] # remove nodes common to multiple d-graphs
candidate = [ d for d in candidate if len(d) >= 5] # remove small d-graphs
if len(candidate) == 0:
score = compute_score(candidate,G2)
scores += [(score, candidate)]
print("scores",[(x,"%d d-graphs" % len(d)) for x,d in sorted(scores)][:-3])
best_d_graph_decomposition = max(scores)[1]
print("best decomposition:",len(best_d_graph_decomposition),"graphs")
d_graphs = best_d_graph_decomposition
# refine d_graphs by removing common nodes
dups = Counter()
for d_graph in d_graphs:
for d_node in d_graph:
dups[d_node] += 1
multi_nodes = [x for x in dups if dups[x] > 1]
print(len(multi_nodes),"multi-d-graphs nodes found, removing them", multi_nodes)
for i in range(len(d_graphs)):
for multi_node in multi_nodes:
if multi_node in d_graphs[i]:
# label nodes by their d_graph
d_graphs_nodes = {}
for i,d_graph in enumerate(d_graphs):
for d_node in d_graph:
if d_node in d_graphs_nodes:
exit("uh oh it's not a partition")
d_graphs_nodes[d_node] = i
# TODO place the remaining nodes in combinations of d-graphs, depending of their neighborhoors
# some nodes might not be in any d-graph
# also what to do with the multi_nodes
# those will be in their own little dummy MI
for g2_node in G2.nodes():
if g2_node not in d_graphs_nodes:
d_graphs_nodes[g2_node]=99 # Nine-Nine!
#communities = dict([(x,1) for x in G2.nodes()]) # dummy communities
communities = dict([(x,d_graphs_nodes[x]) for x in G2.nodes() if x in d_graphs_nodes])
print("community sizes:",[len([c for c,i in communities.items() if i == community_id]) for community_id in set(communities.values())])
#print([[(c,i) for c,i in communities.items() if i == community_id] for community_id in set(communities.values())]) # debug
if len(communities) == 1:
return # nothing to separate here
if dont_separate:
# creation of a new node per community
for node2,community_id in communities.items():
G.add_node(node+"-MI%d" % community_id)
G.add_edge(node+"-MI%d" % community_id, node2, contigs=(G[node][node2]['contigs'] if 'contigs' in G[node][node2] else ""))
# for debugging
G2.nodes[node2]['mi'] = community_id
debug = False
if debug:
node = "3:80_10"
debug_dont_separate = True # run the algo on each node but dont separate the nodes
g_nodes = list(G.nodes())
for node in g_nodes:
......@@ -199,6 +199,8 @@ def print_d2_summary(connected_components, longest_path, covered_vars={}, light_
print("--- Global summary ---")
print(f"Number of connected components: {len(connected_components)}")
print(f"Total number of nodes: {sum([len(x) for x in connected_components])}")
no_singleton = [x for x in connected_components if len(x) > 1]
print(f"There are {len(no_singleton)} connex components with at least 2 nodes")
print(f"The 5 largest components: {[len(x) for x in connected_components][:5]}")
print("--- Largest component analysis ---")
#!/usr/bin/env python3
from networkx import nx
from copy import deepcopy
def compute_network_distances(graph, max_dist=3):
distances = {}
# Init all the distances
g_nodes = list(graph.nodes())
for node in g_nodes:
distances[node] = {node:0}
# Transmit distances over max_dist iterations
for _ in range(max_dist):
old_distances = deepcopy(distances)
# For each node, transmit list of distances
for node in g_nodes:
neighbors = list(graph.neighbors(node))
# for each neighbor of the current node, transmit all the distances already present.
for neighbor in neighbors:
for key, val in old_distances[node].items():
if not key in distances[neighbor]:
distances[neighbor][key] = val + 1
return distances
def print_distances(distances):
for key, val in distances.items():
print(key, len(val))
def evolved_distances(graph, node):
prev_distances = compute_network_distances(graph, 3)
# Remove node
neighbors = list(graph.neighbors(node))
next_distances = compute_network_distances(graph, 3)
# Add removed node
for neighbor in neighbors:
graph.add_edge(node, neighbor)
# print distance evolutions between x and y nodes (different from node)
for origin, destinations in prev_distances.items():
if origin == node:
for destination, distance in destinations.items():
if destination == node:
if not destination in next_distances[origin]:
print(f"{origin}--{destination} {distance}->X")
elif next_distances[origin][destination] != distance:
print(f"{origin}--{destination} {distance}->{next_distances[origin][destination]}")
# if not dest in next_distances:
# print(f"{node}--{dest} {dist}->X")
# elif next_distances[dest] == dist:
# print(f"{node}--{dest} {dist}->{next_distances[dest]}")
graph = nx.read_graphml("simple_duplicated_3links.graphml")
# graph = nx.read_graphml("simulated_barcodes.graphml")
nodes = list(graph.nodes())
# nodes = ["7"]
for node in nodes:
print(f"node {node}")
evolved_distances(graph, node)
# print_distances(distances)
import networkx as nx
g = nx.read_graphml("simple_duplicated_3links.graphml")
# List all the nodes for adjacency tests.
for n, nbrs in g.adj.items():
neighbors = [x[0] for x in nbrs.items()]
to_split = False
# For all nodes B and C neighbors of A
for idx in range(len(neighbors)):
for jdx in range(idx+1, len(neighbors)):
ni = neighbors[idx]
nj = neighbors[jdx]
# Need to plit A in A' and A'' if B and C are not neighbors
if not ni in g.adj[nj]:
# print(f"{ni},{nj}")
to_split = True
# Split or not split, that is the question
print(f"{n} split ? {to_split}")
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment