Commit f5430a9b authored by Yoann Dufresne's avatar Yoann Dufresne

add subdirectories

parent cc574d80
# 10X-deconvolve
Trying to deconvolve single tag assignment for multiple molecules
\ No newline at end of file
Trying to deconvolve single tag assignment for multiple molecules
## Run the deconvolution
python3 src/deconvolve.py
## Run the tests
pytest tests
import sys
print(sys.readline())
import networkx as nx
import d_graph as dgm
class D2Graph(object):
"""D2Graph (read it (d-graph)²)"""
def __init__(self, graph):
super(D2Graph, self).__init__()
self.graph = graph
# Compute all the d-graphs
self.d_graphs = dgm.compute_all_max_d_graphs(self.graph)
# Index all the d-graphes
self.index = self.create_index()
def create_index(self):
index = {}
return index
def save_to_file(self, filename):
next_idx = 0
nodes = {}
G = nx.Graph()
# for dmer in self.dmers:
# for d_idx, dg in enumerate(self.dmers[dmer]):
# # Create a node name
# if not dg in nodes:
# nodes[dg] = next_idx
# next_idx += 1
# # Add the node
# G.add_node(nodes[dg])
# # Add the edges
# for prev_node in self.dmers[dmer][:d_idx]:
# G.add_edge(nodes[dg], nodes[prev_node])
nx.write_gexf(G,filename)
\ No newline at end of file
......@@ -2,13 +2,16 @@ import networkx as nx
import math
from functools import total_ordering
@total_ordering
class Dgraph(object):
"""docstring for Dgraph"""
def __init__(self):
def __init__(self, center):
super(Dgraph, self).__init__()
self.center = center
self.score = 0
self.halves = [[],[]]
self.halves = [None,None]
self.connexity = [None,None]
""" Compute the d-graph quality (score) according to the connectivity between the two halves.
......@@ -20,6 +23,8 @@ class Dgraph(object):
self.score = 0
self.halves[0] = h1
self.halves[1] = h2
self.connexity[0] = {key:0 for key in self.halves[0]}
self.connexity[1] = {key:0 for key in self.halves[1]}
# Compute link arities
for node1 in h1:
......@@ -28,6 +33,13 @@ class Dgraph(object):
for node2 in h2:
if node1 == node2 or node2 in neighbors:
self.score += 1
self.connexity[0][node1] += 1
self.connexity[1][node2] += 1
# Sort the halves by descending connexity
connex = self.connexity
self.halves[0].sort(reverse=True, key=lambda v: connex[0][v])
self.halves[1].sort(reverse=True, key=lambda v: connex[1][v])
def get_link_ratio(self):
......@@ -39,6 +51,11 @@ class Dgraph(object):
return max_len * (max_len - 1) / 2
def to_ordered_list(self):
# TODO : Can't be uniq (see for corrections)
return self.halves[0][::-1] + [self.center] + self.halves[1]
def __eq__(self, other):
my_tuple = (self.get_link_ratio(), self.get_optimal_score())
other_tuple = (other.get_link_ratio(), other.get_optimal_score())
......@@ -52,8 +69,17 @@ class Dgraph(object):
other_tuple = (other.get_link_ratio(), other.get_optimal_score())
return (my_tuple < other_tuple)
def __hash__(self):
return frozenset(self.to_ordered_list()).__hash__()
def __repr__(self):
return str(self.score) + "/" + str(self.get_optimal_score()) + " " + str(self.halves[0]) + str(self.halves[1])
# print(self.halves)
representation = self.center + " " + str(self.score) + "/" + str(self.get_optimal_score()) + " "
representation += "[" + ", ".join([f"{node} {self.connexity[0][node]}" for node in self.halves[0]]) + "]"
representation += "[" + ", ".join([f"{node} {self.connexity[1][node]}" for node in self.halves[1]]) + "]"
return representation
""" From a barcode graph, compute all the possible max d-graphs by node.
......@@ -61,7 +87,7 @@ class Dgraph(object):
@param n_best Only keep n d-graphs (the nearest to 1.0 ratio)
@return A dictionary associating each node to its list of all possible d-graphs. The d-graphs are sorted by decreasing ratio.
"""
def compute_all_max_d_graphs(graph, n_best=10, max_overlap=2):
def compute_all_max_d_graphs(graph, n_best=10):
d_graphes = {}
for node in list(graph.nodes()):
......@@ -78,7 +104,7 @@ def compute_all_max_d_graphs(graph, n_best=10, max_overlap=2):
clq2 = cliques[clq2_idx]
# Check for d-graph candidates
d_graph = Dgraph()
d_graph = Dgraph(node)
d_graph.put_halves(clq1, clq2, neighbors_graph)
optimal_score = d_graph.get_optimal_score()
......@@ -90,7 +116,7 @@ def compute_all_max_d_graphs(graph, n_best=10, max_overlap=2):
# Cut the the distribution queue
d_graphes[node] = sorted(node_d_graphes)
print(node_d_graphes)
d_graphes[node] = sorted(node_d_graphes)[:n_best]
# print(node_d_graphes)
return d_graphes
#!/usr/bin/env python3
import sys
import math
import networkx as nx
import itertools
import d_graph as dg
import d2_graph as d2
def main():
# Parsing the input file
filename = sys.argv[1]
G = None
if filename.endswith('.graphml'):
G = nx.read_graphml(filename)
elif filename.endswith('.gexf'):
G = nx.read_gexf(filename)
d2g = d2.D2Graph(G)
d2g.save_to_file("data/d2_graph.gexf")
if __name__ == "__main__":
main()
#!/usr/bin/env python3
import sys
import math
import networkx as nx
import itertools
def local_deconvolve(G,node, verbose=0):
neighbors = list(G.neighbors(node))
nei_len = len(neighbors)
# Extract neighbors from the graph
G_neighbors = nx.Graph(G.subgraph(neighbors))
communities = get_communities(G_neighbors, max_overlap=0, verbose=verbose-1)
# Continue only if something need to be splited.
if len(communities) <= 1:
if verbose > 0:
print("node",node,nei_len,"neighbors")
print("No split\n")
return
# Split communities
for idx, community in enumerate(communities):
# Add community center
node_name = f"{node}.{idx}"
G.add_node(node_name)
# Add links from the center to the community
for neighbor in community:
G.add_edge(node_name, neighbor)
# Remove old node
G.remove_node(node)
if verbose > 0:
print("node",node,nei_len,"neighbors")
print("splitted into", len(communities), "parts\n")
def get_communities(G, max_overlap=1, strict=True, verbose=0):
# Half d-graphs are cliques. So compute max cliques
cliques = list(nx.find_cliques(G))
if verbose > 0:
print("clique list")
for clq in cliques:
print(clq)
print()
candidate_d_graphs = []
# d-graphs are composed of 2 cliques related by the current node.
# The overlap between the 2 cliques determine the node order in the d-graph.
for idx, clq1 in enumerate(cliques):
to_compare = cliques[idx+1:]
for clq2 in to_compare:
# Test if d-graphs only if there are less than max overlapping nodes
# between the two half
overlap = 0
for val in clq1:
if val in clq2:
overlap += 1
if overlap > max_overlap:
# print(overlap, "is too high overlap")
continue
# Check for d-graph candidates
d_graph = compute_d_graph(clq1, clq2, G, verbose=verbose-1, max_diff_size=0)
if d_graph != None:
candidate_d_graphs.append(d_graph)
# Extract communities from all the possible d-graphes in the neighborood.
# This is a minimal covering d_graph algorithm.
minimal_d_graphes, unpartitionned = filter_d_graphs(candidate_d_graphs, max_overlap=max_overlap)
if strict and len(unpartitionned) > 0:
if verbose > 0:
print("Partialy unpartionned. Aborted")
return [list(G.nodes())]
communities = [list(set(d_graph[0]+d_graph[1])) for d_graph in minimal_d_graphes]
# complete unpartitionned nodes
to_add = []
for idx, d_graph in enumerate(communities):
for node in d_graph:
neighbors = G.neighbors(node)
for nei in neighbors:
if nei in unpartitionned:
to_add.append(node)
break
unpartitionned.extend(list(set(to_add)))
# If no community detected, return one big.
if len(unpartitionned) == len(list(G.nodes())):
return [list(G.nodes())]
# add unpartitionned if not empty
elif len(unpartitionned) > 0:
communities.append(unpartitionned)
if verbose > 0:
for community in communities:
print(community)
return communities
""" This function take two cliques in the graph G and try to find if they are 2 halfes
of a d-graph.
1 - The algorithm compute overlap between nodes from first clique to the second and from the
second to the first.
2 - Filter the clique pairs that have less than n*(n-1)/2 traversing edges (necessary critera
to be considered as d-graph).
3 - Find the node order in the half d-graphs by sorting nodes by traversing order.
4 - return a paair of half d-graph ordered from the central node (not present in G).
@param cliq1 (resp clq2) the clique that is candidate to be composed of half of the d-graph nodes.
@param G the graph of the neighbors of the central node (not present).
@return A pair of lists that are the 2 halves of the d-graph ordered from the center.
"""
def compute_d_graph(clq1, clq2, G, max_diff_size=1, verbose=0):
# Compute the arities between the cliques
arities1 = {name:0 for name in clq1}
arities2 = {name:0 for name in clq2}
sum_edges = 0
if len(clq1) != len(clq2):
# Limit the number of recursions
if abs(len(clq1)-len(clq2)) > max_diff_size:
return None
# Recursion on the biggest clique to reduce complexity.
smallest, largest = (clq1, clq2) if len(clq2) > len(clq1) else (clq2, clq1)
minimal_weighted_d_graph = None
minimal_weight = math.inf
for idx in range(len(largest)):
recur_d_graph = compute_d_graph(smallest, largest[:idx]+largest[idx+1:], G, verbose=verbose)
if recur_d_graph != None and recur_d_graph[2] < minimal_weight:
minimal_weighted_d_graph = recur_d_graph
minimal_weight = recur_d_graph[2]
if verbose > 0:
print(f"Recursive calls for:\n{clq1}\n{clq2}\n")
print(minimal_weighted_d_graph, "\n")
print("/ Recursive\n")
return minimal_weighted_d_graph
min_clq_size = min(len(clq1), len(clq2))
# Compute link arities
for node1 in clq1:
neighbors = list(G.neighbors(node1))
for node2 in clq2:
if node1 == node2 or node2 in neighbors:
# print(node1, "-", node2)
arities1[node1] += 1
arities2[node2] += 1
sum_edges += 1
# if verbose:
# print(clq1, clq2)
# print(arities1, arities2, "\n")
# Reject if not enought edges
if sum_edges < min_clq_size * (min_clq_size-1) / 2:
return None
# if verbose:
# print(clq1, clq2)
# print(arities1, arities2, "\n")
# order lists
lst1 = [key for key, value in sorted(arities1.items(), key=lambda tup: -tup[1])]
lst2 = [key for key, value in sorted(arities2.items(), key=lambda tup: -tup[1])]
if verbose > 0:
print(min_clq_size)
print(lst1, "\n", lst2, "\n")
# Return the 2 halves of the d-graph
return lst1, lst2, sum_edges
""" Filter the candiates regarding their compatibilities
"""
def filter_d_graphs(candidates, max_overlap=0):
# Count for each node the number of their apparition (regarding the half overlap)
selected = {}
counts_by_size = [{} for _ in range(max_overlap+1)]
sorted_d_graphs = [[] for _ in range(max_overlap+1)]
for d_graph in candidates:
# Compute intersection of the two halves
common_length = len(set(d_graph[0]) & set(d_graph[1]))
sorted_d_graphs[common_length].append(d_graph)
# Count occurences
for node in itertools.chain(d_graph[0], d_graph[1]):
if not node in counts_by_size[common_length]:
counts_by_size[common_length][node] = 0
counts_by_size[common_length][node] += 1
selected[node] = False
# take d_graphes with nodes that appears only once
filtered = []
partitionned = False
for overlap_size in range(max_overlap+1):
# Look for d_graphs with overlapping halves first, then 1 node, ...
for d_graph in sorted_d_graphs[overlap_size]:
common_length = len(set(d_graph[0]) & set(d_graph[1]))
for node in itertools.chain(d_graph[0], d_graph[1]):
# Count appearance
total_count = 0
for length in range(overlap_size+1):
total_count += counts_by_size[common_length][node] if node in counts_by_size[common_length] else 0
# Add d-graph
if total_count == 1:
# Add the d_graph to the selection
filtered.append(d_graph)
# register selection of the nodes
for node in itertools.chain(d_graph[0], d_graph[1]):
selected[node] = True
# Over for this d-graph
break
# Stop if all nodes are selected
over = True
for val in selected.values():
if not val:
over = False
break
if over:
partitionned = True
break
# If the partionning is not complete, try to subdivise the node anyway.
# Split the node in n d_graph partitions + 1 unpartitionned set of nodes
unpartitionned = [node for node in selected if not selected[node]]
if len(unpartitionned) == len(selected):
return [], unpartitionned
else:
return filtered, unpartitionned
import d_graph as dg
def main():
# Parsing the input file
filename = sys.argv[1]
G = None
if filename.endswith('.graphml'):
G = nx.read_graphml(filename)
elif filename.endswith('.gexf'):
G = nx.read_gexf(filename)
dgraphs = dg.compute_all_max_d_graphs(G)
for node in list(G.nodes())[:10]:
print(node, dgraphs[node])
print("...")
# # Deconvolve
# g_nodes = list(G.nodes())
# for node in g_nodes:
# local_deconvolve(G,node, verbose=2 if (node.startswith("0:")) else 0)
# # exit()
# print(len(g_nodes), "->", len(list(G.nodes())))
# nx.write_graphml(G,sys.argv[1]+".deconvolved.graphml")
if __name__ == "__main__":
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment