diff --git a/README.md b/README.md index e5a3c735ad516f31b631fce459da331e8fd6ac7a..d9a47230927c96c18b6ff11b5fd2f4f0cd409e00 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,11 @@ # 10X-deconvolve -Trying to deconvolve single tag assignment for multiple molecules \ No newline at end of file +Trying to deconvolve single tag assignment for multiple molecules + +## Run the deconvolution + + python3 src/deconvolve.py + +## Run the tests + + pytest tests diff --git a/deconvolution/__init__.py b/deconvolution/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/deconvolution/__pycache__/__init__.cpython-36.pyc b/deconvolution/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d3444309bd58fe1ef0d8b30490283817d068583 Binary files /dev/null and b/deconvolution/__pycache__/__init__.cpython-36.pyc differ diff --git a/deconvolution/__pycache__/d_graph.cpython-36.pyc b/deconvolution/__pycache__/d_graph.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1794d28ddeb9be54f9fd484998e04657f0e2f7da Binary files /dev/null and b/deconvolution/__pycache__/d_graph.cpython-36.pyc differ diff --git a/deconvolution/count.py b/deconvolution/count.py new file mode 100644 index 0000000000000000000000000000000000000000..f51334da6d1d13fb9dfbac250bd63645447aa5b2 --- /dev/null +++ b/deconvolution/count.py @@ -0,0 +1,3 @@ +import sys + +print(sys.readline()) diff --git a/deconvolution/d2_graph.py b/deconvolution/d2_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..ba67b395f7346dca835f97dc70a0e906f9658c72 --- /dev/null +++ b/deconvolution/d2_graph.py @@ -0,0 +1,44 @@ +import networkx as nx +import d_graph as dgm + + +class D2Graph(object): + """D2Graph (read it (d-graph)²)""" + def __init__(self, graph): + super(D2Graph, self).__init__() + self.graph = graph + + # Compute all the d-graphs + self.d_graphs = dgm.compute_all_max_d_graphs(self.graph) + + # Index all the d-graphes + self.index = self.create_index() + + + def create_index(self): + index = {} + return index + + + def save_to_file(self, filename): + next_idx = 0 + nodes = {} + G = nx.Graph() + + # for dmer in self.dmers: + # for d_idx, dg in enumerate(self.dmers[dmer]): + # # Create a node name + # if not dg in nodes: + # nodes[dg] = next_idx + # next_idx += 1 + + # # Add the node + # G.add_node(nodes[dg]) + + # # Add the edges + # for prev_node in self.dmers[dmer][:d_idx]: + # G.add_edge(nodes[dg], nodes[prev_node]) + + nx.write_gexf(G,filename) + + \ No newline at end of file diff --git a/d_graph.py b/deconvolution/d_graph.py similarity index 67% rename from d_graph.py rename to deconvolution/d_graph.py index 30ac6cf14a3e3d83ce08d1a34f8e3749f58939e7..4ce1185b3e5ca9815a3aa82ed21f2e1705fcdc33 100644 --- a/d_graph.py +++ b/deconvolution/d_graph.py @@ -2,13 +2,16 @@ import networkx as nx import math from functools import total_ordering + @total_ordering class Dgraph(object): """docstring for Dgraph""" - def __init__(self): + def __init__(self, center): super(Dgraph, self).__init__() + self.center = center self.score = 0 - self.halves = [[],[]] + self.halves = [None,None] + self.connexity = [None,None] """ Compute the d-graph quality (score) according to the connectivity between the two halves. @@ -20,6 +23,8 @@ class Dgraph(object): self.score = 0 self.halves[0] = h1 self.halves[1] = h2 + self.connexity[0] = {key:0 for key in self.halves[0]} + self.connexity[1] = {key:0 for key in self.halves[1]} # Compute link arities for node1 in h1: @@ -28,6 +33,13 @@ class Dgraph(object): for node2 in h2: if node1 == node2 or node2 in neighbors: self.score += 1 + self.connexity[0][node1] += 1 + self.connexity[1][node2] += 1 + + # Sort the halves by descending connexity + connex = self.connexity + self.halves[0].sort(reverse=True, key=lambda v: connex[0][v]) + self.halves[1].sort(reverse=True, key=lambda v: connex[1][v]) def get_link_ratio(self): @@ -39,6 +51,11 @@ class Dgraph(object): return max_len * (max_len - 1) / 2 + def to_ordered_list(self): + # TODO : Can't be uniq (see for corrections) + return self.halves[0][::-1] + [self.center] + self.halves[1] + + def __eq__(self, other): my_tuple = (self.get_link_ratio(), self.get_optimal_score()) other_tuple = (other.get_link_ratio(), other.get_optimal_score()) @@ -52,8 +69,17 @@ class Dgraph(object): other_tuple = (other.get_link_ratio(), other.get_optimal_score()) return (my_tuple < other_tuple) + + def __hash__(self): + return frozenset(self.to_ordered_list()).__hash__() + + def __repr__(self): - return str(self.score) + "/" + str(self.get_optimal_score()) + " " + str(self.halves[0]) + str(self.halves[1]) + # print(self.halves) + representation = self.center + " " + str(self.score) + "/" + str(self.get_optimal_score()) + " " + representation += "[" + ", ".join([f"{node} {self.connexity[0][node]}" for node in self.halves[0]]) + "]" + representation += "[" + ", ".join([f"{node} {self.connexity[1][node]}" for node in self.halves[1]]) + "]" + return representation """ From a barcode graph, compute all the possible max d-graphs by node. @@ -61,7 +87,7 @@ class Dgraph(object): @param n_best Only keep n d-graphs (the nearest to 1.0 ratio) @return A dictionary associating each node to its list of all possible d-graphs. The d-graphs are sorted by decreasing ratio. """ -def compute_all_max_d_graphs(graph, n_best=10, max_overlap=2): +def compute_all_max_d_graphs(graph, n_best=10): d_graphes = {} for node in list(graph.nodes()): @@ -78,7 +104,7 @@ def compute_all_max_d_graphs(graph, n_best=10, max_overlap=2): clq2 = cliques[clq2_idx] # Check for d-graph candidates - d_graph = Dgraph() + d_graph = Dgraph(node) d_graph.put_halves(clq1, clq2, neighbors_graph) optimal_score = d_graph.get_optimal_score() @@ -90,7 +116,7 @@ def compute_all_max_d_graphs(graph, n_best=10, max_overlap=2): # Cut the the distribution queue - d_graphes[node] = sorted(node_d_graphes) - print(node_d_graphes) + d_graphes[node] = sorted(node_d_graphes)[:n_best] + # print(node_d_graphes) return d_graphes diff --git a/deconvolve-dgraphs.py b/deconvolution/deconvolve-dgraphs.py similarity index 100% rename from deconvolve-dgraphs.py rename to deconvolution/deconvolve-dgraphs.py diff --git a/deconvolution/deconvolve.py b/deconvolution/deconvolve.py new file mode 100755 index 0000000000000000000000000000000000000000..d6cba263184932f812232b80ffdc6e3427c5ff6a --- /dev/null +++ b/deconvolution/deconvolve.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +import sys +import math +import networkx as nx +import itertools + + + +import d_graph as dg +import d2_graph as d2 + +def main(): + # Parsing the input file + filename = sys.argv[1] + G = None + if filename.endswith('.graphml'): + G = nx.read_graphml(filename) + elif filename.endswith('.gexf'): + G = nx.read_gexf(filename) + + d2g = d2.D2Graph(G) + d2g.save_to_file("data/d2_graph.gexf") + +if __name__ == "__main__": + main() diff --git a/evaluate.py b/deconvolution/evaluate.py similarity index 100% rename from evaluate.py rename to deconvolution/evaluate.py diff --git a/generate_duplicated.py b/deconvolution/generate_duplicated.py similarity index 100% rename from generate_duplicated.py rename to deconvolution/generate_duplicated.py diff --git a/generate_fake_barcode_graph.py b/deconvolution/generate_fake_barcode_graph.py similarity index 100% rename from generate_fake_barcode_graph.py rename to deconvolution/generate_fake_barcode_graph.py diff --git a/generate_fake_molecule_graph.py b/deconvolution/generate_fake_molecule_graph.py similarity index 100% rename from generate_fake_molecule_graph.py rename to deconvolution/generate_fake_molecule_graph.py diff --git a/proxy.py b/deconvolution/proxy.py similarity index 100% rename from proxy.py rename to deconvolution/proxy.py diff --git a/requirements.txt b/deconvolution/requirements.txt similarity index 100% rename from requirements.txt rename to deconvolution/requirements.txt diff --git a/simplify.py b/deconvolution/simplify.py similarity index 100% rename from simplify.py rename to deconvolution/simplify.py diff --git a/deconvolve.py b/deconvolve.py deleted file mode 100755 index 0128d62e7130fd1bad851bb91d3e7401a4f2232b..0000000000000000000000000000000000000000 --- a/deconvolve.py +++ /dev/null @@ -1,278 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import math -import networkx as nx -import itertools - - - -def local_deconvolve(G,node, verbose=0): - neighbors = list(G.neighbors(node)) - nei_len = len(neighbors) - - # Extract neighbors from the graph - G_neighbors = nx.Graph(G.subgraph(neighbors)) - communities = get_communities(G_neighbors, max_overlap=0, verbose=verbose-1) - - # Continue only if something need to be splited. - if len(communities) <= 1: - if verbose > 0: - print("node",node,nei_len,"neighbors") - print("No split\n") - return - - # Split communities - for idx, community in enumerate(communities): - # Add community center - node_name = f"{node}.{idx}" - G.add_node(node_name) - - # Add links from the center to the community - for neighbor in community: - G.add_edge(node_name, neighbor) - - # Remove old node - G.remove_node(node) - if verbose > 0: - print("node",node,nei_len,"neighbors") - print("splitted into", len(communities), "parts\n") - - -def get_communities(G, max_overlap=1, strict=True, verbose=0): - # Half d-graphs are cliques. So compute max cliques - cliques = list(nx.find_cliques(G)) - - if verbose > 0: - print("clique list") - for clq in cliques: - print(clq) - print() - - candidate_d_graphs = [] - - # d-graphs are composed of 2 cliques related by the current node. - # The overlap between the 2 cliques determine the node order in the d-graph. - for idx, clq1 in enumerate(cliques): - to_compare = cliques[idx+1:] - - for clq2 in to_compare: - # Test if d-graphs only if there are less than max overlapping nodes - # between the two half - overlap = 0 - for val in clq1: - if val in clq2: - overlap += 1 - if overlap > max_overlap: - # print(overlap, "is too high overlap") - continue - - # Check for d-graph candidates - d_graph = compute_d_graph(clq1, clq2, G, verbose=verbose-1, max_diff_size=0) - if d_graph != None: - candidate_d_graphs.append(d_graph) - - # Extract communities from all the possible d-graphes in the neighborood. - # This is a minimal covering d_graph algorithm. - minimal_d_graphes, unpartitionned = filter_d_graphs(candidate_d_graphs, max_overlap=max_overlap) - - if strict and len(unpartitionned) > 0: - if verbose > 0: - print("Partialy unpartionned. Aborted") - return [list(G.nodes())] - - communities = [list(set(d_graph[0]+d_graph[1])) for d_graph in minimal_d_graphes] - - # complete unpartitionned nodes - to_add = [] - for idx, d_graph in enumerate(communities): - for node in d_graph: - neighbors = G.neighbors(node) - for nei in neighbors: - if nei in unpartitionned: - to_add.append(node) - break - unpartitionned.extend(list(set(to_add))) - - # If no community detected, return one big. - if len(unpartitionned) == len(list(G.nodes())): - return [list(G.nodes())] - # add unpartitionned if not empty - elif len(unpartitionned) > 0: - communities.append(unpartitionned) - - if verbose > 0: - for community in communities: - print(community) - - return communities - - -""" This function take two cliques in the graph G and try to find if they are 2 halfes - of a d-graph. - 1 - The algorithm compute overlap between nodes from first clique to the second and from the - second to the first. - 2 - Filter the clique pairs that have less than n*(n-1)/2 traversing edges (necessary critera - to be considered as d-graph). - 3 - Find the node order in the half d-graphs by sorting nodes by traversing order. - 4 - return a paair of half d-graph ordered from the central node (not present in G). - @param cliq1 (resp clq2) the clique that is candidate to be composed of half of the d-graph nodes. - @param G the graph of the neighbors of the central node (not present). - @return A pair of lists that are the 2 halves of the d-graph ordered from the center. -""" -def compute_d_graph(clq1, clq2, G, max_diff_size=1, verbose=0): - # Compute the arities between the cliques - arities1 = {name:0 for name in clq1} - arities2 = {name:0 for name in clq2} - sum_edges = 0 - - if len(clq1) != len(clq2): - # Limit the number of recursions - if abs(len(clq1)-len(clq2)) > max_diff_size: - return None - - # Recursion on the biggest clique to reduce complexity. - smallest, largest = (clq1, clq2) if len(clq2) > len(clq1) else (clq2, clq1) - minimal_weighted_d_graph = None - minimal_weight = math.inf - for idx in range(len(largest)): - recur_d_graph = compute_d_graph(smallest, largest[:idx]+largest[idx+1:], G, verbose=verbose) - if recur_d_graph != None and recur_d_graph[2] < minimal_weight: - minimal_weighted_d_graph = recur_d_graph - minimal_weight = recur_d_graph[2] - - if verbose > 0: - print(f"Recursive calls for:\n{clq1}\n{clq2}\n") - print(minimal_weighted_d_graph, "\n") - print("/ Recursive\n") - - return minimal_weighted_d_graph - - - min_clq_size = min(len(clq1), len(clq2)) - - # Compute link arities - for node1 in clq1: - neighbors = list(G.neighbors(node1)) - - for node2 in clq2: - if node1 == node2 or node2 in neighbors: - # print(node1, "-", node2) - arities1[node1] += 1 - arities2[node2] += 1 - sum_edges += 1 - - # if verbose: - # print(clq1, clq2) - # print(arities1, arities2, "\n") - - # Reject if not enought edges - if sum_edges < min_clq_size * (min_clq_size-1) / 2: - return None - - # if verbose: - # print(clq1, clq2) - # print(arities1, arities2, "\n") - - # order lists - lst1 = [key for key, value in sorted(arities1.items(), key=lambda tup: -tup[1])] - lst2 = [key for key, value in sorted(arities2.items(), key=lambda tup: -tup[1])] - - if verbose > 0: - print(min_clq_size) - print(lst1, "\n", lst2, "\n") - - # Return the 2 halves of the d-graph - return lst1, lst2, sum_edges - - -""" Filter the candiates regarding their compatibilities -""" -def filter_d_graphs(candidates, max_overlap=0): - # Count for each node the number of their apparition (regarding the half overlap) - selected = {} - counts_by_size = [{} for _ in range(max_overlap+1)] - sorted_d_graphs = [[] for _ in range(max_overlap+1)] - for d_graph in candidates: - # Compute intersection of the two halves - common_length = len(set(d_graph[0]) & set(d_graph[1])) - sorted_d_graphs[common_length].append(d_graph) - - # Count occurences - for node in itertools.chain(d_graph[0], d_graph[1]): - if not node in counts_by_size[common_length]: - counts_by_size[common_length][node] = 0 - counts_by_size[common_length][node] += 1 - selected[node] = False - - # take d_graphes with nodes that appears only once - filtered = [] - partitionned = False - for overlap_size in range(max_overlap+1): - # Look for d_graphs with overlapping halves first, then 1 node, ... - for d_graph in sorted_d_graphs[overlap_size]: - common_length = len(set(d_graph[0]) & set(d_graph[1])) - - for node in itertools.chain(d_graph[0], d_graph[1]): - # Count appearance - total_count = 0 - for length in range(overlap_size+1): - total_count += counts_by_size[common_length][node] if node in counts_by_size[common_length] else 0 - # Add d-graph - if total_count == 1: - # Add the d_graph to the selection - filtered.append(d_graph) - - # register selection of the nodes - for node in itertools.chain(d_graph[0], d_graph[1]): - selected[node] = True - - # Over for this d-graph - break - # Stop if all nodes are selected - over = True - for val in selected.values(): - if not val: - over = False - break - if over: - partitionned = True - break - - # If the partionning is not complete, try to subdivise the node anyway. - # Split the node in n d_graph partitions + 1 unpartitionned set of nodes - unpartitionned = [node for node in selected if not selected[node]] - if len(unpartitionned) == len(selected): - return [], unpartitionned - else: - return filtered, unpartitionned - - -import d_graph as dg - -def main(): - # Parsing the input file - filename = sys.argv[1] - G = None - if filename.endswith('.graphml'): - G = nx.read_graphml(filename) - elif filename.endswith('.gexf'): - G = nx.read_gexf(filename) - - dgraphs = dg.compute_all_max_d_graphs(G) - for node in list(G.nodes())[:10]: - print(node, dgraphs[node]) - print("...") - - # # Deconvolve - # g_nodes = list(G.nodes()) - # for node in g_nodes: - # local_deconvolve(G,node, verbose=2 if (node.startswith("0:")) else 0) - # # exit() - - # print(len(g_nodes), "->", len(list(G.nodes()))) - - # nx.write_graphml(G,sys.argv[1]+".deconvolved.graphml") - -if __name__ == "__main__": - main() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391