Commit 02f87907 authored by Yoann Dufresne's avatar Yoann Dufresne

index and clique modification

parent af08126f
......@@ -17,15 +17,16 @@ rule compress_data:
input:
barcode_graph=f"{WORKDIR}/{{barcode_file}}.gexf",
d2_raw=f"{WORKDIR}/{{barcode_file}}_d2_raw_maxclq.gexf",
simplified_d2=f"{WORKDIR}/{{barcode_file}}_d2_simplified_maxclq.gexf",
d2_raw_louvain=f"{WORKDIR}/{{barcode_file}}_d2_raw_louvain.gexf",
simplified_d2_louvain=f"{WORKDIR}/{{barcode_file}}_d2_simplified_louvain.gexf"
simplified_d2=f"{WORKDIR}/{{barcode_file}}_d2_simplified_maxclq.gexf"
# d2_raw_louvain=f"{WORKDIR}/{{barcode_file}}_d2_raw_louvain.gexf",
# simplified_d2_louvain=f"{WORKDIR}/{{barcode_file}}_d2_simplified_louvain.gexf"
output:
zip=f"{WORKDIR}/{{barcode_file}}.tar.gz"
shell:
f"cd {WORKDIR}/ ;"
"if [ ! -d {wildcards.barcode_file} ]; then mkdir {wildcards.barcode_file} ; fi;"
"mv *.gexf {wildcards.barcode_file}/ ;"
"mv {wildcards.barcode_file}_* {wildcards.barcode_file}/ ;"
"cp {wildcards.barcode_file}.gexf {wildcards.barcode_file}/ ;"
"tar czf {wildcards.barcode_file}.tar.gz {wildcards.barcode_file}/ ;"
"rm -rf {wildcards.barcode_file}/ ;"
"cd - ;"
......
......@@ -4,7 +4,10 @@ from bidict import bidict
import sys
from deconvolution.dgraph.FixedDGIndex import FixedDGIndex
from deconvolution.dgraph.d_graph import Dgraph, compute_all_max_d_graphs
from deconvolution.dgraph.VariableDGIndex import VariableDGIndex
from deconvolution.dgraph.d_graph import Dgraph
from deconvolution.dgraph.CliqueDGFactory import CliqueDGFactory
from deconvolution.dgraph.LouvainDGFactory import LouvainDGFactory
class D2Graph(nx.Graph):
......@@ -56,11 +59,16 @@ class D2Graph(nx.Graph):
return self.subgraph(list(self.nodes()))
def construct_from_barcodes(self, index_size=3, verbose=True, debug=False, clique_mode=None):
def construct_from_barcodes(self, index_size=3, verbose=True, debug=False, clique_mode=None, threads=1):
# Compute all the d-graphs
if verbose:
print("Computing the unit d-graphs..")
self.d_graphs_per_node = compute_all_max_d_graphs(self.barcode_graph, debug=debug, clique_mode=clique_mode)
dg_factory = None
if clique_mode == "louvain":
dg_factory = LouvainDGFactory(self.barcode_graph)
else:
dg_factory = CliqueDGFactory(self.barcode_graph)
self.d_graphs_per_node = dg_factory.generate_all_dgraphs(debug=True)
if verbose:
counts = sum(len(x) for x in self.d_graphs_per_node.values())
print(f"\t {counts} computed d-graphs")
......@@ -75,9 +83,17 @@ class D2Graph(nx.Graph):
# Index all the d-graphs
if verbose:
print("Compute the dmer dgraph")
self.index = FixedDGIndex(size=index_size)
for dg in self.all_d_graphs:
print("\tIndexing")
# self.index = FixedDGIndex(size=index_size)
self.index = VariableDGIndex(size=2)
for idx, dg in enumerate(self.all_d_graphs):
if verbose:
print(f"\r\t{idx+1}/{len(self.all_d_graphs)}", end='')
self.index.add_dgraph(dg)
# self.var_index.add_dgraph(dg)
if verbose:
print()
print("\tFilter index")
self.index.filter_by_entry()
# self.index = self.create_index_from_tuples(index_size, verbose=verbose)
# self.filter_dominated_in_index(tuple_size=index_size, verbose=verbose)
......@@ -201,55 +217,3 @@ class D2Graph(nx.Graph):
# Distance computing and adding in the dist dicts
d = dg1.distance_to(dg2)
data["distance"] = d
def filter_dominated_in_index(self, tuple_size=3, verbose=True):
to_remove = set()
if verbose:
print("\tFilter dominated in dgraph")
# Find dominated
for dmer_idx, item in enumerate(self.index.items()):
dmer, dg_list = item
if verbose:
sys.stdout.write(f"\r\t{dmer_idx+1}/{len(self.index)}")
sys.stdout.flush()
undominated = list_domination_filter(dg_list)
# Register dominated
if len(dg_list) != len(undominated):
for dg in dg_list:
if dg not in undominated:
to_remove.add(dg)
self.index[dmer] = undominated
if verbose:
print()
print("\tDmer removal")
removable_dmers = set()
for r_idx, r_dg in enumerate(to_remove):
if verbose:
sys.stdout.write(f"\r\t{r_idx+1}/{len(to_remove)}")
sys.stdout.flush()
self.all_d_graphs.remove(r_dg)
self.d_graphs_per_node[r_dg.center].remove(r_dg)
# Remove dominated in dgraph
for dmer in itertools.combinations(r_dg.to_sorted_list(), tuple_size):
if r_dg in self.index[dmer]:
self.index[dmer] = list(filter(lambda x: x!=r_dg, self.index[dmer]))
if len(self.index[dmer]) == 0:
removable_dmers.add(dmer)
# Remove empty dmers
for dmer in removable_dmers:
del self.index[dmer]
if verbose:
print()
import networkx as nx
from abc import abstractmethod
from deconvolution.dgraph.FixedDGIndex import FixedDGIndex
class AbstractDGFactory:
def __init__(self, graph):
self.graph = graph
def generate_all_dgraphs(self, debug=False):
index = FixedDGIndex(size=1)
nb_nodes = len(self.graph.nodes())
for idx, node in enumerate(self.graph.nodes()):
if debug: print(f"\r{idx+1}/{nb_nodes} node analysis", end='')
neighbors = list(self.graph.neighbors(node))
subgraph = nx.Graph(self.graph.subgraph(neighbors))
# Fill the index by node
key = frozenset({node})
for dg in self.generate_by_node(node, subgraph):
index.add_value(key, dg)
index.filter_by_entry()
return index
@abstractmethod
def generate_by_node(self, node, subgraph):
pass
# def compute_all_max_d_graphs(graph, debug=False, clique_mode=None, threads=1):
# d_graphs = FixedDGIndex(size=1)
#
# nds = list(graph.nodes())
# for idx, node in enumerate(nds):
# # print(idx+1, '/', len(nds))
# #if "MI" not in str(node): continue # for debugging; only look at deconvolved nodes
# #print(f"\r{idx+1}/{len(graph.nodes())}")
# neighbors = list(graph.neighbors(node))
# neighbors_graph = nx.Graph(graph.subgraph(neighbors))
#
# node_d_graphs = set()
#
# mode_str = " "
# if clique_mode is None:
# # Find all the cliques (equivalent to compute all the candidate half d-graph)
# cliques = []
# for clique in nx.find_cliques(neighbors_graph):
# if len(clique) > 3:
# cliques.append(clique)
# mode_str += "(max-cliques)"
# elif clique_mode == "louvain":
# louvain = community.best_partition(neighbors_graph) # louvain
# # high resolution seems to work better
# communities = [[c for c,i in louvain.items() if i == clique_id] for clique_id in set(louvain.values())]
# mode_str += "(louvain)"
# cliques = []
# for comm in communities:
# # further decompose! into necessarily 2 communities
# community_as_graph = nx.Graph(graph.subgraph(comm))
# if len(community_as_graph.nodes()) <= 2:
# cliques += [community_as_graph.nodes()]
# else:
# cliques += map(list,nx.community.asyn_fluidc(community_as_graph,2))
#
# elif clique_mode == "testing":
# # k-clique communities
# #from networkx.algorithms.community import k_clique_communities
# #cliques = k_clique_communities(neighbors_graph, 3) # related to the d-graph d parameter
# from cdlib import algorithms
# cliques_dict = algorithms.node_perception(neighbors_graph, threshold=0.75, overlap_threshold=0.75) #typical output: Sizes of found cliques (testing): Counter({6: 4, 5: 3, 4: 2, 2: 1})
# #cliques_dict = algorithms.gdmp2(neighbors_graph, min_threshold=0.9) #typical output: sizes of found cliques (testing): Counter({3: 2, 5: 1})
# #cliques_dict = algorithms.angel(neighbors_graph, threshold=0.90) # very sensitive parameters: 0.84 and 0.88 don't work at all but 0.86 does sort of
# from collections import defaultdict
# cliques_dict2 = defaultdict(list)
# for (node, values) in cliques_dict.to_node_community_map().items():
# for value in values:
# cliques_dict2[value] += [node]
# cliques = list(cliques_dict2.values())
# mode_str += "(testing)"
#
# if debug: print("node", node, "has", len(cliques), "cliques in neighborhood (of size", len(neighbors), ")")
#
# cliques_debugging = True
# if cliques_debugging:
#
# from collections import Counter
# len_cliques = Counter(map(len,cliques))
#
# # Pair halves to create d-graphes
# for idx, clq1 in enumerate(cliques):
# for clq2_idx in range(idx+1, len(cliques)):
# clq2 = cliques[clq2_idx]
#
# # Check for d-graph candidates
# d_graph = Dgraph(node)
# d_graph.put_halves(clq1, clq2, neighbors_graph)
#
# factor = 0.5
# #if clique_mode == "testing": factor = 1 # still allows louvain's big communities
# #print("link div:",d_graph.get_link_divergence(),"opt:",d_graph.get_optimal_score(), "good d graph?",d_graph.get_link_divergence() <= d_graph.get_optimal_score() *factor)
# if d_graph.get_link_divergence() <= d_graph.get_optimal_score() * factor:
# node_d_graphs.add(d_graph)
#
# # Fill the index by node
# key = frozenset({node})
# for dg in node_d_graphs:
# d_graphs.add_value(key, dg)
#
# d_graphs.filter_by_entry()
# return d_graphs
import networkx as nx
from deconvolution.dgraph.AbstractDGFactory import AbstractDGFactory
from deconvolution.dgraph.d_graph import Dgraph
class CliqueDGFactory(AbstractDGFactory):
def __init__(self, graph, min_size_clique=4, dg_max_divergence_factor=0.5):
super(CliqueDGFactory, self).__init__(graph)
self.min_size = min_size_clique
self.dg_max_divergence_factor = dg_max_divergence_factor
def generate_by_node(self, node, subgraph):
node_d_graphs = set()
# Clique computation
cliques = []
for clique in nx.find_cliques(subgraph):
if len(clique) >= self.min_size:
cliques.append(clique)
# TODO: Index cliques to pair them faster
# d-graph computation
for idx, clq1 in enumerate(cliques):
for clq2_idx in range(idx+1, len(cliques)):
clq2 = cliques[clq2_idx]
# Check for d-graph candidates
d_graph = Dgraph(node)
d_graph.put_halves(clq1, clq2, subgraph)
if d_graph.get_link_divergence() <= d_graph.get_optimal_score() * self.dg_max_divergence_factor:
node_d_graphs.add(d_graph)
return node_d_graphs
import networkx as nx
from deconvolution.dgraph.AbstractDGFactory import AbstractDGFactory
from deconvolution.dgraph.d_graph import Dgraph
import community
class LouvainDGFactory(AbstractDGFactory):
def __init__(self, graph, dg_max_divergence_factor=0.5):
super(LouvainDGFactory, self).__init__(graph)
self.dg_max_divergence_factor = dg_max_divergence_factor
def generate_by_node(self, node, subgraph):
node_d_graphs = set()
louvain = community.best_partition(subgraph) # louvain
# high resolution seems to work better
communities = [[c for c,i in louvain.items() if i == clique_id] for clique_id in set(louvain.values())]
cliques = []
for comm in communities:
# further decompose! into necessarily 2 communities
community_as_graph = nx.Graph(subgraph.subgraph(comm))
if len(community_as_graph.nodes()) <= 2:
cliques += [community_as_graph.nodes()]
else:
cliques += map(list,nx.community.asyn_fluidc(community_as_graph,2))
# d-graph computation
for idx, clq1 in enumerate(cliques):
for clq2_idx in range(idx+1, len(cliques)):
clq2 = cliques[clq2_idx]
# Check for d-graph candidates
d_graph = Dgraph(node)
d_graph.put_halves(clq1, clq2, subgraph)
if d_graph.get_link_divergence() <= d_graph.get_optimal_score() * self.dg_max_divergence_factor:
node_d_graphs.add(d_graph)
return node_d_graphs
This diff is collapsed.
import argparse
import networkx as nx
from itertools import combinations
from multiprocessing import Pool
def parse_arguments():
parser = argparse.ArgumentParser(description='Transform a 10X barcode graph into a d2 graph. The program dig for the d-graphs and then merge them into a d2-graph.')
parser.add_argument('barcode_graph', help='The barcode graph file. Must be a gefx formated file.')
parser.add_argument('--threads', '-t', default=8, type=int, help='Number of thread to use for dgraph computation')
args = parser.parse_args()
return args
def main():
args = parse_arguments()
print(args.barcode_graph)
graph = nx.read_gexf(args.barcode_graph)
for node in graph.nodes():
neighbors = list(graph.neighbors(node))
subgraph = nx.Graph(graph.subgraph(neighbors))
index_cliques(subgraph, node)
def index_cliques(graph, node):
clique_index = {}
nb_cliques = 0
nb_unfiltered_cliques = 0
for clique in nx.find_cliques(graph):
nb_cliques += 1
if len(clique) > 3:
# print(clique)
nb_unfiltered_cliques += 1
print(nb_unfiltered_cliques, '/', nb_cliques)
if __name__ == "__main__":
main()
......@@ -11,6 +11,7 @@ def parse_arguments():
parser = argparse.ArgumentParser(description='Transform a 10X barcode graph into a d2 graph. The program dig for the d-graphs and then merge them into a d2-graph.')
parser.add_argument('barcode_graph', help='The barcode graph file. Must be a gefx formated file.')
parser.add_argument('--output_prefix', '-o', default="d2_graph", help="Output file prefix.")
parser.add_argument('--threads', '-t', default=8, type=int, help='Number of thread to use for dgraph computation')
parser.add_argument('--debug', '-d', action='store_true', help="Debug")
parser.add_argument('--maxclq', '-c', action='store_true', help="Enable max clique community detection (default behaviour)")
parser.add_argument('--louvain', '-l', action='store_true', help="Enable Louvain community detection instead of all max-cliques")
......@@ -54,7 +55,7 @@ def main():
dprint("D2 graph object created")
dprint("constructing d2 graph from barcode graph")
index_size = 8 #if clique_mode is None else 3
d2g.construct_from_barcodes(index_size=index_size, debug=debug, clique_mode=clique_mode)
d2g.construct_from_barcodes(index_size=index_size, debug=debug, clique_mode=clique_mode, threads=args.threads)
dprint("[debug] d2 graph constructed")
d2g.save(f"{args.output_prefix}.tsv")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment