diff --git a/Snakefile_data_simu b/Snakefile_data_simu index 62e24a0e9b32c4f219dffa26380787a1e6e446c9..2c720599c810ca13cfec0f7e7ff547d953df24c8 100644 --- a/Snakefile_data_simu +++ b/Snakefile_data_simu @@ -1,6 +1,6 @@ OUTDIR="snake_exec" if "outdir" not in config else config["outdir"] -N=[1000] if "n" not in config else config["n"] # Number of molecule to simulate +N=[10000] if "n" not in config else config["n"] # Number of molecule to simulate D=[5] if "d" not in config else config["d"] # Average coverage of each molecule M=[2] if "m" not in config else config["m"] # Average number of molecule per barcode M_DEV=[0] if "m_dev" not in config else config["m_dev"] # Std deviation for merging number diff --git a/deconvolution/d_graph.py b/deconvolution/d_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..c5eebf5efe02224b831d92185605d996a81b1493 --- /dev/null +++ b/deconvolution/d_graph.py @@ -0,0 +1,404 @@ +import networkx as nx +from functools import total_ordering +import community # pip install python-louvain + + +@total_ordering +class Dgraph(object): + """docstring for Dgraph""" + def __init__(self, center): + super(Dgraph, self).__init__() + self.idx = -1 + self.center = center + self.score = 0 + self.halves = [None,None] + self.connexity = [None,None] + self.nodes = [self.center] + self.node_set = set(self.nodes) + self.edges = [] + self.ordered_list = None + self.sorted_list = None + + self.marked = False + + + """ Static method to load a dgraph from a text + @param text the saved d-graph + @param barcode_graph Barcode graph from which the d-graph is extracted + @return a new d-graph object corresponding to the test + """ + def load(text, barcode_graph): + # basic split + text = text.replace(']', '') + head, h1, h2 = text.split('[') + + # Head parsing + center = head.replace(' ', '') + if center not in barcode_graph: + center = int(center) + dg = Dgraph(center) + + # Reload halves + h1 = [x.split(' ')[-2] for x in h1.split(',')] + h1 = [int(x) if x not in barcode_graph else x for x in h1] + h2 = [x.split(' ')[-2] for x in h2.split(',')] + h2 = [int(x) if x not in barcode_graph else x for x in h2] + dg.put_halves(h1, h2, barcode_graph) + + return dg + + + """ Compute the d-graph quality (score) according to the connectivity between the two halves. + @param h1 First half of the d-graph + @param h2 Second half of the d-graph + @param graph The connectivity graph + """ + def put_halves(self, h1, h2, graph): + self.score = 0 + self.halves[0] = h1 + for node in h1: + self.node_set.add(node) + self.nodes.append(node) + self.halves[1] = h2 + for node in h2: + self.node_set.add(node) + self.nodes.append(node) + + # self.nodes = sorted([self.center] + self.halves[0] + self.halves[1]) + self.connexity[0] = {key: 0 for key in self.halves[0]} + self.connexity[1] = {key: 0 for key in self.halves[1]} + self.edges = [] + + # Compute link arities + for node1 in self.halves[0]: + neighbors = set(graph.neighbors(node1)) + + for node2 in self.halves[1]: + if node1 == node2 or node2 in neighbors: + self.score += 1 + self.connexity[0][node1] += 1 + self.connexity[1][node2] += 1 + + # Compute links from the center to the other nodes + for idx, node1 in enumerate(self.nodes): + for node2 in self.nodes[idx+1:]: + if graph.has_edge(node1, node2): + if node1 < node2: + self.edges.append((node1, node2)) + else: + self.edges.append((node2, node1)) + + # Sort the halves by descending connexity + connex = self.connexity + self.halves[0].sort(reverse=True, key=lambda v: connex[0][v]) + self.halves[1].sort(reverse=True, key=lambda v: connex[1][v]) + + + def get_link_divergence(self): + return int(abs(self.score - self.get_optimal_score())) + + + def get_optimal_score(self): + max_len = max(len(self.halves[0]), len(self.halves[1])) + return int(max_len * (max_len - 1) / 2) + + + def to_sorted_list(self): + if self.sorted_list is None: + self.sorted_list = sorted(self.nodes) + return self.sorted_list + + + def to_ordered_lists(self): + if self.ordered_list is None: + hands = [[],[]] + for idx in range(2): + prev_connectivity = -1 + for node in self.halves[idx]: + # group nodes by similar connectivity + value = self.connexity[idx][node] + if value != prev_connectivity: + hands[idx].append([]) + prev_connectivity = value + hands[idx][-1].append(node) + + self.ordered_list = hands[0][::-1] + [[self.center]] + hands[1] + return self.ordered_list + + + def to_node_set(self): + return frozenset(self.to_sorted_list()) + + + def distance_to(self, dgraph): + nodes_1 = self.to_sorted_list() + nodes_2 = other_nodes = dgraph.to_sorted_list() + + dist = 0 + idx1, idx2 = 0, 0 + while idx1 != len(nodes_1) and idx2 != len(nodes_2): + if nodes_1[idx1] == nodes_2[idx2]: + idx1 += 1 + idx2 += 1 + else: + dist += 1 + if nodes_1[idx1] < nodes_2[idx2]: + idx1 += 1 + else: + idx2 += 1 + dist += len(nodes_1) - idx1 + len(nodes_2) - idx2 + + return dist + + + """ Verify if dg1 is dominated by dg2. The domination is determined by two points: All the nodes + of dg1 are part of dg2 and the divergeance of dg1 is greater than dg2. + @param dg1 (resp dg2) A d_graph object. + @return True if dg1 is dominated by dg2. + """ + def is_dominated(self, dg): + dg1_nodes = self.to_node_set() + dg2_nodes = dg.to_node_set() + + # domination first condition: inclusion of all the nodes + if not dg1_nodes.issubset(dg2_nodes): + return False + + # domination second condition + if len(dg1_nodes) == len(dg2_nodes): + if self.get_link_divergence() > dg.get_link_divergence(): + return True + elif self.get_link_divergence() >= dg.get_link_divergence(): + return True + + return False + + + def __eq__(self, other): + if other is None: + return False + + if self.idx != -1 and self.idx == other.idx: + return True + + if self.node_set != other.node_set: + return False + + return self.to_ordered_lists() == other.to_ordered_lists() + + def __ne__(self, other): + return not (self == other) + + def __lt__(self, other): + my_tuple = (self.get_link_divergence(), self.get_optimal_score()) + other_tuple = (other.get_link_divergence(), other.get_optimal_score()) + return my_tuple < other_tuple + + + def __hash__(self): + nodelist = self.to_sorted_list() + nodelist = [str(x) for x in nodelist] + return ",".join(nodelist).__hash__() + + + def __ordered_hash__(self): + lst = self.to_ordered_lists() + + fwd_uniq_lst = [sorted(l) for l in lst] + fwd_str = ",".join([f"[{'-'.join(l)}]" for l in fwd_uniq_lst]) + fwd_hash = fwd_str.__hash__() + + rev_uniq_lst = [sorted(l) for l in lst[::-1]] + rev_str = ",".join([f"[{'-'.join(l)}]" for l in rev_uniq_lst]) + rev_hash = rev_str.__hash__() + + return int(min(fwd_hash, rev_hash)) + + + def __repr__(self): + # print(self.halves) + representation = str(self.center) + " " + representation += "[" + ", ".join([f"{node} {self.connexity[0][node]}" for node in self.halves[0]]) + "]" + representation += "[" + ", ".join([f"{node} {self.connexity[1][node]}" for node in self.halves[1]]) + "]" + return representation + + def _to_str_nodes(self): + str_nodes = [str(x) for x in self.nodes] + str_nodes.sort() + return str(str_nodes) + + +""" From a barcode graph, compute all the possible max d-graphs by node. + @param graph A barcode graph + @return A dictionary associating each node to its list of all possible d-graphs. The d-graphs are sorted by decreasing ratio. +""" +def compute_all_max_d_graphs(graph, debug=False, clique_mode=None): + d_graphs = {} + + for idx, node in enumerate(list(graph.nodes())): + #if "MI" not in str(node): continue # for debugging; only look at deconvolved nodes + #print(f"\r{idx+1}/{len(graph.nodes())}") + neighbors = list(graph.neighbors(node)) + neighbors_graph = nx.Graph(graph.subgraph(neighbors)) + + node_d_graphs = set() + + mode_str = " " + if clique_mode is None: + # Find all the cliques (equivalent to compute all the candidate half d-graph) + cliques = list(nx.find_cliques(neighbors_graph)) + mode_str += "(max-cliques)" + elif clique_mode == "louvain": + louvain = community.best_partition(neighbors_graph) # louvain + # high resolution seems to work better + communities = [[c for c,i in louvain.items() if i == clique_id] for clique_id in set(louvain.values())] + mode_str += "(louvain)" + cliques = [] + for comm in communities: + # further decompose! into necessarily 2 communities + community_as_graph = nx.Graph(graph.subgraph(comm)) + if len(community_as_graph.nodes()) <= 2: + cliques += [list(community_as_graph.nodes())] + else: + cliques += map(list,nx.community.asyn_fluidc(community_as_graph,2)) + + elif clique_mode == "testing": + # k-clique communities + #from networkx.algorithms.community import k_clique_communities + #cliques = k_clique_communities(neighbors_graph, 3) # related to the d-graph d parameter + from cdlib import algorithms + cliques_dict = algorithms.node_perception(neighbors_graph, threshold=0.75, overlap_threshold=0.75) #typical output: Sizes of found cliques (testing): Counter({6: 4, 5: 3, 4: 2, 2: 1}) + #cliques_dict = algorithms.gdmp2(neighbors_graph, min_threshold=0.9) #typical output: sizes of found cliques (testing): Counter({3: 2, 5: 1}) + #cliques_dict = algorithms.angel(neighbors_graph, threshold=0.90) # very sensitive parameters: 0.84 and 0.88 don't work at all but 0.86 does sort of + from collections import defaultdict + cliques_dict2 = defaultdict(list) + for (node, values) in cliques_dict.to_node_community_map().items(): + for value in values: + cliques_dict2[value] += [node] + cliques = list(cliques_dict2.values()) + mode_str += "(testing)" + + #print("cliques", len(cliques)) + if debug: print("node",node,"has",len(cliques),"cliques in neighborhood (of size",len(neighbors),")") + + cliques_debugging = True + if cliques_debugging: + #cliques_graph = nx.make_max_clique_graph(neighbors_graph) + #if debug: print("node",node,"clique graph has",len(cliques_graph.nodes()),"nodes",len(cliques_graph.edges()),"edges") + #nx.write_gexf(cliques_graph, str(node) +".gexf") + + from collections import Counter + len_cliques = Counter(map(len,cliques)) + #print("sizes of found cliques%s:" % mode_str, len_cliques) + + # Pair halves to create d-graphes + for idx, clq1 in enumerate(cliques): + for clq2_idx in range(idx+1, len(cliques)): + clq2 = cliques[clq2_idx] + + # Check for d-graph candidates + d_graph = Dgraph(node) + d_graph.put_halves(clq1, clq2, neighbors_graph) + + factor = 0.5 + #if clique_mode == "testing": factor = 1 # still allows louvain's big communities + #print("link div:",d_graph.get_link_divergence(),"opt:",d_graph.get_optimal_score(), "good d graph?",d_graph.get_link_divergence() <= d_graph.get_optimal_score() *factor) + if d_graph.get_link_divergence() <= d_graph.get_optimal_score() * factor: + node_d_graphs.add(d_graph) + + #print("d-graphs", len(node_d_graphs)) + d_graphs[node] = brutal_list_domination_filter(sorted(node_d_graphs)) + #print("filtered", len(d_graphs[node])) + + return d_graphs + + +""" Add the new dg in the dgs list. If dg is dominated by another dg in the list, then it's + dropped. If any dg in the list is dominated by the dg to add, then, the new dg is added and + all the dominated dg are removed from the list. + @param dg A new dg to add/filter. + @param undominated_dgs_list A list of dg where any of them is dominated by another one. + @return The updated undominated list. +""" +def add_new_dg_regarding_domination(dg, undominated_dgs_list): + to_remove = [] + dominated = False + + # Search for domination relations + for u_dg in undominated_dgs_list: + if not dominated and dg.is_dominated(u_dg): + dominated = True + if u_dg.is_dominated(dg): + to_remove.append(u_dg) + + # Remove dominated values + size = len(undominated_dgs_list) + for dg2 in to_remove: + undominated_dgs_list.remove(dg2) + + # Add the new dg + if not dominated: + undominated_dgs_list.append(dg) + + return undominated_dgs_list + + + +def filter_dominated(d_graphs, overall=False, in_place=True): + if not overall: + return local_domination_filter(d_graphs, in_place) + + all_d_graphs = [] + for dgs in d_graphs.values(): + all_d_graphs.extend(dgs) + + all_d_graphs = list_domination_filter(all_d_graphs) + + return d_graphs + + +""" Filter the d-graphs by node. In a list of d-graph centered on a node n, if a d-graph is + completely included in another and have a highest distance score to the optimal, then it is + filtered out. + @param d_graphs All the d-graphs to filter, sorted by central node. + @param in_place If true, modify the content of d_graph with the filtered version. If False, + copy all the content in a new dictionary. + @return The filtered dictionary of d-graph per node. +""" +def local_domination_filter(d_graphs, in_place=True): + filtered = d_graphs if in_place else {} + + # Filter node by node + for node, d_graph_list in d_graphs.items(): + lst = str(sorted([str(x) for x in d_graph_list])) + filtered[node] = brutal_list_domination_filter(d_graph_list, node_name=str(node)) + + return filtered + + +""" Filter the input d-graphs list. In the list of d-graph centered on a node n, if a d-graph is + completely included in another and have a highest distance score to the optimal, then it is + filtered out. + @param d_graphs All the d-graphs to filter. + @return The filtered dictionary of d-graph per node. +""" +def list_domination_filter(d_graphs): + filtered = [] + + # Filter d-graph by d-graph + for dg in d_graphs: + add_new_dg_regarding_domination(dg, filtered) + + return set(filtered) + +def brutal_list_domination_filter(d_graphs, node_name=""): + undominated = set(d_graphs) + to_remove = set() + + for dg1 in d_graphs: + for dg2 in d_graphs: + if dg1.is_dominated(dg2): + to_remove.add(dg1) + break + + return undominated - to_remove diff --git a/deconvolution/main/generate_genome_molecule_graph.py b/deconvolution/main/generate_genome_molecule_graph.py new file mode 100755 index 0000000000000000000000000000000000000000..fa6974321bd25269707e59face5207ba15b8a2f6 --- /dev/null +++ b/deconvolution/main/generate_genome_molecule_graph.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 + +import argparse + +import numpy as np +import networkx as nx +from intervaltree import Interval, IntervalTree + +def parse_arguments(): + parser = argparse.ArgumentParser(description='Generate a fake 10X molecule graph.') + parser.add_argument('--num_molecule', '-n', type=int, required=True, help='The number of molecule in the final graph') + parser.add_argument('--genome_size', '-g', type=int, required=True, help='genome size') + parser.add_argument('--avg_mol_len', '-l', type=int, required=True, help='avg molecule size') + parser.add_argument('--min_mol_ovl', '-b', type=int, default=0, help='minimum overlap length between molecules') + parser.add_argument('--rnd_seed', '-s', type=int, default=-1, help='Random seed. Used for reproducibility purpose. Please do not use it in production.') + parser.add_argument('--output', '-o', help="Output filename") + + args = parser.parse_args() + + return args + +def compute_ovl_length(s1,e1,s2,e2): + if s1 > s2: + s1,e1,s2,e2 = s2,e2,s1,e1 + return e1-s2 + +def is_contained(itree,start,end): + for itv in itree.overlap(start,end): + start2, end2 = itv.begin, itv.end + if start2 <= start and end2 >= end: + return True + return False + + +def generate_graph(nb_molecules, genome_size, avg_mol_size, min_mol_ovl, rnd_seed=None): + # Reproducibility + if rnd_seed != -1: + np.random.seed(rnd_seed) + G = nx.Graph() + + # generate molecules according to similar principle as LRSM + # but in addition make sure they're not contained in each other + molecules = dict() #Â mol index: (start,end) + itree = IntervalTree() + for idx in range(nb_molecules): + while True: #Â to avoid contained molecules + G.add_node(idx) + #pick a starting position (follows LRSIM) + start = np.random.randint(0,genome_size) + #pick a fragment size (follows LRSIM) + molecule_size = np.random.poisson(avg_mol_size) + end = start+molecule_size + if is_contained(itree,start,end): continue + print("molecule",idx,"start",start,"end",end) + molecules[idx] = (start, end) + itree.addi(start,end,idx) + break + + #Â create graph edges corresponding to molecules overlaps + for idx in molecules: + start,end = molecules[idx] + for itv in itree.overlap(start,end): + other_idx = itv.data + if idx==other_idx: continue + start2, end2 = itv.begin, itv.end + ovl_length = compute_ovl_length(start,end,start2,end2) + #print("overlap length",ovl_length) + if ovl_length < min_mol_ovl: continue + G.add_edge(idx, other_idx) + return G + +def save_graph(G, outfile): + import networkx as nx + nx.write_gexf(G, outfile) + + +if __name__ == "__main__": + args = parse_arguments() + G = generate_graph(args.num_molecule, args.genome_size, args.avg_mol_len, args.min_mol_ovl, args.rnd_seed) + + outfile = f"simulated_molecules_{args.num_molecule}_{args.genome_size}_{args.avg_mol_len}_{args.min_mol_ovl}.gexf" + if args.output: + outfile = args.output + save_graph(G, outfile) + diff --git a/requirements.txt b/requirements.txt index 68fb1824fc5797e6ba02c2d1301838887d894480..ab00fa7dbdd212dcf6d57bdee5e3240153af10e7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ pytest scipy numpy python-louvain +intervaltree diff --git a/test_louvain.sh b/test_louvain.sh index 813d1cc455734e05a19260f44713297bee12dc32..ee79f24d42bd5ee425f1ec7d02a36284c825d424 100755 --- a/test_louvain.sh +++ b/test_louvain.sh @@ -1,12 +1,14 @@ -rm -Rf snake_tests/simu_bar_n1000_d5_m2* +prefix=simu_bar_n1000_d6_m2 + +rm -Rf snake_tests/$prefix* snakemake -s Snakefile_data_simu snakemake -s Snakefile_d2 -mv snake_exec/simu_bar_n1000_d5_m2.tar.gz snake_tests/simu_bar_n1000_d5_m2.tar.gz +mv snake_exec/$prefix.tar.gz snake_tests/$prefix.tar.gz cd snake_tests -tar xf simu_bar_n1000_d5_m2.tar.gz +tar xf $prefix.tar.gz cd .. do_max_cliques=1 @@ -15,11 +17,11 @@ then echo "max-cliques" - python deconvolution/evaluate.py --type d2 --barcode_graph snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2_d2_simplified.gexf + python deconvolution/evaluate.py --type d2 --barcode_graph snake_tests/"$prefix"/"$prefix".gexf snake_tests/"$prefix"/"$prefix"_d2_simplified.gexf - python deconvolution/d2_to_path.py snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2_d2_simplified.gexf + python deconvolution/d2_to_path.py snake_tests/"$prefix"/"$prefix".gexf snake_tests/"$prefix"/"$prefix"_d2_simplified.gexf - python deconvolution/evaluate.py --type path --barcode_graph snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf _path.gexf + python deconvolution/evaluate.py --type path --barcode_graph snake_tests/"$prefix"/"$prefix".gexf _path.gexf fi @@ -29,11 +31,11 @@ then echo "louvain" - python deconvolution/evaluate.py --type d2 --barcode_graph snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2_d2_simplified_louvain.gexf + python deconvolution/evaluate.py --type d2 --barcode_graph snake_tests/"$prefix"/"$prefix".gexf snake_tests/"$prefix"/"$prefix"_d2_simplified_louvain.gexf - python deconvolution/d2_to_path.py snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2_d2_simplified_louvain.gexf + python deconvolution/d2_to_path.py snake_tests/"$prefix"/"$prefix".gexf snake_tests/"$prefix"/"$prefix"_d2_simplified_louvain.gexf - python deconvolution/evaluate.py --type path --barcode_graph snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf _path.gexf + python deconvolution/evaluate.py --type path --barcode_graph snake_tests/"$prefix"/"$prefix".gexf _path.gexf fi do_comtest=0 @@ -43,10 +45,10 @@ then echo "******************************* Testing algorithm" - python deconvolution/evaluate.py --type d2 --barcode_graph snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2_d2_simplified_comtest.gexf + python deconvolution/evaluate.py --type d2 --barcode_graph snake_tests/"$prefix"/"$prefix".gexf snake_tests/"$prefix"/"$prefix"_d2_simplified_comtest.gexf - python deconvolution/d2_to_path.py snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2_d2_simplified_comtest.gexf + python deconvolution/d2_to_path.py snake_tests/"$prefix"/"$prefix".gexf snake_tests/"$prefix"/"$prefix"_d2_simplified_comtest.gexf - python deconvolution/evaluate.py --type path --barcode_graph snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf _path.gexf + python deconvolution/evaluate.py --type path --barcode_graph snake_tests/"$prefix"/"$prefix".gexf _path.gexf fi