Commit 448f02f1 authored by Yoann Dufresne's avatar Yoann Dufresne
Browse files

merged Snakemake d2

parents c21b7e83 b79c63d1
OUTDIR="snake_exec" if "outdir" not in config else config["outdir"]
N=[1000] if "n" not in config else config["n"] # Number of molecule to simulate
N=[10000] if "n" not in config else config["n"] # Number of molecule to simulate
D=[5] if "d" not in config else config["d"] # Average coverage of each molecule
M=[2] if "m" not in config else config["m"] # Average number of molecule per barcode
M_DEV=[0] if "m_dev" not in config else config["m_dev"] # Std deviation for merging number
......
import networkx as nx
from functools import total_ordering
import community # pip install python-louvain
@total_ordering
class Dgraph(object):
"""docstring for Dgraph"""
def __init__(self, center):
super(Dgraph, self).__init__()
self.idx = -1
self.center = center
self.score = 0
self.halves = [None,None]
self.connexity = [None,None]
self.nodes = [self.center]
self.node_set = set(self.nodes)
self.edges = []
self.ordered_list = None
self.sorted_list = None
self.marked = False
""" Static method to load a dgraph from a text
@param text the saved d-graph
@param barcode_graph Barcode graph from which the d-graph is extracted
@return a new d-graph object corresponding to the test
"""
def load(text, barcode_graph):
# basic split
text = text.replace(']', '')
head, h1, h2 = text.split('[')
# Head parsing
center = head.replace(' ', '')
if center not in barcode_graph:
center = int(center)
dg = Dgraph(center)
# Reload halves
h1 = [x.split(' ')[-2] for x in h1.split(',')]
h1 = [int(x) if x not in barcode_graph else x for x in h1]
h2 = [x.split(' ')[-2] for x in h2.split(',')]
h2 = [int(x) if x not in barcode_graph else x for x in h2]
dg.put_halves(h1, h2, barcode_graph)
return dg
""" Compute the d-graph quality (score) according to the connectivity between the two halves.
@param h1 First half of the d-graph
@param h2 Second half of the d-graph
@param graph The connectivity graph
"""
def put_halves(self, h1, h2, graph):
self.score = 0
self.halves[0] = h1
for node in h1:
self.node_set.add(node)
self.nodes.append(node)
self.halves[1] = h2
for node in h2:
self.node_set.add(node)
self.nodes.append(node)
# self.nodes = sorted([self.center] + self.halves[0] + self.halves[1])
self.connexity[0] = {key: 0 for key in self.halves[0]}
self.connexity[1] = {key: 0 for key in self.halves[1]}
self.edges = []
# Compute link arities
for node1 in self.halves[0]:
neighbors = set(graph.neighbors(node1))
for node2 in self.halves[1]:
if node1 == node2 or node2 in neighbors:
self.score += 1
self.connexity[0][node1] += 1
self.connexity[1][node2] += 1
# Compute links from the center to the other nodes
for idx, node1 in enumerate(self.nodes):
for node2 in self.nodes[idx+1:]:
if graph.has_edge(node1, node2):
if node1 < node2:
self.edges.append((node1, node2))
else:
self.edges.append((node2, node1))
# Sort the halves by descending connexity
connex = self.connexity
self.halves[0].sort(reverse=True, key=lambda v: connex[0][v])
self.halves[1].sort(reverse=True, key=lambda v: connex[1][v])
def get_link_divergence(self):
return int(abs(self.score - self.get_optimal_score()))
def get_optimal_score(self):
max_len = max(len(self.halves[0]), len(self.halves[1]))
return int(max_len * (max_len - 1) / 2)
def to_sorted_list(self):
if self.sorted_list is None:
self.sorted_list = sorted(self.nodes)
return self.sorted_list
def to_ordered_lists(self):
if self.ordered_list is None:
hands = [[],[]]
for idx in range(2):
prev_connectivity = -1
for node in self.halves[idx]:
# group nodes by similar connectivity
value = self.connexity[idx][node]
if value != prev_connectivity:
hands[idx].append([])
prev_connectivity = value
hands[idx][-1].append(node)
self.ordered_list = hands[0][::-1] + [[self.center]] + hands[1]
return self.ordered_list
def to_node_set(self):
return frozenset(self.to_sorted_list())
def distance_to(self, dgraph):
nodes_1 = self.to_sorted_list()
nodes_2 = other_nodes = dgraph.to_sorted_list()
dist = 0
idx1, idx2 = 0, 0
while idx1 != len(nodes_1) and idx2 != len(nodes_2):
if nodes_1[idx1] == nodes_2[idx2]:
idx1 += 1
idx2 += 1
else:
dist += 1
if nodes_1[idx1] < nodes_2[idx2]:
idx1 += 1
else:
idx2 += 1
dist += len(nodes_1) - idx1 + len(nodes_2) - idx2
return dist
""" Verify if dg1 is dominated by dg2. The domination is determined by two points: All the nodes
of dg1 are part of dg2 and the divergeance of dg1 is greater than dg2.
@param dg1 (resp dg2) A d_graph object.
@return True if dg1 is dominated by dg2.
"""
def is_dominated(self, dg):
dg1_nodes = self.to_node_set()
dg2_nodes = dg.to_node_set()
# domination first condition: inclusion of all the nodes
if not dg1_nodes.issubset(dg2_nodes):
return False
# domination second condition
if len(dg1_nodes) == len(dg2_nodes):
if self.get_link_divergence() > dg.get_link_divergence():
return True
elif self.get_link_divergence() >= dg.get_link_divergence():
return True
return False
def __eq__(self, other):
if other is None:
return False
if self.idx != -1 and self.idx == other.idx:
return True
if self.node_set != other.node_set:
return False
return self.to_ordered_lists() == other.to_ordered_lists()
def __ne__(self, other):
return not (self == other)
def __lt__(self, other):
my_tuple = (self.get_link_divergence(), self.get_optimal_score())
other_tuple = (other.get_link_divergence(), other.get_optimal_score())
return my_tuple < other_tuple
def __hash__(self):
nodelist = self.to_sorted_list()
nodelist = [str(x) for x in nodelist]
return ",".join(nodelist).__hash__()
def __ordered_hash__(self):
lst = self.to_ordered_lists()
fwd_uniq_lst = [sorted(l) for l in lst]
fwd_str = ",".join([f"[{'-'.join(l)}]" for l in fwd_uniq_lst])
fwd_hash = fwd_str.__hash__()
rev_uniq_lst = [sorted(l) for l in lst[::-1]]
rev_str = ",".join([f"[{'-'.join(l)}]" for l in rev_uniq_lst])
rev_hash = rev_str.__hash__()
return int(min(fwd_hash, rev_hash))
def __repr__(self):
# print(self.halves)
representation = str(self.center) + " "
representation += "[" + ", ".join([f"{node} {self.connexity[0][node]}" for node in self.halves[0]]) + "]"
representation += "[" + ", ".join([f"{node} {self.connexity[1][node]}" for node in self.halves[1]]) + "]"
return representation
def _to_str_nodes(self):
str_nodes = [str(x) for x in self.nodes]
str_nodes.sort()
return str(str_nodes)
""" From a barcode graph, compute all the possible max d-graphs by node.
@param graph A barcode graph
@return A dictionary associating each node to its list of all possible d-graphs. The d-graphs are sorted by decreasing ratio.
"""
def compute_all_max_d_graphs(graph, debug=False, clique_mode=None):
d_graphs = {}
for idx, node in enumerate(list(graph.nodes())):
#if "MI" not in str(node): continue # for debugging; only look at deconvolved nodes
#print(f"\r{idx+1}/{len(graph.nodes())}")
neighbors = list(graph.neighbors(node))
neighbors_graph = nx.Graph(graph.subgraph(neighbors))
node_d_graphs = set()
mode_str = " "
if clique_mode is None:
# Find all the cliques (equivalent to compute all the candidate half d-graph)
cliques = list(nx.find_cliques(neighbors_graph))
mode_str += "(max-cliques)"
elif clique_mode == "louvain":
louvain = community.best_partition(neighbors_graph) # louvain
# high resolution seems to work better
communities = [[c for c,i in louvain.items() if i == clique_id] for clique_id in set(louvain.values())]
mode_str += "(louvain)"
cliques = []
for comm in communities:
# further decompose! into necessarily 2 communities
community_as_graph = nx.Graph(graph.subgraph(comm))
if len(community_as_graph.nodes()) <= 2:
cliques += [list(community_as_graph.nodes())]
else:
cliques += map(list,nx.community.asyn_fluidc(community_as_graph,2))
elif clique_mode == "testing":
# k-clique communities
#from networkx.algorithms.community import k_clique_communities
#cliques = k_clique_communities(neighbors_graph, 3) # related to the d-graph d parameter
from cdlib import algorithms
cliques_dict = algorithms.node_perception(neighbors_graph, threshold=0.75, overlap_threshold=0.75) #typical output: Sizes of found cliques (testing): Counter({6: 4, 5: 3, 4: 2, 2: 1})
#cliques_dict = algorithms.gdmp2(neighbors_graph, min_threshold=0.9) #typical output: sizes of found cliques (testing): Counter({3: 2, 5: 1})
#cliques_dict = algorithms.angel(neighbors_graph, threshold=0.90) # very sensitive parameters: 0.84 and 0.88 don't work at all but 0.86 does sort of
from collections import defaultdict
cliques_dict2 = defaultdict(list)
for (node, values) in cliques_dict.to_node_community_map().items():
for value in values:
cliques_dict2[value] += [node]
cliques = list(cliques_dict2.values())
mode_str += "(testing)"
#print("cliques", len(cliques))
if debug: print("node",node,"has",len(cliques),"cliques in neighborhood (of size",len(neighbors),")")
cliques_debugging = True
if cliques_debugging:
#cliques_graph = nx.make_max_clique_graph(neighbors_graph)
#if debug: print("node",node,"clique graph has",len(cliques_graph.nodes()),"nodes",len(cliques_graph.edges()),"edges")
#nx.write_gexf(cliques_graph, str(node) +".gexf")
from collections import Counter
len_cliques = Counter(map(len,cliques))
#print("sizes of found cliques%s:" % mode_str, len_cliques)
# Pair halves to create d-graphes
for idx, clq1 in enumerate(cliques):
for clq2_idx in range(idx+1, len(cliques)):
clq2 = cliques[clq2_idx]
# Check for d-graph candidates
d_graph = Dgraph(node)
d_graph.put_halves(clq1, clq2, neighbors_graph)
factor = 0.5
#if clique_mode == "testing": factor = 1 # still allows louvain's big communities
#print("link div:",d_graph.get_link_divergence(),"opt:",d_graph.get_optimal_score(), "good d graph?",d_graph.get_link_divergence() <= d_graph.get_optimal_score() *factor)
if d_graph.get_link_divergence() <= d_graph.get_optimal_score() * factor:
node_d_graphs.add(d_graph)
#print("d-graphs", len(node_d_graphs))
d_graphs[node] = brutal_list_domination_filter(sorted(node_d_graphs))
#print("filtered", len(d_graphs[node]))
return d_graphs
""" Add the new dg in the dgs list. If dg is dominated by another dg in the list, then it's
dropped. If any dg in the list is dominated by the dg to add, then, the new dg is added and
all the dominated dg are removed from the list.
@param dg A new dg to add/filter.
@param undominated_dgs_list A list of dg where any of them is dominated by another one.
@return The updated undominated list.
"""
def add_new_dg_regarding_domination(dg, undominated_dgs_list):
to_remove = []
dominated = False
# Search for domination relations
for u_dg in undominated_dgs_list:
if not dominated and dg.is_dominated(u_dg):
dominated = True
if u_dg.is_dominated(dg):
to_remove.append(u_dg)
# Remove dominated values
size = len(undominated_dgs_list)
for dg2 in to_remove:
undominated_dgs_list.remove(dg2)
# Add the new dg
if not dominated:
undominated_dgs_list.append(dg)
return undominated_dgs_list
def filter_dominated(d_graphs, overall=False, in_place=True):
if not overall:
return local_domination_filter(d_graphs, in_place)
all_d_graphs = []
for dgs in d_graphs.values():
all_d_graphs.extend(dgs)
all_d_graphs = list_domination_filter(all_d_graphs)
return d_graphs
""" Filter the d-graphs by node. In a list of d-graph centered on a node n, if a d-graph is
completely included in another and have a highest distance score to the optimal, then it is
filtered out.
@param d_graphs All the d-graphs to filter, sorted by central node.
@param in_place If true, modify the content of d_graph with the filtered version. If False,
copy all the content in a new dictionary.
@return The filtered dictionary of d-graph per node.
"""
def local_domination_filter(d_graphs, in_place=True):
filtered = d_graphs if in_place else {}
# Filter node by node
for node, d_graph_list in d_graphs.items():
lst = str(sorted([str(x) for x in d_graph_list]))
filtered[node] = brutal_list_domination_filter(d_graph_list, node_name=str(node))
return filtered
""" Filter the input d-graphs list. In the list of d-graph centered on a node n, if a d-graph is
completely included in another and have a highest distance score to the optimal, then it is
filtered out.
@param d_graphs All the d-graphs to filter.
@return The filtered dictionary of d-graph per node.
"""
def list_domination_filter(d_graphs):
filtered = []
# Filter d-graph by d-graph
for dg in d_graphs:
add_new_dg_regarding_domination(dg, filtered)
return set(filtered)
def brutal_list_domination_filter(d_graphs, node_name=""):
undominated = set(d_graphs)
to_remove = set()
for dg1 in d_graphs:
for dg2 in d_graphs:
if dg1.is_dominated(dg2):
to_remove.add(dg1)
break
return undominated - to_remove
#!/usr/bin/env python3
import argparse
import numpy as np
import networkx as nx
from intervaltree import Interval, IntervalTree
def parse_arguments():
parser = argparse.ArgumentParser(description='Generate a fake 10X molecule graph.')
parser.add_argument('--num_molecule', '-n', type=int, required=True, help='The number of molecule in the final graph')
parser.add_argument('--genome_size', '-g', type=int, required=True, help='genome size')
parser.add_argument('--avg_mol_len', '-l', type=int, required=True, help='avg molecule size')
parser.add_argument('--min_mol_ovl', '-b', type=int, default=0, help='minimum overlap length between molecules')
parser.add_argument('--rnd_seed', '-s', type=int, default=-1, help='Random seed. Used for reproducibility purpose. Please do not use it in production.')
parser.add_argument('--output', '-o', help="Output filename")
args = parser.parse_args()
return args
def compute_ovl_length(s1,e1,s2,e2):
if s1 > s2:
s1,e1,s2,e2 = s2,e2,s1,e1
return e1-s2
def is_contained(itree,start,end):
for itv in itree.overlap(start,end):
start2, end2 = itv.begin, itv.end
if start2 <= start and end2 >= end:
return True
return False
def generate_graph(nb_molecules, genome_size, avg_mol_size, min_mol_ovl, rnd_seed=None):
# Reproducibility
if rnd_seed != -1:
np.random.seed(rnd_seed)
G = nx.Graph()
# generate molecules according to similar principle as LRSM
# but in addition make sure they're not contained in each other
molecules = dict() # mol index: (start,end)
itree = IntervalTree()
for idx in range(nb_molecules):
while True: # to avoid contained molecules
G.add_node(idx)
#pick a starting position (follows LRSIM)
start = np.random.randint(0,genome_size)
#pick a fragment size (follows LRSIM)
molecule_size = np.random.poisson(avg_mol_size)
end = start+molecule_size
if is_contained(itree,start,end): continue
print("molecule",idx,"start",start,"end",end)
molecules[idx] = (start, end)
itree.addi(start,end,idx)
break
# create graph edges corresponding to molecules overlaps
for idx in molecules:
start,end = molecules[idx]
for itv in itree.overlap(start,end):
other_idx = itv.data
if idx==other_idx: continue
start2, end2 = itv.begin, itv.end
ovl_length = compute_ovl_length(start,end,start2,end2)
#print("overlap length",ovl_length)
if ovl_length < min_mol_ovl: continue
G.add_edge(idx, other_idx)
return G
def save_graph(G, outfile):
import networkx as nx
nx.write_gexf(G, outfile)
if __name__ == "__main__":
args = parse_arguments()
G = generate_graph(args.num_molecule, args.genome_size, args.avg_mol_len, args.min_mol_ovl, args.rnd_seed)
outfile = f"simulated_molecules_{args.num_molecule}_{args.genome_size}_{args.avg_mol_len}_{args.min_mol_ovl}.gexf"
if args.output:
outfile = args.output
save_graph(G, outfile)
rm -Rf snake_tests/simu_bar_n1000_d5_m2*
prefix=simu_bar_n1000_d6_m2
rm -Rf snake_tests/$prefix*
snakemake -s Snakefile_data_simu
snakemake -s Snakefile_d2
mv snake_exec/simu_bar_n1000_d5_m2.tar.gz snake_tests/simu_bar_n1000_d5_m2.tar.gz
mv snake_exec/$prefix.tar.gz snake_tests/$prefix.tar.gz
cd snake_tests
tar xf simu_bar_n1000_d5_m2.tar.gz
tar xf $prefix.tar.gz
cd ..
do_max_cliques=1
......@@ -15,11 +17,11 @@ then
echo "max-cliques"
python deconvolution/evaluate.py --type d2 --barcode_graph snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2_d2_simplified.gexf
python deconvolution/evaluate.py --type d2 --barcode_graph snake_tests/"$prefix"/"$prefix".gexf snake_tests/"$prefix"/"$prefix"_d2_simplified.gexf
python deconvolution/d2_to_path.py snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2_d2_simplified.gexf
python deconvolution/d2_to_path.py snake_tests/"$prefix"/"$prefix".gexf snake_tests/"$prefix"/"$prefix"_d2_simplified.gexf
python deconvolution/evaluate.py --type path --barcode_graph snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf _path.gexf
python deconvolution/evaluate.py --type path --barcode_graph snake_tests/"$prefix"/"$prefix".gexf _path.gexf
fi
......@@ -29,11 +31,11 @@ then
echo "louvain"
python deconvolution/evaluate.py --type d2 --barcode_graph snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2_d2_simplified_louvain.gexf
python deconvolution/evaluate.py --type d2 --barcode_graph snake_tests/"$prefix"/"$prefix".gexf snake_tests/"$prefix"/"$prefix"_d2_simplified_louvain.gexf
python deconvolution/d2_to_path.py snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2_d2_simplified_louvain.gexf
python deconvolution/d2_to_path.py snake_tests/"$prefix"/"$prefix".gexf snake_tests/"$prefix"/"$prefix"_d2_simplified_louvain.gexf
python deconvolution/evaluate.py --type path --barcode_graph snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf _path.gexf
python deconvolution/evaluate.py --type path --barcode_graph snake_tests/"$prefix"/"$prefix".gexf _path.gexf
fi
do_comtest=0
......@@ -43,10 +45,10 @@ then
echo "******************************* Testing algorithm"
python deconvolution/evaluate.py --type d2 --barcode_graph snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2_d2_simplified_comtest.gexf
python deconvolution/evaluate.py --type d2 --barcode_graph snake_tests/"$prefix"/"$prefix".gexf snake_tests/"$prefix"/"$prefix"_d2_simplified_comtest.gexf
python deconvolution/d2_to_path.py snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2_d2_simplified_comtest.gexf
python deconvolution/d2_to_path.py snake_tests/"$prefix"/"$prefix".gexf snake_tests/"$prefix"/"$prefix"_d2_simplified_comtest.gexf
python deconvolution/evaluate.py --type path --barcode_graph snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf _path.gexf
python deconvolution/evaluate.py --type path --barcode_graph snake_tests/"$prefix"/"$prefix".gexf _path.gexf
fi
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment