Commit 80e507a9 authored by rchikhi's avatar rchikhi
Browse files

added d graph detection algo

parent b5b9bdd4
#C3BI paul avait émis la notion de d-graph
# "Lets call a graph a d-graph if the vertices can be ordered on a line and any vertices within d/2 of each other are connected by an edge. Observe that a d-graph has all vertices of degree d, except at the very end and beginning. For example, for d=6, vertex at position 10 will be connected to 7,8,9,11,12,13. A d-graph (sort of?) captures what the neighborhood of a molecule would look like."
#
# alternatively, observe that a d-graph can be seen as a line in a special overlap graph, where the nodes are the set of neighbors of a certain barcode graph node node, and two nodes in the special overlap graph are linked by an edge if the neighbors intersect over d-1 (or d-2?) elements
# arguments: graphml file
# attempts to deconvolve a barcode graph using d-graph detection
import sys
import itertools
import operator
from collections import Counter
from networkx import nx
#from networkx.algorithms import community
import community # python-louvain
import sys
if len(sys.argv) == 1:
print("args: .graphml or .gexf file")
if ".graphml" in sys.argv[1]:
G = nx.read_graphml(sys.argv[1])
elif ".gexf" in sys.argv[1]:
G = nx.read_gexf(sys.argv[1])
print(G)
# too strict
def strict_d_line_compatible_neighbors(node1, neigh1, node2, neigh2):
neigh1 = set(neigh1) | set([node1])
neigh2 = set(neigh2) | set([node2])
inter = neigh1 & neigh2
diff1 = neigh1 - neigh2
diff2 = neigh2 - neigh1
if len(inter) >= len(neigh1)-1 and len(inter) >= len(neigh2) - 1 and len(diff1) <= 1 and len(diff2) <= 1:
return True
return False
def d_line_compatible_neighbors(node1, neigh1, node2, neigh2):
neigh1 = set(neigh1) | set([node1])
neigh2 = set(neigh2) | set([node2])
inter = neigh1 & neigh2
diff1 = neigh1 - neigh2
diff2 = neigh2 - neigh1
wiggle=2
if len(inter) >= len(neigh1)-wiggle and len(inter) >= len(neigh2) - wiggle and len(diff1) <= wiggle and len(diff2) <= wiggle:
return True
return False
# actually a very loose definition of d-graph, where consecutive neighbors of node on the line don't need to be exactly of cardinality +1/-1
# (for that, use strict_d_line_compatible_neighbors)
# but can be +3/-3
def is_d_graph(graph, all_neighbors_graph):
Gn = nx.Graph() # create a graph of 'neighbor-compatibility': whether two nodes in the original graphs have 'almost' the same neighbors-set, apart from one to the left and one to the right (typical in d-graphs)
for node in graph.nodes():
Gn.add_node(node)
for node1 in graph.nodes():
for node2 in graph.nodes():
if node1 <= node2: continue
neigh1 = list(graph.neighbors(node1))
neigh2 = list(graph.neighbors(node2))
#print(node1, sorted(neigh1))
#print(node2, sorted(neigh2))
if d_line_compatible_neighbors(node1, neigh1, node2, neigh2):
#print("compat",node1,node2)
Gn.add_edge(node1,node2)
#nx.write_graphml(graph,"test_G2.graphml")
#nx.write_graphml(Gn,"test_Gn.graphml")
#print("wrote")
#print("tentative d graph",list(nx.connected_components(Gn)))
if len(list(nx.connected_components(Gn))) != 1:
return False
# this is really a heuristic:
#
# for all nodes in the putative d-graph, make sure their neighbors are all in majority in the component rathe than the whole graph
# this is a critical filter to remove putative d-graphs that are made of multiple molecules
# see e.g. 100_5_2-neighbors_of_molecule_21_83 without that filter..
# 3 d-graphs found (2 ok), dont 1 pas ok:
# d graph found ['10:24_65', '12:66_19', '22:25_81', '37:22_42', '46:63_86', '9:85_45'] -> chimere des molecules voisines de 21 et 83 via le barcode 22:25_81
"""
for node1 in graph.nodes():
neigh1 = set(list(all_neighbors_graph.neighbors(node1)))
if len(neigh1 & set(graph.nodes())) < len(neigh1) / 2:
#print("is not a d graph due to bad separation with rest",graph)
return False
"""
return True
def compute_score(d_graphs_set,G):
score = 0
for d_graph in d_graphs_set:
for node in d_graph:
for neigh in G.neighbors(node):
score += 1 if neigh in d_graph else -4
return score
debug = False
def deconvolve2(G,node,dont_separate=False):
global debug
neighbors = list(G.neighbors(node))
print("node",node,len(neighbors),"neighbors")
G2 = nx.Graph(G.subgraph(neighbors))
if debug:
nx.write_graphml(G2,node + ".graphml")
# add node labels
for g2_node in G2.nodes():
G2.nodes[g2_node]['label'] = str(g2_node)
# make sure there is a single connected component
while True:
ccs = list(nx.connected_components(G2))
if len(ccs) == 1:
break
# else artificially and weakly connect components to run fluid community
n1 = list(ccs[0])[0]
n2 = list(ccs[1])[0]
G2.add_edge(n1,n2)
d_graphs = []
cliques = list(nx.find_cliques(G2))
# remove cliques of size < 4
cliques = [c for c in cliques if len(c) >= 3]
# enumerate all d-graphs
print(len(cliques),"cliques,",len(G2.nodes()),"nodes in neighbors of", node)
#print("cliques:", cliques)
for nb_cliques in range(len(cliques),0,-1):
for candidate in itertools.combinations(cliques, nb_cliques):
candidate_graph = nx.Graph(G2.subgraph([x for y in candidate for x in y ]))
#print("testing for d graph",sorted(list(candidate_graph.nodes())))
if len(candidate_graph.nodes()) <= 5: continue # don't consider small graphs # bit of a hack
if is_d_graph(candidate_graph, G2):
#print("d graph found",sorted(list(candidate_graph.nodes())))
d_graphs += [tuple(sorted(candidate_graph.nodes()))]
d_graphs = list(set(d_graphs)) # dedup
print(len(d_graphs),"d-graphs found in neighbors of",node)
# remove ones included in others
to_remove = set()
for i,d_graph1 in enumerate(d_graphs):
for d_graph2 in d_graphs:
if (set(d_graph1).issubset(set(d_graph2))) and len(d_graph1) < len(d_graph2):
to_remove |= set([i])
d_graphs = [d for (i, d) in enumerate(d_graphs) if i not in to_remove]
print("removed",len(to_remove),"redundant d graphs")
# find the best set of d-graphs which minimize a certain score
scores = []
for nb_dgraphs in range(4,0,-1):
for candidate in itertools.combinations(d_graphs, nb_dgraphs):
node_counter = Counter([x for y in candidate for x in y ])
candidate = [ [node for node in d if node_counter[node] == 1 ] for d in candidate] # remove nodes common to multiple d-graphs
candidate = [ d for d in candidate if len(d) >= 5] # remove small d-graphs
if len(candidate) == 0:
continue
score = compute_score(candidate,G2)
scores += [(score, candidate)]
print("scores",[(x,"%d d-graphs" % len(d)) for x,d in sorted(scores)][:-3])
best_d_graph_decomposition = max(scores)[1]
print("best decomposition:",len(best_d_graph_decomposition),"graphs")
d_graphs = best_d_graph_decomposition
print(d_graphs)
"""
# refine d_graphs by removing common nodes
dups = Counter()
for d_graph in d_graphs:
for d_node in d_graph:
dups[d_node] += 1
multi_nodes = [x for x in dups if dups[x] > 1]
print(len(multi_nodes),"multi-d-graphs nodes found, removing them", multi_nodes)
for i in range(len(d_graphs)):
for multi_node in multi_nodes:
if multi_node in d_graphs[i]:
d_graphs[i].remove(multi_node)
"""
# label nodes by their d_graph
d_graphs_nodes = {}
for i,d_graph in enumerate(d_graphs):
for d_node in d_graph:
if d_node in d_graphs_nodes:
exit("uh oh it's not a partition")
else:
d_graphs_nodes[d_node] = i
# TODO place the remaining nodes in combinations of d-graphs, depending of their neighborhoors
# some nodes might not be in any d-graph
# also what to do with the multi_nodes
# those will be in their own little dummy MI
for g2_node in G2.nodes():
if g2_node not in d_graphs_nodes:
d_graphs_nodes[g2_node]=99 # Nine-Nine!
#communities = dict([(x,1) for x in G2.nodes()]) # dummy communities
communities = dict([(x,d_graphs_nodes[x]) for x in G2.nodes() if x in d_graphs_nodes])
print("community sizes:",[len([c for c,i in communities.items() if i == community_id]) for community_id in set(communities.values())])
#print([[(c,i) for c,i in communities.items() if i == community_id] for community_id in set(communities.values())]) # debug
if len(communities) == 1:
return # nothing to separate here
if dont_separate:
return
# creation of a new node per community
for node2,community_id in communities.items():
G.add_node(node+"-MI%d" % community_id)
G.add_edge(node+"-MI%d" % community_id, node2, contigs=(G[node][node2]['contigs'] if 'contigs' in G[node][node2] else ""))
# for debugging
G2.nodes[node2]['mi'] = community_id
G.remove_node(node)
debug = False
if debug:
node = "3:80_10"
deconvolve2(G,node)
exit(0)
debug_dont_separate = True # run the algo on each node but dont separate the nodes
g_nodes = list(G.nodes())
for node in g_nodes:
deconvolve2(G,node,debug_dont_separate)
nx.write_graphml(G,sys.argv[1]+".deconvolved.graphml")
......@@ -40,7 +40,7 @@ def deconvolve2(G,node):
#cliques = list(community.asyn_lpa_communities(G2)) # seems accurate but hands
cliques = community.best_partition(G2)
print([len([c for c,i in cliques.items() if i == clique_id]) for clique_id in set(cliques.values())])
print("louvain communities",[len([c for c,i in cliques.items() if i == clique_id]) for clique_id in set(cliques.values())])
if len(cliques) == 1:
return # nothing to deconvolve here
......@@ -62,4 +62,4 @@ g_nodes = list(G.nodes())
for node in g_nodes:
deconvolve2(G,node)
nx.write_graphml(G,sys.argv[1]+".deconvolved.graphml")
\ No newline at end of file
nx.write_graphml(G,sys.argv[1]+".deconvolved.graphml")
......@@ -15,7 +15,7 @@ for idx, node in enumerate(G.nodes()):
barcodes = []
n = len(G.nodes())
available_molecules = set(G.nodes())
m = 5 # m molecules per barcode
m = 2 # m molecules per barcode
# Group molecules by barcode
import random
......@@ -49,6 +49,6 @@ for mol_edge in G.edges():
# print(G2.edges)
output = sys.argv[1].replace("molecule", "barcodes").replace(".graphml", f"_{m}.gexf")
output = sys.argv[1].replace("molecule", "barcode").replace(".graphml", f"_{m}.gexf")
nx.write_gexf(G2, output)
......@@ -3,7 +3,23 @@
# generate a bunch of molecules, in the form of python intervals (a,b), i.e. a<=i<b. here, b-a=10
# then find their overlaps >= 1 bases
n = 16
# rationale for realistic parameters:
# https://support.10xgenomics.com/de-novo-assembly/datasets/2.1.0/fly
# drosophila genome, 140 Mbp
# molecules of 70 kbp
# coverage of 70x
# number of barcodes: 50k (only) (zcat barcoded.fastq.gz | grep "^@" | awk '{print $2}' | sort | uniq |wc -l)
# number of molecule per barcode: 10 (~/10x/drosophila/chen_data_longranger_run_on_ref/outs$ samtools view phased_possorted_bam.bam | python ~/10x-barcode-graph/scripts/sam_stats.py)
# so in total, 500k molecules
# i.e. the molecule coverage is around 140Mbp/500kbp = 280x
# conservatively, it seems that we can get overlaps for at least 20 neighbor molecules
# so in that setting, considering each molecule as '1bp', i.e. scaling the genome down to 140Mbp/70kbp=2Mbp
# n = 2000
# o = 50
n = 100
o = 5
molecules_intervals = []
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment