Commit 6e12326a authored by Yoann Dufresne's avatar Yoann Dufresne

Merge branch 'master' of gitlab.pasteur.fr:ydufresne/10x-deconvolve

parents 5fd16e32 84920ea8
WORKDIR="snake_exec" if "outdir" not in config else config["outdir"]
N=[5000] if "n" not in config else config["n"] # Number of molecule to simulate
D=[5] if "d" not in config else config["d"] # Average coverage of each molecule
N=[10000] if "n" not in config else config["n"] # Number of molecule to simulate
D=[10] if "d" not in config else config["d"] # Average coverage of each molecule
M=[3] if "m" not in config else config["m"] # Average number of molecule per barcode
M_DEV=[0] if "m_dev" not in config else config["m_dev"] # Std deviation for merging number
M_DEV=[1] if "m_dev" not in config else config["m_dev"] # Std deviation for merging number
iter=1
# snake_experiments/simu_0_bar_n500_d5_m2.gexf
......
......@@ -10,6 +10,8 @@ import argparse
from termcolor import colored
import networkx as nx
graph_type = "interval" # will be set to "ecoli" if we detect that it is, in fact, a barcode graph simulated from ecoli
# this affects how we evaluate that path is correct: mols starts absolute diff < 9k for ecoli, < 5--10 for interval graphs
def parse_args():
parser = argparse.ArgumentParser(description='Process some integers.')
......@@ -47,6 +49,7 @@ given as input a list of molecules in central nodes,
return a stripped-down list that only includes molecules that overlap their neighbor
"""
def extract_likely_molecule_path_wrapper(path,starter_mol):
tolerance = 9000 if graph_type == "ecoli" else 10 # tolerance for ecoli =the max distance between two starting points, if min overlap is 6000 and mols at 15kbp of length
debug = False
res = []
direction = None
......@@ -54,7 +57,7 @@ def extract_likely_molecule_path_wrapper(path,starter_mol):
for i,mols in enumerate(path):
good_mol = None
for mol in mols:
if (abs(previous_mol[0] - mol[0])< 9000): # the max distance between two starting points, if min overlap is 6000 and mols at 15kbp of length
if (abs(previous_mol[0] - mol[0])< tolerance):
""" shouldn't enforce directionality. why? because a->b->c can sometimes be a->c->b and still be correct, because a,b,c are a clique """
#if i == 0:
#direction = "decreasing" if previous_mol[0] > mol[0] else "increasing"
......@@ -85,13 +88,26 @@ def is_there_path_acc(path): # don't consider overlaps smaller than 5kbp
""" converts a central node of a d-graph into its list of molecules (given the ground truth) """
def central_node_to_molecules(nodestr):
# format for a 2-merge: 1:NC_000913.3_298281_313280_0:0:0_0:0:0_2fb/1_NC_000913.3_338611_353610_0:0:0_0:0:0_37b/1
global graph_type
cur_node_mols = []
for ide in nodestr.split('NC_')[1:]: # specific to e coli
#print(ide)
x = ide.split("_")
start, end = int(x[1]), int(x[2])
cur_node_mols += [(start,end)]
if 'NC_' in nodestr: # ecoli specific
# format for a 2-merge: 1:NC_000913.3_298281_313280_0:0:0_0:0:0_2fb/1_NC_000913.3_338611_353610_0:0:0_0:0:0_37b/1
for ide in nodestr.split('NC_')[1:]: # specific to e coli
graph_type = "ecoli"
#print(ide)
x = ide.split("_")
start, end = int(x[1]), int(x[2])
cur_node_mols += [(start,end)]
else:
if ':' not in nodestr:
# a non-merged node
cur_node_mols += [(int(nodestr),int(nodestr)+1)]
else:
nodestr = nodestr.split(':')[1]
nodestr = nodestr.replace(']','').replace('[','')
for ide in nodestr.split('_'):
#print(ide)
cur_node_mols += [(int(ide),int(ide)+1)] # a tuple just for compatibility with the ecoli format.. should be just int(ide)
return cur_node_mols
def is_coherent_path(central_nodes,path_len):
......@@ -139,18 +155,38 @@ def evaluate_accuracy_paths(path_len,max_paths_per_node=5):
# ---- sensitivity evaluation
""" given an ordered list of molecules, determine if the graph contains a path of central nodse which have these molecules.
it does that by testing all possible combinations of d-graphs having those molecules in their central nodes """
it does that creating a multipartite graph with a source and a sink
"""
def is_there_path(graph,molecules_to_nodes,sought_path):
possible_central_nodes = []
for mol in sought_path:
MPG = nx.Graph() # a multipartite graph
# construct all levels of the multipartite graph
possible_central_nodes = []
for i,mol in enumerate(sought_path):
possible_central_nodes += [molecules_to_nodes[mol]]
#print(possible_central_nodes)
for mols in itertools.product(*possible_central_nodes):
if nx.is_connected(graph.subgraph(mols)):
#print("found connected path",mols)
return True
#print("found no connected paths",sought_path)
return False
for node in possible_central_nodes:
MPG.add_node("%d-%s" % (i,node))
# construct edges between levels
for i in range(len(sought_path)-1):
pcn_i = molecules_to_nodes[sought_path[i]]
pcn_ip1 = molecules_to_nodes[sought_path[i+1]]
for node_i,node_j in itertools.product(pcn_i,pcn_ip1):
#print(node_i,node_j)
if graph.has_edge(node_i,node_j):
MPG.add_edge("%d-%s" %(i,node_i),"%d-%s" % (i+1,node_j))
# add source/sink
MPG.add_node("source")
MPG.add_node("sink")
pcn_0 = molecules_to_nodes[sought_path[0]]
for node in pcn_0:
MPG.add_edge("source","0-%s" % node)
pcn_end = molecules_to_nodes[sought_path[-1]]
for node in pcn_end:
MPG.add_edge("%d-%s" % (len(sought_path)-1, node), "sink")
return nx.has_path(MPG,"source","sink")
def evaluate_sensitivity_paths(path_len,overlap_length=7000):
all_molecules = set()
......@@ -163,7 +199,7 @@ def evaluate_sensitivity_paths(path_len,overlap_length=7000):
for mol in mols:
molecules_to_nodes[mol] += [node]
all_molecules = sorted(all_molecules)
print("got",len(all_molecules),"molecules total")
#print("got",len(all_molecules),"molecules total")
nb_found, nb_not_found = 0,0
for i in range(len(all_molecules)-path_len+1):
sought_path = all_molecules[i:i+path_len]
......@@ -183,9 +219,14 @@ def main():
p = Pool(12)
#p.map(evaluate_accuracy_paths, [1,2,4,6,8,10,15,20,50,100,200,500])
p.map(evaluate_accuracy_paths, [2,4,10,100])
p.join()
#p.map(evaluate_sensitivity_paths, [1,2,3,4])
p.map(evaluate_sensitivity_paths, [2,4,10,100])
#evaluate_accuracy_paths(100)
#evaluate_sensitivity_paths(1)
#evaluate_sensitivity_paths(3)
#evaluate_sensitivity_paths(10)
p.close()
p.join()
if __name__ == "__main__":
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment