Skip to content
Snippets Groups Projects
Commit 84920ea8 authored by Rayan  CHIKHI's avatar Rayan CHIKHI
Browse files

initial modifs for interval graphs evaluation acc/sens

parent 4782df1e
No related branches found
No related tags found
No related merge requests found
WORKDIR="snake_exec" if "outdir" not in config else config["outdir"] WORKDIR="snake_exec" if "outdir" not in config else config["outdir"]
N=[5000] if "n" not in config else config["n"] # Number of molecule to simulate N=[10000] if "n" not in config else config["n"] # Number of molecule to simulate
D=[5] if "d" not in config else config["d"] # Average coverage of each molecule D=[10] if "d" not in config else config["d"] # Average coverage of each molecule
M=[3] if "m" not in config else config["m"] # Average number of molecule per barcode M=[3] if "m" not in config else config["m"] # Average number of molecule per barcode
M_DEV=[0] if "m_dev" not in config else config["m_dev"] # Std deviation for merging number M_DEV=[1] if "m_dev" not in config else config["m_dev"] # Std deviation for merging number
iter=1 iter=1
# snake_experiments/simu_0_bar_n500_d5_m2.gexf # snake_experiments/simu_0_bar_n500_d5_m2.gexf
......
...@@ -10,6 +10,8 @@ import argparse ...@@ -10,6 +10,8 @@ import argparse
from termcolor import colored from termcolor import colored
import networkx as nx import networkx as nx
graph_type = "interval" # will be set to "ecoli" if we detect that it is, in fact, a barcode graph simulated from ecoli
# this affects how we evaluate that path is correct: mols starts absolute diff < 9k for ecoli, < 5--10 for interval graphs
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description='Process some integers.') parser = argparse.ArgumentParser(description='Process some integers.')
...@@ -47,6 +49,7 @@ given as input a list of molecules in central nodes, ...@@ -47,6 +49,7 @@ given as input a list of molecules in central nodes,
return a stripped-down list that only includes molecules that overlap their neighbor return a stripped-down list that only includes molecules that overlap their neighbor
""" """
def extract_likely_molecule_path_wrapper(path,starter_mol): def extract_likely_molecule_path_wrapper(path,starter_mol):
tolerance = 9000 if graph_type == "ecoli" else 10 # tolerance for ecoli =the max distance between two starting points, if min overlap is 6000 and mols at 15kbp of length
debug = False debug = False
res = [] res = []
direction = None direction = None
...@@ -54,7 +57,7 @@ def extract_likely_molecule_path_wrapper(path,starter_mol): ...@@ -54,7 +57,7 @@ def extract_likely_molecule_path_wrapper(path,starter_mol):
for i,mols in enumerate(path): for i,mols in enumerate(path):
good_mol = None good_mol = None
for mol in mols: for mol in mols:
if (abs(previous_mol[0] - mol[0])< 9000): # the max distance between two starting points, if min overlap is 6000 and mols at 15kbp of length if (abs(previous_mol[0] - mol[0])< tolerance):
""" shouldn't enforce directionality. why? because a->b->c can sometimes be a->c->b and still be correct, because a,b,c are a clique """ """ shouldn't enforce directionality. why? because a->b->c can sometimes be a->c->b and still be correct, because a,b,c are a clique """
#if i == 0: #if i == 0:
#direction = "decreasing" if previous_mol[0] > mol[0] else "increasing" #direction = "decreasing" if previous_mol[0] > mol[0] else "increasing"
...@@ -85,13 +88,26 @@ def is_there_path_acc(path): # don't consider overlaps smaller than 5kbp ...@@ -85,13 +88,26 @@ def is_there_path_acc(path): # don't consider overlaps smaller than 5kbp
""" converts a central node of a d-graph into its list of molecules (given the ground truth) """ """ converts a central node of a d-graph into its list of molecules (given the ground truth) """
def central_node_to_molecules(nodestr): def central_node_to_molecules(nodestr):
# format for a 2-merge: 1:NC_000913.3_298281_313280_0:0:0_0:0:0_2fb/1_NC_000913.3_338611_353610_0:0:0_0:0:0_37b/1 global graph_type
cur_node_mols = [] cur_node_mols = []
for ide in nodestr.split('NC_')[1:]: # specific to e coli if 'NC_' in nodestr: # ecoli specific
#print(ide) # format for a 2-merge: 1:NC_000913.3_298281_313280_0:0:0_0:0:0_2fb/1_NC_000913.3_338611_353610_0:0:0_0:0:0_37b/1
x = ide.split("_") for ide in nodestr.split('NC_')[1:]: # specific to e coli
start, end = int(x[1]), int(x[2]) graph_type = "ecoli"
cur_node_mols += [(start,end)] #print(ide)
x = ide.split("_")
start, end = int(x[1]), int(x[2])
cur_node_mols += [(start,end)]
else:
if ':' not in nodestr:
# a non-merged node
cur_node_mols += [(int(nodestr),int(nodestr)+1)]
else:
nodestr = nodestr.split(':')[1]
nodestr = nodestr.replace(']','').replace('[','')
for ide in nodestr.split('_'):
#print(ide)
cur_node_mols += [(int(ide),int(ide)+1)] # a tuple just for compatibility with the ecoli format.. should be just int(ide)
return cur_node_mols return cur_node_mols
def is_coherent_path(central_nodes,path_len): def is_coherent_path(central_nodes,path_len):
...@@ -183,7 +199,7 @@ def evaluate_sensitivity_paths(path_len,overlap_length=7000): ...@@ -183,7 +199,7 @@ def evaluate_sensitivity_paths(path_len,overlap_length=7000):
for mol in mols: for mol in mols:
molecules_to_nodes[mol] += [node] molecules_to_nodes[mol] += [node]
all_molecules = sorted(all_molecules) all_molecules = sorted(all_molecules)
print("got",len(all_molecules),"molecules total") #print("got",len(all_molecules),"molecules total")
nb_found, nb_not_found = 0,0 nb_found, nb_not_found = 0,0
for i in range(len(all_molecules)-path_len+1): for i in range(len(all_molecules)-path_len+1):
sought_path = all_molecules[i:i+path_len] sought_path = all_molecules[i:i+path_len]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment