initial modifs for interval graphs evaluation acc/sens

84920ea8 · Rayan CHIKHI · 4782df1e · 84920ea8 · 84920ea8
Commit 84920ea8 authored 5 years ago by Rayan CHIKHI
--- a/Snakefile_data_simu
+++ b/Snakefile_data_simu
 WORKDIR="snake_exec" if "outdir" not in config else config["outdir"]
-N=[5000] if "n" not in config else config["n"] # Number of molecule to simulate
+N=[10000] if "n" not in config else config["n"] # Number of molecule to simulate
-D=[5] if "d" not in config else config["d"] # Average coverage of each molecule
+D=[10] if "d" not in config else config["d"] # Average coverage of each molecule
 M=[3] if "m" not in config else config["m"] # Average number of molecule per barcode
-M_DEV=[0] if "m_dev" not in config else config["m_dev"] # Std deviation for merging number
+M_DEV=[1] if "m_dev" not in config else config["m_dev"] # Std deviation for merging number
 iter=1
 # snake_experiments/simu_0_bar_n500_d5_m2.gexf

--- a/deconvolution/main/d2_random_path_evaluation.py
+++ b/deconvolution/main/d2_random_path_evaluation.py
@@ -10,6 +10,8 @@ import argparse
 from termcolor import colored
 import networkx as nx
+graph_type = "interval" # will be set to "ecoli" if we detect that it is, in fact, a barcode graph simulated from ecoli
+# this affects how we evaluate that path is correct: mols starts absolute diff < 9k for ecoli, < 5--10 for interval graphs
 def parse_args():
    parser = argparse.ArgumentParser(description='Process some integers.')
@@ -47,6 +49,7 @@ given as input a list of molecules in central nodes,
 return a stripped-down list that only includes molecules that overlap their neighbor
 """
 def extract_likely_molecule_path_wrapper(path,starter_mol):
+    tolerance = 9000 if graph_type == "ecoli" else 10 # tolerance for ecoli =the max distance between two starting points, if min overlap is 6000 and mols at 15kbp of length
    debug = False
    res = []
    direction = None
@@ -54,7 +57,7 @@ def extract_likely_molecule_path_wrapper(path,starter_mol):
    for i,mols in enumerate(path):
        good_mol = None
        for mol in mols:
-            if (abs(previous_mol[0] - mol[0])< 9000): # the max distance between two starting points, if min overlap is 6000 and mols at 15kbp of length
+            if (abs(previous_mol[0] - mol[0])< tolerance): 
                """ shouldn't enforce directionality. why? because a->b->c can sometimes be a->c->b and still be correct, because a,b,c are a clique """
                #if i == 0:
                    #direction = "decreasing" if previous_mol[0] > mol[0] else "increasing"
@@ -85,13 +88,26 @@ def is_there_path_acc(path): # don't consider overlaps smaller than 5kbp
 """ converts a central node of a d-graph into its list of molecules (given the ground truth) """
 def central_node_to_molecules(nodestr):
-    # format for a 2-merge: 1:NC_000913.3_298281_313280_0:0:0_0:0:0_2fb/1_NC_000913.3_338611_353610_0:0:0_0:0:0_37b/1
+    global graph_type
    cur_node_mols = []
-    for ide in nodestr.split('NC_')[1:]: # specific to e coli
+    if 'NC_' in nodestr: # ecoli specific
-        #print(ide)
+        # format for a 2-merge: 1:NC_000913.3_298281_313280_0:0:0_0:0:0_2fb/1_NC_000913.3_338611_353610_0:0:0_0:0:0_37b/1
-        x = ide.split("_")
+        for ide in nodestr.split('NC_')[1:]: # specific to e coli
-        start, end = int(x[1]), int(x[2])
+            graph_type = "ecoli"
-        cur_node_mols += [(start,end)]
+            #print(ide)
+            x = ide.split("_")
+            start, end = int(x[1]), int(x[2])
+            cur_node_mols += [(start,end)]
+    else:
+        if ':' not in nodestr:
+            # a non-merged node
+            cur_node_mols += [(int(nodestr),int(nodestr)+1)]
+        else:
+           nodestr = nodestr.split(':')[1]
+           nodestr = nodestr.replace(']','').replace('[','')
+           for ide in nodestr.split('_'):
+               #print(ide)
+               cur_node_mols += [(int(ide),int(ide)+1)] # a tuple just for compatibility with the ecoli format.. should be just int(ide)
    return cur_node_mols 
 def is_coherent_path(central_nodes,path_len):
@@ -183,7 +199,7 @@ def evaluate_sensitivity_paths(path_len,overlap_length=7000):
        for mol in mols:
            molecules_to_nodes[mol] += [node]
    all_molecules = sorted(all_molecules)
-    print("got",len(all_molecules),"molecules total")
+    #print("got",len(all_molecules),"molecules total")
    nb_found, nb_not_found = 0,0
    for i in range(len(all_molecules)-path_len+1):
        sought_path  = all_molecules[i:i+path_len]