diff --git a/Snakefile_data_simu b/Snakefile_data_simu index 5971238c04cf75c573c39183288a8385ab4eec20..35570c7b9f30ce5ea734a2c9b88348a7f40bfa25 100644 --- a/Snakefile_data_simu +++ b/Snakefile_data_simu @@ -1,9 +1,9 @@ WORKDIR="snake_exec" if "outdir" not in config else config["outdir"] -N=[5000] if "n" not in config else config["n"] # Number of molecule to simulate -D=[5] if "d" not in config else config["d"] # Average coverage of each molecule +N=[10000] if "n" not in config else config["n"] # Number of molecule to simulate +D=[10] if "d" not in config else config["d"] # Average coverage of each molecule M=[3] if "m" not in config else config["m"] # Average number of molecule per barcode -M_DEV=[0] if "m_dev" not in config else config["m_dev"] # Std deviation for merging number +M_DEV=[1] if "m_dev" not in config else config["m_dev"] # Std deviation for merging number iter=1 # snake_experiments/simu_0_bar_n500_d5_m2.gexf diff --git a/deconvolution/main/d2_random_path_evaluation.py b/deconvolution/main/d2_random_path_evaluation.py index 857f9fcc5f6d3ad172a336ea14556659bebbcb46..e9783402f5b4b942a841a2bda7ac4555ee71c3e4 100755 --- a/deconvolution/main/d2_random_path_evaluation.py +++ b/deconvolution/main/d2_random_path_evaluation.py @@ -10,6 +10,8 @@ import argparse from termcolor import colored import networkx as nx +graph_type = "interval" # will be set to "ecoli" if we detect that it is, in fact, a barcode graph simulated from ecoli +# this affects how we evaluate that path is correct: mols starts absolute diff < 9k for ecoli, < 5--10 for interval graphs def parse_args(): parser = argparse.ArgumentParser(description='Process some integers.') @@ -47,6 +49,7 @@ given as input a list of molecules in central nodes, return a stripped-down list that only includes molecules that overlap their neighbor """ def extract_likely_molecule_path_wrapper(path,starter_mol): + tolerance = 9000 if graph_type == "ecoli" else 10 # tolerance for ecoli =the max distance between two starting points, if min overlap is 6000 and mols at 15kbp of length debug = False res = [] direction = None @@ -54,7 +57,7 @@ def extract_likely_molecule_path_wrapper(path,starter_mol): for i,mols in enumerate(path): good_mol = None for mol in mols: - if (abs(previous_mol[0] - mol[0])< 9000): # the max distance between two starting points, if min overlap is 6000 and mols at 15kbp of length + if (abs(previous_mol[0] - mol[0])< tolerance): """ shouldn't enforce directionality. why? because a->b->c can sometimes be a->c->b and still be correct, because a,b,c are a clique """ #if i == 0: #direction = "decreasing" if previous_mol[0] > mol[0] else "increasing" @@ -85,13 +88,26 @@ def is_there_path_acc(path): # don't consider overlaps smaller than 5kbp """ converts a central node of a d-graph into its list of molecules (given the ground truth) """ def central_node_to_molecules(nodestr): - # format for a 2-merge: 1:NC_000913.3_298281_313280_0:0:0_0:0:0_2fb/1_NC_000913.3_338611_353610_0:0:0_0:0:0_37b/1 + global graph_type cur_node_mols = [] - for ide in nodestr.split('NC_')[1:]: # specific to e coli - #print(ide) - x = ide.split("_") - start, end = int(x[1]), int(x[2]) - cur_node_mols += [(start,end)] + if 'NC_' in nodestr: # ecoli specific + # format for a 2-merge: 1:NC_000913.3_298281_313280_0:0:0_0:0:0_2fb/1_NC_000913.3_338611_353610_0:0:0_0:0:0_37b/1 + for ide in nodestr.split('NC_')[1:]: # specific to e coli + graph_type = "ecoli" + #print(ide) + x = ide.split("_") + start, end = int(x[1]), int(x[2]) + cur_node_mols += [(start,end)] + else: + if ':' not in nodestr: + # a non-merged node + cur_node_mols += [(int(nodestr),int(nodestr)+1)] + else: + nodestr = nodestr.split(':')[1] + nodestr = nodestr.replace(']','').replace('[','') + for ide in nodestr.split('_'): + #print(ide) + cur_node_mols += [(int(ide),int(ide)+1)] # a tuple just for compatibility with the ecoli format.. should be just int(ide) return cur_node_mols def is_coherent_path(central_nodes,path_len): @@ -183,7 +199,7 @@ def evaluate_sensitivity_paths(path_len,overlap_length=7000): for mol in mols: molecules_to_nodes[mol] += [node] all_molecules = sorted(all_molecules) - print("got",len(all_molecules),"molecules total") + #print("got",len(all_molecules),"molecules total") nb_found, nb_not_found = 0,0 for i in range(len(all_molecules)-path_len+1): sought_path = all_molecules[i:i+path_len]