Commit c0bf8e4c authored by Yoann Dufresne's avatar Yoann Dufresne
Browse files

evalution of path ok

parent d1bb245c
......@@ -2,12 +2,12 @@
import sys
import csv
import argparse
from termcolor import colored
import networkx as nx
from d2_graph import D2Graph
def parse_args():
parser = argparse.ArgumentParser(description='Process some integers.')
......@@ -17,8 +17,9 @@ def parse_args():
help="Define the data type to evaluate. Must be 'd2' or 'path' or 'd2-2annotate' (Rayan's hack).")
parser.add_argument('--light-print', '-l', action='store_true',
help='Print only wrong nodes and paths')
parser.add_argument('--barcode_graph', '-b', help="Path to the barcode graph corresponding to the d2_graph to analyse.")
parser.add_argument('--optimization_file', '-o',
help="If the main file is a d2, a file formated for optimization can be set. This file will be used to compute the coverage of the longest path on the barcode graph.")
help="If the main file is a d2, a file formatted for optimization can be set. This file will be used to compute the coverage of the longest path on the barcode graph.")
args = parser.parse_args()
return args
......@@ -59,34 +60,30 @@ def mols_from_node(node_name):
@param file_pointer Where to print the output. If set to stdout, then pretty print. If set to None, don't print anything.
@return A tuple containing two dictionaries. The first one with theoritical frequencies of each node, the second one with observed frequencies.
"""
def parse_path_graph_frequencies(graph):
# Compute origin nodes formated as `{idx}:{mol1_id}_{mol2_id}_...`
def parse_path_graph_frequencies(graph, barcode_graph):
# Compute origin nodes formatted as `{idx}:{mol1_id}_{mol2_id}_...`
observed_frequencies = {}
real_frequencies = {}
origin_node_names = []
node_per_barcode = {}
for node in graph.nodes():
origin_name = node.split(".")[0]
if not origin_name in node_per_barcode:
for node, data in graph.nodes(data=True):
origin_name = data["center"]
if origin_name not in node_per_barcode:
node_per_barcode[origin_name] = []
node_per_barcode[origin_name].append(node)
# Count frequency
if not origin_name in observed_frequencies:
if origin_name not in observed_frequencies:
observed_frequencies[origin_name] = 0
origin_node_names.append(origin_name)
observed_frequencies[origin_name] += 1
# Compute wanted frequencies
theoritical_frequencies = {}
for node_name in origin_node_names:
_, composition = node_name.split(':')
mol_ids = composition.split('_')
# The node should be splited into the number of molecules inside itself
theoritical_frequencies[node_name] = len(mol_ids)
# Theoretical frequencies
real_frequencies = {node_id: len(node_id.split(":")[1].split("_")) for node_id in barcode_graph.nodes()}
return theoritical_frequencies, observed_frequencies, node_per_barcode
return real_frequencies, observed_frequencies, node_per_barcode
""" This function aims to look for direct molecule neighbors.
......@@ -111,57 +108,44 @@ def parse_graph_path(graph):
return neighborhood
def print_path_summary(frequencies, neighborhood, light_print=False, file_pointer=sys.stdout):
if file_pointer == None:
def print_path_summary(frequencies, light_print=False, file_pointer=sys.stdout):
if file_pointer is None:
return
print("--- Nodes analysis ---", file=file_pointer)
theoritical_frequencies, observed_frequencies, node_per_barcode = frequencies
for key in theoritical_frequencies:
obs, the = observed_frequencies[key], theoritical_frequencies[key]
theoretical_frequencies, observed_frequencies, node_per_barcode = frequencies
for key in theoretical_frequencies:
obs, the = observed_frequencies[key] if key in observed_frequencies else 0, theoretical_frequencies[key]
result = f"{key}: {obs}/{the}"
if file_pointer == sys.stdout:
result = colored(result, 'green' if obs==the else 'red')
# Compute neighborhood correctness
neighborhood_ok = True
for node in node_per_barcode[key]:
if len(neighborhood[node]) != 2:
neighborhood_ok = False
if light_print and obs==the and neighborhood_ok:
if light_print and obs == the:
continue
print(result, file=file_pointer)
for node in node_per_barcode[key]:
text = f"\t{node}\t{' '.join(neighborhood[node])}"
if file_pointer == sys.stdout:
text = colored(text, 'green' if len(neighborhood[node]) == 2 else 'yellow')
print(text, file=file_pointer)
print("--- Global summary ---", file=file_pointer)
# --- Frequency usage ---
# Tags
distinct_theoritical_nodes = len(frequencies[0])
distinct_observed_nodes = len(frequencies[1])
print(f"Distinct barcodes: {distinct_observed_nodes}/{distinct_theoritical_nodes}", file=file_pointer)
distinct_theoretical_nodes = len(theoretical_frequencies)
distinct_observed_nodes = len(observed_frequencies)
print(f"Distinct barcodes: {distinct_observed_nodes}/{distinct_theoretical_nodes}", file=file_pointer)
# molecules
cumulative_theoritical_nodes = sum(frequencies[0].values())
cumulative_observed_nodes = sum(frequencies[1].values())
print(f"Molecules: {cumulative_observed_nodes}/{cumulative_theoritical_nodes}", file=file_pointer)
cumulative_theoretical_nodes = sum(theoretical_frequencies.values())
cumulative_observed_nodes = sum(observed_frequencies.values())
print(f"Molecules: {cumulative_observed_nodes}/{cumulative_theoretical_nodes}", file=file_pointer)
# Wrong splits
over_split = 0
under_split = 0
for barcode in frequencies[0]:
observed = frequencies[1][barcode]
theoritic = frequencies[0][barcode]
over_split += max(observed-theoritic, 0)
under_split += max(theoritic-observed, 0)
for barcode in theoretical_frequencies:
observed = observed_frequencies[barcode] if barcode in observed_frequencies else 0
theoretic = theoretical_frequencies[barcode]
over_split += max(observed-theoretic, 0)
under_split += max(theoretic-observed, 0)
print(f"Under/Over splitting: {under_split} - {over_split}")
......@@ -230,32 +214,6 @@ def print_d2_summary(connected_components, longest_path, covered_vars={}, light_
falses.append(idx)
print(f"Coverage: {nb_true}/{len(covered_vars)}")
print(f"Uncovered_values:\n{falses}")
# def component_to_nearest_neighbor_graph(component):
# nng = nx.Graph()
# nng.add_nodes_from(component.nodes())
# for edge in component.edges():
# node1, node2 = edge
# node1 = parse_dg_name(node1)
# node2 = parse_dg_name(node2)
# central1 = mols_from_node(node1[0][1])
# central2 = frozenset(mols_from_node(node2[0][1]))
# for mol1 in central1:
# if mol1-1 in central2 or mol1+1 in central2:
# nng.add_edge(edge[0], edge[1])
# componnents = list(nx.connected_components(nng))
# print([len(x) for x in componnents])
# componnents.sort(key=lambda x: -len(x))
# componnents = [nng.subgraph(x) for x in componnents]
# nx.write_gexf(componnents[0], "data/d2_reducted.gexf")
# return nng, componnents
def compute_next_nodes(d2_component):
......@@ -491,10 +449,10 @@ def main():
graph = load_graph(args.filename)
if args.type == "path":
frequencies = parse_path_graph_frequencies(graph)
neighborhood = parse_graph_path(graph)
barcode_graph = load_graph(args.barcode_graph)
frequencies = parse_path_graph_frequencies(graph, barcode_graph)
print_path_summary(frequencies, neighborhood, light_print=args.light_print)
print_path_summary(frequencies, light_print=args.light_print)
elif args.type == "d2":
components = list(nx.connected_components(graph))
components.sort(key=lambda x: -len(x))
......@@ -516,9 +474,10 @@ def main():
component = verify_graph_edges(component)
extension=args.filename.split('.')[-1]
base_filename='.'.join(args.filename.split('.')[:-1])
save_graph(component,base_filename+".verified."+extension)
extension = args.filename.split('.')[-1]
base_filename = '.'.join(args.filename.split('.')[:-1])
save_graph(component, base_filename+".verified."+extension)
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment