Commit c475c1d3 authored by Yoann Dufresne's avatar Yoann Dufresne
Browse files

evaluation

parent 398b47ac
#!/usr/bin/env python3
import sys
import argparse
from termcolor import colored
import networkx as nx
def parse_args():
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('filename', type=str,
help='The output file to evalute')
parser.add_argument('--light-print', '-l', action='store_true',
help='Print only wrong nodes and paths')
args = parser.parse_args()
return args
def load_graph(filename):
if filename.endswith('.graphml'):
return nx.read_graphml(filename)
elif filename.endswith('.gexf'):
return nx.read_gexf(filename)
else:
print("Wrong file format. Require graphml or gefx format", file=sys.stderr)
exit()
def mols_from_node(node_name):
return [int(idx) for idx in node_name.split(":")[1].split(".")[0].split("_")]
""" Compute appearance frequencies from node names.
All the node names must be under the format :
{idx}:{mol1_id}_{mol2_id}_...{molx_id}.other_things_here
@param graph The networkx graph representinf the deconvolved graph
@param only_wong If True, don't print correct nodes
@param file_pointer Where to print the output. If set to stdout, then pretty print. If set to None, don't print anything.
@return A tuple containing two dictionaries. The first one with theoritical frequencies of each node, the second one with observed frequencies.
"""
def parse_graph_frequencies(graph):
# Compute origin nodes formated as `{idx}:{mol1_id}_{mol2_id}_...`
observed_frequencies = {}
origin_node_names = []
node_per_barcode = {}
for node in graph.nodes():
origin_name = node.split(".")[0]
if not origin_name in node_per_barcode:
node_per_barcode[origin_name] = []
node_per_barcode[origin_name].append(node)
# Count frequency
if not origin_name in observed_frequencies:
observed_frequencies[origin_name] = 0
origin_node_names.append(origin_name)
observed_frequencies[origin_name] += 1
# Compute wanted frequencies
theoritical_frequencies = {}
for node_name in origin_node_names:
_, composition = node_name.split(':')
mol_ids = composition.split('_')
# The node should be splited into the number of molecules inside itself
theoritical_frequencies[node_name] = len(mol_ids)
return theoritical_frequencies, observed_frequencies, node_per_barcode
""" This function aims to look for direct molecule neighbors.
If a node has more than 2 direct neighbors, it's not rightly splitted
"""
def parse_graph_path(graph):
neighborhood = {}
for node in graph.nodes():
molecules = mols_from_node(node)
neighbors = list(graph.neighbors(node))
neighborhood[node] = []
for mol in molecules:
for nei in neighbors:
nei_mols = mols_from_node(nei)
if mol-1 in nei_mols:
neighborhood[node].append(nei)
if mol+1 in nei_mols:
neighborhood[node].append(nei)
return neighborhood
def print_summary(frequencies, neighborhood, light_print=False, file_pointer=sys.stdout):
if file_pointer == None:
return
print("--- Nodes analysis ---", file=file_pointer)
theoritical_frequencies, observed_frequencies, node_per_barcode = frequencies
for key in theoritical_frequencies:
obs, the = observed_frequencies[key], theoritical_frequencies[key]
result = f"{key}: {obs}/{the}"
if file_pointer == sys.stdout:
result = colored(result, 'green' if obs==the else 'red')
# Compute neighborhood correctness
neighborhood_ok = True
for node in node_per_barcode[key]:
if len(neighborhood[node]) != 2:
neighborhood_ok = False
if light_print and obs==the and neighborhood_ok:
continue
print(result, file=file_pointer)
for node in node_per_barcode[key]:
text = f"\t{node}\t{' '.join(neighborhood[node])}"
if file_pointer == sys.stdout:
text = colored(text, 'green' if len(neighborhood[node]) == 2 else 'yellow')
print(text, file=file_pointer)
print("--- Global summary ---", file=file_pointer)
# --- Frequency usage ---
# Tags
distinct_theoritical_nodes = len(frequencies[0])
distinct_observed_nodes = len(frequencies[1])
print(f"Distinct barcodes: {distinct_observed_nodes}/{distinct_theoritical_nodes}", file=file_pointer)
# molecules
cumulative_theoritical_nodes = sum(frequencies[0].values())
cumulative_observed_nodes = sum(frequencies[1].values())
print(f"Molecules: {cumulative_observed_nodes}/{cumulative_theoritical_nodes}", file=file_pointer)
# Wrong splits
over_split = 0
under_split = 0
for barcode in frequencies[0]:
observed = frequencies[1][barcode]
theoritic = frequencies[0][barcode]
over_split += max(observed-theoritic, 0)
under_split += max(theoritic-observed, 0)
print(f"Under/Over splitting: {under_split} - {over_split}")
def main():
args = parse_args()
graph = load_graph(args.filename)
frequencies = parse_graph_frequencies(graph)
neighborhood = parse_graph_path(graph)
print_summary(frequencies, neighborhood, light_print=args.light_print)
if __name__ == "__main__":
main()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment