evaluate.py 2.89 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3


import sys
import argparse
from termcolor import colored

import networkx as nx


def parse_args():
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('filename', type=str,
                        help='The output file to evalute')

    args = parser.parse_args()

    return args


def load_graph(filename):
    if filename.endswith('.graphml'):
        return nx.read_graphml(filename)
    elif filename.endswith('.gexf'):
        return nx.read_gexf(filename)
    else:
        print("Wrong file format. Require graphml or gefx format", file=sys.stderr)
        exit()


""" Compute appearance frequencies from node names.
    All the node names must be under the format :
    {idx}:{mol1_id}_{mol2_id}_...{molx_id}.other_things_here
    @param graph The networkx graph representinf the deconvolved graph
    @param only_wong If True, don't print correct nodes
    @param file_pointer Where to print the output. If set to stdout, then pretty print. If set to None, don't print anything.
    @return A tuple containing two dictionaries. The first one with theoritical frequences of each node, the second one with observed frequencies.
"""
def parse_graph_frequencies(graph, only_wrong=False, file_pointer=sys.stdout):
    # Compute origin nodes formated as `{idx}:{mol1_id}_{mol2_id}_...`
    observed_frequences = {}
    origin_node_names = []
    for node in graph.nodes():
        first_dot = node.find(".")
        origin_name = node[:first_dot]

        # Count frequency
        if not origin_name in observed_frequences:
            observed_frequences[origin_name] = 0
            origin_node_names.append(origin_name)
        observed_frequences[origin_name] += 1
    
    # Compute wanted frequencies
    theoritical_frequencies = {}
    for node_name in origin_node_names:
        _, composition = node_name.split(':')

        mol_ids = composition.split('_')
        # The node should be splited into the number of molecules inside itself
        theoritical_frequencies[node_name] = len(mol_ids)

    # Print results
    if file_pointer != None:
        print("--- Frequency analysis ---", file=file_pointer)
        for key in theoritical_frequencies:
            obs, the = observed_frequences[key], theoritical_frequencies[key]
            result = f"{key}: {obs}/{the}"

            if file_pointer == sys.stdout:
                result = colored(result, 'green' if obs==the else 'red')

            if only_wrong and obs==the:
                continue

            print(result, file=file_pointer)

    return theoritical_frequencies, observed_frequences


def print_summary(frequencies, file_pointer=sys.stdout):
    print("--- Global summary ---", file=file_pointer)



def main():
    args = parse_args()
    graph = load_graph(args.filename)
    frequencies = parse_graph_frequencies(graph)

    print_summary(frequencies)


if __name__ == "__main__":
    main()