evaluate.py 3.91 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#!/usr/bin/env python3


import sys
import argparse
from termcolor import colored

import networkx as nx


def parse_args():
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('filename', type=str,
                        help='The output file to evalute')
Yoann Dufresne's avatar
Yoann Dufresne committed
15
16
    parser.add_argument('--light-print', '-l', action='store_true',
                        help='Print only wrong nodes and paths')
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

    args = parser.parse_args()

    return args


def load_graph(filename):
    if filename.endswith('.graphml'):
        return nx.read_graphml(filename)
    elif filename.endswith('.gexf'):
        return nx.read_gexf(filename)
    else:
        print("Wrong file format. Require graphml or gefx format", file=sys.stderr)
        exit()


""" Compute appearance frequencies from node names.
    All the node names must be under the format :
    {idx}:{mol1_id}_{mol2_id}_...{molx_id}.other_things_here
    @param graph The networkx graph representinf the deconvolved graph
    @param only_wong If True, don't print correct nodes
    @param file_pointer Where to print the output. If set to stdout, then pretty print. If set to None, don't print anything.
    @return A tuple containing two dictionaries. The first one with theoritical frequences of each node, the second one with observed frequencies.
"""
def parse_graph_frequencies(graph, only_wrong=False, file_pointer=sys.stdout):
    # Compute origin nodes formated as `{idx}:{mol1_id}_{mol2_id}_...`
    observed_frequences = {}
    origin_node_names = []
    for node in graph.nodes():
        first_dot = node.find(".")
        origin_name = node[:first_dot]

        # Count frequency
        if not origin_name in observed_frequences:
            observed_frequences[origin_name] = 0
            origin_node_names.append(origin_name)
        observed_frequences[origin_name] += 1
    
    # Compute wanted frequencies
    theoritical_frequencies = {}
    for node_name in origin_node_names:
        _, composition = node_name.split(':')

        mol_ids = composition.split('_')
        # The node should be splited into the number of molecules inside itself
        theoritical_frequencies[node_name] = len(mol_ids)

    # Print results
    if file_pointer != None:
        print("--- Frequency analysis ---", file=file_pointer)
        for key in theoritical_frequencies:
            obs, the = observed_frequences[key], theoritical_frequencies[key]
            result = f"{key}: {obs}/{the}"

            if file_pointer == sys.stdout:
                result = colored(result, 'green' if obs==the else 'red')

            if only_wrong and obs==the:
                continue

            print(result, file=file_pointer)

    return theoritical_frequencies, observed_frequences


def print_summary(frequencies, file_pointer=sys.stdout):
Yoann Dufresne's avatar
Yoann Dufresne committed
83
84
85
    if file_pointer == None:
        return

86
87
    print("--- Global summary ---", file=file_pointer)

Yoann Dufresne's avatar
Yoann Dufresne committed
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
    # --- Frequency usage ---
    # Tags
    distinct_theoritical_nodes = len(frequencies[0])
    distinct_observed_nodes = len(frequencies[1])
    print(f"Distinct barcodes: {distinct_observed_nodes}/{distinct_theoritical_nodes}", file=file_pointer)
    # molecules
    cumulative_theoritical_nodes = sum(frequencies[0].values())
    cumulative_observed_nodes = sum(frequencies[1].values())
    print(f"Molecules: {cumulative_observed_nodes}/{cumulative_theoritical_nodes}", file=file_pointer)
    # Wrong splits
    over_split = 0
    under_split = 0
    for barcode in frequencies[0]:
        observed = frequencies[1][barcode]
        theoritic = frequencies[0][barcode]
        over_split += max(observed-theoritic, 0)
        under_split += max(theoritic-observed, 0)
    print(f"Under/Over splitting: {under_split} - {over_split}")
106
107
108
109
110


def main():
    args = parse_args()
    graph = load_graph(args.filename)
Yoann Dufresne's avatar
Yoann Dufresne committed
111
    frequencies = parse_graph_frequencies(graph, only_wrong=args.light_print)
112
113
114
115
116
117

    print_summary(frequencies)


if __name__ == "__main__":
    main()