evaluate.py 5.37 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#!/usr/bin/env python3


import sys
import argparse
from termcolor import colored

import networkx as nx


def parse_args():
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('filename', type=str,
                        help='The output file to evalute')
Yoann Dufresne's avatar
Yoann Dufresne committed
15
16
    parser.add_argument('--light-print', '-l', action='store_true',
                        help='Print only wrong nodes and paths')
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

    args = parser.parse_args()

    return args


def load_graph(filename):
    if filename.endswith('.graphml'):
        return nx.read_graphml(filename)
    elif filename.endswith('.gexf'):
        return nx.read_gexf(filename)
    else:
        print("Wrong file format. Require graphml or gefx format", file=sys.stderr)
        exit()


Yoann Dufresne's avatar
Yoann Dufresne committed
33
34
35
36
def mols_from_node(node_name):
    return [int(idx) for idx in node_name.split(":")[1].split(".")[0].split("_")]


37
38
39
40
41
42
""" Compute appearance frequencies from node names.
    All the node names must be under the format :
    {idx}:{mol1_id}_{mol2_id}_...{molx_id}.other_things_here
    @param graph The networkx graph representinf the deconvolved graph
    @param only_wong If True, don't print correct nodes
    @param file_pointer Where to print the output. If set to stdout, then pretty print. If set to None, don't print anything.
Yoann Dufresne's avatar
Yoann Dufresne committed
43
    @return A tuple containing two dictionaries. The first one with theoritical frequencies of each node, the second one with observed frequencies.
44
"""
Yoann Dufresne's avatar
Yoann Dufresne committed
45
def parse_graph_frequencies(graph):
46
    # Compute origin nodes formated as `{idx}:{mol1_id}_{mol2_id}_...`
Yoann Dufresne's avatar
Yoann Dufresne committed
47
    observed_frequencies = {}
48
    origin_node_names = []
Yoann Dufresne's avatar
Yoann Dufresne committed
49
    node_per_barcode = {}
50
    for node in graph.nodes():
Yoann Dufresne's avatar
Yoann Dufresne committed
51
52
53
54
55
        origin_name = node.split(".")[0]

        if not origin_name in node_per_barcode:
            node_per_barcode[origin_name] = []
        node_per_barcode[origin_name].append(node)
56
57

        # Count frequency
Yoann Dufresne's avatar
Yoann Dufresne committed
58
59
        if not origin_name in observed_frequencies:
            observed_frequencies[origin_name] = 0
60
            origin_node_names.append(origin_name)
Yoann Dufresne's avatar
Yoann Dufresne committed
61
        observed_frequencies[origin_name] += 1
62
63
64
65
66
67
68
69
70
71
    
    # Compute wanted frequencies
    theoritical_frequencies = {}
    for node_name in origin_node_names:
        _, composition = node_name.split(':')

        mol_ids = composition.split('_')
        # The node should be splited into the number of molecules inside itself
        theoritical_frequencies[node_name] = len(mol_ids)

Yoann Dufresne's avatar
Yoann Dufresne committed
72
    return theoritical_frequencies, observed_frequencies, node_per_barcode
73
74


Yoann Dufresne's avatar
Yoann Dufresne committed
75
76
77
78
79
80
81
82
83
""" This function aims to look for direct molecule neighbors.
    If a node has more than 2 direct neighbors, it's not rightly splitted
"""
def parse_graph_path(graph):
    neighborhood = {}

    for node in graph.nodes():
        molecules = mols_from_node(node)
        neighbors = list(graph.neighbors(node))
84

Yoann Dufresne's avatar
Yoann Dufresne committed
85
86
87
88
89
90
91
92
        neighborhood[node] = []
        for mol in molecules:
            for nei in neighbors:
                nei_mols = mols_from_node(nei)
                if mol-1 in nei_mols:
                    neighborhood[node].append(nei)
                if mol+1 in nei_mols:
                    neighborhood[node].append(nei)
93

Yoann Dufresne's avatar
Yoann Dufresne committed
94
    return neighborhood
95
96


Yoann Dufresne's avatar
Yoann Dufresne committed
97
def print_summary(frequencies, neighborhood, light_print=False, file_pointer=sys.stdout):
Yoann Dufresne's avatar
Yoann Dufresne committed
98
99
100
    if file_pointer == None:
        return

Yoann Dufresne's avatar
Yoann Dufresne committed
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
    print("--- Nodes analysis ---", file=file_pointer)
    theoritical_frequencies, observed_frequencies,  node_per_barcode = frequencies
    for key in theoritical_frequencies:
        obs, the = observed_frequencies[key], theoritical_frequencies[key]
        result = f"{key}: {obs}/{the}"

        if file_pointer == sys.stdout:
            result = colored(result, 'green' if obs==the else 'red')

        # Compute neighborhood correctness
        neighborhood_ok = True
        for node in node_per_barcode[key]:
            if len(neighborhood[node]) != 2:
                neighborhood_ok = False

Yoann Dufresne's avatar
Yoann Dufresne committed
116
        if light_print and obs==the and neighborhood_ok:
Yoann Dufresne's avatar
Yoann Dufresne committed
117
118
119
120
121
122
123
124
125
126
127
128
            continue

        print(result, file=file_pointer)
        for node in node_per_barcode[key]:
            text = f"\t{node}\t{' '.join(neighborhood[node])}"

            if file_pointer == sys.stdout:
                text = colored(text, 'green' if len(neighborhood[node]) == 2 else 'yellow')

            print(text, file=file_pointer)


129
130
    print("--- Global summary ---", file=file_pointer)

Yoann Dufresne's avatar
Yoann Dufresne committed
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
    # --- Frequency usage ---
    # Tags
    distinct_theoritical_nodes = len(frequencies[0])
    distinct_observed_nodes = len(frequencies[1])
    print(f"Distinct barcodes: {distinct_observed_nodes}/{distinct_theoritical_nodes}", file=file_pointer)
    # molecules
    cumulative_theoritical_nodes = sum(frequencies[0].values())
    cumulative_observed_nodes = sum(frequencies[1].values())
    print(f"Molecules: {cumulative_observed_nodes}/{cumulative_theoritical_nodes}", file=file_pointer)
    # Wrong splits
    over_split = 0
    under_split = 0
    for barcode in frequencies[0]:
        observed = frequencies[1][barcode]
        theoritic = frequencies[0][barcode]
        over_split += max(observed-theoritic, 0)
        under_split += max(theoritic-observed, 0)
    print(f"Under/Over splitting: {under_split} - {over_split}")
149
150
151
152
153


def main():
    args = parse_args()
    graph = load_graph(args.filename)
Yoann Dufresne's avatar
Yoann Dufresne committed
154
155
    frequencies = parse_graph_frequencies(graph)
    neighborhood = parse_graph_path(graph)
156

Yoann Dufresne's avatar
Yoann Dufresne committed
157
    print_summary(frequencies, neighborhood, light_print=args.light_print)
158
159
160
161


if __name__ == "__main__":
    main()