d_graph.py 14.5 KB
Newer Older
1 2
import networkx as nx
from functools import total_ordering
3
import community # pip install python-louvain
4

Yoann Dufresne's avatar
Yoann Dufresne committed
5

6 7 8
@total_ordering
class Dgraph(object):
    """docstring for Dgraph"""
Yoann Dufresne's avatar
Yoann Dufresne committed
9
    def __init__(self, center):
10
        super(Dgraph, self).__init__()
Yoann Dufresne's avatar
Yoann Dufresne committed
11
        self.idx = -1
Yoann Dufresne's avatar
Yoann Dufresne committed
12
        self.center = center
13
        self.score = 0
Yoann Dufresne's avatar
Yoann Dufresne committed
14 15
        self.halves = [None,None]
        self.connexity = [None,None]
16
        self.nodes = [self.center]
17
        self.node_set = set(self.nodes)
Yoann Dufresne's avatar
Yoann Dufresne committed
18
        self.edges = []
19
        self.ordered_list = None
20
        self.sorted_list = None
21

Yoann Dufresne's avatar
Yoann Dufresne committed
22 23
        self.marked = False

24

Yoann Dufresne's avatar
Yoann Dufresne committed
25
    """ Static method to load a lcp from a text
26 27 28 29 30 31 32 33 34 35
        @param text the saved d-graph
        @param barcode_graph Barcode graph from which the d-graph is extracted
        @return a new d-graph object corresponding to the test 
    """
    def load(text, barcode_graph):
        # basic split
        text = text.replace(']', '')
        head, h1, h2 = text.split('[')

        # Head parsing
Yoann Dufresne's avatar
Yoann Dufresne committed
36 37
        center = head.replace(' ', '')
        if center not in barcode_graph:
38 39
            center = int(center)
        dg = Dgraph(center)
40

41
        # Reload halves
42
        h1 = [x.split(' ')[-2] for x in h1.split(',')]
Yoann Dufresne's avatar
Yoann Dufresne committed
43
        h1 = [int(x) if x not in barcode_graph else x for x in h1]
44
        h2 = [x.split(' ')[-2] for x in h2.split(',')]
Yoann Dufresne's avatar
Yoann Dufresne committed
45
        h2 = [int(x) if x not in barcode_graph else x for x in h2]
46 47
        dg.put_halves(h1, h2, barcode_graph)

48
        return dg
Yoann Dufresne's avatar
Yoann Dufresne committed
49

50 51 52 53 54 55 56 57 58

    """ Compute the d-graph quality (score) according to the connectivity between the two halves.
        @param h1 First half of the d-graph
        @param h2 Second half of the d-graph
        @param graph The connectivity graph
    """
    def put_halves(self, h1, h2, graph):
        self.score = 0
        self.halves[0] = h1
59 60
        for node in h1:
            self.node_set.add(node)
Yoann Dufresne's avatar
Yoann Dufresne committed
61
            self.nodes.append(node)
62
        self.halves[1] = h2
63 64
        for node in h2:
            self.node_set.add(node)
Yoann Dufresne's avatar
Yoann Dufresne committed
65
            self.nodes.append(node)
66

Yoann Dufresne's avatar
Yoann Dufresne committed
67
        # self.nodes = sorted([self.center] + self.halves[0] + self.halves[1])
Yoann Dufresne's avatar
Yoann Dufresne committed
68 69
        self.connexity[0] = {key: 0 for key in self.halves[0]}
        self.connexity[1] = {key: 0 for key in self.halves[1]}
Yoann Dufresne's avatar
Yoann Dufresne committed
70
        self.edges = []
71 72

        # Compute link arities
73
        for node1 in self.halves[0]:
74 75
            neighbors = set(graph.neighbors(node1))

76
            for node2 in self.halves[1]:
77 78
                if node1 == node2 or node2 in neighbors:
                    self.score += 1
Yoann Dufresne's avatar
Yoann Dufresne committed
79 80 81
                    self.connexity[0][node1] += 1
                    self.connexity[1][node2] += 1

82 83 84 85
        # Compute links from the center to the other nodes
        for idx, node1 in enumerate(self.nodes):
            for node2 in self.nodes[idx+1:]:
                if graph.has_edge(node1, node2):
Yoann Dufresne's avatar
Yoann Dufresne committed
86 87
                    if node1 < node2:
                        self.edges.append((node1, node2))
88
                    else:
Yoann Dufresne's avatar
Yoann Dufresne committed
89 90
                        self.edges.append((node2, node1))

Yoann Dufresne's avatar
Yoann Dufresne committed
91 92 93 94
        # Sort the halves by descending connexity
        connex = self.connexity
        self.halves[0].sort(reverse=True, key=lambda v: connex[0][v])
        self.halves[1].sort(reverse=True, key=lambda v: connex[1][v])
95 96


97
    def get_link_divergence(self):
98
        return int(abs(self.score - self.get_optimal_score()))
99 100 101 102


    def get_optimal_score(self):
        max_len = max(len(self.halves[0]), len(self.halves[1]))
103
        return int(max_len * (max_len - 1) / 2)
104 105


106 107
    def to_sorted_list(self):
        if self.sorted_list is None:
Yoann Dufresne's avatar
Yoann Dufresne committed
108
            self.sorted_list = sorted(self.nodes)
109
        return self.sorted_list
Yoann Dufresne's avatar
Yoann Dufresne committed
110 111


112
    def to_ordered_lists(self):
113 114 115 116 117 118 119 120 121 122 123 124 125 126
        if self.ordered_list is None:
            hands = [[],[]]
            for idx in range(2):
                prev_connectivity = -1
                for node in self.halves[idx]:
                    # group nodes by similar connectivity
                    value = self.connexity[idx][node]
                    if value != prev_connectivity:
                        hands[idx].append([])
                        prev_connectivity = value
                    hands[idx][-1].append(node)

            self.ordered_list = hands[0][::-1] + [[self.center]] + hands[1]
        return self.ordered_list
Yoann Dufresne's avatar
Yoann Dufresne committed
127 128


129 130
    def to_node_set(self):
        return frozenset(self.to_sorted_list())
131 132


133
    def distance_to(self, dgraph):
Yoann Dufresne's avatar
Yoann Dufresne committed
134 135
        nodes_1 = self.to_sorted_list()
        nodes_2 = other_nodes = dgraph.to_sorted_list()
136 137 138

        dist = 0
        idx1, idx2 = 0, 0
Yoann Dufresne's avatar
Yoann Dufresne committed
139 140
        while idx1 != len(nodes_1) and idx2 != len(nodes_2):
            if nodes_1[idx1] == nodes_2[idx2]:
141 142 143 144
                idx1 += 1
                idx2 += 1
            else:
                dist += 1
Yoann Dufresne's avatar
Yoann Dufresne committed
145
                if nodes_1[idx1] < nodes_2[idx2]:
146 147 148
                    idx1 += 1
                else:
                    idx2 += 1
Yoann Dufresne's avatar
Yoann Dufresne committed
149
        dist += len(nodes_1) - idx1 + len(nodes_2) - idx2
150 151 152 153

        return dist


154 155 156 157 158 159
    """ Verify if dg1 is dominated by dg2. The domination is determined by two points: All the nodes
    of dg1 are part of dg2 and the divergeance of dg1 is greater than dg2.
    @param dg1 (resp dg2) A d_graph object.
    @return True if dg1 is dominated by dg2.
    """
    def is_dominated(self, dg):
160 161
        dg1_nodes = self.to_node_set()
        dg2_nodes = dg.to_node_set()
162 163 164 165 166 167 168

        # domination first condition: inclusion of all the nodes
        if not dg1_nodes.issubset(dg2_nodes):
            return False

        # domination second condition
        if len(dg1_nodes) == len(dg2_nodes):
169
            if self.get_link_divergence() > dg.get_link_divergence():
170
                return True
171
        elif self.get_link_divergence() >= dg.get_link_divergence():
172 173 174 175 176
            return True

        return False


177
    def __eq__(self, other):
178 179 180
        if other is None:
            return False

Yoann Dufresne's avatar
Yoann Dufresne committed
181
        if self.idx != -1 and self.idx == other.idx:
182 183 184
            return True

        if self.node_set != other.node_set:
Yoann Dufresne's avatar
Yoann Dufresne committed
185 186
            return False

Yoann Dufresne's avatar
Yoann Dufresne committed
187
        return self.to_ordered_lists() == other.to_ordered_lists()
188 189 190 191 192

    def __ne__(self, other):
        return not (self == other)

    def __lt__(self, other):
193 194
        my_tuple = (self.get_link_divergence(), self.get_optimal_score())
        other_tuple = (other.get_link_divergence(), other.get_optimal_score())
Yoann Dufresne's avatar
Yoann Dufresne committed
195
        return my_tuple < other_tuple
196

Yoann Dufresne's avatar
Yoann Dufresne committed
197

198
    def __hash__(self):
199
        nodelist = self.to_sorted_list()
200
        nodelist = [str(x) for x in nodelist]
201 202
        return ",".join(nodelist).__hash__()

Yoann Dufresne's avatar
Yoann Dufresne committed
203

204 205 206 207 208 209 210 211 212 213 214 215 216 217
    def __ordered_hash__(self):
        lst = self.to_ordered_lists()

        fwd_uniq_lst = [sorted(l) for l in lst]
        fwd_str = ",".join([f"[{'-'.join(l)}]" for l in fwd_uniq_lst])
        fwd_hash = fwd_str.__hash__()

        rev_uniq_lst = [sorted(l) for l in lst[::-1]]
        rev_str = ",".join([f"[{'-'.join(l)}]" for l in rev_uniq_lst])
        rev_hash = rev_str.__hash__()

        return int(min(fwd_hash, rev_hash))


218
    def __repr__(self):
Yoann Dufresne's avatar
Yoann Dufresne committed
219
        # print(self.halves)
220
        representation = str(self.center) + " "
Yoann Dufresne's avatar
Yoann Dufresne committed
221 222 223
        representation += "[" + ", ".join([f"{node} {self.connexity[0][node]}" for node in self.halves[0]]) + "]"
        representation += "[" + ", ".join([f"{node} {self.connexity[1][node]}" for node in self.halves[1]]) + "]"
        return representation
224

Yoann Dufresne's avatar
Yoann Dufresne committed
225 226 227 228 229
    def _to_str_nodes(self):
        str_nodes = [str(x) for x in self.nodes]
        str_nodes.sort()
        return str(str_nodes)

230 231 232 233 234

""" From a barcode graph, compute all the possible max d-graphs by node.
    @param graph A barcode graph
    @return A dictionary associating each node to its list of all possible d-graphs. The d-graphs are sorted by decreasing ratio.
"""
235
def compute_all_max_d_graphs(graph, debug=False, clique_mode=None):
236
    d_graphs = {}
237

Yoann Dufresne's avatar
Yoann Dufresne committed
238
    for idx, node in enumerate(list(graph.nodes())):
239
        #if "MI" not in str(node): continue    # for debugging; only look at deconvolved nodes
240
        #print(f"\r{idx+1}/{len(graph.nodes())}")
241 242 243
        neighbors = list(graph.neighbors(node))
        neighbors_graph = nx.Graph(graph.subgraph(neighbors))

244
        node_d_graphs = set()
245

246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
        mode_str = " " 
        if clique_mode is None:
            # Find all the cliques (equivalent to compute all the candidate half d-graph)
            cliques = list(nx.find_cliques(neighbors_graph))
            mode_str += "(max-cliques)"
        elif clique_mode == "louvain":
            louvain = community.best_partition(neighbors_graph) # louvain
            # high resolution seems to work better
            communities = [[c for c,i in louvain.items() if i == clique_id] for clique_id in set(louvain.values())]
            mode_str += "(louvain)"
            cliques = []
            for comm in communities:
                # further decompose! into necessarily 2 communities
                community_as_graph = nx.Graph(graph.subgraph(comm))
                if len(community_as_graph.nodes()) <= 2:
rchikhi's avatar
merge  
rchikhi committed
261
                    cliques += [list(community_as_graph.nodes())]
262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
                else:
                    cliques += map(list,nx.community.asyn_fluidc(community_as_graph,2))
                
        elif clique_mode == "testing":
            # k-clique communities
            #from networkx.algorithms.community import k_clique_communities
            #cliques = k_clique_communities(neighbors_graph, 3) # related to the d-graph d parameter
            from cdlib import algorithms
            cliques_dict = algorithms.node_perception(neighbors_graph, threshold=0.75, overlap_threshold=0.75) #typical output: Sizes of found cliques (testing): Counter({6: 4, 5: 3, 4: 2, 2: 1})
            #cliques_dict = algorithms.gdmp2(neighbors_graph, min_threshold=0.9) #typical output: sizes of found cliques (testing): Counter({3: 2, 5: 1})
            #cliques_dict = algorithms.angel(neighbors_graph, threshold=0.90) # very sensitive parameters: 0.84 and 0.88 don't work at all but 0.86 does sort of
            from collections import defaultdict
            cliques_dict2 = defaultdict(list)
            for (node, values) in cliques_dict.to_node_community_map().items():
                for value in values:
                    cliques_dict2[value] += [node]
            cliques = list(cliques_dict2.values())
            mode_str += "(testing)"

281
        #print("cliques", len(cliques))
282 283 284 285 286 287 288 289 290 291
        if debug: print("node",node,"has",len(cliques),"cliques in neighborhood (of size",len(neighbors),")")
        
        cliques_debugging = True
        if cliques_debugging:
            #cliques_graph = nx.make_max_clique_graph(neighbors_graph)
            #if debug: print("node",node,"clique graph has",len(cliques_graph.nodes()),"nodes",len(cliques_graph.edges()),"edges")
            #nx.write_gexf(cliques_graph, str(node) +".gexf")

            from collections import Counter
            len_cliques = Counter(map(len,cliques))
292
            #print("sizes of found cliques%s:" % mode_str, len_cliques)
293

294 295 296 297 298 299
        # Pair halves to create d-graphes
        for idx, clq1 in enumerate(cliques):
            for clq2_idx in range(idx+1, len(cliques)):
                clq2 = cliques[clq2_idx]

                # Check for d-graph candidates
Yoann Dufresne's avatar
Yoann Dufresne committed
300
                d_graph = Dgraph(node)
301
                d_graph.put_halves(clq1, clq2, neighbors_graph)
Yoann Dufresne's avatar
Yoann Dufresne committed
302

303 304 305 306
                factor = 0.5 
                #if clique_mode == "testing": factor = 1 # still allows louvain's big communities
                #print("link div:",d_graph.get_link_divergence(),"opt:",d_graph.get_optimal_score(), "good d graph?",d_graph.get_link_divergence() <= d_graph.get_optimal_score() *factor)
                if d_graph.get_link_divergence() <= d_graph.get_optimal_score() * factor:
Yoann Dufresne's avatar
Yoann Dufresne committed
307
                    node_d_graphs.add(d_graph)
308

309
        #print("d-graphs", len(node_d_graphs))
Yoann Dufresne's avatar
Yoann Dufresne committed
310
        d_graphs[node] = brutal_list_domination_filter(sorted(node_d_graphs))
311
        #print("filtered", len(d_graphs[node]))
312 313 314 315 316 317 318 319 320 321 322 323 324

    return d_graphs


""" Add the new dg in the dgs list. If dg is dominated by another dg in the list, then it's
    dropped. If any dg in the list is dominated by the dg to add, then, the new dg is added and
    all the dominated dg are removed from the list.
    @param dg A new dg to add/filter.
    @param undominated_dgs_list A list of dg where any of them is dominated by another one.
    @return The updated undominated list. 
"""
def add_new_dg_regarding_domination(dg, undominated_dgs_list):
    to_remove = []
325
    dominated = False
326 327 328

    # Search for domination relations
    for u_dg in undominated_dgs_list:
329 330 331
        if not dominated and dg.is_dominated(u_dg):
            dominated = True
        if u_dg.is_dominated(dg):
332 333 334
            to_remove.append(u_dg)

    # Remove dominated values
335
    size = len(undominated_dgs_list)
336 337 338 339
    for dg2 in to_remove:
        undominated_dgs_list.remove(dg2)

    # Add the new dg
340 341
    if not dominated:
        undominated_dgs_list.append(dg)
342 343 344 345

    return undominated_dgs_list


346 347 348 349 350 351 352 353 354 355 356 357 358 359

def filter_dominated(d_graphs, overall=False, in_place=True):
    if not overall:
        return local_domination_filter(d_graphs, in_place)

    all_d_graphs = []
    for dgs in d_graphs.values():
        all_d_graphs.extend(dgs)

    all_d_graphs = list_domination_filter(all_d_graphs)

    return d_graphs


360
""" Filter the d-graphs by node. In a list of d-graph centered on a node n, if a d-graph is
361
    completely included in another and have a highest distance score to the optimal, then it is
362 363 364
    filtered out.
    @param d_graphs All the d-graphs to filter, sorted by central node.
    @param in_place If true, modify the content of d_graph with the filtered version. If False,
365 366
    copy all the content in a new dictionary.
    @return The filtered dictionary of d-graph per node.
367
"""
368
def local_domination_filter(d_graphs, in_place=True):
369 370 371 372
    filtered = d_graphs if in_place else {}

    # Filter node by node
    for node, d_graph_list in d_graphs.items():
Yoann Dufresne's avatar
Yoann Dufresne committed
373 374
        lst = str(sorted([str(x) for x in d_graph_list]))
        filtered[node] = brutal_list_domination_filter(d_graph_list, node_name=str(node))
375

376
    return filtered
377

378 379

""" Filter the input d-graphs list. In the list of d-graph centered on a node n, if a d-graph is
380
    completely included in another and have a highest distance score to the optimal, then it is
381 382
    filtered out.
    @param d_graphs All the d-graphs to filter.
383
    @return The filtered dictionary of d-graph per node.
384 385 386 387 388 389 390
"""
def list_domination_filter(d_graphs):
    filtered = []

    # Filter d-graph by d-graph
    for dg in d_graphs:
        add_new_dg_regarding_domination(dg, filtered)
391

392 393
    return set(filtered)

Yoann Dufresne's avatar
Yoann Dufresne committed
394
def brutal_list_domination_filter(d_graphs, node_name=""):
395
    undominated = set(d_graphs)
Yoann Dufresne's avatar
Yoann Dufresne committed
396
    to_remove = set()
397 398 399 400

    for dg1 in d_graphs:
        for dg2 in d_graphs:
            if dg1.is_dominated(dg2):
Yoann Dufresne's avatar
Yoann Dufresne committed
401
                to_remove.add(dg1)
402 403
                break

Yoann Dufresne's avatar
Yoann Dufresne committed
404
    return undominated - to_remove