Commit af2011d4 authored by Yoann Dufresne's avatar Yoann Dufresne

refacotring udg to lcp

parent f2e45da9
......@@ -2,7 +2,7 @@
A compilation of scripts and pipelines to count and extract scaffolds of barcodes from linked reads datasets.
**WARNING**: This code is a proof of concept, not a usable software for production. If the code is too slow for your tests or you are encontering some bugs (maybe it's a feature ? :p) don't hesitate to contact us via the issues or with a direct mail to me (yoann [dot] dufresne [at] pasteur [dot] fr).
**WARNING**: This code is a proof of concept, not a usable software for production. If the code is too slow for your tests or you are encontering some bugs (maybe it's a feature ? :p) don't hesitate to contact us via the issues or with a direct mail to me (yoann \[dot] dufresne \[at] pasteur \[dot] fr).
## Nomenclature warnings
During the process of writing a scientific article, some of the datastructure names have been modified.
......
......@@ -5,7 +5,7 @@ import community # pip install python-louvain
@total_ordering
class Dgraph(object):
"""docstring for Dgraph"""
"""docstring for Lcp"""
def __init__(self, center):
super(Dgraph, self).__init__()
self.idx = -1
......@@ -313,11 +313,11 @@ def compute_all_max_d_graphs(graph, debug=False, clique_mode=None):
return d_graphs
""" Add the new dg in the dgs list. If dg is dominated by another dg in the list, then it's
dropped. If any dg in the list is dominated by the dg to add, then, the new dg is added and
all the dominated dg are removed from the list.
@param dg A new dg to add/filter.
@param undominated_dgs_list A list of dg where any of them is dominated by another one.
""" Add the new lcp in the dgs list. If lcp is dominated by another lcp in the list, then it's
dropped. If any lcp in the list is dominated by the lcp to add, then, the new lcp is added and
all the dominated lcp are removed from the list.
@param lcp A new lcp to add/filter.
@param undominated_dgs_list A list of lcp where any of them is dominated by another one.
@return The updated undominated list.
"""
def add_new_dg_regarding_domination(dg, undominated_dgs_list):
......@@ -336,7 +336,7 @@ def add_new_dg_regarding_domination(dg, undominated_dgs_list):
for dg2 in to_remove:
undominated_dgs_list.remove(dg2)
# Add the new dg
# Add the new lcp
if not dominated:
undominated_dgs_list.append(dg)
......
......@@ -36,7 +36,7 @@ class AbstractDGIndex(dict):
def add_dgraph(self, dg):
""" Generate all the set needed for keys in the lcp and push the d-graph as value.
For fixed size of the lcp all the sets of this size will be generated as key.
Otherwise, all the set of size at least len(dg) - size will be generated.
Otherwise, all the set of size at least len(lcp) - size will be generated.
"""
pass
......
......@@ -2,7 +2,7 @@ import networkx as nx
from collections import Counter
from deconvolution.lcp.AbstractLcpFactory import AbstractDGFactory
from deconvolution.lcp.lcp import Dgraph
from deconvolution.lcp.lcp import Lcp
from deconvolution.lcp import AbstractDGIndex
......@@ -111,7 +111,7 @@ class CliqueDGFactory(AbstractDGFactory):
clq1 = cliques[idx1]
clq2 = cliques[idx2]
# Create candidate udg
d_graph = Dgraph(central_node)
d_graph = Lcp(central_node)
d_graph.put_halves(list(clq1), list(clq2), subgraph)
node_lcps.add(d_graph)
......
import networkx as nx
from deconvolution.lcp.AbstractLcpFactory import AbstractDGFactory
from deconvolution.lcp.lcp import Dgraph
from deconvolution.lcp.lcp import Lcp
import community
class LouvainDGFactory(AbstractDGFactory):
......@@ -33,7 +33,7 @@ class LouvainDGFactory(AbstractDGFactory):
clq2 = cliques[clq2_idx]
# Check for d-graph candidates
d_graph = Dgraph(node)
d_graph = Lcp(node)
d_graph.put_halves(clq1, clq2, subgraph)
if d_graph.get_link_divergence() <= d_graph.get_optimal_score() * self.dg_max_divergence_factor:
......
......@@ -2,7 +2,7 @@ import networkx as nx
import random
def generate_d_graph_chain(size, d):
""" Generate a d-graph chain (succession of unit d-graphs).
""" Generate a lcp chain (succession of unit d-graphs).
If you slice any 2*d+1 part of the graph, it will be a unit d-graph
:param size The number of nodes in the chain (should not be less than 2*d+1)
:param d The number of connection on the left and on the right for any node
......@@ -12,7 +12,7 @@ def generate_d_graph_chain(size, d):
def generate_approx_d_graph_chain(size, d_max, d_avg, size_reduction=0, rnd_seed=-1):
""" Generate an almost d-graph chain (succession of unit d-graphs). Almost because they are d-graphs in average
""" Generate an almost d-graph chain (succession of lcp). Almost because they are lcp in average
with a coverage variation.
:param size The number of nodes in the chain (should not be less than 2*d+1)
:param d_max The max number of connection on the left and on the right for any node
......@@ -69,15 +69,15 @@ def generate_approx_d_graph_chain(size, d_max, d_avg, size_reduction=0, rnd_seed
return G
""" Merge 2 nodes of a graph G.
def merge_nodes(G, node1, node2):
""" Merge 2 nodes of a graph G.
The new node have edges from both of the previous nodes (dereplicated).
If a link between node1 and node2 exist, it's discarded.
@param G The graph to manipulate
@param node1 First node to merge
@param node2 Second node to merge
@return The name of the new node in G
"""
def merge_nodes(G, node1, node2):
:param G: The graph to manipulate
:param node1: First node to merge
:param node2: Second node to merge
:return: The name of the new node in G
"""
# Create the new node
new_node = f"{node1}_{node2}" if node1 < node2 else f"{node2}_{node1}"
G.add_node(new_node)
......
......@@ -3,10 +3,10 @@ from functools import total_ordering
@total_ordering
class Dgraph(object):
"""docstring for Dgraph"""
class Lcp(object):
"""docstring for Lcp"""
def __init__(self, center):
super(Dgraph, self).__init__()
super(Lcp, self).__init__()
self.idx = -1
self.center = center
self.score = 0
......@@ -21,8 +21,8 @@ class Dgraph(object):
@staticmethod
def load(lcp_txt, score, variables):
""" Static method to load a lcp from a text
:param lcp_txt: the saved d-graph
:return: a new d-graph object corresponding to the test
:param lcp_txt: the saved lcp
:return: a new lcp object corresponding to the test
"""
# basic split
lcp_txt = lcp_txt.replace(']', '')
......@@ -30,31 +30,31 @@ class Dgraph(object):
# Head parsing
center = head.replace(' ', '')
dg = Dgraph(center)
lcp = Lcp(center)
# Reload halves
h1 = [x for x in h1.split(',')]
h2 = [x for x in h2.split(',')]
dg.halves[0] = h1
dg.node_set.update(h1)
dg.nodes.extend(h1)
dg.halves[1] = h2
dg.node_set.update(h2)
dg.nodes.extend(h2)
lcp.halves[0] = h1
lcp.node_set.update(h1)
lcp.nodes.extend(h1)
lcp.halves[1] = h2
lcp.node_set.update(h2)
lcp.nodes.extend(h2)
# Score parsing
dg.score = int(score.split('/')[0])
lcp.score = int(score.split('/')[0])
# covering variable loading
dg.edges = {int(x) for x in variables.split(' ')}
lcp.edges = {int(x) for x in variables.split(' ')}
return dg
return lcp
def put_halves(self, h1, h2, graph):
""" Compute the d-graph quality (score) according to the connectivity between the two halves.
:param h1: First half of the d-graph
:param h2: Second half of the d-graph
""" Create the lcp and compute the lcp quality (score) according to the connectivity between the two cliques.
:param h1: First clique of the lcp
:param h2: Second clique of the lcp
:param graph: The barcode graph
"""
self.score = 0
......@@ -131,9 +131,9 @@ class Dgraph(object):
return self.center, left, right
def distance_to(self, dgraph):
def distance_to(self, lcp):
nodes_1 = self.to_sorted_list()
nodes_2 = other_nodes = dgraph.to_sorted_list()
nodes_2 = other_nodes = lcp.to_sorted_list()
dist = 0
idx1, idx2 = 0, 0
......@@ -151,24 +151,24 @@ class Dgraph(object):
return dist
""" Verify if dg1 is dominated by dg2. The domination is determined by two points: All the nodes
of dg1 are part of dg2 and the divergeance of dg1 is greater than dg2.
@param dg1 (resp dg2) A d_graph object.
@return True if dg1 is dominated by dg2.
def is_dominated(self, lcp):
""" Verify if dg1 is dominated by lcp2. The domination is determined by two points: All the nodes
of lcp1 are part of lcp2 and the divergeance of lcp1 is greater than lcp2.
:param lcp: A Lcp object.
:return: True if lcp1 is dominated by lcp2.
"""
def is_dominated(self, dg):
dg1_nodes = self.to_node_set()
dg2_nodes = dg.to_node_set()
lcp1_nodes = self.to_node_set()
lcp2_nodes = lcp.to_node_set()
# domination first condition: inclusion of all the nodes
if not dg1_nodes.issubset(dg2_nodes):
if not lcp1_nodes.issubset(lcp2_nodes):
return False
# domination second condition
if len(dg1_nodes) == len(dg2_nodes):
if self.get_link_divergence() > dg.get_link_divergence():
if len(lcp1_nodes) == len(lcp2_nodes):
if self.get_link_divergence() > lcp.get_link_divergence():
return True
elif self.get_link_divergence() >= dg.get_link_divergence():
elif self.get_link_divergence() >= lcp.get_link_divergence():
return True
return False
......
......@@ -5,7 +5,7 @@ import sys
# from deconvolution.lcp.FixedDGIndex import FixedDGIndex
from deconvolution.lcp.VariableDGIndex import VariableDGIndex
from deconvolution.lcp.lcp import Dgraph
from deconvolution.lcp.lcp import Lcp
from deconvolution.lcp.CliqueLcpFactory import CliqueDGFactory
from deconvolution.lcp.LouvainDGFactory import LouvainDGFactory
......@@ -130,14 +130,14 @@ class LcpGraph(nx.Graph):
self.bidict_nodes = {}
for idx, node in enumerate(self.nodes(data=True)):
node, data = node
dg = Dgraph.load(data["udg"], data["score"], data["barcode_edges"])
dg = Lcp.load(data["udg"], data["score"], data["barcode_edges"])
self.variables.update(dg.edges)
self.bidict_nodes[node] = dg
self.all_lcp.append(dg)
if dg.idx == -1:
dg.idx = int(node)
self.node_by_idx[dg.idx] = dg
# self.node_by_name[node] = dg
# self.node_by_name[node] = lcp
self.bidict_nodes = bidict(self.bidict_nodes)
......
......@@ -266,7 +266,7 @@ def _get_distant_neighbors(graph, node, dist):
return neighbors
def compute_next_nodes(d2_component, max_jumps=0):
# First parse dg names
# First parse lcp names
dg_names = {}
for node in d2_component.nodes():
dg_names[node] = parse_dg_name(d2_component,node)
......@@ -280,7 +280,7 @@ def compute_next_nodes(d2_component, max_jumps=0):
# Get the current molecule idxs
molecule_idxs = mols_from_node(head[1])
#print("node",node,"dg name",dg_names[node],"mol idxs",molecule_idxs)
#print("node",node,"lcp name",dg_names[node],"mol idxs",molecule_idxs)
for mol_idx in molecule_idxs:
nexts = []
......
......@@ -3,7 +3,7 @@ from random import randint
from deconvolution.lcp.FixedDGIndex import FixedDGIndex
from deconvolution.lcp.VariableDGIndex import VariableDGIndex
from deconvolution.lcp.lcp import Dgraph
from deconvolution.lcp.lcp import Lcp
from deconvolution.lcp.graph_manipulator import generate_d_graph_chain
......@@ -56,7 +56,7 @@ def _generate_dg(d):
h2 = list(G.subgraph([2*d-x for x in range(d)]).nodes())
# d-graph construction
dg = Dgraph(center)
dg = Lcp(center)
dg.put_halves(h1, h2, G)
return dg
......
......@@ -19,8 +19,8 @@ class TestD2Graph(unittest.TestCase):
# d2.construct_from_barcodes(neighbor_threshold=0, min_size_clique=d, verbose=False)
# print("after", d)
#
# # for dg in d2.all_lcp:
# # print(dg.score, dg.get_link_divergence(), dg)
# # for lcp in d2.all_lcp:
# # print(lcp.score, lcp.get_link_divergence(), lcp)
# # print()
#
# # Test the number of d-graphs
......@@ -86,7 +86,7 @@ class TestD2Graph(unittest.TestCase):
# Test all_lcp
self.assertEqual(len(d2_reloaded.all_lcp), len(d2.all_lcp))
# Verify dg idxs
# Verify lcp idxs
reloaded_idxs = [dg.idx for dg in d2_reloaded.all_lcp]
for dg in d2.all_lcp:
self.assertTrue(dg.idx in reloaded_idxs)
......
......@@ -2,7 +2,7 @@ import unittest
import networkx as nx
from d_graph_data import unit_d_graph
from deconvolution.lcp.lcp import Dgraph
from deconvolution.lcp.lcp import Lcp
from deconvolution.lcp import graph_manipulator as gm
from deconvolution.lcp.CliqueLcpFactory import CliqueDGFactory
......@@ -12,7 +12,7 @@ class TestDGraph(unittest.TestCase):
def test_construction(self):
center, h1, h2, G = unit_d_graph
# Test basic construction
dg = Dgraph(center)
dg = Lcp(center)
self.assertEqual(center, dg.center)
self.assertEqual(0, dg.score)
......@@ -34,7 +34,7 @@ class TestDGraph(unittest.TestCase):
h2 = list(G.subgraph([size-1-x for x in range(d)]).nodes())
# d-graph construction
dg = Dgraph(center)
dg = Lcp(center)
dg.put_halves(h1, h2, G)
# Test the internal arity
......@@ -43,20 +43,20 @@ class TestDGraph(unittest.TestCase):
def test_optimal_score(self):
center, h1, h2, G = unit_d_graph
dg = Dgraph(center)
dg = Lcp(center)
dg.put_halves(h1, h2, G)
# Must be the number of transitive edges
self.assertEqual(3, dg.get_optimal_score())
def test_divergence(self):
center, h1, h2, G = unit_d_graph
dg = Dgraph(center)
dg = Lcp(center)
dg.put_halves(h1, h2, G)
self.assertEqual(0.0, dg.get_link_divergence())
def test_dg_to_list(self):
center, h1, h2, G = unit_d_graph
dg = Dgraph(center)
dg = Lcp(center)
dg.put_halves(h1, h2, G)
lst = dg.to_ordered_lists()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment