Commit f2e45da9 authored by Yoann Dufresne's avatar Yoann Dufresne

udg -> lcp refactoring

parent 5cc89481
......@@ -22,7 +22,7 @@ class Dgraph(object):
self.marked = False
""" Static method to load a dgraph from a text
""" Static method to load a lcp from a text
@param text the saved d-graph
@param barcode_graph Barcode graph from which the d-graph is extracted
@return a new d-graph object corresponding to the test
......
......@@ -17,7 +17,7 @@ class AbstractDGIndex(dict):
pass
def add_value(self, key_set, dgraph):
""" Add the couple key (set of barcodes) and value (dgraph) at the right place in the dict
""" Add the couple key (set of barcodes) and value (lcp) at the right place in the dict
"""
pass
# Test the key size
......@@ -34,8 +34,8 @@ class AbstractDGIndex(dict):
@abstractmethod
def add_dgraph(self, dg):
""" Generate all the set needed for keys in the dgraph and push the d-graph as value.
For fixed size of the dgraph all the sets of this size will be generated as key.
""" Generate all the set needed for keys in the lcp and push the d-graph as value.
For fixed size of the lcp all the sets of this size will be generated as key.
Otherwise, all the set of size at least len(dg) - size will be generated.
"""
pass
......
......@@ -3,7 +3,7 @@ import sys
from abc import abstractmethod
from multiprocessing import Pool, Value
from deconvolution.dgraph.FixedDGIndex import FixedDGIndex, AbstractDGIndex
from deconvolution.lcp.FixedDGIndex import FixedDGIndex, AbstractDGIndex
counter = None
......@@ -45,7 +45,7 @@ class AbstractDGFactory:
global counter
counter = Value('i', 0)
def generate_all_dgraphs(self, verbose=False, threads=8):
def generate_all_lcps(self, verbose=False, threads=8):
index = FixedDGIndex(size=1)
factory = self
nb_nodes = len(self.graph.nodes())
......
import networkx as nx
from collections import Counter
from deconvolution.dgraph.AbstractDGFactory import AbstractDGFactory
from deconvolution.dgraph.d_graph import Dgraph
from deconvolution.dgraph import AbstractDGIndex
from deconvolution.lcp.AbstractLcpFactory import AbstractDGFactory
from deconvolution.lcp.lcp import Dgraph
from deconvolution.lcp import AbstractDGIndex
class CliqueDGFactory(AbstractDGFactory):
......@@ -24,7 +24,7 @@ class CliqueDGFactory(AbstractDGFactory):
def generate_by_node(self, central_node, subgraph):
node_d_graphs = set()
node_lcps = set()
node_neighbors = {node: [x for x in subgraph[node]] for node in subgraph.nodes}
# Clique computation
......@@ -113,7 +113,7 @@ class CliqueDGFactory(AbstractDGFactory):
# Create candidate udg
d_graph = Dgraph(central_node)
d_graph.put_halves(list(clq1), list(clq2), subgraph)
node_d_graphs.add(d_graph)
node_lcps.add(d_graph)
if self.debug is not None:
mwm_results.append(" <-> ".join(sorted([clique_names[idx1], clique_names[idx2]])))
......@@ -126,4 +126,4 @@ class CliqueDGFactory(AbstractDGFactory):
for result in mwm_results:
matching.write(f"{result}\n")
return node_d_graphs
return node_lcps
from itertools import combinations
from deconvolution.dgraph.AbstractDGIndex import AbstractDGIndex
from deconvolution.lcp.AbstractDGIndex import AbstractDGIndex
class FixedDGIndex(AbstractDGIndex):
......@@ -11,7 +11,7 @@ class FixedDGIndex(AbstractDGIndex):
def _verify_key(self, key_set, dg_size=0):
if len(key_set) != self.size:
raise ValueError("Wrong set size in the dgraph")
raise ValueError("Wrong set size in the lcp")
def add_dgraph(self, dg):
......
import networkx as nx
from deconvolution.dgraph.AbstractDGFactory import AbstractDGFactory
from deconvolution.dgraph.d_graph import Dgraph
from deconvolution.lcp.AbstractLcpFactory import AbstractDGFactory
from deconvolution.lcp.lcp import Dgraph
import community
class LouvainDGFactory(AbstractDGFactory):
......
from itertools import combinations
from deconvolution.dgraph.AbstractDGIndex import AbstractDGIndex
from deconvolution.lcp.AbstractDGIndex import AbstractDGIndex
class VariableDGIndex(AbstractDGIndex):
......@@ -11,7 +11,7 @@ class VariableDGIndex(AbstractDGIndex):
def _verify_key(self, key_set, dg_size=0):
if len(key_set) < dg_size - self.size:
raise ValueError("Wrong set size in the dgraph")
raise ValueError("Wrong set size in the lcp")
def add_dgraph(self, dg):
......
......@@ -20,7 +20,7 @@ class Dgraph(object):
@staticmethod
def load(lcp_txt, score, variables):
""" Static method to load a dgraph from a text
""" Static method to load a lcp from a text
:param lcp_txt: the saved d-graph
:return: a new d-graph object corresponding to the test
"""
......
......@@ -3,11 +3,11 @@ import itertools
from bidict import bidict
import sys
# from deconvolution.dgraph.FixedDGIndex import FixedDGIndex
from deconvolution.dgraph.VariableDGIndex import VariableDGIndex
from deconvolution.dgraph.d_graph import Dgraph
from deconvolution.dgraph.CliqueDGFactory import CliqueDGFactory
from deconvolution.dgraph.LouvainDGFactory import LouvainDGFactory
# from deconvolution.lcp.FixedDGIndex import FixedDGIndex
from deconvolution.lcp.VariableDGIndex import VariableDGIndex
from deconvolution.lcp.lcp import Dgraph
from deconvolution.lcp.CliqueLcpFactory import CliqueDGFactory
from deconvolution.lcp.LouvainDGFactory import LouvainDGFactory
class LcpGraph(nx.Graph):
......@@ -80,7 +80,7 @@ class LcpGraph(nx.Graph):
lcp_factory = LouvainDGFactory(self.barcode_graph)
else:
lcp_factory = CliqueDGFactory(self.barcode_graph, min_size_clique=min_size_clique, debug=self.debug, debug_path=self.debug_path)
self.lcp_per_node = lcp_factory.generate_all_dgraphs(threads=threads, verbose=verbose)
self.lcp_per_node = lcp_factory.generate_all_lcps(threads=threads, verbose=verbose)
if verbose:
counts = sum(len(x) for x in self.lcp_per_node.values())
print(f"\t {counts} computed d-graphs")
......
from deconvolution.lcpgraph.lcp_path import Path
""" Greedy algorithm. Start with th most probable unitig (ie lowest normalized penalty first and largest unitig for
equalities). Then extends on both side to the nearest interesting unitig.
an interesting unitig is a unitig that add new covering variables.
"""
def construct_path_from_unitigs(unitigs, d2g):
# Initiate covering variables
used_variables = {x: False for x in d2g.barcode_edge_idxs}
# Initiate the path with longest sure unitig
path = Path(d2g)
first_utg = unitigs.pop(0)
path.add_path(first_utg)
print(len(path), path.covering_score())
# While it's possible, search to add unitigs
while len(unitigs) > 0:
# Search for next unitig
path_extension = _search_way_to_next_unitig(path, unitigs, d2g)
if path_extension is None:
break
# TODO: Search for unitigs on the other side of the path
# Add the unitig to the path
path.add_path(path_extension)
print(len(path), path.covering_score())
unitigs = [utg for utg in unitigs if path.covering_difference(utg) > 0]
return path
""" Look for the next unitig to join from right endpoint of the path
@param path Currently created path
@param unitigs List of unitigs of interest (be careful to fill this list with unitigs that add new covered variables
@param lcpg The lcpg to use as path support
@return (path_to, utg) Respectively the path to the next unitig of interest (from the right of the current path) and
the corresponding unitig (linked to the path from left to right)
"""
def _search_way_to_next_unitig(path, unitigs, d2g):
# Register endpoints
endpoints = {}
for utg in unitigs:
endpoints[utg[0]] = utg
if len(utg) > 1:
endpoints[utg[-1]] = utg
# Search for next endpoint
best_paths = _search_endpoint(path[-1], endpoints, d2g, path)
if len(best_paths) == 0:
return None
# Search for the best path (unitig included) to add (The one that cover the most new variables)
the_best_path = None
covered_variables = 0
for extension in best_paths:
utg = endpoints[extension[-1]]
# if the utg is in the wrong size
if utg[-1] == extension[-1]:
utg.reverse()
complete_path = Path(d2g)
complete_path.add_path(extension[1:])
complete_path.add_path(utg[1:])
num_new_vars = path.covering_difference(complete_path)
if num_new_vars > covered_variables:
covered_variables = num_new_vars
the_best_path = complete_path
return the_best_path
""" Search the min-penalty paths from a start udg to any of the target nodes.
@param start_udg The start node of the path
@param targets A list of udg of interest
@param lcpg The lcpg to follow
@param forbidden_udgs excluded nodes from the lcpg
@return A list of equivalent paths
"""
def _search_endpoint(start_udg, targets, d2g, forbidden_udgs):
marked_nodes = {x: (None, None) for x in forbidden_udgs}
covered_variables = set(forbidden_udgs.covering_variables.keys())
# Init Dijkstra
to_explore = [start_udg]
marked_nodes[start_udg] = (0, None)
found = []
# Dijkstra part 1, compute penalties and previous nodes
while len(to_explore) > 0 and len(found) == 0:
# Select min penalty in to_explore
current = min(to_explore, key=lambda x: marked_nodes[x][0])
current_penalty = marked_nodes[current][0]
to_explore.remove(current)
# Filter neighbors by there covering values
neighbors = d2g.neighbors(str(current.idx))
filtered_neighbors = []
for n in neighbors:
nei = d2g.node_by_idx[int(n)]
if len(d2g.get_covering_variables([nei]) - covered_variables) > 0:
filtered_neighbors.append(n)
neighbors = filtered_neighbors
# Explore all the neighbors of the current node.
for nei_idx in neighbors:
nei_udg = d2g.node_by_idx[int(nei_idx)]
penalty = current_penalty + d2g[nei_idx][str(current.idx)]["distance"]
if nei_udg in marked_nodes:
# Forbidden node
if marked_nodes[nei_udg][0] is None:
continue
# Already inserted but update the min penalty to reach the udg
if penalty < marked_nodes[nei_udg][0]:
marked_nodes[nei_udg] = penalty, current
else:
# First encounter: insert to explore and set min penalty
marked_nodes[nei_udg] = (penalty, current)
to_explore.append(nei_udg)
# Set stop if a target is found
if nei_udg in targets:
if len(found) == 0:
found.append(nei_udg)
elif marked_nodes[found[0]][0] == penalty:
found.append(nei_udg)
elif marked_nodes[found[0]][0] > penalty:
found = [nei_udg]
# Dijkstra part 2, retrieve the best paths
paths = []
for target in found:
# Start a path
node_path = [target]
# Add all nodes one by one to the first
while node_path[-1] != start_udg:
node_path.append(marked_nodes[node_path[-1]][1])
# Reverse the path to be on the right order
node_path.reverse()
# Create and add a real path object
path = Path(d2g)
path.add_path(node_path)
paths.append(path)
return paths
......@@ -6,24 +6,24 @@ from deconvolution.lcpgraph.lcp_path import Path
class Optimizer:
def __init__(self, d2g):
self.d2g = d2g
def __init__(self, lcpg):
self.lcpg = lcpg
def bb_solution(self, start_node=None, verbose=False):
nb_steps_since_last_score_up = 0
total_nb_steps = 0
first_pass = True
used_nodes = {n: False for n in self.d2g.nodes()}
used_nodes = {n: False for n in self.lcpg.nodes()}
# Init solution for
if start_node == None:
start_node = list(used_nodes.keys())
start_node = start_node[random.randint(0, len(start_node)-1)]
best_path = Path(self.d2g)
best_path = Path(self.lcpg)
stack = [[start_node]]
current_path = Path(self.d2g)
current_path = Path(self.lcpg)
last_increase_node = None
max_cov = len(current_path.covering_variables)
......@@ -52,14 +52,14 @@ class Optimizer:
nb_steps_since_last_score_up += 1
# 3 - Neighbors preparation
neighbors = [x for x in self.d2g[current_node] if not used_nodes[x]]
neighbors = [x for x in self.lcpg[current_node] if not used_nodes[x]]
opt = self
def node_sorting_value(node):
u = opt.d2g.node_by_idx[int(node)]
return (0 if len(opt.d2g.get_covering_variables(node)) - current_path.covering_value == 0 else 1,
u = opt.lcpg.node_by_idx[int(node)]
return (0 if len(opt.lcpg.get_covering_variables(node)) - current_path.covering_value == 0 else 1,
- u.get_link_divergence(),
- opt.d2g[str(u.idx)][str(current_node)]["distance"])
- opt.lcpg[str(u.idx)][str(current_node)]["distance"])
neighbors.sort(key=node_sorting_value)
stack.append(neighbors)
......@@ -74,10 +74,10 @@ class Optimizer:
# 5 - Reinit the search from a side and stop when done
if len(stack) == 0 or (total_nb_steps > 10000 and nb_steps_since_last_score_up >= total_nb_steps / 2):
if first_pass:
used_nodes = {n: False for n in self.d2g.nodes()}
best_path = Path(self.d2g)
used_nodes = {n: False for n in self.lcpg.nodes()}
best_path = Path(self.lcpg)
stack = [[last_increase_node]]
current_path = Path(self.d2g)
current_path = Path(self.lcpg)
if verbose:
print("Start again !")
import time
......
......@@ -4,7 +4,7 @@ import networkx as nx
import random
import argparse
import deconvolution.dgraph.graph_manipulator as gm
import deconvolution.lcp.graph_manipulator as gm
def parse_arguments():
......
......@@ -2,7 +2,7 @@
import argparse
from deconvolution.dgraph import graph_manipulator as gm
from deconvolution.lcp import graph_manipulator as gm
def parse_arguments():
......
......@@ -11,7 +11,7 @@ def parse_arguments():
parser = argparse.ArgumentParser(description='Transform a barcode graph into a lcp graph. The program dig for a set of lcps and then merge them into a lcp graph.')
parser.add_argument('barcode_graph', help='The barcode graph file. Must be a gefx formated file.')
parser.add_argument('--outfile', '-o', default="", help="Output file name for lcp graph.")
parser.add_argument('--threads', '-t', default=8, type=int, help='Number of thread to use for dgraph computation')
parser.add_argument('--threads', '-t', default=8, type=int, help='Number of thread to use for lcp computation')
parser.add_argument('--debug', '-d', action='store_true', help="Debug")
parser.add_argument('--verbose', '-v', action='store_true', help="Verbose")
parser.add_argument('--edge_divergence_threshold', '-dt', default=0.25, type=float, help='Divergence threshold value to link two udgs in the d2-graph')
......
......@@ -4,7 +4,7 @@ import networkx as nx
from collections import Counter
from experiments.CliqueGraph import CliqueGraph
from deconvolution.dgraph.CliqueDGFactory import CliqueDGFactory
from deconvolution.lcp.CliqueLcpFactory import CliqueDGFactory
def parse_arguments():
......@@ -83,7 +83,7 @@ def analyse_clique_graph(barcode_graph):
def analyse_d_graphs(barcode_graph, threads=8, verbose=False):
# Generate udgs
factory = CliqueDGFactory(barcode_graph, 1)
udg_per_node = factory.generate_all_dgraphs(threads=threads, verbose=verbose)
udg_per_node = factory.generate_all_lcps(threads=threads, verbose=verbose)
# Remove duplicate udgs
udgs = {}
for udg_node_lst in udg_per_node.values():
......@@ -120,7 +120,7 @@ def main2():
import os
os.mkdir(debug_path)
factory = CliqueDGFactory(g, 1, debug=args.debug, debug_path=debug_path)
udg_per_node = factory.generate_all_dgraphs(threads=args.threads, verbose=args.verbose)
udg_per_node = factory.generate_all_lcps(threads=args.threads, verbose=args.verbose)
# Remove duplicate udgs
udgs = set()
for udg_node_lst in udg_per_node.values():
......
......@@ -4,7 +4,7 @@ from distutils.core import setup
setup(
name='10X-deconvolve',
version='0.2dev',
packages=['deconvolution.lcpgraph', 'deconvolution.dgraph', 'deconvolution.main', 'experiments', 'tests'],
packages=['deconvolution.lcpgraph', 'deconvolution.lcp', 'deconvolution.main', 'experiments', 'tests'],
license='AGPL V3',
long_description=open('README.md').read(),
)
import unittest
from random import randint
from deconvolution.dgraph.FixedDGIndex import FixedDGIndex
from deconvolution.dgraph.VariableDGIndex import VariableDGIndex
from deconvolution.dgraph.d_graph import Dgraph
from deconvolution.dgraph.graph_manipulator import generate_d_graph_chain
from deconvolution.lcp.FixedDGIndex import FixedDGIndex
from deconvolution.lcp.VariableDGIndex import VariableDGIndex
from deconvolution.lcp.lcp import Dgraph
from deconvolution.lcp.graph_manipulator import generate_d_graph_chain
class TestIndex(unittest.TestCase):
......
......@@ -4,7 +4,7 @@ import networkx as nx
from scipy.special import comb
from deconvolution.lcpgraph.lcp_graph import LcpGraph
from deconvolution.dgraph import graph_manipulator as gm
from deconvolution.lcp import graph_manipulator as gm
class TestD2Graph(unittest.TestCase):
......
......@@ -2,9 +2,9 @@ import unittest
import networkx as nx
from d_graph_data import unit_d_graph
from deconvolution.dgraph.d_graph import Dgraph
from deconvolution.dgraph import graph_manipulator as gm
from deconvolution.dgraph.CliqueDGFactory import CliqueDGFactory
from deconvolution.lcp.lcp import Dgraph
from deconvolution.lcp import graph_manipulator as gm
from deconvolution.lcp.CliqueLcpFactory import CliqueDGFactory
class TestDGraph(unittest.TestCase):
......@@ -65,7 +65,7 @@ class TestDGraph(unittest.TestCase):
def test_generation_no_variability_DGCliqueFactory(self):
barcode_graph = nx.read_gexf("test_data/bar_1000_5_2.gexf")
factory = CliqueDGFactory(barcode_graph, 1)
udg_per_node = factory.generate_all_dgraphs()
udg_per_node = factory.generate_all_lcps()
all_udgs = set()
for udgs in udg_per_node.values():
all_udgs.update(udgs)
......@@ -73,7 +73,7 @@ class TestDGraph(unittest.TestCase):
size = len(all_udgs)
for _ in range(5):
factory = CliqueDGFactory(barcode_graph, 1)
udg_per_node = factory.generate_all_dgraphs()
udg_per_node = factory.generate_all_lcps()
all_udgs = set()
for udgs in udg_per_node.values():
all_udgs.update(udgs)
......
import unittest
from deconvolution.dgraph import graph_manipulator as gm
from deconvolution.lcp import graph_manipulator as gm
class TestGraphManipulation(unittest.TestCase):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment