Commit f2e45da9 authored by Yoann Dufresne's avatar Yoann Dufresne

udg -> lcp refactoring

parent 5cc89481
...@@ -22,7 +22,7 @@ class Dgraph(object): ...@@ -22,7 +22,7 @@ class Dgraph(object):
self.marked = False self.marked = False
""" Static method to load a dgraph from a text """ Static method to load a lcp from a text
@param text the saved d-graph @param text the saved d-graph
@param barcode_graph Barcode graph from which the d-graph is extracted @param barcode_graph Barcode graph from which the d-graph is extracted
@return a new d-graph object corresponding to the test @return a new d-graph object corresponding to the test
......
...@@ -17,7 +17,7 @@ class AbstractDGIndex(dict): ...@@ -17,7 +17,7 @@ class AbstractDGIndex(dict):
pass pass
def add_value(self, key_set, dgraph): def add_value(self, key_set, dgraph):
""" Add the couple key (set of barcodes) and value (dgraph) at the right place in the dict """ Add the couple key (set of barcodes) and value (lcp) at the right place in the dict
""" """
pass pass
# Test the key size # Test the key size
...@@ -34,8 +34,8 @@ class AbstractDGIndex(dict): ...@@ -34,8 +34,8 @@ class AbstractDGIndex(dict):
@abstractmethod @abstractmethod
def add_dgraph(self, dg): def add_dgraph(self, dg):
""" Generate all the set needed for keys in the dgraph and push the d-graph as value. """ Generate all the set needed for keys in the lcp and push the d-graph as value.
For fixed size of the dgraph all the sets of this size will be generated as key. For fixed size of the lcp all the sets of this size will be generated as key.
Otherwise, all the set of size at least len(dg) - size will be generated. Otherwise, all the set of size at least len(dg) - size will be generated.
""" """
pass pass
......
...@@ -3,7 +3,7 @@ import sys ...@@ -3,7 +3,7 @@ import sys
from abc import abstractmethod from abc import abstractmethod
from multiprocessing import Pool, Value from multiprocessing import Pool, Value
from deconvolution.dgraph.FixedDGIndex import FixedDGIndex, AbstractDGIndex from deconvolution.lcp.FixedDGIndex import FixedDGIndex, AbstractDGIndex
counter = None counter = None
...@@ -45,7 +45,7 @@ class AbstractDGFactory: ...@@ -45,7 +45,7 @@ class AbstractDGFactory:
global counter global counter
counter = Value('i', 0) counter = Value('i', 0)
def generate_all_dgraphs(self, verbose=False, threads=8): def generate_all_lcps(self, verbose=False, threads=8):
index = FixedDGIndex(size=1) index = FixedDGIndex(size=1)
factory = self factory = self
nb_nodes = len(self.graph.nodes()) nb_nodes = len(self.graph.nodes())
......
import networkx as nx import networkx as nx
from collections import Counter from collections import Counter
from deconvolution.dgraph.AbstractDGFactory import AbstractDGFactory from deconvolution.lcp.AbstractLcpFactory import AbstractDGFactory
from deconvolution.dgraph.d_graph import Dgraph from deconvolution.lcp.lcp import Dgraph
from deconvolution.dgraph import AbstractDGIndex from deconvolution.lcp import AbstractDGIndex
class CliqueDGFactory(AbstractDGFactory): class CliqueDGFactory(AbstractDGFactory):
...@@ -24,7 +24,7 @@ class CliqueDGFactory(AbstractDGFactory): ...@@ -24,7 +24,7 @@ class CliqueDGFactory(AbstractDGFactory):
def generate_by_node(self, central_node, subgraph): def generate_by_node(self, central_node, subgraph):
node_d_graphs = set() node_lcps = set()
node_neighbors = {node: [x for x in subgraph[node]] for node in subgraph.nodes} node_neighbors = {node: [x for x in subgraph[node]] for node in subgraph.nodes}
# Clique computation # Clique computation
...@@ -113,7 +113,7 @@ class CliqueDGFactory(AbstractDGFactory): ...@@ -113,7 +113,7 @@ class CliqueDGFactory(AbstractDGFactory):
# Create candidate udg # Create candidate udg
d_graph = Dgraph(central_node) d_graph = Dgraph(central_node)
d_graph.put_halves(list(clq1), list(clq2), subgraph) d_graph.put_halves(list(clq1), list(clq2), subgraph)
node_d_graphs.add(d_graph) node_lcps.add(d_graph)
if self.debug is not None: if self.debug is not None:
mwm_results.append(" <-> ".join(sorted([clique_names[idx1], clique_names[idx2]]))) mwm_results.append(" <-> ".join(sorted([clique_names[idx1], clique_names[idx2]])))
...@@ -126,4 +126,4 @@ class CliqueDGFactory(AbstractDGFactory): ...@@ -126,4 +126,4 @@ class CliqueDGFactory(AbstractDGFactory):
for result in mwm_results: for result in mwm_results:
matching.write(f"{result}\n") matching.write(f"{result}\n")
return node_d_graphs return node_lcps
from itertools import combinations from itertools import combinations
from deconvolution.dgraph.AbstractDGIndex import AbstractDGIndex from deconvolution.lcp.AbstractDGIndex import AbstractDGIndex
class FixedDGIndex(AbstractDGIndex): class FixedDGIndex(AbstractDGIndex):
...@@ -11,7 +11,7 @@ class FixedDGIndex(AbstractDGIndex): ...@@ -11,7 +11,7 @@ class FixedDGIndex(AbstractDGIndex):
def _verify_key(self, key_set, dg_size=0): def _verify_key(self, key_set, dg_size=0):
if len(key_set) != self.size: if len(key_set) != self.size:
raise ValueError("Wrong set size in the dgraph") raise ValueError("Wrong set size in the lcp")
def add_dgraph(self, dg): def add_dgraph(self, dg):
......
import networkx as nx import networkx as nx
from deconvolution.dgraph.AbstractDGFactory import AbstractDGFactory from deconvolution.lcp.AbstractLcpFactory import AbstractDGFactory
from deconvolution.dgraph.d_graph import Dgraph from deconvolution.lcp.lcp import Dgraph
import community import community
class LouvainDGFactory(AbstractDGFactory): class LouvainDGFactory(AbstractDGFactory):
......
from itertools import combinations from itertools import combinations
from deconvolution.dgraph.AbstractDGIndex import AbstractDGIndex from deconvolution.lcp.AbstractDGIndex import AbstractDGIndex
class VariableDGIndex(AbstractDGIndex): class VariableDGIndex(AbstractDGIndex):
...@@ -11,7 +11,7 @@ class VariableDGIndex(AbstractDGIndex): ...@@ -11,7 +11,7 @@ class VariableDGIndex(AbstractDGIndex):
def _verify_key(self, key_set, dg_size=0): def _verify_key(self, key_set, dg_size=0):
if len(key_set) < dg_size - self.size: if len(key_set) < dg_size - self.size:
raise ValueError("Wrong set size in the dgraph") raise ValueError("Wrong set size in the lcp")
def add_dgraph(self, dg): def add_dgraph(self, dg):
......
...@@ -20,7 +20,7 @@ class Dgraph(object): ...@@ -20,7 +20,7 @@ class Dgraph(object):
@staticmethod @staticmethod
def load(lcp_txt, score, variables): def load(lcp_txt, score, variables):
""" Static method to load a dgraph from a text """ Static method to load a lcp from a text
:param lcp_txt: the saved d-graph :param lcp_txt: the saved d-graph
:return: a new d-graph object corresponding to the test :return: a new d-graph object corresponding to the test
""" """
......
...@@ -3,11 +3,11 @@ import itertools ...@@ -3,11 +3,11 @@ import itertools
from bidict import bidict from bidict import bidict
import sys import sys
# from deconvolution.dgraph.FixedDGIndex import FixedDGIndex # from deconvolution.lcp.FixedDGIndex import FixedDGIndex
from deconvolution.dgraph.VariableDGIndex import VariableDGIndex from deconvolution.lcp.VariableDGIndex import VariableDGIndex
from deconvolution.dgraph.d_graph import Dgraph from deconvolution.lcp.lcp import Dgraph
from deconvolution.dgraph.CliqueDGFactory import CliqueDGFactory from deconvolution.lcp.CliqueLcpFactory import CliqueDGFactory
from deconvolution.dgraph.LouvainDGFactory import LouvainDGFactory from deconvolution.lcp.LouvainDGFactory import LouvainDGFactory
class LcpGraph(nx.Graph): class LcpGraph(nx.Graph):
...@@ -80,7 +80,7 @@ class LcpGraph(nx.Graph): ...@@ -80,7 +80,7 @@ class LcpGraph(nx.Graph):
lcp_factory = LouvainDGFactory(self.barcode_graph) lcp_factory = LouvainDGFactory(self.barcode_graph)
else: else:
lcp_factory = CliqueDGFactory(self.barcode_graph, min_size_clique=min_size_clique, debug=self.debug, debug_path=self.debug_path) lcp_factory = CliqueDGFactory(self.barcode_graph, min_size_clique=min_size_clique, debug=self.debug, debug_path=self.debug_path)
self.lcp_per_node = lcp_factory.generate_all_dgraphs(threads=threads, verbose=verbose) self.lcp_per_node = lcp_factory.generate_all_lcps(threads=threads, verbose=verbose)
if verbose: if verbose:
counts = sum(len(x) for x in self.lcp_per_node.values()) counts = sum(len(x) for x in self.lcp_per_node.values())
print(f"\t {counts} computed d-graphs") print(f"\t {counts} computed d-graphs")
......
from deconvolution.lcpgraph.lcp_path import Path
""" Greedy algorithm. Start with th most probable unitig (ie lowest normalized penalty first and largest unitig for
equalities). Then extends on both side to the nearest interesting unitig.
an interesting unitig is a unitig that add new covering variables.
"""
def construct_path_from_unitigs(unitigs, d2g):
# Initiate covering variables
used_variables = {x: False for x in d2g.barcode_edge_idxs}
# Initiate the path with longest sure unitig
path = Path(d2g)
first_utg = unitigs.pop(0)
path.add_path(first_utg)
print(len(path), path.covering_score())
# While it's possible, search to add unitigs
while len(unitigs) > 0:
# Search for next unitig
path_extension = _search_way_to_next_unitig(path, unitigs, d2g)
if path_extension is None:
break
# TODO: Search for unitigs on the other side of the path
# Add the unitig to the path
path.add_path(path_extension)
print(len(path), path.covering_score())
unitigs = [utg for utg in unitigs if path.covering_difference(utg) > 0]
return path
""" Look for the next unitig to join from right endpoint of the path
@param path Currently created path
@param unitigs List of unitigs of interest (be careful to fill this list with unitigs that add new covered variables
@param lcpg The lcpg to use as path support
@return (path_to, utg) Respectively the path to the next unitig of interest (from the right of the current path) and
the corresponding unitig (linked to the path from left to right)
"""
def _search_way_to_next_unitig(path, unitigs, d2g):
# Register endpoints
endpoints = {}
for utg in unitigs:
endpoints[utg[0]] = utg
if len(utg) > 1:
endpoints[utg[-1]] = utg
# Search for next endpoint
best_paths = _search_endpoint(path[-1], endpoints, d2g, path)
if len(best_paths) == 0:
return None
# Search for the best path (unitig included) to add (The one that cover the most new variables)
the_best_path = None
covered_variables = 0
for extension in best_paths:
utg = endpoints[extension[-1]]
# if the utg is in the wrong size
if utg[-1] == extension[-1]:
utg.reverse()
complete_path = Path(d2g)
complete_path.add_path(extension[1:])
complete_path.add_path(utg[1:])
num_new_vars = path.covering_difference(complete_path)
if num_new_vars > covered_variables:
covered_variables = num_new_vars
the_best_path = complete_path
return the_best_path
""" Search the min-penalty paths from a start udg to any of the target nodes.
@param start_udg The start node of the path
@param targets A list of udg of interest
@param lcpg The lcpg to follow
@param forbidden_udgs excluded nodes from the lcpg
@return A list of equivalent paths
"""
def _search_endpoint(start_udg, targets, d2g, forbidden_udgs):
marked_nodes = {x: (None, None) for x in forbidden_udgs}
covered_variables = set(forbidden_udgs.covering_variables.keys())
# Init Dijkstra
to_explore = [start_udg]
marked_nodes[start_udg] = (0, None)
found = []
# Dijkstra part 1, compute penalties and previous nodes
while len(to_explore) > 0 and len(found) == 0:
# Select min penalty in to_explore
current = min(to_explore, key=lambda x: marked_nodes[x][0])
current_penalty = marked_nodes[current][0]
to_explore.remove(current)
# Filter neighbors by there covering values
neighbors = d2g.neighbors(str(current.idx))
filtered_neighbors = []
for n in neighbors:
nei = d2g.node_by_idx[int(n)]
if len(d2g.get_covering_variables([nei]) - covered_variables) > 0:
filtered_neighbors.append(n)
neighbors = filtered_neighbors
# Explore all the neighbors of the current node.
for nei_idx in neighbors:
nei_udg = d2g.node_by_idx[int(nei_idx)]
penalty = current_penalty + d2g[nei_idx][str(current.idx)]["distance"]
if nei_udg in marked_nodes:
# Forbidden node
if marked_nodes[nei_udg][0] is None:
continue
# Already inserted but update the min penalty to reach the udg
if penalty < marked_nodes[nei_udg][0]:
marked_nodes[nei_udg] = penalty, current
else:
# First encounter: insert to explore and set min penalty
marked_nodes[nei_udg] = (penalty, current)
to_explore.append(nei_udg)
# Set stop if a target is found
if nei_udg in targets:
if len(found) == 0:
found.append(nei_udg)
elif marked_nodes[found[0]][0] == penalty:
found.append(nei_udg)
elif marked_nodes[found[0]][0] > penalty:
found = [nei_udg]
# Dijkstra part 2, retrieve the best paths
paths = []
for target in found:
# Start a path
node_path = [target]
# Add all nodes one by one to the first
while node_path[-1] != start_udg:
node_path.append(marked_nodes[node_path[-1]][1])
# Reverse the path to be on the right order
node_path.reverse()
# Create and add a real path object
path = Path(d2g)
path.add_path(node_path)
paths.append(path)
return paths
...@@ -6,24 +6,24 @@ from deconvolution.lcpgraph.lcp_path import Path ...@@ -6,24 +6,24 @@ from deconvolution.lcpgraph.lcp_path import Path
class Optimizer: class Optimizer:
def __init__(self, d2g): def __init__(self, lcpg):
self.d2g = d2g self.lcpg = lcpg
def bb_solution(self, start_node=None, verbose=False): def bb_solution(self, start_node=None, verbose=False):
nb_steps_since_last_score_up = 0 nb_steps_since_last_score_up = 0
total_nb_steps = 0 total_nb_steps = 0
first_pass = True first_pass = True
used_nodes = {n: False for n in self.d2g.nodes()} used_nodes = {n: False for n in self.lcpg.nodes()}
# Init solution for # Init solution for
if start_node == None: if start_node == None:
start_node = list(used_nodes.keys()) start_node = list(used_nodes.keys())
start_node = start_node[random.randint(0, len(start_node)-1)] start_node = start_node[random.randint(0, len(start_node)-1)]
best_path = Path(self.d2g) best_path = Path(self.lcpg)
stack = [[start_node]] stack = [[start_node]]
current_path = Path(self.d2g) current_path = Path(self.lcpg)
last_increase_node = None last_increase_node = None
max_cov = len(current_path.covering_variables) max_cov = len(current_path.covering_variables)
...@@ -52,14 +52,14 @@ class Optimizer: ...@@ -52,14 +52,14 @@ class Optimizer:
nb_steps_since_last_score_up += 1 nb_steps_since_last_score_up += 1
# 3 - Neighbors preparation # 3 - Neighbors preparation
neighbors = [x for x in self.d2g[current_node] if not used_nodes[x]] neighbors = [x for x in self.lcpg[current_node] if not used_nodes[x]]
opt = self opt = self
def node_sorting_value(node): def node_sorting_value(node):
u = opt.d2g.node_by_idx[int(node)] u = opt.lcpg.node_by_idx[int(node)]
return (0 if len(opt.d2g.get_covering_variables(node)) - current_path.covering_value == 0 else 1, return (0 if len(opt.lcpg.get_covering_variables(node)) - current_path.covering_value == 0 else 1,
- u.get_link_divergence(), - u.get_link_divergence(),
- opt.d2g[str(u.idx)][str(current_node)]["distance"]) - opt.lcpg[str(u.idx)][str(current_node)]["distance"])
neighbors.sort(key=node_sorting_value) neighbors.sort(key=node_sorting_value)
stack.append(neighbors) stack.append(neighbors)
...@@ -74,10 +74,10 @@ class Optimizer: ...@@ -74,10 +74,10 @@ class Optimizer:
# 5 - Reinit the search from a side and stop when done # 5 - Reinit the search from a side and stop when done
if len(stack) == 0 or (total_nb_steps > 10000 and nb_steps_since_last_score_up >= total_nb_steps / 2): if len(stack) == 0 or (total_nb_steps > 10000 and nb_steps_since_last_score_up >= total_nb_steps / 2):
if first_pass: if first_pass:
used_nodes = {n: False for n in self.d2g.nodes()} used_nodes = {n: False for n in self.lcpg.nodes()}
best_path = Path(self.d2g) best_path = Path(self.lcpg)
stack = [[last_increase_node]] stack = [[last_increase_node]]
current_path = Path(self.d2g) current_path = Path(self.lcpg)
if verbose: if verbose:
print("Start again !") print("Start again !")
import time import time
......
...@@ -4,7 +4,7 @@ import networkx as nx ...@@ -4,7 +4,7 @@ import networkx as nx
import random import random
import argparse import argparse
import deconvolution.dgraph.graph_manipulator as gm import deconvolution.lcp.graph_manipulator as gm
def parse_arguments(): def parse_arguments():
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
import argparse import argparse
from deconvolution.dgraph import graph_manipulator as gm from deconvolution.lcp import graph_manipulator as gm
def parse_arguments(): def parse_arguments():
......
...@@ -11,7 +11,7 @@ def parse_arguments(): ...@@ -11,7 +11,7 @@ def parse_arguments():
parser = argparse.ArgumentParser(description='Transform a barcode graph into a lcp graph. The program dig for a set of lcps and then merge them into a lcp graph.') parser = argparse.ArgumentParser(description='Transform a barcode graph into a lcp graph. The program dig for a set of lcps and then merge them into a lcp graph.')
parser.add_argument('barcode_graph', help='The barcode graph file. Must be a gefx formated file.') parser.add_argument('barcode_graph', help='The barcode graph file. Must be a gefx formated file.')
parser.add_argument('--outfile', '-o', default="", help="Output file name for lcp graph.") parser.add_argument('--outfile', '-o', default="", help="Output file name for lcp graph.")
parser.add_argument('--threads', '-t', default=8, type=int, help='Number of thread to use for dgraph computation') parser.add_argument('--threads', '-t', default=8, type=int, help='Number of thread to use for lcp computation')
parser.add_argument('--debug', '-d', action='store_true', help="Debug") parser.add_argument('--debug', '-d', action='store_true', help="Debug")
parser.add_argument('--verbose', '-v', action='store_true', help="Verbose") parser.add_argument('--verbose', '-v', action='store_true', help="Verbose")
parser.add_argument('--edge_divergence_threshold', '-dt', default=0.25, type=float, help='Divergence threshold value to link two udgs in the d2-graph') parser.add_argument('--edge_divergence_threshold', '-dt', default=0.25, type=float, help='Divergence threshold value to link two udgs in the d2-graph')
......
...@@ -4,7 +4,7 @@ import networkx as nx ...@@ -4,7 +4,7 @@ import networkx as nx
from collections import Counter from collections import Counter
from experiments.CliqueGraph import CliqueGraph from experiments.CliqueGraph import CliqueGraph
from deconvolution.dgraph.CliqueDGFactory import CliqueDGFactory from deconvolution.lcp.CliqueLcpFactory import CliqueDGFactory
def parse_arguments(): def parse_arguments():
...@@ -83,7 +83,7 @@ def analyse_clique_graph(barcode_graph): ...@@ -83,7 +83,7 @@ def analyse_clique_graph(barcode_graph):
def analyse_d_graphs(barcode_graph, threads=8, verbose=False): def analyse_d_graphs(barcode_graph, threads=8, verbose=False):
# Generate udgs # Generate udgs
factory = CliqueDGFactory(barcode_graph, 1) factory = CliqueDGFactory(barcode_graph, 1)
udg_per_node = factory.generate_all_dgraphs(threads=threads, verbose=verbose) udg_per_node = factory.generate_all_lcps(threads=threads, verbose=verbose)
# Remove duplicate udgs # Remove duplicate udgs
udgs = {} udgs = {}
for udg_node_lst in udg_per_node.values(): for udg_node_lst in udg_per_node.values():
...@@ -120,7 +120,7 @@ def main2(): ...@@ -120,7 +120,7 @@ def main2():
import os import os
os.mkdir(debug_path) os.mkdir(debug_path)
factory = CliqueDGFactory(g, 1, debug=args.debug, debug_path=debug_path) factory = CliqueDGFactory(g, 1, debug=args.debug, debug_path=debug_path)
udg_per_node = factory.generate_all_dgraphs(threads=args.threads, verbose=args.verbose) udg_per_node = factory.generate_all_lcps(threads=args.threads, verbose=args.verbose)
# Remove duplicate udgs # Remove duplicate udgs
udgs = set() udgs = set()
for udg_node_lst in udg_per_node.values(): for udg_node_lst in udg_per_node.values():
......
...@@ -4,7 +4,7 @@ from distutils.core import setup ...@@ -4,7 +4,7 @@ from distutils.core import setup
setup( setup(
name='10X-deconvolve', name='10X-deconvolve',
version='0.2dev', version='0.2dev',
packages=['deconvolution.lcpgraph', 'deconvolution.dgraph', 'deconvolution.main', 'experiments', 'tests'], packages=['deconvolution.lcpgraph', 'deconvolution.lcp', 'deconvolution.main', 'experiments', 'tests'],
license='AGPL V3', license='AGPL V3',
long_description=open('README.md').read(), long_description=open('README.md').read(),
) )
import unittest import unittest
from random import randint from random import randint
from deconvolution.dgraph.FixedDGIndex import FixedDGIndex from deconvolution.lcp.FixedDGIndex import FixedDGIndex
from deconvolution.dgraph.VariableDGIndex import VariableDGIndex from deconvolution.lcp.VariableDGIndex import VariableDGIndex
from deconvolution.dgraph.d_graph import Dgraph from deconvolution.lcp.lcp import Dgraph
from deconvolution.dgraph.graph_manipulator import generate_d_graph_chain from deconvolution.lcp.graph_manipulator import generate_d_graph_chain
class TestIndex(unittest.TestCase): class TestIndex(unittest.TestCase):
......
...@@ -4,7 +4,7 @@ import networkx as nx ...@@ -4,7 +4,7 @@ import networkx as nx
from scipy.special import comb from scipy.special import comb
from deconvolution.lcpgraph.lcp_graph import LcpGraph from deconvolution.lcpgraph.lcp_graph import LcpGraph
from deconvolution.dgraph import graph_manipulator as gm from deconvolution.lcp import graph_manipulator as gm
class TestD2Graph(unittest.TestCase): class TestD2Graph(unittest.TestCase):
......
...@@ -2,9 +2,9 @@ import unittest ...@@ -2,9 +2,9 @@ import unittest
import networkx as nx import networkx as nx
from d_graph_data import unit_d_graph from d_graph_data import unit_d_graph
from deconvolution.dgraph.d_graph import Dgraph from deconvolution.lcp.lcp import Dgraph
from deconvolution.dgraph import graph_manipulator as gm