Commit 6cb17069 authored by rchikhi's avatar rchikhi
Browse files

merge

parents 268fcbf6 02f87907
......@@ -2,6 +2,16 @@
Trying to deconvolve single tag assignment for multiple molecules
## Installation
Install the package from the root directory.
```bash
# For users
pip install . --user
# For developers
pip install -e . --user
```
## Scripts
For the majority of the scripts, argparse is used.
......@@ -48,5 +58,5 @@ Config parameters:
```bash
snakemake -s Snakefile_data_simu --config n=10000 m=[4,6,8,10,12] m_dev=[0,0.5,1,2,3]
snakemake -s Snakefile_d2 --config input=[snakemake -s Snakefile_d2 --config input=[snake_exec/simu_bar_n10000_d5_m10-dev0.5.gexf,snake_exec/simu_bar_n10000_d5_m10-dev0.gexf,snake_exec/simu_bar_n10000_d5_m10-dev1.gexf,snake_exec/simu_bar_n10000_d5_m10-dev2.gexf,snake_exec/simu_bar_n10000_d5_m10-dev3.gexf,snake_exec/simu_bar_n10000_d5_m12-dev0.5.gexf,snake_exec/simu_bar_n10000_d5_m12-dev0.gexf,snake_exec/simu_bar_n10000_d5_m12-dev1.gexf,snake_exec/simu_bar_n10000_d5_m12-dev2.gexf,snake_exec/simu_bar_n10000_d5_m12-dev3.gexf,snake_exec/simu_bar_n10000_d5_m4-dev0.5.gexf,snake_exec/simu_bar_n10000_d5_m4-dev0.gexf,snake_exec/simu_bar_n10000_d5_m4-dev1.gexf,snake_exec/simu_bar_n10000_d5_m4-dev2.gexf,snake_exec/simu_bar_n10000_d5_m4-dev3.gexf,snake_exec/simu_bar_n10000_d5_m6-dev0.5.gexf,snake_exec/simu_bar_n10000_d5_m6-dev0.gexf,snake_exec/simu_bar_n10000_d5_m6-dev1.gexf,snake_exec/simu_bar_n10000_d5_m6-dev2.gexf,snake_exec/simu_bar_n10000_d5_m6-dev3.gexf,snake_exec/simu_bar_n10000_d5_m8-dev0.5.gexf,snake_exec/simu_bar_n10000_d5_m8-dev0.gexf,snake_exec/simu_bar_n10000_d5_m8-dev1.gexf,snake_exec/simu_bar_n10000_d5_m8-dev2.gexf,snake_exec/simu_bar_n10000_d5_m8-dev3.gexf]]
snakemake -s Snakefile_d2 --config input=[snake_exec/simu_bar_n10000_d5_m10-dev0.5.gexf,snake_exec/simu_bar_n10000_d5_m10-dev0.gexf,snake_exec/simu_bar_n10000_d5_m10-dev1.gexf,snake_exec/simu_bar_n10000_d5_m10-dev2.gexf,snake_exec/simu_bar_n10000_d5_m10-dev3.gexf,snake_exec/simu_bar_n10000_d5_m12-dev0.5.gexf,snake_exec/simu_bar_n10000_d5_m12-dev0.gexf,snake_exec/simu_bar_n10000_d5_m12-dev1.gexf,snake_exec/simu_bar_n10000_d5_m12-dev2.gexf,snake_exec/simu_bar_n10000_d5_m12-dev3.gexf,snake_exec/simu_bar_n10000_d5_m4-dev0.5.gexf,snake_exec/simu_bar_n10000_d5_m4-dev0.gexf,snake_exec/simu_bar_n10000_d5_m4-dev1.gexf,snake_exec/simu_bar_n10000_d5_m4-dev2.gexf,snake_exec/simu_bar_n10000_d5_m4-dev3.gexf,snake_exec/simu_bar_n10000_d5_m6-dev0.5.gexf,snake_exec/simu_bar_n10000_d5_m6-dev0.gexf,snake_exec/simu_bar_n10000_d5_m6-dev1.gexf,snake_exec/simu_bar_n10000_d5_m6-dev2.gexf,snake_exec/simu_bar_n10000_d5_m6-dev3.gexf,snake_exec/simu_bar_n10000_d5_m8-dev0.5.gexf,snake_exec/simu_bar_n10000_d5_m8-dev0.gexf,snake_exec/simu_bar_n10000_d5_m8-dev1.gexf,snake_exec/simu_bar_n10000_d5_m8-dev2.gexf,snake_exec/simu_bar_n10000_d5_m8-dev3.gexf]
```
......@@ -17,15 +17,16 @@ rule compress_data:
input:
barcode_graph=f"{WORKDIR}/{{barcode_file}}.gexf",
d2_raw=f"{WORKDIR}/{{barcode_file}}_d2_raw_maxclq.gexf",
simplified_d2=f"{WORKDIR}/{{barcode_file}}_d2_simplified_maxclq.gexf",
d2_raw_louvain=f"{WORKDIR}/{{barcode_file}}_d2_raw_louvain.gexf",
simplified_d2_louvain=f"{WORKDIR}/{{barcode_file}}_d2_simplified_louvain.gexf"
simplified_d2=f"{WORKDIR}/{{barcode_file}}_d2_simplified_maxclq.gexf"
# d2_raw_louvain=f"{WORKDIR}/{{barcode_file}}_d2_raw_louvain.gexf",
# simplified_d2_louvain=f"{WORKDIR}/{{barcode_file}}_d2_simplified_louvain.gexf"
output:
zip=f"{WORKDIR}/{{barcode_file}}.tar.gz"
shell:
f"cd {WORKDIR}/ ;"
"if [ ! -d {wildcards.barcode_file} ]; then mkdir {wildcards.barcode_file} ; fi;"
"mv *.gexf {wildcards.barcode_file}/ ;"
"mv {wildcards.barcode_file}_* {wildcards.barcode_file}/ ;"
"cp {wildcards.barcode_file}.gexf {wildcards.barcode_file}/ ;"
"tar czf {wildcards.barcode_file}.tar.gz {wildcards.barcode_file}/ ;"
"rm -rf {wildcards.barcode_file}/ ;"
"cd - ;"
......@@ -37,7 +38,7 @@ rule d2_simplification:
output:
simplified_d2="{barcode_path}_d2_simplified_{method}.gexf"
shell:
"python3 deconvolution/d2_reduction.py -o {output.simplified_d2} {input.barcode_graph} {input.d2_raw}"
"python3 deconvolution/main/d2_reduction.py -o {output.simplified_d2} {input.barcode_graph} {input.d2_raw}"
rule d2_generation:
......@@ -46,7 +47,7 @@ rule d2_generation:
output:
d2_file=f"{WORKDIR}/{{file}}_d2_raw_{{method}}.gexf"
run:
shell(f"python3 deconvolution/to_d2_graph.py {{input.barcode_graph}} --{{wildcards.method}} -o {WORKDIR}/{{wildcards.file}}_d2_raw_{{wildcards.method}}")
shell(f"python3 deconvolution/main/to_d2_graph.py {{input.barcode_graph}} --{{wildcards.method}} -o {WORKDIR}/{{wildcards.file}}_d2_raw_{{wildcards.method}}")
rule setup_workdir:
......@@ -58,4 +59,4 @@ rule setup_workdir:
shell(f"if [ ! -d {WORKDIR} ]; then mkdir {WORKDIR}; fi;")
shell(f"cp {{input}} {WORKDIR}")
if input[-5:] != ".gexf":
shell("python3 deconvolution/gexf_converter.py {input}")
shell("python3 deconvolution/main/gexf_converter.py {input}")
......@@ -16,11 +16,11 @@ rule generate_barcodes:
output:
"{path}/simu_bar_{params}_m{m}-dev{md}.gexf"
shell:
"python3 deconvolution/generate_fake_barcode_graph.py --merging_depth {wildcards.m} --deviation {wildcards.md} --input_graph {input} --output {output}"
"python3 deconvolution/main/generate_fake_barcode_graph.py --merging_depth {wildcards.m} --deviation {wildcards.md} --input_graph {input} --output {output}"
rule generate_molecules:
output:
"{path}/simu_mol_n{n}_d{d}.gexf"
shell:
"python3 deconvolution/generate_fake_molecule_graph.py --num_molecule {wildcards.n} --avg_depth {wildcards.d} --output {output}"
"python3 deconvolution/main/generate_fake_molecule_graph.py --num_molecule {wildcards.n} --avg_depth {wildcards.d} --output {output}"
import networkx as nx
from itertools import combinations
from d2_path import Path, Unitig
from deconvolution.d2graph.d2_path import Unitig
""" Remove unnecessary transitions
......
......@@ -3,7 +3,11 @@ import itertools
from bidict import bidict
import sys
from d_graph import Dgraph, compute_all_max_d_graphs, filter_dominated, list_domination_filter
from deconvolution.dgraph.FixedDGIndex import FixedDGIndex
from deconvolution.dgraph.VariableDGIndex import VariableDGIndex
from deconvolution.dgraph.d_graph import Dgraph
from deconvolution.dgraph.CliqueDGFactory import CliqueDGFactory
from deconvolution.dgraph.LouvainDGFactory import LouvainDGFactory
class D2Graph(nx.Graph):
......@@ -55,23 +59,22 @@ class D2Graph(nx.Graph):
return self.subgraph(list(self.nodes()))
def construct_from_barcodes(self, index_size=3, verbose=True, debug=False, clique_mode=None):
import debug_disct as dd
def construct_from_barcodes(self, index_size=3, verbose=True, debug=False, clique_mode=None, threads=1):
# Compute all the d-graphs
if verbose:
print("Computing the unit d-graphs..")
self.d_graphs_per_node = compute_all_max_d_graphs(self.barcode_graph, debug=debug, clique_mode=clique_mode)
dg_factory = None
if clique_mode == "louvain":
dg_factory = LouvainDGFactory(self.barcode_graph)
else:
dg_factory = CliqueDGFactory(self.barcode_graph)
self.d_graphs_per_node = dg_factory.generate_all_dgraphs(debug=True)
if verbose:
counts = sum(len(x) for x in self.d_graphs_per_node.values())
print(f"\t {counts} computed d-graphs")
#self.d_graphs_per_node = filter_dominated(self.d_graphs_per_node)
#if verbose:
# counts = sum(len(x) for x in self.d_graphs_per_node.values())
# print(f"\t {counts} remaining d-graphs after first filter")
for d_graphs in self.d_graphs_per_node.values():
self.all_d_graphs.extend(d_graphs)
# Name the d-graphs
# Number the d_graphs
for idx, d_graph in enumerate(self.all_d_graphs):
d_graph.idx = idx
......@@ -79,15 +82,26 @@ class D2Graph(nx.Graph):
# Index all the d-graphs
if verbose:
print("Compute the dmer index")
self.index = self.create_index_from_tuples(index_size, verbose=verbose)
self.filter_dominated_in_index(tuple_size=index_size, verbose=verbose)
print("Compute the dmer dgraph")
print("\tIndexing")
# self.index = FixedDGIndex(size=index_size)
self.index = VariableDGIndex(size=2)
for idx, dg in enumerate(self.all_d_graphs):
if verbose:
print(f"\r\t{idx+1}/{len(self.all_d_graphs)}", end='')
self.index.add_dgraph(dg)
# self.var_index.add_dgraph(dg)
if verbose:
print()
print("\tFilter index")
self.index.filter_by_entry()
# self.index = self.create_index_from_tuples(index_size, verbose=verbose)
# self.filter_dominated_in_index(tuple_size=index_size, verbose=verbose)
# Compute node distances for pair of dgraphs that share at least 1 dmer.
if verbose:
print("Compute the graph")
# Create the graph
self.bidict_nodes = self.create_graph()
#self.compute_distances()
def get_covering_variables(self, udg):
......@@ -203,55 +217,3 @@ class D2Graph(nx.Graph):
# Distance computing and adding in the dist dicts
d = dg1.distance_to(dg2)
data["distance"] = d
def filter_dominated_in_index(self, tuple_size=3, verbose=True):
to_remove = set()
if verbose:
print("\tFilter dominated in index")
# Find dominated
for dmer_idx, item in enumerate(self.index.items()):
dmer, dg_list = item
if verbose:
sys.stdout.write(f"\r\t{dmer_idx+1}/{len(self.index)}")
sys.stdout.flush()
undominated = list_domination_filter(dg_list)
# Register dominated
if len(dg_list) != len(undominated):
for dg in dg_list:
if dg not in undominated:
to_remove.add(dg)
self.index[dmer] = undominated
if verbose:
print()
print("\tDmer removal")
removable_dmers = set()
for r_idx, r_dg in enumerate(to_remove):
if verbose:
sys.stdout.write(f"\r\t{r_idx+1}/{len(to_remove)}")
sys.stdout.flush()
self.all_d_graphs.remove(r_dg)
self.d_graphs_per_node[r_dg.center].remove(r_dg)
# Remove dominated in index
for dmer in itertools.combinations(r_dg.to_sorted_list(), tuple_size):
if r_dg in self.index[dmer]:
self.index[dmer] = list(filter(lambda x: x!=r_dg, self.index[dmer]))
if len(self.index[dmer]) == 0:
removable_dmers.add(dmer)
# Remove empty dmers
for dmer in removable_dmers:
del self.index[dmer]
if verbose:
print()
import networkx as nx
from d2_path import Path
from deconvolution.d2graph.d2_path import Path
""" Greedy algorithm. Start with th most probable unitig (ie lowest normalized penalty first and largest unitig for
equalities). Then extends on both side to the nearest interesting unitig.
......
import random
from d2_path import Path
import networkx as nx
from deconvolution.d2graph.d2_path import Path
class Optimizer:
......
def save(dict, filename):
with open(filename, "w") as fp:
for key, array in dict.items():
fp.write(str(len(array)) + " " + key + "\n")
fp.write('\n'.join([str(sorted(x.nodes)) for x in array]) + "\n")
print(filename, "saved")
def load(filename):
d = {}
with open(filename) as fp:
value = None
nb_vals = 0
for line in fp:
line = line.strip()
if value == None:
first_space = line.find(' ')
nb_vals = int(line[:first_space])
value = line[first_space+1:]
d[value] = []
else:
d[value].append(line.strip())
nb_vals -= 1
if nb_vals == 0:
value = None
print(filename, "loaded")
return d
def compare(d1, d2):
remaining = set(d2.keys())
for key in d1:
if key not in d2:
print(key, "not present in d2")
remaining.remove(key)
l1 = sorted([str(sorted(x.nodes)) for x in d1[key]])
l2 = sorted([str(x) for x in d2[key]])
if l1 != l2:
print(f"{key}: disimilar lists:")
s1 = set(l1)
s2 = set(l2)
print(s1 - s2)
print(s2 - s1)
for key in remaining:
print(key, "not present in d1")
import networkx as nx
from abc import abstractmethod
from deconvolution.dgraph.FixedDGIndex import FixedDGIndex
class AbstractDGFactory:
def __init__(self, graph):
self.graph = graph
def generate_all_dgraphs(self, debug=False):
index = FixedDGIndex(size=1)
nb_nodes = len(self.graph.nodes())
for idx, node in enumerate(self.graph.nodes()):
if debug: print(f"\r{idx+1}/{nb_nodes} node analysis", end='')
neighbors = list(self.graph.neighbors(node))
subgraph = nx.Graph(self.graph.subgraph(neighbors))
# Fill the index by node
key = frozenset({node})
for dg in self.generate_by_node(node, subgraph):
index.add_value(key, dg)
index.filter_by_entry()
return index
@abstractmethod
def generate_by_node(self, node, subgraph):
pass
# def compute_all_max_d_graphs(graph, debug=False, clique_mode=None, threads=1):
# d_graphs = FixedDGIndex(size=1)
#
# nds = list(graph.nodes())
# for idx, node in enumerate(nds):
# # print(idx+1, '/', len(nds))
# #if "MI" not in str(node): continue # for debugging; only look at deconvolved nodes
# #print(f"\r{idx+1}/{len(graph.nodes())}")
# neighbors = list(graph.neighbors(node))
# neighbors_graph = nx.Graph(graph.subgraph(neighbors))
#
# node_d_graphs = set()
#
# mode_str = " "
# if clique_mode is None:
# # Find all the cliques (equivalent to compute all the candidate half d-graph)
# cliques = []
# for clique in nx.find_cliques(neighbors_graph):
# if len(clique) > 3:
# cliques.append(clique)
# mode_str += "(max-cliques)"
# elif clique_mode == "louvain":
# louvain = community.best_partition(neighbors_graph) # louvain
# # high resolution seems to work better
# communities = [[c for c,i in louvain.items() if i == clique_id] for clique_id in set(louvain.values())]
# mode_str += "(louvain)"
# cliques = []
# for comm in communities:
# # further decompose! into necessarily 2 communities
# community_as_graph = nx.Graph(graph.subgraph(comm))
# if len(community_as_graph.nodes()) <= 2:
# cliques += [community_as_graph.nodes()]
# else:
# cliques += map(list,nx.community.asyn_fluidc(community_as_graph,2))
#
# elif clique_mode == "testing":
# # k-clique communities
# #from networkx.algorithms.community import k_clique_communities
# #cliques = k_clique_communities(neighbors_graph, 3) # related to the d-graph d parameter
# from cdlib import algorithms
# cliques_dict = algorithms.node_perception(neighbors_graph, threshold=0.75, overlap_threshold=0.75) #typical output: Sizes of found cliques (testing): Counter({6: 4, 5: 3, 4: 2, 2: 1})
# #cliques_dict = algorithms.gdmp2(neighbors_graph, min_threshold=0.9) #typical output: sizes of found cliques (testing): Counter({3: 2, 5: 1})
# #cliques_dict = algorithms.angel(neighbors_graph, threshold=0.90) # very sensitive parameters: 0.84 and 0.88 don't work at all but 0.86 does sort of
# from collections import defaultdict
# cliques_dict2 = defaultdict(list)
# for (node, values) in cliques_dict.to_node_community_map().items():
# for value in values:
# cliques_dict2[value] += [node]
# cliques = list(cliques_dict2.values())
# mode_str += "(testing)"
#
# if debug: print("node", node, "has", len(cliques), "cliques in neighborhood (of size", len(neighbors), ")")
#
# cliques_debugging = True
# if cliques_debugging:
#
# from collections import Counter
# len_cliques = Counter(map(len,cliques))
#
# # Pair halves to create d-graphes
# for idx, clq1 in enumerate(cliques):
# for clq2_idx in range(idx+1, len(cliques)):
# clq2 = cliques[clq2_idx]
#
# # Check for d-graph candidates
# d_graph = Dgraph(node)
# d_graph.put_halves(clq1, clq2, neighbors_graph)
#
# factor = 0.5
# #if clique_mode == "testing": factor = 1 # still allows louvain's big communities
# #print("link div:",d_graph.get_link_divergence(),"opt:",d_graph.get_optimal_score(), "good d graph?",d_graph.get_link_divergence() <= d_graph.get_optimal_score() *factor)
# if d_graph.get_link_divergence() <= d_graph.get_optimal_score() * factor:
# node_d_graphs.add(d_graph)
#
# # Fill the index by node
# key = frozenset({node})
# for dg in node_d_graphs:
# d_graphs.add_value(key, dg)
#
# d_graphs.filter_by_entry()
# return d_graphs
from abc import abstractmethod
class AbstractDGIndex(dict):
def __init__(self, size=3):
""" This class represent a index of dgraphs.
Each key in the index is a set of barcodes and each value a list of dgraphs containing these barcodes.
:param size: Usage vary regarding the subclass used
"""
super(AbstractDGIndex, self).__init__()
self.size = size
@abstractmethod
def _verify_key(self, key_set, dg_size=0):
pass
def add_value(self, key_set, dgraph):
""" Add the couple key (set of barcodes) and value (dgraph) at the right place in the dict
"""
pass
# Test the key size
self._verify_key(key_set)
key_set = frozenset(key_set)
# Add the key if not already present
if key_set not in self:
self[key_set] = set()
# Associate the value with the key
self[key_set].add(dgraph)
@abstractmethod
def add_dgraph(self, dg):
""" Generate all the set needed for keys in the dgraph and push the d-graph as value.
For fixed size of the dgraph all the sets of this size will be generated as key.
Otherwise, all the set of size at least len(dg) - size will be generated.
"""
pass
def _filter_entry(self, key_set):
""" For one entry in the index, filter out dominated dgraphs
:param key_set: The entry to filter
"""
# Verify presence in the index
if key_set not in self:
raise KeyError("The set is not present in the index")
# n² filtering
dgs = self[key_set]
to_remove = set()
for dg1 in dgs:
for dg2 in dgs:
if dg1.is_dominated(dg2):
to_remove.add(dg1)
break
self[key_set] = dgs - to_remove
return to_remove
def filter_by_entry(self):
for key_set in self:
removed = self._filter_entry(key_set)
# TODO: remove globaly ?
def __contains__(self, key):
key = frozenset(key)
return super(AbstractDGIndex, self).__contains__(key)
def __getitem__(self, key):
return super(AbstractDGIndex, self).__getitem__(self.__keytransform__(key))
def __keytransform__(self, key):
return frozenset(key)
import networkx as nx
from deconvolution.dgraph.AbstractDGFactory import AbstractDGFactory
from deconvolution.dgraph.d_graph import Dgraph
class CliqueDGFactory(AbstractDGFactory):
def __init__(self, graph, min_size_clique=4, dg_max_divergence_factor=0.5):
super(CliqueDGFactory, self).__init__(graph)
self.min_size = min_size_clique
self.dg_max_divergence_factor = dg_max_divergence_factor
def generate_by_node(self, node, subgraph):
node_d_graphs = set()
# Clique computation
cliques = []
for clique in nx.find_cliques(subgraph):
if len(clique) >= self.min_size:
cliques.append(clique)
# TODO: Index cliques to pair them faster
# d-graph computation
for idx, clq1 in enumerate(cliques):
for clq2_idx in range(idx+1, len(cliques)):
clq2 = cliques[clq2_idx]
# Check for d-graph candidates
d_graph = Dgraph(node)
d_graph.put_halves(clq1, clq2, subgraph)
if d_graph.get_link_divergence() <= d_graph.get_optimal_score() * self.dg_max_divergence_factor:
node_d_graphs.add(d_graph)
return node_d_graphs
from itertools import combinations
from deconvolution.dgraph.AbstractDGIndex import AbstractDGIndex
class FixedDGIndex(AbstractDGIndex):
def __init__(self, size=3):
super(FixedDGIndex, self).__init__(size=size)
def _verify_key(self, key_set, dg_size=0):
if len(key_set) != self.size:
raise ValueError("Wrong set size in the dgraph")
def add_dgraph(self, dg):
barcodes = dg.node_set
for tup in combinations(barcodes, self.size):
self.add_value(frozenset(tup), dg)
import networkx as nx
from deconvolution.dgraph.AbstractDGFactory import AbstractDGFactory
from deconvolution.dgraph.d_graph import Dgraph
import community
class LouvainDGFactory(AbstractDGFactory):
def __init__(self, graph, dg_max_divergence_factor=0.5):
super(LouvainDGFactory, self).__init__(graph)
self.dg_max_divergence_factor = dg_max_divergence_factor
def generate_by_node(self, node, subgraph):
node_d_graphs = set()
louvain = community.best_partition(subgraph) # louvain
# high resolution seems to work better
communities = [[c for c,i in louvain.items() if i == clique_id] for clique_id in set(louvain.values())]
cliques = []
for comm in communities:
# further decompose! into necessarily 2 communities
community_as_graph = nx.Graph(subgraph.subgraph(comm))
if len(community_as_graph.nodes()) <= 2:
cliques += [community_as_graph.nodes()]
else:
cliques += map(list,nx.community.asyn_fluidc(community_as_graph,2))
# d-graph computation
for idx, clq1 in enumerate(cliques):
for clq2_idx in range(idx+1, len(cliques)):
clq2 = cliques[clq2_idx]
# Check for d-graph candidates
d_graph = Dgraph(node)
d_graph.put_halves(clq1, clq2, subgraph)
if d_graph.get_link_divergence() <= d_graph.get_optimal_score() * self.dg_max_divergence_factor:
node_d_graphs.add(d_graph)
return node_d_graphs
from itertools import combinations
from deconvolution.dgraph.AbstractDGIndex import AbstractDGIndex
class VariableDGIndex(AbstractDGIndex):