Commit 498df34a authored by Yoann Dufresne's avatar Yoann Dufresne
Browse files

clique graph tests

parent 2122230c
OUTDIR="snake_exec" if "outdir" not in config else config["outdir"]
N=[10000] if "n" not in config else config["n"] # Number of molecule to simulate
D=[5] if "d" not in config else config["d"] # Average coverage of each molecule
M=[2] if "m" not in config else config["m"] # Average number of molecule per barcode
M_DEV=[0] if "m_dev" not in config else config["m_dev"] # Std deviation for merging number
rule all:
input:
expand(f"{OUTDIR}/simu_bar_n{{n}}_d{{d}}_m{{m}}-dev{{md}}.gexf", n=N, m=M, d=D, md=M_DEV)
rule generate_barcodes:
input:
"{path}/simu_mol_{params}.gexf"
output:
"{path}/simu_bar_{params}_m{m}-dev{md}.gexf"
shell:
"python3 deconvolution/main/generate_fake_barcode_graph.py --merging_depth {wildcards.m} --deviation {wildcards.md} --input_graph {input} --output {output}"
rule generate_molecules:
output:
"{path}/simu_mol_n{n}_d{d}.gexf"
shell:
"python3 deconvolution/main/generate_fake_molecule_graph.py --num_molecule {wildcards.n} --avg_depth {wildcards.d} --output {output}"
......@@ -62,7 +62,7 @@ class D2Graph(nx.Graph):
return self.subgraph(list(self.nodes()))
def construct_from_barcodes(self, index_size=3, verbose=True, clique_mode=None, threads=1):
def construct_from_barcodes(self, neighbor_threshold=0.25, verbose=True, clique_mode=None, threads=1):
# Compute all the d-graphs
if verbose:
print("Computing the unit d-graphs..")
......@@ -83,28 +83,10 @@ class D2Graph(nx.Graph):
d_graph.idx = idx
self.node_by_idx[idx] = d_graph
# # Index all the d-graphs
# if verbose:
# print("Compute the dmer dgraph")
# print("\tIndexing")
# # self.index = FixedDGIndex(size=index_size)
# self.index = VariableDGIndex(size=index_size)
# for idx, dg in enumerate(self.all_d_graphs):
# if verbose:
# print(f"\r\t{idx+1}/{len(self.all_d_graphs)}", end='')
# self.index.add_dgraph(dg)
# # self.var_index.add_dgraph(dg)
# if verbose:
# print()
# print("\tFilter index")
# self.index.filter_by_entry()
# # self.index = self.create_index_from_tuples(index_size, verbose=verbose)
# # self.filter_dominated_in_index(tuple_size=index_size, verbose=verbose)
# # Compute node distances for pair of dgraphs that share at least 1 dmer.
if verbose:
print("Compute the graph")
# Create the graph
self.bidict_nodes = self.create_graph_from_node_neighborhoods()
self.bidict_nodes = self.create_graph_from_node_neighborhoods(neighbor_threshold)
def get_covering_variables(self, udg):
......
......@@ -7,10 +7,9 @@ from deconvolution.dgraph import AbstractDGIndex
class CliqueDGFactory(AbstractDGFactory):
def __init__(self, graph, min_size_clique=4, dg_max_divergence_factor=0.5, debug=False, debug_path="."):
def __init__(self, graph, min_size_clique=4, debug=False, debug_path="."):
super(CliqueDGFactory, self).__init__(graph, debug=debug)
self.min_size = min_size_clique
self.dg_max_divergence_factor = dg_max_divergence_factor
if debug:
self.debug_path = debug_path
......
......@@ -64,8 +64,7 @@ def main():
d2g = d2.D2Graph(G, debug=debug, debug_path=debug_path)
dprint("D2 graph object created")
dprint("constructing d2 graph from barcode graph")
index_size = 4 #if clique_mode is None else 3
d2g.construct_from_barcodes(index_size=index_size, clique_mode=clique_mode, threads=args.threads)
d2g.construct_from_barcodes(neighbor_threshold=d2_threshold, clique_mode=clique_mode, threads=args.threads)
dprint("[debug] d2 graph constructed")
# d2g.save(f"{args.output_prefix}.tsv")
......
import networkx as nx
class CliqueGraph(nx.Graph):
def __init__(self, g):
nx.Graph.__init__(self)
self.listed_cliques = set()
self.clique_per_node = {}
self._nodes_from_graph(g)
def _nodes_from_graph(self, g):
self.listed_cliques = set()
self.clique_per_node = {n:set() for n in g.nodes()}
# Generate the graph per node
for n in g.nodes():
# Extract the local neighborhood induced subgraph
neighbors = g.neighbors(n)
subgraph = g.subgraph(neighbors)
# Max clique search
cliques = nx.find_cliques(subgraph)
for clique in cliques:
clique = frozenset(clique)
# Do nothing if the clique was already detected
if clique in self.listed_cliques:
continue
# Add it to the nodes, and to the index
self.add_node(clique)
self.listed_cliques.add(clique)
for node in clique:
self.clique_per_node[node].add(clique)
import argparse
import networkx as nx
from collections import Counter
from experiments.CliqueGraph import CliqueGraph
from deconvolution.dgraph.CliqueDGFactory import CliqueDGFactory
def parse_arguments():
parser = argparse.ArgumentParser(description="Tests on graph barcode")
parser.add_argument('barcode_graph', help='The barcode graph file. Must be a gexf formatted file.')
args = parser.parse_args()
return args
def is_continuous(barcode_multiset):
# Save barcode provenance
originated_barcode = {}
for barcode in barcode_multiset:
for molecule in barcode:
originated_barcode[molecule] = barcode
# Create a continuous array of molecule id
ordered_molecules = list(originated_barcode.keys())
ordered_molecules.sort()
# Look for contiguous molecules
nb_barcode = sum(barcode_multiset.values())
for idx in range(nb_barcode, len(ordered_molecules)+1):
first_idx = idx - nb_barcode
last_idx = idx - 1
# If the gap is to big continue
if ordered_molecules[first_idx] + nb_barcode - 1 != ordered_molecules[last_idx]:
continue
# Verify the number of different barcode involved
involved_barcodes = [originated_barcode[ordered_molecules[idx]] for idx in range(first_idx, idx)]
involved_barcodes_multiset = Counter(involved_barcodes)
if sum(involved_barcodes_multiset.values()) != nb_barcode:
continue
# Verify the barcode content equality
involved_match = True
for key, val in involved_barcodes_multiset.items():
if barcode_multiset[key] != val:
involved_match = False
if involved_match:
return True
return False
def iterable_to_barcode_multiset(clique):
barcodes = []
for b in clique:
ids = (int(x) for x in b.split(":")[1].split("_"))
barcodes.append(ids)
return Counter(barcodes)
def analyse_clique_graph(barcode_graph):
clique_graph = CliqueGraph(barcode_graph)
continuous = 0
for clique in clique_graph.nodes():
# Transform the clique in barcode set
bms = iterable_to_barcode_multiset(clique)
# Check the contiguity
if is_continuous(bms):
continuous += 1
return continuous, len(clique_graph.nodes())
def analyse_d_graphs(barcode_graph):
# Generate udgs
factory = CliqueDGFactory(barcode_graph, 1)
udg_per_node = factory.generate_all_dgraphs()
# Remove duplicate udgs
udgs = {}
for udg_node_lst in udg_per_node.values():
for udg in udg_node_lst:
barcodes = (x for x in udg.to_sorted_list())
bms = iterable_to_barcode_multiset(barcodes)
udgs[barcodes] = bms
continuous = 0
for udg in udgs.values():
if is_continuous(udg):
continuous += 1
return continuous, len(udgs)
def main():
args = parse_arguments()
g = nx.read_gexf(args.barcode_graph)
continuous, total = analyse_clique_graph(g)
print("cliques")
print(continuous, "/", total)
continuous, total = analyse_d_graphs(g)
print("udgs")
print(continuous, "/", total)
if __name__ == "__main__":
main()
......@@ -4,7 +4,7 @@ from distutils.core import setup
setup(
name='10X-deconvolve',
version='0.1dev',
packages=['deconvolution.d2graph', 'deconvolution.dgraph', 'deconvolution.main'],
packages=['deconvolution.d2graph', 'deconvolution.dgraph', 'deconvolution.main', 'experiments'],
license='AGPL V3',
long_description=open('README.md').read(),
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment