Commit 288449cc authored by Yoann Dufresne's avatar Yoann Dufresne
Browse files

deconvolution updates

parent 47b4117e
import glob
WORKDIR = "snake_tests"
DATA = "data"
DATA = "real_data"
SAMPLE = "ema_spades.sam"
#DATA = "real_data"
#SAMPLE = "ema_spades_minovl15k_cont2k.sam"
rule all:
input:
f"{WORKDIR}/simulated_barcode_2000_d5-r0.5_f3-0.5.tar.gz"
f"{WORKDIR}/{SAMPLE}.tar.gz"
rule compress_data:
......@@ -39,15 +43,28 @@ rule d2_generation:
output:
d2_file=f"{WORKDIR}/{{file}}_d2_raw.gexf"
shell:
f"python3 --version;"
f"python3 deconvolution/to_d2_graph.py {{input.barcode_graph}} -o {WORKDIR}/{{wildcards.file}}_d2_raw"
def define_graph_input(wildcards):
lst = glob.glob(f"{DATA}/{wildcards.file}.*")
lst.append(None)
return WORKDIR + lst[0][lst[0].rfind('/'):]
rule convert_file:
input: define_graph_input
output:
f"{WORKDIR}/{{file}}.gexf"
shell:
"python3 deconvolution/gexf_converter.py {input}"
rule setup_workdir:
input:
barcode_graph=f"{DATA}/{{file}}.gexf"
barcode_graph=f"{DATA}/{{file}}"
output:
f"{WORKDIR}/{{file}}.gexf"
f"{WORKDIR}/{{file}}"
shell:
f"if [ ! -d {WORKDIR} ]; then mkdir {WORKDIR}; fi;"
f"cp {{input.barcode_graph}} {WORKDIR}"
......@@ -64,10 +64,10 @@ class D2Graph(nx.Graph):
if verbose:
counts = sum(len(x) for x in self.d_graphs_per_node.values())
print(f"\t {counts} computed d-graphs")
self.d_graphs_per_node = filter_dominated(self.d_graphs_per_node)
if verbose:
counts = sum(len(x) for x in self.d_graphs_per_node.values())
print(f"\t {counts} remaining d-graphs after first filter")
#self.d_graphs_per_node = filter_dominated(self.d_graphs_per_node)
#if verbose:
# counts = sum(len(x) for x in self.d_graphs_per_node.values())
# print(f"\t {counts} remaining d-graphs after first filter")
for d_graphs in self.d_graphs_per_node.values():
self.all_d_graphs.extend(d_graphs)
......@@ -87,7 +87,7 @@ class D2Graph(nx.Graph):
print("Compute the graph")
# Create the graph
self.bidict_nodes = self.create_graph()
self.compute_distances()
#self.compute_distances()
def get_covering_variables(self, udg):
......@@ -162,23 +162,11 @@ class D2Graph(nx.Graph):
return index
def compute_distances(self):
for x, y, data in self.edges(data=True):
dg1 = self.node_by_idx[x]
dg2 = self.node_by_idx[y]
if dg1 == dg2:
continue
# Distance computing and adding in the dist dicts
d = dg1.distance_to(dg2)
data["distance"] = d
def create_graph(self):
nodes = {}
for dmer in self.index:
dgs = list(self.index[dmer])
dgs = list(set(self.index[dmer]))
for d_idx, dg in enumerate(dgs):
# Create a node name
if dg not in nodes:
......@@ -194,13 +182,29 @@ class D2Graph(nx.Graph):
# Add the edges
for prev_node in dgs[:d_idx]:
if prev_node != dg:
self.add_edge(nodes[dg], nodes[prev_node])
for prev_idx in range(d_idx):
prev_dg = dgs[prev_idx]
# Add on small distances
d = dg.distance_to(prev_dg)
if d <= 5:
self.add_edge(nodes[dg], nodes[prev_dg], distance=d)
return bidict(nodes)
def compute_distances(self):
for x, y, data in self.edges(data=True):
dg1 = self.node_by_idx[x]
dg2 = self.node_by_idx[y]
if dg1 == dg2:
continue
# Distance computing and adding in the dist dicts
d = dg1.distance_to(dg2)
data["distance"] = d
def filter_dominated_in_index(self, tuple_size=3, verbose=True):
to_remove = set()
......
......@@ -104,8 +104,7 @@ class Dgraph(object):
def to_sorted_list(self):
if self.sorted_list is None:
self.sorted_list = self.halves[0]+ [self.center] + self.halves[1]
self.sorted_list.sort()
self.sorted_list = sorted(self.nodes)
return self.sorted_list
......@@ -131,21 +130,22 @@ class Dgraph(object):
def distance_to(self, dgraph):
other_nodes = dgraph.nodes
nodes_1 = self.to_sorted_list()
nodes_2 = other_nodes = dgraph.to_sorted_list()
dist = 0
idx1, idx2 = 0, 0
while idx1 != len(self.nodes) and idx2 != len(other_nodes):
if self.nodes[idx1] == other_nodes[idx2]:
while idx1 != len(nodes_1) and idx2 != len(nodes_2):
if nodes_1[idx1] == nodes_2[idx2]:
idx1 += 1
idx2 += 1
else:
dist += 1
if self.nodes[idx1] < other_nodes[idx2]:
if nodes_1[idx1] < nodes_2[idx2]:
idx1 += 1
else:
idx2 += 1
dist += len(self.nodes) - idx1 + len(other_nodes) - idx2
dist += len(nodes_1) - idx1 + len(nodes_2) - idx2
return dist
......@@ -234,13 +234,15 @@ class Dgraph(object):
def compute_all_max_d_graphs(graph, debug=False):
d_graphs = {}
for node in list(graph.nodes()):
for idx, node in enumerate(list(graph.nodes())):
print(f"\r{idx+1}/{len(graph.nodes())}")
neighbors = list(graph.neighbors(node))
neighbors_graph = nx.Graph(graph.subgraph(neighbors))
node_d_graphs = set()
# Find all the cliques (equivalent to compute all the candidate half d-graph)
cliques = list(nx.find_cliques(neighbors_graph))
print("cliques", len(cliques))
# Pair halves to create d-graphes
for idx, clq1 in enumerate(cliques):
......@@ -254,8 +256,9 @@ def compute_all_max_d_graphs(graph, debug=False):
if d_graph.get_link_divergence() <= d_graph.get_optimal_score() / 2:
node_d_graphs.add(d_graph)
d_graphs[node] = sorted(node_d_graphs)
print("d-graphs", len(node_d_graphs))
d_graphs[node] = brutal_list_domination_filter(sorted(node_d_graphs))
print("filtered", len(d_graphs[node]))
return d_graphs
......
def save(dict, filename):
with open(filename, "w") as fp:
for key, array in dict.items():
fp.write(str(len(array)) + " " + key + "\n")
fp.write('\n'.join([str(sorted(x.nodes)) for x in array]) + "\n")
print(filename, "saved")
def load(filename):
d = {}
with open(filename) as fp:
value = None
nb_vals = 0
for line in fp:
line = line.strip()
if value == None:
first_space = line.find(' ')
nb_vals = int(line[:first_space])
value = line[first_space+1:]
d[value] = []
else:
d[value].append(line.strip())
nb_vals -= 1
if nb_vals == 0:
value = None
print(filename, "loaded")
return d
def compare(d1, d2):
remaining = set(d2.keys())
for key in d1:
if key not in d2:
print(key, "not present in d2")
remaining.remove(key)
l1 = sorted([str(sorted(x.nodes)) for x in d1[key]])
l2 = sorted([str(x) for x in d2[key]])
if l1 != l2:
print(f"{key}: disimilar lists:")
s1 = set(l1)
s2 = set(l2)
print(s1 - s2)
print(s2 - s1)
for key in remaining:
print(key, "not present in d1")
import networkx as nx
import argparse
def parse_arguments():
parser = argparse.ArgumentParser(description='Transform graph files from different types into gexf')
parser.add_argument('input_graph', help='Input file with graph to convert')
args = parser.parse_args()
return args
loading_functions = [
nx.read_gpickle,
nx.read_gml,
nx.read_graphml,
nx.read_leda,
nx.read_yaml,
nx.read_pajek,
nx.read_shp
]
def load_graph(filename):
G = None
for loader in loading_functions:
try:
G = loader(filename)
break
except Exception:
continue
if G is None:
raise nx.exception.NetworkXError("No corresponding format")
return G
if __name__ == "__main__":
args = parse_arguments()
print("Loading graph")
G = load_graph(args.input_graph)
print("Writing graph")
lst_point = args.input_graph.rfind('.')
output = args.input_graph[:lst_point] + ".gexf"
nx.write_gexf(G, output)
#!/usr/bin/env python3
import networkx as nx
import argparse
def parse_arguments():
parser = argparse.ArgumentParser(description='Analysis graph cliques complexity')
parser.add_argument('graph', help='The input graph to analyse')
args = parser.parse_args()
return args
def analyse_neighbors_cliques(graph):
clique_sizes = []
for node in graph.nodes():
neighbors = list(graph.neighbors(node))
subgraph = graph.subgraph(neighbors)
cliques = list(nx.find_cliques(subgraph))
print(node, " \t", len(neighbors), " \t", len(cliques))
clique_sizes.append(len(cliques))
print(max(clique_sizes))
def main():
args = parse_arguments()
graph = nx.read_gexf(args.graph)
analyse_neighbors_cliques(graph)
if __name__ == "__main__":
main()
......@@ -28,7 +28,7 @@ def main():
G = nx.read_gexf(filename)
# Index size must be changed for general purpose. 8 is good for d=5
d2g = d2.D2Graph(G)
d2g.construct_from_barcodes(index_size=8)
d2g.construct_from_barcodes(index_size=4)
d2g.save(f"{args.output_prefix}.tsv")
nx.write_gexf(d2g, f"{args.output_prefix}.gexf")
......
......@@ -13,7 +13,7 @@ from tests.d_graph_data import complete_graph
class TestD2Graph(unittest.TestCase):
def test_construction(self):
d2 = D2Graph(complete_graph)
d2.construct_from_barcodes(index_size=6, verbose=False)
d2.construct_from_barcodes(index_size=4, verbose=False)
# Evaluate the number of candidate unit d_graphs generated
for node, candidates in d2.d_graphs_per_node.items():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment