Commit cb354bcb authored by Yoann Dufresne's avatar Yoann Dufresne

merged main

parents c97d2571 34adf732
......@@ -2,6 +2,8 @@
A compilation of scripts and pipelines to count and extract scaffolds of barcodes from linked reads datasets.
**WARNING**: This code is a proof of concept, not a usable software for production. If the code is too slow for your tests or you are encontering some bugs (maybe it's a feature ? :p) don't hesitate to contact us via the issues or with a direct mail to me (yoann [dot] dufresne [at] pasteur [dot] fr).
## Nomenclature warnings
During the process of writing a scientific article, some of the datastructure names have been modified.
In this repository the majority of the names are old names.
......@@ -27,6 +29,20 @@ Install the package from the root directory.
For the majority of the scripts, argparse is used.
To know how to use it please use the -h command line option.
### Test the complete pipeline on simulated data
For a complete test, we made a bunch of snakemake files.
If you are looking for a complete pipeline from synthetic data generation, you should look into the "Snakefile_d2_eval" file.
You can play with the N (number of molecules in the interval graph), M (average number of merge to perform in a barcode), DEV (standard deviation on merge) variables to see impact on performances.
These values are arrays. You can enter multiple values and all the combinations will be done.
A summary is output in the tsv file "{WORKDIR}/eval_compare_maxclique.tsv.
Warning: the pipeline can be very slow for huge number of parameters.
Command to run the pipeline:
```
snakemake -s Snakefile_d2_eval
```
### Data simulation
* generate_fake_molecule_graph.py: Create a linear molecule graph, where the molecules are linked to the d molecules on their left and d molecules on their right.
......
......@@ -35,14 +35,13 @@ rule compress_data:
rule d2_simplification:
input:
barcode_graph="{barcode_path}.gexf",
d2_raw="{barcode_path}_d2_raw_{method}.gexf"
output:
simplified_d2="{barcode_path}_d2_simplified_{method}.gexf"
wildcard_constraints:
method="[A-Za-z0-9]+"
shell:
"python3 deconvolution/main/d2_reduction.py -o {output.simplified_d2} {input.barcode_graph} {input.d2_raw}"
"python3 deconvolution/main/d2_reduction.py -o {output.simplified_d2} {input.d2_raw}"
rule d2_generation:
......@@ -54,7 +53,7 @@ rule d2_generation:
wildcard_constraints:
method="[A-Za-z0-9]+"
run:
shell(f"python3 deconvolution/main/to_d2_graph.py {{input.barcode_graph}} --{{wildcards.method}} -t {{threads}} -o {WORKDIR}/{{wildcards.file}}_d2_raw_{{wildcards.method}}")
shell(f"python3 deconvolution/main/to_d2_graph.py {{input.barcode_graph}} --{{wildcards.method}} -t {{threads}} -o {WORKDIR}/{{wildcards.file}}_d2_raw_{{wildcards.method}}.gexf")
rule setup_workdir:
......
......@@ -3,10 +3,10 @@ include: "Snakefile_d2"
include: "Snakefile_d2_path"
WORKDIR = "snake_experiments" if "workdir" not in config else config["workdir"]
N = [5000, 10000]
N = [5000]
D = [10]
M = [2, 3]
DEV = [0, 1]
M = [2]
DEV = [0]
rule generate_compare:
input:
......
......@@ -5,14 +5,13 @@ threshold = 0.9
rule d2_path_generation:
input:
barcode="{path}.gexf",
d2="{path}_d2_{type}_{method}.gexf"
output:
"{path}_d2_{type}_{method}_path.gexf"
run:
best = 0
for _ in range(number_try):
shell("python3 deconvolution/main/d2_to_path.py {input.barcode} {input.d2} > {output}_tmp.out")
shell("python3 deconvolution/main/d2_to_path.py {input.d2} > {output}_tmp.out")
score = 0
with open(f"{output}_tmp.out") as out:
score_line = out.readlines()[-2].strip()
......
......@@ -12,26 +12,19 @@ from deconvolution.dgraph.LouvainDGFactory import LouvainDGFactory
class D2Graph(nx.Graph):
"""D2Graph (read it (d-graph)²)"""
def __init__(self, barcode_graph, debug=False, debug_path='.'):
def __init__(self, debug=False, debug_path='.'):
super(D2Graph, self).__init__()
self.all_d_graphs = []
self.d_graphs_per_node = {}
self.node_by_idx = {}
self.barcode_graph = barcode_graph
self.barcode_graph = None
self.index = None
self.variables = set()
self.variables_per_lcp = {}
# Number the edges from original graph
self.barcode_edge_idxs = {}
self.nb_uniq_edge = 0
for idx, edge in enumerate(self.barcode_graph.edges()):
if edge == (edge[1], edge[0]):
self.nb_uniq_edge += 1
if edge in self.barcode_edge_idxs:
print("Edge already present")
self.barcode_edge_idxs[edge] = idx
self.barcode_edge_idxs[(edge[1], edge[0])] = idx
self.debug = debug
self.debug_path = debug_path
......@@ -58,13 +51,27 @@ class D2Graph(nx.Graph):
# Node by idx
G.node_by_idx = self.node_by_idx
G.variables = self.variables.copy()
return G
def clone(self):
return self.subgraph(list(self.nodes()))
def construct_from_barcodes(self, neighbor_threshold=0.25, min_size_clique=4, verbose=False, clique_mode=None, threads=1):
def construct_from_barcodes(self, barcode_graph, neighbor_threshold=0.25, min_size_clique=4, verbose=False, clique_mode=None, threads=1):
self.barcode_graph = barcode_graph
# Number the edges from original graph
for idx, edge in enumerate(self.barcode_graph.edges()):
if edge == (edge[1], edge[0]):
self.nb_uniq_edge += 1
if edge in self.barcode_edge_idxs:
print("Edge already present")
self.barcode_edge_idxs[edge] = idx
self.barcode_edge_idxs[(edge[1], edge[0])] = idx
self.barcode_graph[edge[0]][edge[1]]["uniq_idx"] = idx
# Compute all the d-graphs
if verbose:
print("Computing the unit d-graphs..")
......@@ -102,27 +109,13 @@ class D2Graph(nx.Graph):
lcp = self.get_lcp(obj)
if lcp not in self.variables_per_lcp:
variables = []
for e in lcp.edges:
variables.append(self.barcode_edge_idxs[e])
for e_idx in lcp.edges:
variables.append(e_idx)
self.variables_per_lcp[lcp] = variables
return self.variables_per_lcp[lcp]
def save(self, filename):
with open(filename, "w") as fp:
# First line nb_nodes nb_cov_var
fp.write(f"{len(self.all_d_graphs)} {int((len(self.barcode_edge_idxs)+self.nb_uniq_edge)/2)}\n")
# Write the edges per d_graph
for d_graph in self.all_d_graphs:
fp.write(f"{d_graph.idx} {' '.join([str(self.barcode_edge_idxs[e]) for e in d_graph.edges])}\n")
# Write the distances
for x, y, data in self.edges(data=True):
fp.write(f"{x} {y} {data['distance']}\n")
def load(self, filename):
# Reload the graph
G = nx.read_gexf(filename)
......@@ -137,7 +130,8 @@ class D2Graph(nx.Graph):
self.bidict_nodes = {}
for idx, node in enumerate(self.nodes(data=True)):
node, data = node
dg = Dgraph.load(data["udg"], self.barcode_graph)
dg = Dgraph.load(data["udg"], data["score"], data["barcode_edges"])
self.variables.update(dg.edges)
self.bidict_nodes[node] = dg
self.all_d_graphs.append(dg)
if dg.idx == -1:
......@@ -181,11 +175,11 @@ class D2Graph(nx.Graph):
nodes[dg] = dg.idx
self.add_node(nodes[dg])
# Add covering barcode edges
barcode_edges = " ".join([str(self.barcode_edge_idxs[x]) for x in dg.edges])
barcode_edges = " ".join([str(x) for x in dg.edges])
self.nodes[nodes[dg]]["barcode_edges"] = barcode_edges
self.nodes[nodes[dg]]["score"] = f"{dg.score}/{dg.get_optimal_score()}"
self.nodes[nodes[dg]]["udg"] = str(dg)
self.nodes[nodes[dg]]["central_node_barcode"] = str(dg).split(']')[0]+']'
self.nodes[nodes[dg]]["central_node_barcode"] = str(dg.center)
# Create the edges from neighbor edges
for dg in self.all_d_graphs:
......
......@@ -10,7 +10,7 @@ class Path(list):
def __init__(self, d2g):
super(Path, self).__init__()
self.d2g = d2g
self.covering_variables = {x: 0 for x in self.d2g.barcode_edge_idxs.values()}
self.covering_variables = {x: 0 for x in self.d2g.variables}
self.covering_value = 0
# a succession of Counter (multiset)
self.barcode_order = []
......@@ -22,8 +22,7 @@ class Path(list):
lcp = self.d2g.get_lcp(obj)
# Update the covering variables
for barcode_edge in lcp.edges:
edge_idx = self.d2g.barcode_edge_idxs[barcode_edge]
for edge_idx in lcp.edges:
if self.covering_variables[edge_idx] == 0:
self.covering_value += 1
self.covering_variables[edge_idx] += 1
......@@ -75,10 +74,9 @@ class Path(list):
self._pop_barcodes(lcp)
# Update the covering variables
for barcode_edge in lcp.edges:
edge_idx = self.d2g.barcode_edge_idxs[barcode_edge]
self.covering_variables[edge_idx] -= 1
if self.covering_variables[edge_idx] == 0:
for e_idx in lcp.edges:
self.covering_variables[e_idx] -= 1
if self.covering_variables[e_idx] == 0:
self.covering_value -= 1
return lcp
......@@ -137,7 +135,7 @@ class Path(list):
d2p.nodes[udg.idx]["center"] = udg.center
d2p.nodes[udg.idx]["udg"] = str(udg)
d2p.nodes[udg.idx]["score"] = f"{udg.score}/{udg.get_optimal_score()}"
barcode_edges = " ".join([str(self.d2g.barcode_edge_idxs[x]) for x in udg.edges])
barcode_edges = " ".join([str(x) for x in udg.edges])
d2p.nodes[udg.idx]["barcode_edges"] = barcode_edges
# add the edges
......
......@@ -11,60 +11,63 @@ class Dgraph(object):
self.center = center
self.score = 0
self.halves = [[], []]
self.connexity = [[], []]
self.connexity = [{}, {}]
self.nodes = [self.center]
self.node_set = set(self.nodes)
self.edges = []
self.ordered_list = None
self.sorted_list = None
self.edges = set()
self.marked = False
@staticmethod
def load(lcp_txt, score, variables):
""" Static method to load a dgraph from a text
@param text the saved d-graph
@param barcode_graph Barcode graph from which the d-graph is extracted
@return a new d-graph object corresponding to the test
:param lcp_txt: the saved d-graph
:return: a new d-graph object corresponding to the test
"""
def load(text, barcode_graph):
# basic split
text = text.replace(']', '')
_, head, h1, h2 = text.split('[')
lcp_txt = lcp_txt.replace(']', '')
_, head, h1, h2 = lcp_txt.split('[')
# Head parsing
center = head.replace(' ', '')
if center not in barcode_graph:
center = int(center)
dg = Dgraph(center)
# Reload halves
h1 = [x for x in h1.split(',')]
h1 = [int(x) if x not in barcode_graph else x for x in h1]
h2 = [x for x in h2.split(',')]
h2 = [int(x) if x not in barcode_graph else x for x in h2]
dg.put_halves(h1, h2, barcode_graph)
dg.halves[0] = h1
dg.node_set.update(h1)
dg.nodes.extend(h1)
dg.halves[1] = h2
dg.node_set.update(h2)
dg.nodes.extend(h2)
# Score parsing
dg.score = int(score.split('/')[0])
# covering variable loading
dg.edges = {int(x) for x in variables.split(' ')}
return dg
def put_halves(self, h1, h2, graph):
""" Compute the d-graph quality (score) according to the connectivity between the two halves.
@param h1 First half of the d-graph
@param h2 Second half of the d-graph
@param graph The connectivity graph
:param h1: First half of the d-graph
:param h2: Second half of the d-graph
:param graph: The barcode graph
"""
def put_halves(self, h1, h2, graph):
self.score = 0
# First half
self.halves[0] = h1
for node in h1:
self.node_set.add(node)
self.nodes.append(node)
self.node_set.update(h1)
self.nodes.extend(h1)
self.connexity[0] = {key: 0 for key in h1}
# second half
self.halves[1] = h2
for node in h2:
self.node_set.add(node)
self.nodes.append(node)
# self.nodes = sorted([self.center] + self.halves[0] + self.halves[1])
self.connexity[0] = {key: 0 for key in self.halves[0]}
self.connexity[1] = {key: 0 for key in self.halves[1]}
self.edges = []
self.node_set.update(h2)
self.nodes.extend(h2)
self.connexity[1] = {key: 0 for key in h2}
# Compute link arities
for node1 in self.halves[0]:
......@@ -76,19 +79,10 @@ class Dgraph(object):
self.connexity[0][node1] += 1
self.connexity[1][node2] += 1
# Compute links from the center to the other nodes
for idx, node1 in enumerate(self.nodes):
for node2 in self.nodes[idx+1:]:
if graph.has_edge(node1, node2):
if node1 < node2:
self.edges.append((node1, node2))
else:
self.edges.append((node2, node1))
# Sort the halves by descending connexity
connex = self.connexity
self.halves[0].sort(reverse=True, key=lambda v: connex[0][v])
self.halves[1].sort(reverse=True, key=lambda v: connex[1][v])
# Compute covering variable
subgraph = graph.subgraph(self.nodes)
for _, _, id in subgraph.edges.data("uniq_idx"):
self.edges.add(id)
def get_link_divergence(self):
return int(abs(self.score - self.get_optimal_score()))
......@@ -104,7 +98,7 @@ class Dgraph(object):
def to_ordered_lists(self):
if self.ordered_list is None:
hands = [[],[]]
hands = [[], []]
for idx in range(2):
prev_connectivity = -1
for node in self.halves[idx]:
......
#!/usr/bin/env python3
import sys
with open(sys.argv[1]) as file:
header = file.readline()
nb_nodes, nb_variables = [int(x) for x in header.split()]
present_variables = [False]*nb_variables
for _ in range(nb_nodes):
line = file.readline()
variables = [int(x) for x in line.split()[1:]]
for var in variables:
present_variables[var] = True
for idx, val in enumerate(present_variables):
if not val:
print(f"{idx} not present")
\ No newline at end of file
#!/usr/bin/env python3
import networkx as nx
import argparse
def parse_arguments():
parser = argparse.ArgumentParser(description='Analysis graph cliques complexity')
parser.add_argument('graph', help='The input graph to analyse')
args = parser.parse_args()
return args
def analyse_neighbors_cliques(graph):
clique_sizes = []
for node in graph.nodes():
neighbors = list(graph.neighbors(node))
subgraph = graph.subgraph(neighbors)
cliques = list(nx.find_cliques(subgraph))
print(node, " \t", len(neighbors), " \t", len(cliques))
clique_sizes.append(len(cliques))
print(max(clique_sizes))
def main():
args = parse_arguments()
graph = nx.read_gexf(args.graph)
analyse_neighbors_cliques(graph)
if __name__ == "__main__":
main()
......@@ -10,12 +10,15 @@ from deconvolution.d2graph import d2_algorithms as d2a
def parse_arguments():
parser = argparse.ArgumentParser(description='Reduce the number of edges of a d2 graph. The reduction is performed on edges that are redundant.')
parser.add_argument('barcode_graph', help='The barcode graph file. Must be a gefx formated file.')
parser.add_argument('d2_graph', help='d2 graph to reduce. Must be a gefx formated file.')
parser.add_argument('--output_d2_name', '-o', default="transitively_reduced_d2.gexf", help="Output file prefix.")
parser.add_argument('--component_min_size', '-c', type=int, default=10, help="Minimum size of a component to keep it in the d2-graph, after the edge simplifications.")
parser.add_argument('lcp_graph', help='lcp graph to reduce. Must be a gefx formated file.')
parser.add_argument('--output_lcpg_name', '-o', default="", help="Output file prefix.")
parser.add_argument('--component_min_size', '-c', type=int, default=10, help="Minimum size of a component to keep it in the lcp graph, after the edge simplifications.")
args = parser.parse_args()
if args.output_lcpg_name == "":
args.output_lcpg_name = args.lcp_graph[:args.lcp_graph.rfind('.')] + "_reduced.gexf"
return args
......@@ -23,32 +26,30 @@ def main():
# Parsing the arguments and validate them
args = parse_arguments()
barcode_file = args.barcode_graph
d2_file = args.d2_graph
if (not barcode_file.endswith('.gexf')) or (not d2_file.endswith(".gexf")):
lcpg_name = args.lcp_graph
if not lcpg_name.endswith(".gexf"):
print("Inputs file must be gexf formatted", file=sys.stderr)
exit(1)
# Loading
print("--- d2 graph loading ---")
G = nx.read_gexf(barcode_file)
d2g = d2.D2Graph(G)
d2g.load(d2_file)
print("--- lcp graph loading ---")
lcpg = d2.D2Graph()
lcpg.load(lcpg_name)
# Algorithms for reduction
print("--- 3-reduction processing ---")
d2a.transitive_reduction(d2g)
d2a.transitive_reduction(lcpg)
# Take the principal component
print("--- Extract and write the graph ---")
components = [c for c in nx.connected_components(d2g) if len(c) > args.component_min_size]
components = [c for c in nx.connected_components(lcpg) if len(c) > args.component_min_size]
nodes = set()
for comp in components:
nodes.update(comp)
subgraph = d2g.subgraph(nodes)
subgraph = lcpg.subgraph(nodes)
# Write the simplified graph
nx.write_gexf(subgraph, args.output_d2_name)
nx.write_gexf(subgraph, args.output_lcpg_name)
if __name__ == "__main__":
......
#!/usr/bin/env python3
import networkx as nx
import argparse
import sys
import random
from deconvolution.d2graph import d2_graph as d2
from barcodes.partialorder import greedy_partial_order, bb_partial_order
def parse_arguments():
parser = argparse.ArgumentParser(description='Greedy construction of a path through the d2 graph.')
parser.add_argument('barcode_graph', help='The barcode graph file. Must be a gefx formatted file.')
parser.add_argument('d2_graph', help='d2 graph to reduce. Must be a gexf formatted file.')
parser.add_argument('--out_prefix', '-o', default="", help="Output file prefix.")
args = parser.parse_args()
if args.out_prefix == "":
args.out_prefix = '.'.join(args.d2_graph.split('.')[:-1])
return args
def main():
# Parsing the arguments and validate them
args = parse_arguments()
barcode_file = args.barcode_graph
d2_file = args.d2_graph
if (not barcode_file.endswith('.gexf')) or (not d2_file.endswith(".gexf")):
print("Inputs file must be gexf formatted", file=sys.stderr)
exit(1)
# Loading
G = nx.read_gexf(barcode_file)
d2g = d2.D2Graph(G)
d2g.load(d2_file)
# Take the principal component
largest_component_nodes = max(nx.connected_components(d2g), key=len)
largest_component = d2g.subgraph(largest_component_nodes)
all_nodes = list(largest_component.nodes())
rnd_node = all_nodes[random.randint(0, len(all_nodes)-1)]
# po = greedy_partial_order(largest_component, rnd_node)
for po in bb_partial_order(largest_component, rnd_node):
print("barcodes", len(po), "sets", po.len_sets, "udgs", po.len_udgs, "score", po.score)
if __name__ == "__main__":
main()
......@@ -9,15 +9,14 @@ from deconvolution.d2graph import d2_graph as d2, path_optimization as po
def parse_arguments():
parser = argparse.ArgumentParser(description='Greedy construction of a path through the d2 graph.')
parser.add_argument('barcode_graph', help='The barcode graph file. Must be a gefx formatted file.')
parser.add_argument('d2_graph', help='d2 graph to reduce. Must be a gexf formatted file.')
parser.add_argument('--out_prefix', '-o', default="", help="Output file prefix.")
parser.add_argument('lcp_graph', help='d2 graph to reduce. Must be a gexf formatted file.')
parser.add_argument('--outfile', '-o', default="", help="Output file prefix.")
parser.add_argument('--verbose', '-v', action="store_true", help="Verbose")
args = parser.parse_args()
if args.out_prefix == "":
args.out_prefix = '.'.join(args.d2_graph.split('.')[:-1])
if args.outfile == "":
args.outfile = '.'.join(args.lcp_graph.split('.')[:-1]) + "_path.gexf"
return args
......@@ -26,20 +25,18 @@ def main():
# Parsing the arguments and validate them
args = parse_arguments()
barcode_file = args.barcode_graph
d2_file = args.d2_graph
if (not barcode_file.endswith('.gexf')) or (not d2_file.endswith(".gexf")):
lcpg_name = args.lcp_graph
if not lcpg_name.endswith(".gexf"):
print("Inputs file must be gexf formatted", file=sys.stderr)
exit(1)
# Loading
G = nx.read_gexf(barcode_file)
d2g = d2.D2Graph(G)
d2g.load(d2_file)
lcpg = d2.D2Graph()
lcpg.load(lcpg_name)
# Take the principal component
largest_component_nodes = max(nx.connected_components(d2g), key=len)
largest_component = d2g.subgraph(largest_component_nodes)
largest_component_nodes = max(nx.connected_components(lcpg), key=len)
largest_component = lcpg.subgraph(largest_component_nodes)
# Start optimization
optimizer = po.Optimizer(largest_component)
......@@ -50,7 +47,7 @@ def main():
print()
print()
print(f"covering score: {path.covering_score()}")
path.save_gexf(f"{args.out_prefix}_path.gexf")
path.save_gexf(args.outfile)
print("Solution saved")
......
......@@ -25,24 +25,6 @@ def parse_args():
return args
def load_graph(filename):
if filename.endswith('.graphml'):
return nx.read_graphml(filename)
elif filename.endswith('.gexf'):
return nx.read_gexf(filename)
else:
print("Wrong file format. Require graphml or gefx format", file=sys.stderr)
exit()
def save_graph(g, filename):
if filename.endswith('.graphml'):
return nx.write_graphml(g,filename)
elif filename.endswith('.gexf'):
return nx.write_gexf(g,filename)
else:
print("Wrong file format. Require graphml or gefx format", file=sys.stderr)
exit()
def transform_bg(graph):
idx = 0
node_names = {}
......@@ -610,14 +592,14 @@ def verify_graph_edges(d2_component):
def main():
args = parse_args()
graph = load_graph(args.filename)
graph = nx.read_gexf(args.filename)
if args.type == "path":
if args.barcode_graph is None:
print("--barcode_graph is required for path analysis", file=sys.stderr)