...
 
Commits (6)
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
A compilation of scripts and pipelines to count and extract scaffolds of barcodes from linked reads datasets. A compilation of scripts and pipelines to count and extract scaffolds of barcodes from linked reads datasets.
**WARNING**: This code is a proof of concept, not a usable software for production. If the code is too slow for your tests or you are encontering some bugs (maybe it's a feature ? :p) don't hesitate to contact us via the issues or with a direct mail to me (yoann [dot] dufresne [at] pasteur [dot] fr).
## Nomenclature warnings ## Nomenclature warnings
During the process of writing a scientific article, some of the datastructure names have been modified. During the process of writing a scientific article, some of the datastructure names have been modified.
In this repository the majority of the names are old names. In this repository the majority of the names are old names.
...@@ -27,6 +29,20 @@ Install the package from the root directory. ...@@ -27,6 +29,20 @@ Install the package from the root directory.
For the majority of the scripts, argparse is used. For the majority of the scripts, argparse is used.
To know how to use it please use the -h command line option. To know how to use it please use the -h command line option.
### Test the complete pipeline on simulated data
For a complete test, we made a bunch of snakemake files.
If you are looking for a complete pipeline from synthetic data generation, you should look into the "Snakefile_d2_eval" file.
You can play with the N (number of molecules in the interval graph), M (average number of merge to perform in a barcode), DEV (standard deviation on merge) variables to see impact on performances.
These values are arrays. You can enter multiple values and all the combinations will be done.
A summary is output in the tsv file "{WORKDIR}/eval_compare_maxclique.tsv.
Warning: the pipeline can be very slow for huge number of parameters.
Command to run the pipeline:
```
snakemake -s Snakefile_d2_eval
```
### Data simulation ### Data simulation
* generate_fake_molecule_graph.py: Create a linear molecule graph, where the molecules are linked to the d molecules on their left and d molecules on their right. * generate_fake_molecule_graph.py: Create a linear molecule graph, where the molecules are linked to the d molecules on their left and d molecules on their right.
......
...@@ -35,14 +35,13 @@ rule compress_data: ...@@ -35,14 +35,13 @@ rule compress_data:
rule d2_simplification: rule d2_simplification:
input: input:
barcode_graph="{barcode_path}.gexf",
d2_raw="{barcode_path}_d2_raw_{method}.gexf" d2_raw="{barcode_path}_d2_raw_{method}.gexf"
output: output:
simplified_d2="{barcode_path}_d2_simplified_{method}.gexf" simplified_d2="{barcode_path}_d2_simplified_{method}.gexf"
wildcard_constraints: wildcard_constraints:
method="[A-Za-z0-9]+" method="[A-Za-z0-9]+"
shell: shell:
"python3 deconvolution/main/d2_reduction.py -o {output.simplified_d2} {input.barcode_graph} {input.d2_raw}" "python3 deconvolution/main/d2_reduction.py -o {output.simplified_d2} {input.d2_raw}"
rule d2_generation: rule d2_generation:
...@@ -54,7 +53,7 @@ rule d2_generation: ...@@ -54,7 +53,7 @@ rule d2_generation:
wildcard_constraints: wildcard_constraints:
method="[A-Za-z0-9]+" method="[A-Za-z0-9]+"
run: run:
shell(f"python3 deconvolution/main/to_d2_graph.py {{input.barcode_graph}} --{{wildcards.method}} -t {{threads}} -o {WORKDIR}/{{wildcards.file}}_d2_raw_{{wildcards.method}}") shell(f"python3 deconvolution/main/to_d2_graph.py {{input.barcode_graph}} --{{wildcards.method}} -t {{threads}} -o {WORKDIR}/{{wildcards.file}}_d2_raw_{{wildcards.method}}.gexf")
rule setup_workdir: rule setup_workdir:
......
...@@ -3,10 +3,10 @@ include: "Snakefile_d2" ...@@ -3,10 +3,10 @@ include: "Snakefile_d2"
include: "Snakefile_d2_path" include: "Snakefile_d2_path"
WORKDIR = "snake_experiments" if "workdir" not in config else config["workdir"] WORKDIR = "snake_experiments" if "workdir" not in config else config["workdir"]
N = [5000, 10000] N = [5000]
D = [10] D = [10]
M = [2, 3] M = [2]
DEV = [0, 1] DEV = [0]
rule generate_compare: rule generate_compare:
input: input:
......
...@@ -5,14 +5,13 @@ threshold = 0.9 ...@@ -5,14 +5,13 @@ threshold = 0.9
rule d2_path_generation: rule d2_path_generation:
input: input:
barcode="{path}.gexf",
d2="{path}_d2_{type}_{method}.gexf" d2="{path}_d2_{type}_{method}.gexf"
output: output:
"{path}_d2_{type}_{method}_path.gexf" "{path}_d2_{type}_{method}_path.gexf"
run: run:
best = 0 best = 0
for _ in range(number_try): for _ in range(number_try):
shell("python3 deconvolution/main/d2_to_path.py {input.barcode} {input.d2} > {output}_tmp.out") shell("python3 deconvolution/main/d2_to_path.py {input.d2} > {output}_tmp.out")
score = 0 score = 0
with open(f"{output}_tmp.out") as out: with open(f"{output}_tmp.out") as out:
score_line = out.readlines()[-2].strip() score_line = out.readlines()[-2].strip()
......
...@@ -12,26 +12,19 @@ from deconvolution.dgraph.LouvainDGFactory import LouvainDGFactory ...@@ -12,26 +12,19 @@ from deconvolution.dgraph.LouvainDGFactory import LouvainDGFactory
class D2Graph(nx.Graph): class D2Graph(nx.Graph):
"""D2Graph (read it (d-graph)²)""" """D2Graph (read it (d-graph)²)"""
def __init__(self, barcode_graph, debug=False, debug_path='.'): def __init__(self, debug=False, debug_path='.'):
super(D2Graph, self).__init__() super(D2Graph, self).__init__()
self.all_d_graphs = [] self.all_d_graphs = []
self.d_graphs_per_node = {} self.d_graphs_per_node = {}
self.node_by_idx = {} self.node_by_idx = {}
self.barcode_graph = barcode_graph self.barcode_graph = None
self.index = None self.index = None
self.variables = set()
self.variables_per_lcp = {} self.variables_per_lcp = {}
# Number the edges from original graph
self.barcode_edge_idxs = {} self.barcode_edge_idxs = {}
self.nb_uniq_edge = 0 self.nb_uniq_edge = 0
for idx, edge in enumerate(self.barcode_graph.edges()):
if edge == (edge[1], edge[0]):
self.nb_uniq_edge += 1
if edge in self.barcode_edge_idxs:
print("Edge already present")
self.barcode_edge_idxs[edge] = idx
self.barcode_edge_idxs[(edge[1], edge[0])] = idx
self.debug = debug self.debug = debug
self.debug_path = debug_path self.debug_path = debug_path
...@@ -58,13 +51,27 @@ class D2Graph(nx.Graph): ...@@ -58,13 +51,27 @@ class D2Graph(nx.Graph):
# Node by idx # Node by idx
G.node_by_idx = self.node_by_idx G.node_by_idx = self.node_by_idx
G.variables = self.variables.copy()
return G return G
def clone(self): def clone(self):
return self.subgraph(list(self.nodes())) return self.subgraph(list(self.nodes()))
def construct_from_barcodes(self, neighbor_threshold=0.25, min_size_clique=4, verbose=False, clique_mode=None, threads=1): def construct_from_barcodes(self, barcode_graph, neighbor_threshold=0.25, min_size_clique=4, verbose=False, clique_mode=None, threads=1):
self.barcode_graph = barcode_graph
# Number the edges from original graph
for idx, edge in enumerate(self.barcode_graph.edges()):
if edge == (edge[1], edge[0]):
self.nb_uniq_edge += 1
if edge in self.barcode_edge_idxs:
print("Edge already present")
self.barcode_edge_idxs[edge] = idx
self.barcode_edge_idxs[(edge[1], edge[0])] = idx
self.barcode_graph[edge[0]][edge[1]]["uniq_idx"] = idx
# Compute all the d-graphs # Compute all the d-graphs
if verbose: if verbose:
print("Computing the unit d-graphs..") print("Computing the unit d-graphs..")
...@@ -102,27 +109,13 @@ class D2Graph(nx.Graph): ...@@ -102,27 +109,13 @@ class D2Graph(nx.Graph):
lcp = self.get_lcp(obj) lcp = self.get_lcp(obj)
if lcp not in self.variables_per_lcp: if lcp not in self.variables_per_lcp:
variables = [] variables = []
for e in lcp.edges: for e_idx in lcp.edges:
variables.append(self.barcode_edge_idxs[e]) variables.append(e_idx)
self.variables_per_lcp[lcp] = variables self.variables_per_lcp[lcp] = variables
return self.variables_per_lcp[lcp] return self.variables_per_lcp[lcp]
def save(self, filename):
with open(filename, "w") as fp:
# First line nb_nodes nb_cov_var
fp.write(f"{len(self.all_d_graphs)} {int((len(self.barcode_edge_idxs)+self.nb_uniq_edge)/2)}\n")
# Write the edges per d_graph
for d_graph in self.all_d_graphs:
fp.write(f"{d_graph.idx} {' '.join([str(self.barcode_edge_idxs[e]) for e in d_graph.edges])}\n")
# Write the distances
for x, y, data in self.edges(data=True):
fp.write(f"{x} {y} {data['distance']}\n")
def load(self, filename): def load(self, filename):
# Reload the graph # Reload the graph
G = nx.read_gexf(filename) G = nx.read_gexf(filename)
...@@ -137,7 +130,8 @@ class D2Graph(nx.Graph): ...@@ -137,7 +130,8 @@ class D2Graph(nx.Graph):
self.bidict_nodes = {} self.bidict_nodes = {}
for idx, node in enumerate(self.nodes(data=True)): for idx, node in enumerate(self.nodes(data=True)):
node, data = node node, data = node
dg = Dgraph.load(data["udg"], self.barcode_graph) dg = Dgraph.load(data["udg"], data["score"], data["barcode_edges"])
self.variables.update(dg.edges)
self.bidict_nodes[node] = dg self.bidict_nodes[node] = dg
self.all_d_graphs.append(dg) self.all_d_graphs.append(dg)
if dg.idx == -1: if dg.idx == -1:
...@@ -181,11 +175,11 @@ class D2Graph(nx.Graph): ...@@ -181,11 +175,11 @@ class D2Graph(nx.Graph):
nodes[dg] = dg.idx nodes[dg] = dg.idx
self.add_node(nodes[dg]) self.add_node(nodes[dg])
# Add covering barcode edges # Add covering barcode edges
barcode_edges = " ".join([str(self.barcode_edge_idxs[x]) for x in dg.edges]) barcode_edges = " ".join([str(x) for x in dg.edges])
self.nodes[nodes[dg]]["barcode_edges"] = barcode_edges self.nodes[nodes[dg]]["barcode_edges"] = barcode_edges
self.nodes[nodes[dg]]["score"] = f"{dg.score}/{dg.get_optimal_score()}" self.nodes[nodes[dg]]["score"] = f"{dg.score}/{dg.get_optimal_score()}"
self.nodes[nodes[dg]]["udg"] = str(dg) self.nodes[nodes[dg]]["udg"] = str(dg)
self.nodes[nodes[dg]]["central_node_barcode"] = str(dg).split(']')[0]+']' self.nodes[nodes[dg]]["central_node_barcode"] = str(dg.center)
# Create the edges from neighbor edges # Create the edges from neighbor edges
for dg in self.all_d_graphs: for dg in self.all_d_graphs:
......
...@@ -10,7 +10,7 @@ class Path(list): ...@@ -10,7 +10,7 @@ class Path(list):
def __init__(self, d2g): def __init__(self, d2g):
super(Path, self).__init__() super(Path, self).__init__()
self.d2g = d2g self.d2g = d2g
self.covering_variables = {x: 0 for x in self.d2g.barcode_edge_idxs.values()} self.covering_variables = {x: 0 for x in self.d2g.variables}
self.covering_value = 0 self.covering_value = 0
# a succession of Counter (multiset) # a succession of Counter (multiset)
self.barcode_order = [] self.barcode_order = []
...@@ -22,8 +22,7 @@ class Path(list): ...@@ -22,8 +22,7 @@ class Path(list):
lcp = self.d2g.get_lcp(obj) lcp = self.d2g.get_lcp(obj)
# Update the covering variables # Update the covering variables
for barcode_edge in lcp.edges: for edge_idx in lcp.edges:
edge_idx = self.d2g.barcode_edge_idxs[barcode_edge]
if self.covering_variables[edge_idx] == 0: if self.covering_variables[edge_idx] == 0:
self.covering_value += 1 self.covering_value += 1
self.covering_variables[edge_idx] += 1 self.covering_variables[edge_idx] += 1
...@@ -75,10 +74,9 @@ class Path(list): ...@@ -75,10 +74,9 @@ class Path(list):
self._pop_barcodes(lcp) self._pop_barcodes(lcp)
# Update the covering variables # Update the covering variables
for barcode_edge in lcp.edges: for e_idx in lcp.edges:
edge_idx = self.d2g.barcode_edge_idxs[barcode_edge] self.covering_variables[e_idx] -= 1
self.covering_variables[edge_idx] -= 1 if self.covering_variables[e_idx] == 0:
if self.covering_variables[edge_idx] == 0:
self.covering_value -= 1 self.covering_value -= 1
return lcp return lcp
...@@ -137,7 +135,7 @@ class Path(list): ...@@ -137,7 +135,7 @@ class Path(list):
d2p.nodes[udg.idx]["center"] = udg.center d2p.nodes[udg.idx]["center"] = udg.center
d2p.nodes[udg.idx]["udg"] = str(udg) d2p.nodes[udg.idx]["udg"] = str(udg)
d2p.nodes[udg.idx]["score"] = f"{udg.score}/{udg.get_optimal_score()}" d2p.nodes[udg.idx]["score"] = f"{udg.score}/{udg.get_optimal_score()}"
barcode_edges = " ".join([str(self.d2g.barcode_edge_idxs[x]) for x in udg.edges]) barcode_edges = " ".join([str(x) for x in udg.edges])
d2p.nodes[udg.idx]["barcode_edges"] = barcode_edges d2p.nodes[udg.idx]["barcode_edges"] = barcode_edges
# add the edges # add the edges
......
...@@ -11,60 +11,63 @@ class Dgraph(object): ...@@ -11,60 +11,63 @@ class Dgraph(object):
self.center = center self.center = center
self.score = 0 self.score = 0
self.halves = [[], []] self.halves = [[], []]
self.connexity = [[], []] self.connexity = [{}, {}]
self.nodes = [self.center] self.nodes = [self.center]
self.node_set = set(self.nodes) self.node_set = set(self.nodes)
self.edges = []
self.ordered_list = None self.ordered_list = None
self.sorted_list = None self.sorted_list = None
self.edges = set()
self.marked = False @staticmethod
def load(lcp_txt, score, variables):
""" Static method to load a dgraph from a text """ Static method to load a dgraph from a text
@param text the saved d-graph :param lcp_txt: the saved d-graph
@param barcode_graph Barcode graph from which the d-graph is extracted :return: a new d-graph object corresponding to the test
@return a new d-graph object corresponding to the test """
"""
def load(text, barcode_graph):
# basic split # basic split
text = text.replace(']', '') lcp_txt = lcp_txt.replace(']', '')
_, head, h1, h2 = text.split('[') _, head, h1, h2 = lcp_txt.split('[')
# Head parsing # Head parsing
center = head.replace(' ', '') center = head.replace(' ', '')
if center not in barcode_graph:
center = int(center)
dg = Dgraph(center) dg = Dgraph(center)
# Reload halves # Reload halves
h1 = [x for x in h1.split(',')] h1 = [x for x in h1.split(',')]
h1 = [int(x) if x not in barcode_graph else x for x in h1]
h2 = [x for x in h2.split(',')] h2 = [x for x in h2.split(',')]
h2 = [int(x) if x not in barcode_graph else x for x in h2]
dg.put_halves(h1, h2, barcode_graph) dg.halves[0] = h1
dg.node_set.update(h1)
dg.nodes.extend(h1)
dg.halves[1] = h2
dg.node_set.update(h2)
dg.nodes.extend(h2)
# Score parsing
dg.score = int(score.split('/')[0])
# covering variable loading
dg.edges = {int(x) for x in variables.split(' ')}
return dg return dg
""" Compute the d-graph quality (score) according to the connectivity between the two halves.
@param h1 First half of the d-graph
@param h2 Second half of the d-graph
@param graph The connectivity graph
"""
def put_halves(self, h1, h2, graph): def put_halves(self, h1, h2, graph):
""" Compute the d-graph quality (score) according to the connectivity between the two halves.
:param h1: First half of the d-graph
:param h2: Second half of the d-graph
:param graph: The barcode graph
"""
self.score = 0 self.score = 0
# First half
self.halves[0] = h1 self.halves[0] = h1
for node in h1: self.node_set.update(h1)
self.node_set.add(node) self.nodes.extend(h1)
self.nodes.append(node) self.connexity[0] = {key: 0 for key in h1}
# second half
self.halves[1] = h2 self.halves[1] = h2
for node in h2: self.node_set.update(h2)
self.node_set.add(node) self.nodes.extend(h2)
self.nodes.append(node) self.connexity[1] = {key: 0 for key in h2}
# self.nodes = sorted([self.center] + self.halves[0] + self.halves[1])
self.connexity[0] = {key: 0 for key in self.halves[0]}
self.connexity[1] = {key: 0 for key in self.halves[1]}
self.edges = []
# Compute link arities # Compute link arities
for node1 in self.halves[0]: for node1 in self.halves[0]:
...@@ -76,19 +79,10 @@ class Dgraph(object): ...@@ -76,19 +79,10 @@ class Dgraph(object):
self.connexity[0][node1] += 1 self.connexity[0][node1] += 1
self.connexity[1][node2] += 1 self.connexity[1][node2] += 1
# Compute links from the center to the other nodes # Compute covering variable
for idx, node1 in enumerate(self.nodes): subgraph = graph.subgraph(self.nodes)
for node2 in self.nodes[idx+1:]: for _, _, id in subgraph.edges.data("uniq_idx"):
if graph.has_edge(node1, node2): self.edges.add(id)
if node1 < node2:
self.edges.append((node1, node2))
else:
self.edges.append((node2, node1))
# Sort the halves by descending connexity
connex = self.connexity
self.halves[0].sort(reverse=True, key=lambda v: connex[0][v])
self.halves[1].sort(reverse=True, key=lambda v: connex[1][v])
def get_link_divergence(self): def get_link_divergence(self):
return int(abs(self.score - self.get_optimal_score())) return int(abs(self.score - self.get_optimal_score()))
...@@ -104,7 +98,7 @@ class Dgraph(object): ...@@ -104,7 +98,7 @@ class Dgraph(object):
def to_ordered_lists(self): def to_ordered_lists(self):
if self.ordered_list is None: if self.ordered_list is None:
hands = [[],[]] hands = [[], []]
for idx in range(2): for idx in range(2):
prev_connectivity = -1 prev_connectivity = -1
for node in self.halves[idx]: for node in self.halves[idx]:
......
#!/usr/bin/env python3
import sys
with open(sys.argv[1]) as file:
header = file.readline()
nb_nodes, nb_variables = [int(x) for x in header.split()]
present_variables = [False]*nb_variables
for _ in range(nb_nodes):
line = file.readline()
variables = [int(x) for x in line.split()[1:]]
for var in variables:
present_variables[var] = True
for idx, val in enumerate(present_variables):
if not val:
print(f"{idx} not present")
\ No newline at end of file
#!/usr/bin/env python3
import networkx as nx
import argparse
def parse_arguments():
parser = argparse.ArgumentParser(description='Analysis graph cliques complexity')
parser.add_argument('graph', help='The input graph to analyse')
args = parser.parse_args()
return args
def analyse_neighbors_cliques(graph):
clique_sizes = []
for node in graph.nodes():
neighbors = list(graph.neighbors(node))
subgraph = graph.subgraph(neighbors)
cliques = list(nx.find_cliques(subgraph))
print(node, " \t", len(neighbors), " \t", len(cliques))
clique_sizes.append(len(cliques))
print(max(clique_sizes))
def main():
args = parse_arguments()
graph = nx.read_gexf(args.graph)
analyse_neighbors_cliques(graph)
if __name__ == "__main__":
main()
...@@ -10,12 +10,15 @@ from deconvolution.d2graph import d2_algorithms as d2a ...@@ -10,12 +10,15 @@ from deconvolution.d2graph import d2_algorithms as d2a
def parse_arguments(): def parse_arguments():
parser = argparse.ArgumentParser(description='Reduce the number of edges of a d2 graph. The reduction is performed on edges that are redundant.') parser = argparse.ArgumentParser(description='Reduce the number of edges of a d2 graph. The reduction is performed on edges that are redundant.')
parser.add_argument('barcode_graph', help='The barcode graph file. Must be a gefx formated file.') parser.add_argument('lcp_graph', help='lcp graph to reduce. Must be a gefx formated file.')
parser.add_argument('d2_graph', help='d2 graph to reduce. Must be a gefx formated file.') parser.add_argument('--output_lcpg_name', '-o', default="", help="Output file prefix.")
parser.add_argument('--output_d2_name', '-o', default="transitively_reduced_d2.gexf", help="Output file prefix.") parser.add_argument('--component_min_size', '-c', type=int, default=10, help="Minimum size of a component to keep it in the lcp graph, after the edge simplifications.")
parser.add_argument('--component_min_size', '-c', type=int, default=10, help="Minimum size of a component to keep it in the d2-graph, after the edge simplifications.")
args = parser.parse_args() args = parser.parse_args()
if args.output_lcpg_name == "":
args.output_lcpg_name = args.lcp_graph[:args.lcp_graph.rfind('.')] + "_reduced.gexf"
return args return args
...@@ -23,32 +26,30 @@ def main(): ...@@ -23,32 +26,30 @@ def main():
# Parsing the arguments and validate them # Parsing the arguments and validate them
args = parse_arguments() args = parse_arguments()
barcode_file = args.barcode_graph lcpg_name = args.lcp_graph
d2_file = args.d2_graph if not lcpg_name.endswith(".gexf"):
if (not barcode_file.endswith('.gexf')) or (not d2_file.endswith(".gexf")):
print("Inputs file must be gexf formatted", file=sys.stderr) print("Inputs file must be gexf formatted", file=sys.stderr)
exit(1) exit(1)
# Loading # Loading
print("--- d2 graph loading ---") print("--- lcp graph loading ---")
G = nx.read_gexf(barcode_file) lcpg = d2.D2Graph()
d2g = d2.D2Graph(G) lcpg.load(lcpg_name)
d2g.load(d2_file)
# Algorithms for reduction # Algorithms for reduction
print("--- 3-reduction processing ---") print("--- 3-reduction processing ---")
d2a.transitive_reduction(d2g) d2a.transitive_reduction(lcpg)
# Take the principal component # Take the principal component
print("--- Extract and write the graph ---") print("--- Extract and write the graph ---")
components = [c for c in nx.connected_components(d2g) if len(c) > args.component_min_size] components = [c for c in nx.connected_components(lcpg) if len(c) > args.component_min_size]
nodes = set() nodes = set()
for comp in components: for comp in components:
nodes.update(comp) nodes.update(comp)
subgraph = d2g.subgraph(nodes) subgraph = lcpg.subgraph(nodes)
# Write the simplified graph # Write the simplified graph
nx.write_gexf(subgraph, args.output_d2_name) nx.write_gexf(subgraph, args.output_lcpg_name)
if __name__ == "__main__": if __name__ == "__main__":
......
#!/usr/bin/env python3
import networkx as nx
import argparse
import sys
import random
from deconvolution.d2graph import d2_graph as d2
from barcodes.partialorder import greedy_partial_order, bb_partial_order
def parse_arguments():
parser = argparse.ArgumentParser(description='Greedy construction of a path through the d2 graph.')
parser.add_argument('barcode_graph', help='The barcode graph file. Must be a gefx formatted file.')
parser.add_argument('d2_graph', help='d2 graph to reduce. Must be a gexf formatted file.')
parser.add_argument('--out_prefix', '-o', default="", help="Output file prefix.")
args = parser.parse_args()
if args.out_prefix == "":
args.out_prefix = '.'.join(args.d2_graph.split('.')[:-1])
return args
def main():
# Parsing the arguments and validate them
args = parse_arguments()
barcode_file = args.barcode_graph
d2_file = args.d2_graph
if (not barcode_file.endswith('.gexf')) or (not d2_file.endswith(".gexf")):
print("Inputs file must be gexf formatted", file=sys.stderr)
exit(1)
# Loading
G = nx.read_gexf(barcode_file)
d2g = d2.D2Graph(G)
d2g.load(d2_file)
# Take the principal component
largest_component_nodes = max(nx.connected_components(d2g), key=len)
largest_component = d2g.subgraph(largest_component_nodes)
all_nodes = list(largest_component.nodes())
rnd_node = all_nodes[random.randint(0, len(all_nodes)-1)]
# po = greedy_partial_order(largest_component, rnd_node)
for po in bb_partial_order(largest_component, rnd_node):
print("barcodes", len(po), "sets", po.len_sets, "udgs", po.len_udgs, "score", po.score)
if __name__ == "__main__":
main()
...@@ -9,15 +9,14 @@ from deconvolution.d2graph import d2_graph as d2, path_optimization as po ...@@ -9,15 +9,14 @@ from deconvolution.d2graph import d2_graph as d2, path_optimization as po
def parse_arguments(): def parse_arguments():
parser = argparse.ArgumentParser(description='Greedy construction of a path through the d2 graph.') parser = argparse.ArgumentParser(description='Greedy construction of a path through the d2 graph.')
parser.add_argument('barcode_graph', help='The barcode graph file. Must be a gefx formatted file.') parser.add_argument('lcp_graph', help='d2 graph to reduce. Must be a gexf formatted file.')
parser.add_argument('d2_graph', help='d2 graph to reduce. Must be a gexf formatted file.') parser.add_argument('--outfile', '-o', default="", help="Output file prefix.")
parser.add_argument('--out_prefix', '-o', default="", help="Output file prefix.")
parser.add_argument('--verbose', '-v', action="store_true", help="Verbose") parser.add_argument('--verbose', '-v', action="store_true", help="Verbose")
args = parser.parse_args() args = parser.parse_args()
if args.out_prefix == "": if args.outfile == "":
args.out_prefix = '.'.join(args.d2_graph.split('.')[:-1]) args.outfile = '.'.join(args.lcp_graph.split('.')[:-1]) + "_path.gexf"
return args return args
...@@ -26,20 +25,18 @@ def main(): ...@@ -26,20 +25,18 @@ def main():
# Parsing the arguments and validate them # Parsing the arguments and validate them
args = parse_arguments() args = parse_arguments()
barcode_file = args.barcode_graph lcpg_name = args.lcp_graph
d2_file = args.d2_graph if not lcpg_name.endswith(".gexf"):
if (not barcode_file.endswith('.gexf')) or (not d2_file.endswith(".gexf")):
print("Inputs file must be gexf formatted", file=sys.stderr) print("Inputs file must be gexf formatted", file=sys.stderr)
exit(1) exit(1)
# Loading # Loading
G = nx.read_gexf(barcode_file) lcpg = d2.D2Graph()
d2g = d2.D2Graph(G) lcpg.load(lcpg_name)
d2g.load(d2_file)
# Take the principal component # Take the principal component
largest_component_nodes = max(nx.connected_components(d2g), key=len) largest_component_nodes = max(nx.connected_components(lcpg), key=len)
largest_component = d2g.subgraph(largest_component_nodes) largest_component = lcpg.subgraph(largest_component_nodes)
# Start optimization # Start optimization
optimizer = po.Optimizer(largest_component) optimizer = po.Optimizer(largest_component)
...@@ -50,7 +47,7 @@ def main(): ...@@ -50,7 +47,7 @@ def main():
print() print()
print() print()
print(f"covering score: {path.covering_score()}") print(f"covering score: {path.covering_score()}")
path.save_gexf(f"{args.out_prefix}_path.gexf") path.save_gexf(args.outfile)
print("Solution saved") print("Solution saved")
......
...@@ -25,24 +25,6 @@ def parse_args(): ...@@ -25,24 +25,6 @@ def parse_args():
return args return args
def load_graph(filename):
if filename.endswith('.graphml'):
return nx.read_graphml(filename)
elif filename.endswith('.gexf'):
return nx.read_gexf(filename)
else:
print("Wrong file format. Require graphml or gefx format", file=sys.stderr)
exit()
def save_graph(g, filename):
if filename.endswith('.graphml'):
return nx.write_graphml(g,filename)
elif filename.endswith('.gexf'):
return nx.write_gexf(g,filename)
else:
print("Wrong file format. Require graphml or gefx format", file=sys.stderr)
exit()
def transform_bg(graph): def transform_bg(graph):
idx = 0 idx = 0
node_names = {} node_names = {}
...@@ -610,14 +592,14 @@ def verify_graph_edges(d2_component): ...@@ -610,14 +592,14 @@ def verify_graph_edges(d2_component):
def main(): def main():
args = parse_args() args = parse_args()
graph = load_graph(args.filename) graph = nx.read_gexf(args.filename)
if args.type == "path": if args.type == "path":
if args.barcode_graph is None: if args.barcode_graph is None:
print("--barcode_graph is required for path analysis", file=sys.stderr) print("--barcode_graph is required for path analysis", file=sys.stderr)
exit(0) exit(0)
barcode_graph = load_graph(args.barcode_graph) barcode_graph = nx.read_gexf(args.barcode_graph)
# if len(list(nx.connected_components(graph))) != 1: # if len(list(nx.connected_components(graph))) != 1:
# print([len(x) for x in list(nx.connected_components(graph))]) # print([len(x) for x in list(nx.connected_components(graph))])
# exit("when running evaluate.py --type path, the graph should have a single connected component (it's supposed to be a path, after all)") # exit("when running evaluate.py --type path, the graph should have a single connected component (it's supposed to be a path, after all)")
...@@ -660,7 +642,7 @@ def main(): ...@@ -660,7 +642,7 @@ def main():
extension = args.filename.split('.')[-1] extension = args.filename.split('.')[-1]
base_filename = '.'.join(args.filename.split('.')[:-1]) base_filename = '.'.join(args.filename.split('.')[:-1])
save_graph(component, base_filename+".verified."+extension) nx.write_gexf(component, base_filename+".verified."+extension)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -8,18 +8,21 @@ from deconvolution.d2graph import d2_graph as d2 ...@@ -8,18 +8,21 @@ from deconvolution.d2graph import d2_graph as d2
def parse_arguments(): def parse_arguments():
parser = argparse.ArgumentParser(description='Transform a 10X barcode graph into a d2 graph. The program dig for the d-graphs and then merge them into a d2-graph.') parser = argparse.ArgumentParser(description='Transform a barcode graph into a lcp graph. The program dig for a set of lcps and then merge them into a lcp graph.')
parser.add_argument('barcode_graph', help='The barcode graph file. Must be a gefx formated file.') parser.add_argument('barcode_graph', help='The barcode graph file. Must be a gefx formated file.')
parser.add_argument('--output_prefix', '-o', default="d2_graph", help="Output file prefix.") parser.add_argument('--outfile', '-o', default="", help="Output file name for lcp graph.")
parser.add_argument('--threads', '-t', default=8, type=int, help='Number of thread to use for dgraph computation') parser.add_argument('--threads', '-t', default=8, type=int, help='Number of thread to use for dgraph computation')
parser.add_argument('--debug', '-d', action='store_true', help="Debug") parser.add_argument('--debug', '-d', action='store_true', help="Debug")
parser.add_argument('--verbose', '-v', action='store_true', help="Verbose") parser.add_argument('--verbose', '-v', action='store_true', help="Verbose")
parser.add_argument('--edge_divergence_threshold', '-dt', default=0.25, type=float, help='Divergence threshold value to link two udgs in the d2-graph') parser.add_argument('--edge_divergence_threshold', '-dt', default=0.25, type=float, help='Divergence threshold value to link two udgs in the d2-graph')
parser.add_argument('--maxclq', '-c', action='store_true', help="Enable max clique community detection (default behaviour)") parser.add_argument('--maxclq', '-c', action='store_true', help="Enable max clique community detection (default behaviour)")
parser.add_argument('--louvain', '-l', action='store_true', help="Enable Louvain community detection instead of all max-cliques") parser.add_argument('--louvain', '-l', action='store_true', help="Enable Louvain community detection instead of all max-cliques")
parser.add_argument('--comtest', '-k', action='store_true', help="Enable [placeholder] community detection algorithm instead of max-cliques")
args = parser.parse_args() args = parser.parse_args()
if args.outfile == "":
args.outfile = ".".join(args.barcode_graph.split(".")[:-1]) + "_lcpg.gexf"
return args return args
def main(): def main():
...@@ -28,26 +31,10 @@ def main(): ...@@ -28,26 +31,10 @@ def main():
# debug = args.debug # debug = args.debug
filename = args.barcode_graph filename = args.barcode_graph
barcode_graph = nx.read_gexf(filename)
def dprint(s):
from datetime import datetime
t = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print(t, s)
dprint("loading barcode graph")
if filename.endswith('.gexf'):
G = nx.read_gexf(filename)
elif filename.endswith('.graphml'):
G = nx.read_graphml(filename)
else:
print("Input file must be gexf or graphml formatted", file=sys.stderr)
exit(1)
dprint("barcode graph loaded")
if args.louvain: if args.louvain:
clique_mode = "louvain" clique_mode = "louvain"
elif args.comtest:
clique_mode = "testing"
else: else:
clique_mode = None clique_mode = None
...@@ -56,23 +43,22 @@ def main(): ...@@ -56,23 +43,22 @@ def main():
debug_path = "/dev/null" debug_path = "/dev/null"
if args.debug: if args.debug:
debug = True debug = True
debug_path = f"{args.output_prefix}_debug" debug_path = ".".join(args.barcode_graph.split(".")[:-1]) + "_debug"
import os, shutil import os, shutil
if os.path.isdir(debug_path): if os.path.isdir(debug_path):
shutil.rmtree(debug_path) shutil.rmtree(debug_path)
os.mkdir(debug_path) os.mkdir(debug_path)
# Index size must be changed for general purpose. 8 is good for d=5 d2g = d2.D2Graph(debug=debug, debug_path=debug_path)
dprint("creating D2graph object") d2g.construct_from_barcodes(
d2g = d2.D2Graph(G, debug=debug, debug_path=debug_path) barcode_graph,
dprint("D2 graph object created") neighbor_threshold=args.edge_divergence_threshold,
dprint("constructing d2 graph from barcode graph") clique_mode=clique_mode,
d2g.construct_from_barcodes(neighbor_threshold=args.edge_divergence_threshold, clique_mode=clique_mode, threads=args.threads, verbose=args.verbose) threads=args.threads,
dprint("[debug] d2 graph constructed") verbose=args.verbose
dprint("[debug] %d nodes %d edges" % (len(d2g.nodes()), len(d2g.edges()))) )
# d2g.save(f"{args.output_prefix}.tsv") nx.write_gexf(d2g, args.outfile)
nx.write_gexf(d2g, f"{args.output_prefix}.gexf")
if __name__ == "__main__": if __name__ == "__main__":
......