Commit fedc3560 authored by Yoann Dufresne's avatar Yoann Dufresne
Browse files

allow non regular molecule repartition along the genome in the simulations

parent 9a61b76f
#!/usr/bin/env python3
import networkx as nx
import itertools as it
import sys
from bidict import bidict
barcode_graph = nx.read_gexf(sys.argv[1])
d_size = int(sys.argv[2])
# label molecule nodes
nodes = list(barcode_graph.nodes())
node_per_mol = {}
for node in nodes:
# Parse names
name, idxs = node.split(":")
idxs = [int(idx) for idx in idxs.split("_")]
# index nodes per molecule
for idx in idxs:
node_per_mol[idx] = node
# enumerate d-graphs
d_graphs = {}
for idx, start_idx in enumerate(range(len(nodes) - 2 * d_size)):
d_graphs[idx] = tuple([node_per_mol[idx] for idx in range(start_idx, start_idx + 2 * d_size + 1)])
d_graphs = bidict(d_graphs)
# compute d-graph edges by indexing then distance
index = {}
for idx, d_graph in d_graphs.items():
# Generate all tuplesize-mers
for dmer in it.combinations(d_graph, 3):
dmer = tuple(sorted(list(dmer)))
if not dmer in index:
index[dmer] = [d_graph]
else:
index[dmer].append(d_graph)
def distance(dg1, dg2):
dg1 = sorted(dg1)
dg2 = sorted(dg2)
distance = 0
idx1, idx2 = 0, 0
while idx1 < len(dg1) and idx2 < len(dg2):
if dg1[idx1] < dg2[idx2]:
idx1 += 1
distance += 1
elif dg1[idx1] > dg2[idx2]:
idx2 += 1
distance += 1
else:
idx1 += 1
idx2 += 1
distance += len(dg1) - idx1 + len(dg2) - idx2
return distance
distances = {idx:{} for idx in d_graphs}
for _, graphs in index.items():
for pair in it.combinations([i for i in range(len(graphs))], 2):
g1, g2 = graphs[pair[0]], graphs[pair[1]]
x, y = d_graphs.inverse[g1], d_graphs.inverse[g2]
distances[x][y] = distances[y][x] = distance(list(g1), list(g2))
# print(distances)
G2 = nx.Graph()
for idx in d_graphs:
G2.add_node(idx)
for idx1, nodes in distances.items():
for idx2, dist in nodes.items():
G2.add_edge(idx1, idx2)
# output = sys.argv[1].replace("molecule", "barcode").replace(".graphml", f"_{m}.gexf")
nx.write_gexf(G2, 'data/d2_simulated.gexf')
#!/usr/bin/env python3
import argparse
import sys
import graph_manipulator as gm
def parse_arguments():
parser = argparse.ArgumentParser(description='Generate a fake homogenous 10X molecule graph.')
parser.add_argument('--num_molecule', '-n', type=int, required=True, help='The number of molecule in the graph')
parser.add_argument('--depth', '-d', type=int, required=True, help='The number of melecule linked on each direction of the chain.')
parser = argparse.ArgumentParser(description='Generate a fake 10X molecule graph.')
parser.add_argument('--num_molecule', '-n', type=int, required=True, help='The number of molecule in the final graph')
parser.add_argument('--max_depth', '-md', type=int, default=-1, help='The max number of molecule linked on each direction of the chain. if not specified, ceil(avg_depth).')
parser.add_argument('--avg_depth', '-ad', type=int, required=True, help='The average number of molecule linked on each direction of the chain.')
parser.add_argument('--rnd_seed', '-s', type=int, default=None, help='Random seed. Used for reproducibility purpose. Please do not use it in production.')
parser.add_argument('--output', '-o', help="Output filename")
args = parser.parse_args()
if args.max_depth == -1:
args.max_depth = args.avg_depth
return args
def generate_graph(n, d):
return gm.generate_d_graph_chain(n, d)
def generate_graph(n, d_max, d_avg, rnd_seed=None):
return gm.generate_approx_d_graph_chain(n, d_max, d_avg, rnd_seed)
# return gm.generate_d_graph_chain(n, d_max)
def save_graph(G, outfile):
......@@ -27,9 +33,9 @@ def save_graph(G, outfile):
if __name__ == "__main__":
args = parse_arguments()
G = generate_graph(args.num_molecule, args.depth)
G = generate_graph(args.num_molecule, args.max_depth, args.avg_depth, args.rnd_seed)
outfile = f"simulated_molecules_{args.num_molecule}_{args.depth}.gexf"
outfile = f"simulated_molecules_{args.num_molecule}_{args.avg_depth}.gexf"
if args.output:
outfile = args.output
save_graph(G, outfile)
......
import networkx as nx
import random
""" Generate a d-graph chain (succession of unit d-graphs).
If you slice any 2*d+1 part of the graph, it will be a unit d-graph
@parem size The number of nodes in the chain (should not be less than 2*d+1)
@param d The number of connection on the left and on the right for any node
@return The d-graph chain
"""
def generate_d_graph_chain(size, d):
""" Generate a d-graph chain (succession of unit d-graphs).
If you slice any 2*d+1 part of the graph, it will be a unit d-graph
:param size The number of nodes in the chain (should not be less than 2*d+1)
:param d The number of connection on the left and on the right for any node
:return The d-graph chain
"""
G = nx.Graph()
for idx in range(size):
......@@ -21,6 +21,54 @@ def generate_d_graph_chain(size, d):
return G
def generate_approx_d_graph_chain(size, d_max, d_avg, rnd_seed=None):
""" Generate an almost d-graph chain (succession of unit d-graphs). Almost because they are d-graphs in average
with a coverage variation.
:param size The number of nodes in the chain (should not be less than 2*d+1)
:param d_max The max number of connection on the left and on the right for any node
:param d_avg The average d value in the graph (ie 2*d average coverage)
:param rnd_seed Fix the random seed for reproducibility
:return The d-graph chain
"""
# Reproducibility
if rnd_seed:
random.seed(rnd_seed)
# Sample size computation
sursample_needed = round(size * (2 * d_max - 2 * d_avg) / (4 * d_max))
total_size = size + sursample_needed
# Choose which idx to jump over
to_skip = random.sample(range(total_size), sursample_needed)
to_skip.sort()
# Generate sequence
G = nx.Graph()
previous_nodes = [None]* d_max
next_idx = 0
for idx in range(total_size):
if len(to_skip) > 0 and to_skip[0] == idx:
to_skip.pop(0)
previous_nodes.append(None)
else:
# Create the node
G.add_node(next_idx)
# link the node with previous ones
for node_idx in previous_nodes:
if node_idx is not None:
G.add_edge(next_idx, node_idx)
# Update the state
previous_nodes.append(next_idx)
next_idx += 1
# Remove oldest previous node
previous_nodes.pop(0)
return G
""" Merge 2 nodes of a graph G.
The new node have edges from both of the previous nodes (dereplicated).
If a link between node1 and node2 exist, it's discarded.
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment