Commit d0b9fb62 authored by Yoann Dufresne's avatar Yoann Dufresne
Browse files

molecule generation variations

parent fedc3560
...@@ -10,7 +10,8 @@ def parse_arguments(): ...@@ -10,7 +10,8 @@ def parse_arguments():
parser.add_argument('--num_molecule', '-n', type=int, required=True, help='The number of molecule in the final graph') parser.add_argument('--num_molecule', '-n', type=int, required=True, help='The number of molecule in the final graph')
parser.add_argument('--max_depth', '-md', type=int, default=-1, help='The max number of molecule linked on each direction of the chain. if not specified, ceil(avg_depth).') parser.add_argument('--max_depth', '-md', type=int, default=-1, help='The max number of molecule linked on each direction of the chain. if not specified, ceil(avg_depth).')
parser.add_argument('--avg_depth', '-ad', type=int, required=True, help='The average number of molecule linked on each direction of the chain.') parser.add_argument('--avg_depth', '-ad', type=int, required=True, help='The average number of molecule linked on each direction of the chain.')
parser.add_argument('--rnd_seed', '-s', type=int, default=None, help='Random seed. Used for reproducibility purpose. Please do not use it in production.') parser.add_argument('--size_reduction', '-r', type=float, default=0.0, help='99%% of molecule size will be between average_size*size_reduction')
parser.add_argument('--rnd_seed', '-s', type=int, default=-1, help='Random seed. Used for reproducibility purpose. Please do not use it in production.')
parser.add_argument('--output', '-o', help="Output filename") parser.add_argument('--output', '-o', help="Output filename")
args = parser.parse_args() args = parser.parse_args()
...@@ -21,8 +22,8 @@ def parse_arguments(): ...@@ -21,8 +22,8 @@ def parse_arguments():
return args return args
def generate_graph(n, d_max, d_avg, rnd_seed=None): def generate_graph(n, d_max, d_avg, size_reduction, rnd_seed=None):
return gm.generate_approx_d_graph_chain(n, d_max, d_avg, rnd_seed) return gm.generate_approx_d_graph_chain(n, d_max, d_avg, size_reduction, rnd_seed)
# return gm.generate_d_graph_chain(n, d_max) # return gm.generate_d_graph_chain(n, d_max)
...@@ -33,7 +34,7 @@ def save_graph(G, outfile): ...@@ -33,7 +34,7 @@ def save_graph(G, outfile):
if __name__ == "__main__": if __name__ == "__main__":
args = parse_arguments() args = parse_arguments()
G = generate_graph(args.num_molecule, args.max_depth, args.avg_depth, args.rnd_seed) G = generate_graph(args.num_molecule, args.max_depth, args.avg_depth, args.size_reduction, args.rnd_seed)
outfile = f"simulated_molecules_{args.num_molecule}_{args.avg_depth}.gexf" outfile = f"simulated_molecules_{args.num_molecule}_{args.avg_depth}.gexf"
if args.output: if args.output:
......
...@@ -21,17 +21,18 @@ def generate_d_graph_chain(size, d): ...@@ -21,17 +21,18 @@ def generate_d_graph_chain(size, d):
return G return G
def generate_approx_d_graph_chain(size, d_max, d_avg, rnd_seed=None): def generate_approx_d_graph_chain(size, d_max, d_avg, size_reduction=0, rnd_seed=-1):
""" Generate an almost d-graph chain (succession of unit d-graphs). Almost because they are d-graphs in average """ Generate an almost d-graph chain (succession of unit d-graphs). Almost because they are d-graphs in average
with a coverage variation. with a coverage variation.
:param size The number of nodes in the chain (should not be less than 2*d+1) :param size The number of nodes in the chain (should not be less than 2*d+1)
:param d_max The max number of connection on the left and on the right for any node :param d_max The max number of connection on the left and on the right for any node
:param d_avg The average d value in the graph (ie 2*d average coverage) :param d_avg The average d value in the graph (ie 2*d average coverage)
:param size_reduction Randomly change the size of the molecule when created. 99% of the molecules will have a size over size*size_reduction
:param rnd_seed Fix the random seed for reproducibility :param rnd_seed Fix the random seed for reproducibility
:return The d-graph chain :return The d-graph chain
""" """
# Reproducibility # Reproducibility
if rnd_seed: if rnd_seed != -1:
random.seed(rnd_seed) random.seed(rnd_seed)
# Sample size computation # Sample size computation
...@@ -42,9 +43,13 @@ def generate_approx_d_graph_chain(size, d_max, d_avg, rnd_seed=None): ...@@ -42,9 +43,13 @@ def generate_approx_d_graph_chain(size, d_max, d_avg, rnd_seed=None):
to_skip = random.sample(range(total_size), sursample_needed) to_skip = random.sample(range(total_size), sursample_needed)
to_skip.sort() to_skip.sort()
# Init the random size variation
d_min = d_max*(1-size_reduction)
std_dev = (d_max-d_min)/2.5
# Generate sequence # Generate sequence
G = nx.Graph() G = nx.Graph()
previous_nodes = [None]* d_max previous_nodes = [None]* d_max * 2
next_idx = 0 next_idx = 0
for idx in range(total_size): for idx in range(total_size):
if len(to_skip) > 0 and to_skip[0] == idx: if len(to_skip) > 0 and to_skip[0] == idx:
...@@ -54,8 +59,13 @@ def generate_approx_d_graph_chain(size, d_max, d_avg, rnd_seed=None): ...@@ -54,8 +59,13 @@ def generate_approx_d_graph_chain(size, d_max, d_avg, rnd_seed=None):
# Create the node # Create the node
G.add_node(next_idx) G.add_node(next_idx)
# link the node with previous ones # size deviation computation
for node_idx in previous_nodes: nb_nodes_to_connect = round(random.gauss(d_max, std_dev))
# Limit the deviation
nb_nodes_to_connect = max(nb_nodes_to_connect, 1)
nb_nodes_to_connect = min(nb_nodes_to_connect, 2 * d_max)
# link the node with previous ones regarding size deviation
for node_idx in previous_nodes[len(previous_nodes)-nb_nodes_to_connect:]:
if node_idx is not None: if node_idx is not None:
G.add_edge(next_idx, node_idx) G.add_edge(next_idx, node_idx)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment