Commit a96e76f7 authored by Yoann Dufresne's avatar Yoann Dufresne
Browse files

rewrite molecule generation with tested code

parent 7e66edd0
#!/usr/bin/env python3 #!/usr/bin/env python3
# generate a bunch of molecules, in the form of python intervals (a,b), i.e. a<=i<b. here, b-a=10 import argparse
# then find their overlaps >= 1 bases import sys
# rationale for realistic parameters: import graph_manipulator as gm
# https://support.10xgenomics.com/de-novo-assembly/datasets/2.1.0/fly
# drosophila genome, 140 Mbp
# molecules of 70 kbp
# coverage of 70x
# number of barcodes: 50k (only) (zcat barcoded.fastq.gz | grep "^@" | awk '{print $2}' | sort | uniq |wc -l)
# number of molecule per barcode: 10 (~/10x/drosophila/chen_data_longranger_run_on_ref/outs$ samtools view phased_possorted_bam.bam | python ~/10x-barcode-graph/scripts/sam_stats.py)
# so in total, 500k molecules
# i.e. the molecule coverage is around 140Mbp/500kbp = 280x
# conservatively, it seems that we can get overlaps for at least 20 neighbor molecules
# so in that setting, considering each molecule as '1bp', i.e. scaling the genome down to 140Mbp/70kbp=2Mbp
# n = 2000
# o = 50
def parse_arguments():
parser = argparse.ArgumentParser(description='Generate a fake homogenous 10X molecule graph.')
parser.add_argument('--num_molecule', '-n', type=int, required=True, help='The number of molecule in the graph')
parser.add_argument('--depth', '-d', type=int, required=True, help='The number of melecule linked on each direction of the chain.')
parser.add_argument('--output', '-o', help="Output filename")
n = 1500 args = parser.parse_args()
o = 6 return args
molecules_intervals = []
for i in range(n):
molecules_intervals += [(i,i+o)]
#print(molecules_intervals) def generate_graph(n, d):
return gm.generate_d_graph_chain(n, d)
def overlap(a,b):
for i in range(*a):
if i in range(*b):
return True
return False
#print(overlap(molecules_intervals[0],molecules_intervals[2])) def save_graph(G, outfile):
#print(overlap(molecules_intervals[0],molecules_intervals[20])) import networkx as nx
nx.write_gexf(G, outfile)
import networkx as nx
G = nx.Graph()
for i,a in enumerate(molecules_intervals): if __name__ == "__main__":
G.add_node(i) args = parse_arguments()
G = generate_graph(args.num_molecule, args.depth)
for i,a in enumerate(molecules_intervals): outfile = f"simulated_molecules_{args.num_molecule}_{args.depth}.gexf"
for j,b in enumerate(molecules_intervals): if args.output:
if i >= j: continue outfile = args.outfile
if overlap(a,b): save_graph(G, outfile)
G.add_edge(i,j)
print(G.edges())
nx.write_graphml(G, f"data/simulated_molecules_{n}_{o-1}.graphml")
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment