Commit a96e76f7 authored by Yoann Dufresne's avatar Yoann Dufresne
Browse files

rewrite molecule generation with tested code

parent 7e66edd0
#!/usr/bin/env python3
# generate a bunch of molecules, in the form of python intervals (a,b), i.e. a<=i<b. here, b-a=10
# then find their overlaps >= 1 bases
import argparse
import sys
# rationale for realistic parameters:
# drosophila genome, 140 Mbp
# molecules of 70 kbp
# coverage of 70x
# number of barcodes: 50k (only) (zcat barcoded.fastq.gz | grep "^@" | awk '{print $2}' | sort | uniq |wc -l)
# number of molecule per barcode: 10 (~/10x/drosophila/chen_data_longranger_run_on_ref/outs$ samtools view phased_possorted_bam.bam | python ~/10x-barcode-graph/scripts/
# so in total, 500k molecules
# i.e. the molecule coverage is around 140Mbp/500kbp = 280x
# conservatively, it seems that we can get overlaps for at least 20 neighbor molecules
# so in that setting, considering each molecule as '1bp', i.e. scaling the genome down to 140Mbp/70kbp=2Mbp
# n = 2000
# o = 50
import graph_manipulator as gm
def parse_arguments():
parser = argparse.ArgumentParser(description='Generate a fake homogenous 10X molecule graph.')
parser.add_argument('--num_molecule', '-n', type=int, required=True, help='The number of molecule in the graph')
parser.add_argument('--depth', '-d', type=int, required=True, help='The number of melecule linked on each direction of the chain.')
parser.add_argument('--output', '-o', help="Output filename")
n = 1500
o = 6
args = parser.parse_args()
return args
molecules_intervals = []
for i in range(n):
molecules_intervals += [(i,i+o)]
def generate_graph(n, d):
return gm.generate_d_graph_chain(n, d)
def overlap(a,b):
for i in range(*a):
if i in range(*b):
return True
return False
def save_graph(G, outfile):
import networkx as nx
nx.write_gexf(G, outfile)
import networkx as nx
G = nx.Graph()
for i,a in enumerate(molecules_intervals):
if __name__ == "__main__":
args = parse_arguments()
G = generate_graph(args.num_molecule, args.depth)
for i,a in enumerate(molecules_intervals):
for j,b in enumerate(molecules_intervals):
if i >= j: continue
if overlap(a,b):
outfile = f"simulated_molecules_{args.num_molecule}_{args.depth}.gexf"
if args.output:
outfile = args.outfile
save_graph(G, outfile)
nx.write_graphml(G, f"data/simulated_molecules_{n}_{o-1}.graphml")
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment