Skip to content
Snippets Groups Projects
Commit a96e76f7 authored by Yoann Dufresne's avatar Yoann Dufresne
Browse files

rewrite molecule generation with tested code

parent 7e66edd0
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3 #!/usr/bin/env python3
# generate a bunch of molecules, in the form of python intervals (a,b), i.e. a<=i<b. here, b-a=10 import argparse
# then find their overlaps >= 1 bases import sys
# rationale for realistic parameters: import graph_manipulator as gm
# https://support.10xgenomics.com/de-novo-assembly/datasets/2.1.0/fly
# drosophila genome, 140 Mbp
# molecules of 70 kbp
# coverage of 70x
# number of barcodes: 50k (only) (zcat barcoded.fastq.gz | grep "^@" | awk '{print $2}' | sort | uniq |wc -l)
# number of molecule per barcode: 10 (~/10x/drosophila/chen_data_longranger_run_on_ref/outs$ samtools view phased_possorted_bam.bam | python ~/10x-barcode-graph/scripts/sam_stats.py)
# so in total, 500k molecules
# i.e. the molecule coverage is around 140Mbp/500kbp = 280x
# conservatively, it seems that we can get overlaps for at least 20 neighbor molecules
# so in that setting, considering each molecule as '1bp', i.e. scaling the genome down to 140Mbp/70kbp=2Mbp
# n = 2000
# o = 50
def parse_arguments():
parser = argparse.ArgumentParser(description='Generate a fake homogenous 10X molecule graph.')
parser.add_argument('--num_molecule', '-n', type=int, required=True, help='The number of molecule in the graph')
parser.add_argument('--depth', '-d', type=int, required=True, help='The number of melecule linked on each direction of the chain.')
parser.add_argument('--output', '-o', help="Output filename")
n = 1500 args = parser.parse_args()
o = 6 return args
molecules_intervals = []
for i in range(n):
molecules_intervals += [(i,i+o)]
#print(molecules_intervals) def generate_graph(n, d):
return gm.generate_d_graph_chain(n, d)
def overlap(a,b):
for i in range(*a):
if i in range(*b):
return True
return False
#print(overlap(molecules_intervals[0],molecules_intervals[2])) def save_graph(G, outfile):
#print(overlap(molecules_intervals[0],molecules_intervals[20])) import networkx as nx
nx.write_gexf(G, outfile)
import networkx as nx
G = nx.Graph()
for i,a in enumerate(molecules_intervals): if __name__ == "__main__":
G.add_node(i) args = parse_arguments()
G = generate_graph(args.num_molecule, args.depth)
for i,a in enumerate(molecules_intervals): outfile = f"simulated_molecules_{args.num_molecule}_{args.depth}.gexf"
for j,b in enumerate(molecules_intervals): if args.output:
if i >= j: continue outfile = args.outfile
if overlap(a,b): save_graph(G, outfile)
G.add_edge(i,j)
print(G.edges())
nx.write_graphml(G, f"data/simulated_molecules_{n}_{o-1}.graphml")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment