Skip to content
Snippets Groups Projects
Commit a96e76f7 authored by Yoann Dufresne's avatar Yoann Dufresne
Browse files

rewrite molecule generation with tested code

parent 7e66edd0
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
# generate a bunch of molecules, in the form of python intervals (a,b), i.e. a<=i<b. here, b-a=10
# then find their overlaps >= 1 bases
import argparse
import sys
# rationale for realistic parameters:
# https://support.10xgenomics.com/de-novo-assembly/datasets/2.1.0/fly
# drosophila genome, 140 Mbp
# molecules of 70 kbp
# coverage of 70x
# number of barcodes: 50k (only) (zcat barcoded.fastq.gz | grep "^@" | awk '{print $2}' | sort | uniq |wc -l)
# number of molecule per barcode: 10 (~/10x/drosophila/chen_data_longranger_run_on_ref/outs$ samtools view phased_possorted_bam.bam | python ~/10x-barcode-graph/scripts/sam_stats.py)
# so in total, 500k molecules
# i.e. the molecule coverage is around 140Mbp/500kbp = 280x
# conservatively, it seems that we can get overlaps for at least 20 neighbor molecules
# so in that setting, considering each molecule as '1bp', i.e. scaling the genome down to 140Mbp/70kbp=2Mbp
# n = 2000
# o = 50
import graph_manipulator as gm
def parse_arguments():
parser = argparse.ArgumentParser(description='Generate a fake homogenous 10X molecule graph.')
parser.add_argument('--num_molecule', '-n', type=int, required=True, help='The number of molecule in the graph')
parser.add_argument('--depth', '-d', type=int, required=True, help='The number of melecule linked on each direction of the chain.')
parser.add_argument('--output', '-o', help="Output filename")
n = 1500
o = 6
args = parser.parse_args()
return args
molecules_intervals = []
for i in range(n):
molecules_intervals += [(i,i+o)]
#print(molecules_intervals)
def generate_graph(n, d):
return gm.generate_d_graph_chain(n, d)
def overlap(a,b):
for i in range(*a):
if i in range(*b):
return True
return False
#print(overlap(molecules_intervals[0],molecules_intervals[2]))
#print(overlap(molecules_intervals[0],molecules_intervals[20]))
def save_graph(G, outfile):
import networkx as nx
nx.write_gexf(G, outfile)
import networkx as nx
G = nx.Graph()
for i,a in enumerate(molecules_intervals):
G.add_node(i)
if __name__ == "__main__":
args = parse_arguments()
G = generate_graph(args.num_molecule, args.depth)
for i,a in enumerate(molecules_intervals):
for j,b in enumerate(molecules_intervals):
if i >= j: continue
if overlap(a,b):
G.add_edge(i,j)
outfile = f"simulated_molecules_{args.num_molecule}_{args.depth}.gexf"
if args.output:
outfile = args.outfile
save_graph(G, outfile)
print(G.edges())
nx.write_graphml(G, f"data/simulated_molecules_{n}_{o-1}.graphml")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment