diff --git a/deconvolution/generate_fake_molecule_graph.py b/deconvolution/generate_fake_molecule_graph.py index 8abff70125dd52953552e57e5d76cdc6ab9f7f5b..2c61d5a0d93204226ccbebd3d5216429db739c8c 100755 --- a/deconvolution/generate_fake_molecule_graph.py +++ b/deconvolution/generate_fake_molecule_graph.py @@ -1,54 +1,36 @@ #!/usr/bin/env python3 -# generate a bunch of molecules, in the form of python intervals (a,b), i.e. a<=i<b. here, b-a=10 -# then find their overlaps >= 1 bases +import argparse +import sys -# rationale for realistic parameters: -# https://support.10xgenomics.com/de-novo-assembly/datasets/2.1.0/fly -# drosophila genome, 140 Mbp -# molecules of 70 kbp -# coverage of 70x -# number of barcodes: 50k (only) (zcat barcoded.fastq.gz | grep "^@" | awk '{print $2}' | sort | uniq |wc -l) -# number of molecule per barcode: 10 (~/10x/drosophila/chen_data_longranger_run_on_ref/outs$ samtools view phased_possorted_bam.bam | python ~/10x-barcode-graph/scripts/sam_stats.py) -# so in total, 500k molecules -# i.e. the molecule coverage is around 140Mbp/500kbp = 280x -# conservatively, it seems that we can get overlaps for at least 20 neighbor molecules -# so in that setting, considering each molecule as '1bp', i.e. scaling the genome down to 140Mbp/70kbp=2Mbp -# n = 2000 -# o = 50 +import graph_manipulator as gm +def parse_arguments(): + parser = argparse.ArgumentParser(description='Generate a fake homogenous 10X molecule graph.') + parser.add_argument('--num_molecule', '-n', type=int, required=True, help='The number of molecule in the graph') + parser.add_argument('--depth', '-d', type=int, required=True, help='The number of melecule linked on each direction of the chain.') + parser.add_argument('--output', '-o', help="Output filename") -n = 1500 -o = 6 + args = parser.parse_args() + return args -molecules_intervals = [] -for i in range(n): - molecules_intervals += [(i,i+o)] -#print(molecules_intervals) +def generate_graph(n, d): + return gm.generate_d_graph_chain(n, d) -def overlap(a,b): - for i in range(*a): - if i in range(*b): - return True - return False -#print(overlap(molecules_intervals[0],molecules_intervals[2])) -#print(overlap(molecules_intervals[0],molecules_intervals[20])) +def save_graph(G, outfile): + import networkx as nx + nx.write_gexf(G, outfile) -import networkx as nx -G = nx.Graph() -for i,a in enumerate(molecules_intervals): - G.add_node(i) +if __name__ == "__main__": + args = parse_arguments() + G = generate_graph(args.num_molecule, args.depth) -for i,a in enumerate(molecules_intervals): - for j,b in enumerate(molecules_intervals): - if i >= j: continue - if overlap(a,b): - G.add_edge(i,j) + outfile = f"simulated_molecules_{args.num_molecule}_{args.depth}.gexf" + if args.output: + outfile = args.outfile + save_graph(G, outfile) -print(G.edges()) - -nx.write_graphml(G, f"data/simulated_molecules_{n}_{o-1}.graphml")