From b79c63d11b225beafe181afbb2fd74a8d80203fc Mon Sep 17 00:00:00 2001 From: rchikhi <rayan.chikhi@univ-lille1.fr> Date: Fri, 24 Jan 2020 14:39:07 +0100 Subject: [PATCH] barcode graphe generation from genomes tuning --- .../main/generate_genome_molecule_graph.py | 37 ++++++++++++++----- requirements.txt | 1 + 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/deconvolution/main/generate_genome_molecule_graph.py b/deconvolution/main/generate_genome_molecule_graph.py index 08aa184..fa69743 100755 --- a/deconvolution/main/generate_genome_molecule_graph.py +++ b/deconvolution/main/generate_genome_molecule_graph.py @@ -19,27 +19,42 @@ def parse_arguments(): return args +def compute_ovl_length(s1,e1,s2,e2): + if s1 > s2: + s1,e1,s2,e2 = s2,e2,s1,e1 + return e1-s2 + +def is_contained(itree,start,end): + for itv in itree.overlap(start,end): + start2, end2 = itv.begin, itv.end + if start2 <= start and end2 >= end: + return True + return False + def generate_graph(nb_molecules, genome_size, avg_mol_size, min_mol_ovl, rnd_seed=None): # Reproducibility if rnd_seed != -1: np.random.seed(rnd_seed) G = nx.Graph() + + # generate molecules according to similar principle as LRSM + # but in addition make sure they're not contained in each other molecules = dict() #Â mol index: (start,end) + itree = IntervalTree() for idx in range(nb_molecules): + while True: #Â to avoid contained molecules G.add_node(idx) #pick a starting position (follows LRSIM) - start_pos = np.random.randint(0,genome_size) + start = np.random.randint(0,genome_size) #pick a fragment size (follows LRSIM) molecule_size = np.random.poisson(avg_mol_size) - print("molecule",idx,"start",start_pos,"end",start_pos+molecule_size) - molecules[idx] = (start_pos, start_pos+molecule_size) - - #Â compute overlaps between molecules - itree = IntervalTree() - for idx in molecules: - start,end = molecules[idx] - itree.addi( start,end,idx) + end = start+molecule_size + if is_contained(itree,start,end): continue + print("molecule",idx,"start",start,"end",end) + molecules[idx] = (start, end) + itree.addi(start,end,idx) + break #Â create graph edges corresponding to molecules overlaps for idx in molecules: @@ -47,6 +62,10 @@ def generate_graph(nb_molecules, genome_size, avg_mol_size, min_mol_ovl, rnd_see for itv in itree.overlap(start,end): other_idx = itv.data if idx==other_idx: continue + start2, end2 = itv.begin, itv.end + ovl_length = compute_ovl_length(start,end,start2,end2) + #print("overlap length",ovl_length) + if ovl_length < min_mol_ovl: continue G.add_edge(idx, other_idx) return G diff --git a/requirements.txt b/requirements.txt index 3fb587f..1998447 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ bidict pytest scipy python-louvain +intervaltree -- GitLab