From b79c63d11b225beafe181afbb2fd74a8d80203fc Mon Sep 17 00:00:00 2001
From: rchikhi <rayan.chikhi@univ-lille1.fr>
Date: Fri, 24 Jan 2020 14:39:07 +0100
Subject: [PATCH] barcode graphe generation from genomes tuning

---
 .../main/generate_genome_molecule_graph.py    | 37 ++++++++++++++-----
 requirements.txt                              |  1 +
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/deconvolution/main/generate_genome_molecule_graph.py b/deconvolution/main/generate_genome_molecule_graph.py
index 08aa184..fa69743 100755
--- a/deconvolution/main/generate_genome_molecule_graph.py
+++ b/deconvolution/main/generate_genome_molecule_graph.py
@@ -19,27 +19,42 @@ def parse_arguments():
 
     return args
 
+def compute_ovl_length(s1,e1,s2,e2):
+    if s1 > s2:
+        s1,e1,s2,e2 = s2,e2,s1,e1
+    return e1-s2
+
+def is_contained(itree,start,end):
+    for itv in itree.overlap(start,end):
+        start2, end2 = itv.begin, itv.end
+        if start2 <= start and end2 >= end:
+            return True
+    return False
+
 
 def generate_graph(nb_molecules, genome_size, avg_mol_size, min_mol_ovl, rnd_seed=None):
     # Reproducibility
     if rnd_seed != -1:
         np.random.seed(rnd_seed)
     G = nx.Graph()
+
+    # generate molecules according to similar principle as LRSM
+    # but in addition make sure they're not contained in each other
     molecules = dict() # mol index: (start,end)
+    itree = IntervalTree()
     for idx in range(nb_molecules):
+        while True: # to avoid contained molecules
             G.add_node(idx)
             #pick a starting position (follows LRSIM)
-            start_pos = np.random.randint(0,genome_size)
+            start = np.random.randint(0,genome_size)
             #pick a fragment size (follows LRSIM)
             molecule_size = np.random.poisson(avg_mol_size)
-            print("molecule",idx,"start",start_pos,"end",start_pos+molecule_size)
-            molecules[idx] = (start_pos, start_pos+molecule_size)
-
-    # compute overlaps between molecules
-    itree = IntervalTree()
-    for idx in molecules:
-        start,end = molecules[idx]
-        itree.addi( start,end,idx)
+            end = start+molecule_size
+            if is_contained(itree,start,end): continue
+            print("molecule",idx,"start",start,"end",end)
+            molecules[idx] = (start, end)
+            itree.addi(start,end,idx)
+            break
 
     # create graph edges corresponding to molecules overlaps
     for idx in molecules:
@@ -47,6 +62,10 @@ def generate_graph(nb_molecules, genome_size, avg_mol_size, min_mol_ovl, rnd_see
         for itv in itree.overlap(start,end):
             other_idx = itv.data
             if idx==other_idx: continue
+            start2, end2 = itv.begin, itv.end
+            ovl_length = compute_ovl_length(start,end,start2,end2)
+            #print("overlap length",ovl_length)
+            if ovl_length < min_mol_ovl: continue
             G.add_edge(idx, other_idx)
     return G
 
diff --git a/requirements.txt b/requirements.txt
index 3fb587f..1998447 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ bidict
 pytest
 scipy
 python-louvain
+intervaltree
-- 
GitLab