Commit b906f1b9 authored by rchikhi's avatar rchikhi
Browse files

some of my tests on alternative d_graphs construction techniques

parent 55032d52
import glob
WORKDIR = "snake_exec" if "workdir" not in config else config["workdir"]
INPUT = "data/simulated_barcode_1000_5_2.gexf" if "input" not in config else config["input"]
INPUT = "snake_tests/simu_bar_n1000_d5_m2.gexf" if "input" not in config else config["input"]
SAMPLE_NAME = INPUT[INPUT.rfind('/')+1:INPUT.rfind('.')]
rule all:
input:
f"{WORKDIR}/{SAMPLE_NAME}.tar.gz"
rule compress_data:
input:
barcode_graph=f"{WORKDIR}/{{barcode_file}}.gexf",
d2_raw=f"{WORKDIR}/{{barcode_file}}_d2_raw.gexf",
simplified_d2=f"{WORKDIR}/{{barcode_file}}_d2_simplified.gexf"
simplified_d2=f"{WORKDIR}/{{barcode_file}}_d2_simplified.gexf",
d2_raw_louvain=f"{WORKDIR}/{{barcode_file}}_d2_raw_louvain.gexf",
simplified_d2_louvain=f"{WORKDIR}/{{barcode_file}}_d2_simplified_louvain.gexf",
#d2_raw_comtest=f"{WORKDIR}/{{barcode_file}}_d2_raw_comtest.gexf",
#simplified_d2_comtest=f"{WORKDIR}/{{barcode_file}}_d2_simplified_comtest.gexf"
output:
zip=f"{WORKDIR}/{{barcode_file}}.tar.gz"
shell:
......@@ -24,7 +27,6 @@ rule compress_data:
"rm -rf {wildcards.barcode_file}/ ;"
"cd - ;"
rule d2_simplification:
input:
barcode_graph="{barcode_path}.gexf",
......@@ -43,6 +45,46 @@ rule d2_generation:
shell:
f"python3 deconvolution/to_d2_graph.py {{input.barcode_graph}} -o {WORKDIR}/{{wildcards.file}}_d2_raw"
rule d2_simplification_louvain:
input:
barcode_graph="{barcode_path}.gexf",
d2_raw="{barcode_path}_d2_raw_louvain.gexf"
output:
simplified_d2="{barcode_path}_d2_simplified_louvain.gexf"
shell:
"python3 deconvolution/d2_reduction.py -o {output.simplified_d2} {input.barcode_graph} {input.d2_raw}"
rule d2_generation_louvain:
input:
barcode_graph=f"{WORKDIR}/{{file}}.gexf"
output:
d2_file=f"{WORKDIR}/{{file}}_d2_raw_louvain.gexf"
shell:
f"python3 --version;"
f"python3 deconvolution/to_d2_graph.py {{input.barcode_graph}} --louvain -o {WORKDIR}/{{wildcards.file}}_d2_raw_louvain"
rule d2_simplification_comtest:
input:
barcode_graph="{barcode_path}.gexf",
d2_raw="{barcode_path}_d2_raw_comtest.gexf"
output:
simplified_d2="{barcode_path}_d2_simplified_comtest.gexf"
shell:
"python3 deconvolution/d2_reduction.py -o {output.simplified_d2} {input.barcode_graph} {input.d2_raw}"
rule d2_generation_comtest:
input:
barcode_graph=f"{WORKDIR}/{{file}}.gexf"
output:
d2_file=f"{WORKDIR}/{{file}}_d2_raw_comtest.gexf"
shell:
f"python3 --version;"
f"python3 deconvolution/to_d2_graph.py {{input.barcode_graph}} --comtest -o {WORKDIR}/{{wildcards.file}}_d2_raw_comtest"
#def define_graph_input(wildcards):
# lst = glob.glob(f"{DATA}/{wildcards.file}.*")
......
WORKDIR="snake_exec" if "workdir" not in config else config["workdir"]
OUTDIR="snake_tests" if "outdir" not in config else config["outdir"]
N=[1000] if "n" not in config else config["n"] # Number of molecule to simulate
D=[5] if "d" not in config else config["d"] # Average coverage of each molecule
M=[2] if "m" not in config else config["m"] # Average number of molecule per barcode
......@@ -7,7 +7,7 @@ M=[2] if "m" not in config else config["m"] # Average number of molecule per bar
rule all:
input:
expand(f"{WORKDIR}/simu_bar_n{{n}}_d{{d}}_m{{m}}.gexf", n=N, m=M, d=D)
expand(f"{OUTDIR}/simu_bar_n{{n}}_d{{d}}_m{{m}}.gexf", n=N, m=M, d=D)
rule generate_barcodes:
input:
......
......@@ -55,12 +55,12 @@ class D2Graph(nx.Graph):
return self.subgraph(list(self.nodes()))
def construct_from_barcodes(self, index_size=3, verbose=True, debug=False):
def construct_from_barcodes(self, index_size=3, verbose=True, debug=False, clique_mode=None):
import debug_disct as dd
# Compute all the d-graphs
if verbose:
print("Computing the unit d-graphs..")
self.d_graphs_per_node = compute_all_max_d_graphs(self.barcode_graph, debug=debug)
self.d_graphs_per_node = compute_all_max_d_graphs(self.barcode_graph, debug=debug, clique_mode=clique_mode)
if verbose:
counts = sum(len(x) for x in self.d_graphs_per_node.values())
print(f"\t {counts} computed d-graphs")
......
import networkx as nx
from functools import total_ordering
import community # pip install python-louvain
@total_ordering
......@@ -231,20 +232,64 @@ class Dgraph(object):
@param graph A barcode graph
@return A dictionary associating each node to its list of all possible d-graphs. The d-graphs are sorted by decreasing ratio.
"""
def compute_all_max_d_graphs(graph, debug=False):
def compute_all_max_d_graphs(graph, debug=False, clique_mode=None):
d_graphs = {}
for idx, node in enumerate(list(graph.nodes())):
#if "MI" not in str(node): continue # for debugging; only look at deconvolved nodes
print(f"\r{idx+1}/{len(graph.nodes())}")
neighbors = list(graph.neighbors(node))
neighbors_graph = nx.Graph(graph.subgraph(neighbors))
node_d_graphs = set()
# Find all the cliques (equivalent to compute all the candidate half d-graph)
cliques = list(nx.find_cliques(neighbors_graph))
print("cliques", len(cliques))
if debug: print("node",node,"has",len(cliques),"cliques")
mode_str = " "
if clique_mode is None:
# Find all the cliques (equivalent to compute all the candidate half d-graph)
cliques = list(nx.find_cliques(neighbors_graph))
mode_str += "(max-cliques)"
elif clique_mode == "louvain":
louvain = community.best_partition(neighbors_graph) # louvain
# high resolution seems to work better
communities = [[c for c,i in louvain.items() if i == clique_id] for clique_id in set(louvain.values())]
mode_str += "(louvain)"
cliques = []
for comm in communities:
# further decompose! into necessarily 2 communities
community_as_graph = nx.Graph(graph.subgraph(comm))
if len(community_as_graph.nodes()) <= 2:
cliques += [community_as_graph.nodes()]
else:
cliques += map(list,nx.community.asyn_fluidc(community_as_graph,2))
elif clique_mode == "testing":
# k-clique communities
#from networkx.algorithms.community import k_clique_communities
#cliques = k_clique_communities(neighbors_graph, 3) # related to the d-graph d parameter
from cdlib import algorithms
cliques_dict = algorithms.node_perception(neighbors_graph, threshold=0.75, overlap_threshold=0.75) #typical output: Sizes of found cliques (testing): Counter({6: 4, 5: 3, 4: 2, 2: 1})
#cliques_dict = algorithms.gdmp2(neighbors_graph, min_threshold=0.9) #typical output: sizes of found cliques (testing): Counter({3: 2, 5: 1})
#cliques_dict = algorithms.angel(neighbors_graph, threshold=0.90) # very sensitive parameters: 0.84 and 0.88 don't work at all but 0.86 does sort of
from collections import defaultdict
cliques_dict2 = defaultdict(list)
for (node, values) in cliques_dict.to_node_community_map().items():
for value in values:
cliques_dict2[value] += [node]
cliques = list(cliques_dict2.values())
mode_str += "(testing)"
print("cliques", len(cliques))
if debug: print("node",node,"has",len(cliques),"cliques in neighborhood (of size",len(neighbors),")")
cliques_debugging = True
if cliques_debugging:
#cliques_graph = nx.make_max_clique_graph(neighbors_graph)
#if debug: print("node",node,"clique graph has",len(cliques_graph.nodes()),"nodes",len(cliques_graph.edges()),"edges")
#nx.write_gexf(cliques_graph, str(node) +".gexf")
from collections import Counter
len_cliques = Counter(map(len,cliques))
print("sizes of found cliques%s:" % mode_str, len_cliques)
# Pair halves to create d-graphes
for idx, clq1 in enumerate(cliques):
......@@ -255,7 +300,10 @@ def compute_all_max_d_graphs(graph, debug=False):
d_graph = Dgraph(node)
d_graph.put_halves(clq1, clq2, neighbors_graph)
if d_graph.get_link_divergence() <= d_graph.get_optimal_score() / 2:
factor = 0.5
#if clique_mode == "testing": factor = 1 # still allows louvain's big communities
#print("link div:",d_graph.get_link_divergence(),"opt:",d_graph.get_optimal_score(), "good d graph?",d_graph.get_link_divergence() <= d_graph.get_optimal_score() *factor)
if d_graph.get_link_divergence() <= d_graph.get_optimal_score() * factor:
node_d_graphs.add(d_graph)
print("d-graphs", len(node_d_graphs))
......
......@@ -38,7 +38,10 @@ if __name__ == "__main__":
print("Loading graph")
G = load_graph(args.input_graph)
print("Writing graph")
lst_point = args.input_graph.rfind('.')
output = args.input_graph[:lst_point] + ".gexf"
nx.write_gexf(G, output)
if output == args.input_graph:
print("gexf converter: nothing to do")
else:
print("Writing graph")
nx.write_gexf(G, output)
......@@ -12,6 +12,8 @@ def parse_arguments():
parser.add_argument('barcode_graph', help='The barcode graph file. Must be a gefx formated file.')
parser.add_argument('--output_prefix', '-o', default="d2_graph", help="Output file prefix.")
parser.add_argument('--debug', '-d', action='store_true', help="Debug")
parser.add_argument('--louvain', '-l', action='store_true', help="Enable Louvain community detection instead of all max-cliques")
parser.add_argument('--comtest', '-k', action='store_true', help="Enable [placeholder] community detection algorithm instead of max-cliques")
args = parser.parse_args()
return args
......@@ -38,12 +40,20 @@ def main():
exit(1)
dprint("barcode graph loaded")
if args.louvain:
clique_mode = "louvain"
elif args.comtest:
clique_mode = "testing"
else:
clique_mode = None
# Index size must be changed for general purpose. 8 is good for d=5
dprint("creating D2graph object")
d2g = d2.D2Graph(G)
dprint("D2 graph object created")
dprint("constructing d2 graph from barcode graph")
d2g.construct_from_barcodes(index_size=8, debug=debug)
index_size = 8 #if clique_mode is None else 3
d2g.construct_from_barcodes(index_size=index_size, debug=debug, clique_mode=clique_mode)
dprint("[debug] d2 graph constructed")
d2g.save(f"{args.output_prefix}.tsv")
......
rm -Rf snake_tests/simu_bar_n1000_d5_m2*
snakemake -s Snakefile_data_simu
snakemake -s Snakefile_d2
mv snake_exec/simu_bar_n1000_d5_m2.tar.gz snake_tests/simu_bar_n1000_d5_m2.tar.gz
cd snake_tests
tar xf simu_bar_n1000_d5_m2.tar.gz
cd ..
do_max_cliques=1
if [[ $do_max_cliques == 1 ]]
then
echo "max-cliques"
python deconvolution/evaluate.py --type d2 --barcode_graph snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2_d2_simplified.gexf
python deconvolution/d2_to_path.py snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2_d2_simplified.gexf
python deconvolution/evaluate.py --type path --barcode_graph snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf _path.gexf
fi
do_louvain=1
if [[ $do_louvain == 1 ]]
then
echo "louvain"
python deconvolution/evaluate.py --type d2 --barcode_graph snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2_d2_simplified_louvain.gexf
python deconvolution/d2_to_path.py snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2_d2_simplified_louvain.gexf
python deconvolution/evaluate.py --type path --barcode_graph snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf _path.gexf
fi
do_comtest=0
if [[ $do_comtest == 1 ]]
then
echo "******************************* Testing algorithm"
python deconvolution/evaluate.py --type d2 --barcode_graph snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2_d2_simplified_comtest.gexf
python deconvolution/d2_to_path.py snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2_d2_simplified_comtest.gexf
python deconvolution/evaluate.py --type path --barcode_graph snake_tests/simu_bar_n1000_d5_m2/simu_bar_n1000_d5_m2.gexf _path.gexf
fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment