Commit 799c6865 authored by Rayan Chikhi's avatar Rayan Chikhi

more control over ground truth overlaps; also script for generating figure of...

more control over ground truth overlaps; also script for generating figure of synthetic accuracy/sens
parent 0c032479
......@@ -69,6 +69,7 @@ def main():
dprint("constructing d2 graph from barcode graph")
d2g.construct_from_barcodes(neighbor_threshold=args.edge_divergence_threshold, clique_mode=clique_mode, threads=args.threads, verbose=args.verbose)
dprint("[debug] d2 graph constructed")
dprint("[debug] %d nodes %d edges" % (len(d2g.nodes()), len(d2g.edges())))
# d2g.save(f"{args.output_prefix}.tsv")
nx.write_gexf(d2g, f"{args.output_prefix}.gexf")
......
......@@ -10,7 +10,7 @@ do
echo "n=$n d=$d m=$m m_dev=$dev"
rm -f snake_exec/simu_0_bar_n"$n"_d"$d"_m"$m"-dev0.gexf &&
snakemake -s Snakefile_data_simu --config n=$n d=$d m=$m m_dev=$dev >/dev/null 2>&1
snakemake --cores 1 -s Snakefile_data_simu --config n=$n d=$d m=$m m_dev=$dev >/dev/null 2>&1
cd snake_exec && source CMD_acc_sens && cd ..
done
......
# compares a barcode graph to ground truth
# input: [tested.gpickle] [ground_truth.gpickle]
import sys
if len(sys.argv) < 3:
exit("input: [tested.gpickle] [ground_truth.gpickle]")
import sys
from collections import Counter
from networkx import nx
print("loading tested graph")
tested = nx.read_gpickle(sys.argv[1])
......
......@@ -5,7 +5,7 @@
# input: [reads_file.fastq.gz] [output_prefix]
#
# example:
# zcat ~/data/10x-ecoli/fastqs/ecoli.fq.gz | python ground_truth_barcode_graph.py /dev/stdin graph
# zcat ~/data/10x-ecoli/fastqs/ecoli.fq.gz | python ground_truth_barcode_graph.py /dev/stdin graph [min_overlap_length] [max_nb_overlaps_per_mol] [equi]
import sys, gzip
from Bio.SeqIO.QualityIO import FastqGeneralIterator
......@@ -16,6 +16,20 @@ from collections import Counter
reads_file = sys.argv[1]
overlap_size = 9000
if len(sys.argv) > 3:
overlap_size = int(sys.argv[3])
print("using custom overlap size %d" % overlap_size)
max_overlaps = None # limits the number of overlaps we record per molecule (keeping only the longest ones)
if len(sys.argv) > 4:
max_overlaps = int(sys.argv[4])
print("using custom maximal number of overlaps per molecule %d" % max_overlaps)
equi = False # among the neighbors of a molecule, force to have as many molecules before than as after (i.e. prevents a molecule from only overlapping with those after itself) - useful in conjunction with max_overlaps
if len(sys.argv) > 5:
equi = True
print("asking for equi before/after overlap repartition")
if reads_file.endswith('.gz'):
opener = gzip.open
......@@ -48,8 +62,8 @@ for title, seq, qual in FastqGeneralIterator(opener(reads_file,"rt")):
#print(read)
# WARNING: here if it crashes, just uncomment one of the two lines and comment the other
#chrom, posA, posB = read.split('_')[1:4] #specific to reference names with a "_"
chrom, posA, posB = read.split('_')[0:3] #specific to reference names without "_"
chrom, posA, posB = read.split('_')[1:4] #specific to reference names with a "_"
#chrom, posA, posB = read.split('_')[0:3] #specific to reference names without "_"
posA, posB = map(int,[posA,posB])
#print(posA,posB,barcode)
......@@ -154,6 +168,7 @@ for barcode in barcodes_labels:
long_label = ';'.join(barcodes_labels[barcode])
bG.add_node(barcode, label=long_label)
overlaps = defaultdict(list)
for i, molecule in enumerate(molecule_extents):
chrom, posA, posB = molecule_extents[molecule]
results = tree.find_overlap(posA,posB)
......@@ -161,18 +176,43 @@ for i, molecule in enumerate(molecule_extents):
for otherposA, otherposB, other_molecule in results:
other_molecule = vals[other_molecule]
otherchrom, otherposA_2, otherposB_2 = molecule_extents[other_molecule]
assert(otherposA == otherposA_2 and otherposB == otherposB_2)
if chrom == otherchrom and overlap(posA,posB,otherposA,otherposB) > overlap_size:
#print(posA,posB,molecule,"-",otherposA, otherposB, other_molecule)
mG.add_edge(molecule,other_molecule)
barcode = molecule.split("-")[0]
otherbarcode = other_molecule.split("-")[0]
assert(otherposA == otherposA_2 and otherposB == otherposB_2) # verifies the interval tree result
overlap_length = overlap(posA,posB,otherposA,otherposB)
if chrom == otherchrom and overlap_length >= overlap_size:
edge_label= '%s:%d-%d %s:%d-%d' %(chrom,posA,posB,otherchrom,otherposA,otherposB)
bG.add_edge(barcode,otherbarcode, label=edge_label)
overlaps[molecule] += [(overlap_length, other_molecule, edge_label)]
if i % 1000 == 1:
sys.stderr.write(".")
sys.stderr.flush()
for molecule in overlaps:
nb_needed_overlaps = float('inf') if max_overlaps is None else max_overlaps #- mG.degree(molecule)
nb_before, nb_after = 0,0 # some stats over the neighbors of a molecule
_, posA, posB = molecule_extents[molecule]
for (overlap_length, other_molecule, edge_label) in sorted(overlaps[molecule])[::-1]:
# examine whether that candidate neighbor starts before or after that current molecule
_, otherposA, otherposB = molecule_extents[other_molecule]
if otherposA > posA:
if equi and nb_after >= max_overlaps//2: continue
nb_after += 1
else:
if equi and nb_before >= max_overlaps//2: continue
nb_before += 1
#print(posA,posB,molecule,"-",otherposA, otherposB, other_molecule)
mG.add_edge(molecule,other_molecule)
barcode = molecule.split("-")[0]
otherbarcode = other_molecule.split("-")[0]
bG.add_edge(barcode,otherbarcode, label=edge_label)
nb_needed_overlaps -= 1
if nb_needed_overlaps == 0:
break
if nb_before != max_overlaps//2 or nb_after != max_overlaps//2:
print(molecule,"before/after %d/%d" % (nb_before,nb_after))
print("writing graphs")
print(len(bG.nodes()),"nodes",len(bG.edges()),"edges for the barcode graph")
print(len(mG.nodes()),"nodes",len(mG.edges()),"edges for the molecule graph")
......
......@@ -5,7 +5,7 @@ mv d2_graph.gexf simu_0_bar_n"$n"_d"$d"_m"$m"-dev"$dev".gexf.d2_graph.gexf
python ../deconvolution/main/d2_reduction.py --output_d2_name simu_0_bar_n"$n"_d"$d"_m"$m"-dev"$dev".gexf.d2_graph_reduced.gexf simu_0_bar_n"$n"_d"$d"_m"$m"-dev"$dev".gexf simu_0_bar_n"$n"_d"$d"_m"$m"-dev"$dev".gexf.d2_graph.gexf 2>&1 >/dev/null
echo "Gb $n $d $m $dev"
#echo "Gb $n $d $m $dev"
python ../deconvolution/main/d2_random_path_evaluation.py simu_0_bar_n"$n"_d"$d"_m"$m"-dev"$dev".gexf $n-$d-$dev Gb
echo "Lcp $n $d $m $dev"
#echo "Lcp $n $d $m $dev"
python ../deconvolution/main/d2_random_path_evaluation.py simu_0_bar_n"$n"_d"$d"_m"$m"-dev"$dev".gexf.d2_graph_reduced.gexf $n-$d-$dev Lcp
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment