Commit c97d2571 authored by Yoann Dufresne's avatar Yoann Dufresne

clean repo

parents cd20dda3 799c6865
...@@ -69,6 +69,7 @@ def main(): ...@@ -69,6 +69,7 @@ def main():
dprint("constructing d2 graph from barcode graph") dprint("constructing d2 graph from barcode graph")
d2g.construct_from_barcodes(neighbor_threshold=args.edge_divergence_threshold, clique_mode=clique_mode, threads=args.threads, verbose=args.verbose) d2g.construct_from_barcodes(neighbor_threshold=args.edge_divergence_threshold, clique_mode=clique_mode, threads=args.threads, verbose=args.verbose)
dprint("[debug] d2 graph constructed") dprint("[debug] d2 graph constructed")
dprint("[debug] %d nodes %d edges" % (len(d2g.nodes()), len(d2g.edges())))
# d2g.save(f"{args.output_prefix}.tsv") # d2g.save(f"{args.output_prefix}.tsv")
nx.write_gexf(d2g, f"{args.output_prefix}.gexf") nx.write_gexf(d2g, f"{args.output_prefix}.gexf")
......
...@@ -10,7 +10,7 @@ do ...@@ -10,7 +10,7 @@ do
echo "n=$n d=$d m=$m m_dev=$dev" echo "n=$n d=$d m=$m m_dev=$dev"
rm -f snake_exec/simu_0_bar_n"$n"_d"$d"_m"$m"-dev0.gexf && rm -f snake_exec/simu_0_bar_n"$n"_d"$d"_m"$m"-dev0.gexf &&
snakemake -s Snakefile_data_simu --config n=$n d=$d m=$m m_dev=$dev >/dev/null 2>&1 snakemake --cores 1 -s Snakefile_data_simu --config n=$n d=$d m=$m m_dev=$dev >/dev/null 2>&1
cd snake_exec && source CMD_acc_sens && cd .. cd snake_exec && source CMD_acc_sens && cd ..
done done
......
# compares a barcode graph to ground truth # compares a barcode graph to ground truth
# input: [tested.gpickle] [ground_truth.gpickle] import sys
if len(sys.argv) < 3:
exit("input: [tested.gpickle] [ground_truth.gpickle]")
import sys
from collections import Counter from collections import Counter
from networkx import nx from networkx import nx
print("loading tested graph") print("loading tested graph")
tested = nx.read_gpickle(sys.argv[1]) tested = nx.read_gpickle(sys.argv[1])
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
# input: [reads_file.fastq.gz] [output_prefix] # input: [reads_file.fastq.gz] [output_prefix]
# #
# example: # example:
# zcat ~/data/10x-ecoli/fastqs/ecoli.fq.gz | python ground_truth_barcode_graph.py /dev/stdin graph # zcat ~/data/10x-ecoli/fastqs/ecoli.fq.gz | python ground_truth_barcode_graph.py /dev/stdin graph [min_overlap_length] [max_nb_overlaps_per_mol] [equi]
import sys, gzip import sys, gzip
from Bio.SeqIO.QualityIO import FastqGeneralIterator from Bio.SeqIO.QualityIO import FastqGeneralIterator
...@@ -16,6 +16,20 @@ from collections import Counter ...@@ -16,6 +16,20 @@ from collections import Counter
reads_file = sys.argv[1] reads_file = sys.argv[1]
overlap_size = 9000 overlap_size = 9000
if len(sys.argv) > 3:
overlap_size = int(sys.argv[3])
print("using custom overlap size %d" % overlap_size)
max_overlaps = None # limits the number of overlaps we record per molecule (keeping only the longest ones)
if len(sys.argv) > 4:
max_overlaps = int(sys.argv[4])
print("using custom maximal number of overlaps per molecule %d" % max_overlaps)
equi = False # among the neighbors of a molecule, force to have as many molecules before than as after (i.e. prevents a molecule from only overlapping with those after itself) - useful in conjunction with max_overlaps
if len(sys.argv) > 5:
equi = True
print("asking for equi before/after overlap repartition")
if reads_file.endswith('.gz'): if reads_file.endswith('.gz'):
opener = gzip.open opener = gzip.open
...@@ -48,8 +62,8 @@ for title, seq, qual in FastqGeneralIterator(opener(reads_file,"rt")): ...@@ -48,8 +62,8 @@ for title, seq, qual in FastqGeneralIterator(opener(reads_file,"rt")):
#print(read) #print(read)
# WARNING: here if it crashes, just uncomment one of the two lines and comment the other # WARNING: here if it crashes, just uncomment one of the two lines and comment the other
#chrom, posA, posB = read.split('_')[1:4] #specific to reference names with a "_" chrom, posA, posB = read.split('_')[1:4] #specific to reference names with a "_"
chrom, posA, posB = read.split('_')[0:3] #specific to reference names without "_" #chrom, posA, posB = read.split('_')[0:3] #specific to reference names without "_"
posA, posB = map(int,[posA,posB]) posA, posB = map(int,[posA,posB])
#print(posA,posB,barcode) #print(posA,posB,barcode)
...@@ -154,6 +168,7 @@ for barcode in barcodes_labels: ...@@ -154,6 +168,7 @@ for barcode in barcodes_labels:
long_label = ';'.join(barcodes_labels[barcode]) long_label = ';'.join(barcodes_labels[barcode])
bG.add_node(barcode, label=long_label) bG.add_node(barcode, label=long_label)
overlaps = defaultdict(list)
for i, molecule in enumerate(molecule_extents): for i, molecule in enumerate(molecule_extents):
chrom, posA, posB = molecule_extents[molecule] chrom, posA, posB = molecule_extents[molecule]
results = tree.find_overlap(posA,posB) results = tree.find_overlap(posA,posB)
...@@ -161,18 +176,43 @@ for i, molecule in enumerate(molecule_extents): ...@@ -161,18 +176,43 @@ for i, molecule in enumerate(molecule_extents):
for otherposA, otherposB, other_molecule in results: for otherposA, otherposB, other_molecule in results:
other_molecule = vals[other_molecule] other_molecule = vals[other_molecule]
otherchrom, otherposA_2, otherposB_2 = molecule_extents[other_molecule] otherchrom, otherposA_2, otherposB_2 = molecule_extents[other_molecule]
assert(otherposA == otherposA_2 and otherposB == otherposB_2) assert(otherposA == otherposA_2 and otherposB == otherposB_2) # verifies the interval tree result
if chrom == otherchrom and overlap(posA,posB,otherposA,otherposB) > overlap_size: overlap_length = overlap(posA,posB,otherposA,otherposB)
#print(posA,posB,molecule,"-",otherposA, otherposB, other_molecule) if chrom == otherchrom and overlap_length >= overlap_size:
mG.add_edge(molecule,other_molecule)
barcode = molecule.split("-")[0]
otherbarcode = other_molecule.split("-")[0]
edge_label= '%s:%d-%d %s:%d-%d' %(chrom,posA,posB,otherchrom,otherposA,otherposB) edge_label= '%s:%d-%d %s:%d-%d' %(chrom,posA,posB,otherchrom,otherposA,otherposB)
bG.add_edge(barcode,otherbarcode, label=edge_label) overlaps[molecule] += [(overlap_length, other_molecule, edge_label)]
if i % 1000 == 1: if i % 1000 == 1:
sys.stderr.write(".") sys.stderr.write(".")
sys.stderr.flush() sys.stderr.flush()
for molecule in overlaps:
nb_needed_overlaps = float('inf') if max_overlaps is None else max_overlaps #- mG.degree(molecule)
nb_before, nb_after = 0,0 # some stats over the neighbors of a molecule
_, posA, posB = molecule_extents[molecule]
for (overlap_length, other_molecule, edge_label) in sorted(overlaps[molecule])[::-1]:
# examine whether that candidate neighbor starts before or after that current molecule
_, otherposA, otherposB = molecule_extents[other_molecule]
if otherposA > posA:
if equi and nb_after >= max_overlaps//2: continue
nb_after += 1
else:
if equi and nb_before >= max_overlaps//2: continue
nb_before += 1
#print(posA,posB,molecule,"-",otherposA, otherposB, other_molecule)
mG.add_edge(molecule,other_molecule)
barcode = molecule.split("-")[0]
otherbarcode = other_molecule.split("-")[0]
bG.add_edge(barcode,otherbarcode, label=edge_label)
nb_needed_overlaps -= 1
if nb_needed_overlaps == 0:
break
if nb_before != max_overlaps//2 or nb_after != max_overlaps//2:
print(molecule,"before/after %d/%d" % (nb_before,nb_after))
print("writing graphs") print("writing graphs")
print(len(bG.nodes()),"nodes",len(bG.edges()),"edges for the barcode graph") print(len(bG.nodes()),"nodes",len(bG.edges()),"edges for the barcode graph")
print(len(mG.nodes()),"nodes",len(mG.edges()),"edges for the molecule graph") print(len(mG.nodes()),"nodes",len(mG.edges()),"edges for the molecule graph")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment