Commit 10497ebc authored by rchikhi's avatar rchikhi
Browse files

eval scripts

parent 4541613e
# compares a barcode graph to ground truth
# input: [tested.gpickle] [ground_truth.gpickle]
import sys
from collections import Counter
from networkx import nx
print("loading tested graph")
tested = nx.read_gpickle(sys.argv[1])
print("loading ground truth graph")
ground = nx.read_gpickle(sys.argv[2])
print("graphs loaded")
common_nodes = set(tested.nodes()) & set(ground.nodes())
print(len(common_nodes),"common nodes")
tp,fp,fn = 0,0,0
for edge in tested.edges():
if edge in ground.edges():
tp += 1
else:
fp += 1
for edge in ground.edges():
if edge not in tested.edges():
fn += 1
print("tested edges:",len(tested.edges()))
print("ground edges:",len(ground.edges()))
print("TP:",tp)
print("FP:",fp)
print("FN:",fn)
# given a reads file produced by LRSIM and "longranger basic" and then passed through "lrsim_reinject_mi_in_barcoded_fastq.py" to get MI's
#
# construct ground truth barcode graph and molecule graph
#
# input: [reads_file.fastq.gz] [output_prefix]
#
# example:
# zcat ~/data/10x-ecoli/fastqs/ecoli.fq.gz | python ground_truth_barcode_graph.py /dev/stdin graph
import sys, gzip
from Bio.SeqIO.QualityIO import FastqGeneralIterator
from networkx import nx
from ncls import NCLS
import numpy as np
from collections import Counter
reads_file = sys.argv[1]
overlap_size = 9000
if reads_file.endswith('.gz'):
opener = gzip.open
else:
opener = open
"""
get the molecule intervals for each barcode+MI combination
"""
counter =0
highest_pos = 0
molecule_extents = dict()
nb_reads_per_mol = Counter()
for title, seq, qual in FastqGeneralIterator(opener(reads_file,"rt")):
# fastq format, @[read id] [barcode]
line = title.strip().split()
read = line[0][1:].split('/')[0] #strip @ and '/1'
barcode = "NA"
mi = "NA"
for x in line:
if x.startswith('BX'):
barcode = x.strip("-1")
if x.startswith('MI'):
mi = x
if barcode == "NA" or mi == "NA":
continue
molecule = barcode + "-" + mi # build a molecule graph actually
#print(read)
# WARNING: here if it crashes, just uncomment one of the two lines and comment the other
#chrom, posA, posB = read.split('_')[1:4] #specific to reference names with a "_"
chrom, posA, posB = read.split('_')[0:3] #specific to reference names without "_"
posA, posB = map(int,[posA,posB])
#print(posA,posB,barcode)
if posA > posB:
posA, posB = posB, posA
highest_pos = max(highest_pos,posB)
if molecule in molecule_extents:
oldchrom, oldA, oldB = molecule_extents[molecule]
else:
oldA, oldB = posA, posB
oldchrom = chrom
if oldchrom != chrom: print("weird molecule placement",oldchrom,chrom,posA,posB,oldA,oldB)
molecule_extents[molecule] = (chrom, min(oldA,posA),max(oldB,posB))
nb_reads_per_mol[molecule] += 1
counter += 1
if counter % 100000 == 0:
sys.stderr.write(".")
sys.stderr.flush()
# drop molecules that have <= 3 reads or have coverage < 0.05x (usually peak is supposed to be at 0.2x as per https://www.sciencedirect.com/science/article/pii/S2001037017300855)
# assuming 150bp reads
nb_removed_low_cov_molecules = 0
for molecule in nb_reads_per_mol:
chom, posA, posB = molecule_extents[molecule]
assert(posB >= posA)
molecule_length = posB-posA
if nb_reads_per_mol[molecule] <= 3:
del molecule_extents[molecule]
continue
if nb_reads_per_mol[molecule] * 150 <= 0.05 * molecule_length:
nb_removed_low_cov_molecules += 1
del molecule_extents[molecule]
continue
if nb_removed_low_cov_molecules > 0:
print("removed",nb_removed_low_cov_molecules,"low-coverage molecules")
#build the interval tree
intervals = []
starts = []
ends = []
vals = []
for molecule in molecule_extents:
chrom, posA, posB = molecule_extents[molecule]
intervals += [(posA,posB,molecule)]
#tree.addi(posA,posB,barcode)
starts += [posA]
ends += [posB]
vals += [molecule]
np_starts = np.array(starts, dtype=np.long)
np_ends = np.array(ends, dtype=np.long)
np_vals = np.array(range(len(vals)), dtype=np.long) #ncls doesn't associate strings, only integers, so i'm storing indices
tree = NCLS(np_starts,np_ends,np_vals)
print("tree constructed")
# compute molecule overlap length
# assuming they already intersect
#
# a------b
# c------d
#
# or
#
# a-----------b
# c-------d
#
def overlap(a,b,c,d):
if a > c:
a,b,c,d = c,d,a,b
if c >= a and d <= b: #contained
return d-c
if c >= a and d >= b:
return b-c
else:
print("dunno how to overlap",a,b,c,d)
exit(1)
# construct true barcode and molecule graphs
mG = nx.Graph()
bG = nx.Graph()
from collections import defaultdict
barcodes_labels = defaultdict(list)
for molecule in molecule_extents:
chrom, posA, posB = molecule_extents[molecule]
label = "%s:%d-%d" %(chrom,posA,posB)
mG.add_node(molecule, label=molecule+";"+label)
barcode = molecule.split("-")[0]
barcodes_labels[barcode] += [label]
for barcode in barcodes_labels:
#print("label",barcodes_labels[barcode])
long_label = ';'.join(barcodes_labels[barcode])
bG.add_node(barcode, label=long_label)
for i, molecule in enumerate(molecule_extents):
chrom, posA, posB = molecule_extents[molecule]
results = tree.find_overlap(posA,posB)
#print(results)
for otherposA, otherposB, other_molecule in results:
other_molecule = vals[other_molecule]
otherchrom, otherposA_2, otherposB_2 = molecule_extents[other_molecule]
assert(otherposA == otherposA_2 and otherposB == otherposB_2)
if chrom == otherchrom and overlap(posA,posB,otherposA,otherposB) > overlap_size:
#print(posA,posB,molecule,"-",otherposA, otherposB, other_molecule)
mG.add_edge(molecule,other_molecule)
barcode = molecule.split("-")[0]
otherbarcode = other_molecule.split("-")[0]
edge_label= '%s:%d-%d %s:%d-%d' %(chrom,posA,posB,otherchrom,otherposA,otherposB)
bG.add_edge(barcode,otherbarcode, label=edge_label)
if i % 1000 == 1:
sys.stderr.write(".")
sys.stderr.flush()
print("writing graphs")
print(len(bG.nodes()),"nodes",len(bG.edges()),"edges for the barcode graph")
print(len(mG.nodes()),"nodes",len(mG.edges()),"edges for the molecule graph")
nx.write_graphml(bG, sys.argv[2]+".groundtruth_barcode.graphml")
nx.write_graphml(mG, sys.argv[2]+".groundtruth_molecule.graphml")
nx.write_gpickle(bG, sys.argv[2]+".groundtruth_barcode.gpickle")
nx.write_gpickle(mG, sys.argv[2]+".groundtruth_molecule.gpickle")
print("ground truth constructed")
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment