diff --git a/medvedev_repo_scripts/evaluate_barcode_graph.py b/medvedev_repo_scripts/evaluate_barcode_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..0cb25fa818be3ba5de77208b686914c676276297 --- /dev/null +++ b/medvedev_repo_scripts/evaluate_barcode_graph.py @@ -0,0 +1,36 @@ +# compares a barcode graph to ground truth +# input: [tested.gpickle] [ground_truth.gpickle] + + +import sys +from collections import Counter +from networkx import nx + +print("loading tested graph") +tested = nx.read_gpickle(sys.argv[1]) + +print("loading ground truth graph") +ground = nx.read_gpickle(sys.argv[2]) +print("graphs loaded") + +common_nodes = set(tested.nodes()) & set(ground.nodes()) + +print(len(common_nodes),"common nodes") + +tp,fp,fn = 0,0,0 + +for edge in tested.edges(): + if edge in ground.edges(): + tp += 1 + else: + fp += 1 + +for edge in ground.edges(): + if edge not in tested.edges(): + fn += 1 + +print("tested edges:",len(tested.edges())) +print("ground edges:",len(ground.edges())) +print("TP:",tp) +print("FP:",fp) +print("FN:",fn) diff --git a/medvedev_repo_scripts/ground_truth_barcode_graph.py b/medvedev_repo_scripts/ground_truth_barcode_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..aa988df65fdbd7fce273d49b08905be6da975556 --- /dev/null +++ b/medvedev_repo_scripts/ground_truth_barcode_graph.py @@ -0,0 +1,185 @@ +# given a reads file produced by LRSIM and "longranger basic" and then passed through "lrsim_reinject_mi_in_barcoded_fastq.py" to get MI's +# +# construct ground truth barcode graph and molecule graph +# +# input: [reads_file.fastq.gz] [output_prefix] +# +# example: +# zcat ~/data/10x-ecoli/fastqs/ecoli.fq.gz | python ground_truth_barcode_graph.py /dev/stdin graph + +import sys, gzip +from Bio.SeqIO.QualityIO import FastqGeneralIterator +from networkx import nx +from ncls import NCLS +import numpy as np +from collections import Counter + +reads_file = sys.argv[1] +overlap_size = 9000 + +if reads_file.endswith('.gz'): + opener = gzip.open +else: + opener = open + +""" + get the molecule intervals for each barcode+MI combination +""" +counter =0 +highest_pos = 0 +molecule_extents = dict() +nb_reads_per_mol = Counter() +for title, seq, qual in FastqGeneralIterator(opener(reads_file,"rt")): + # fastq format, @[read id] [barcode] + line = title.strip().split() + read = line[0][1:].split('/')[0] #strip @ and '/1' + barcode = "NA" + mi = "NA" + for x in line: + if x.startswith('BX'): + barcode = x.strip("-1") + if x.startswith('MI'): + mi = x + + if barcode == "NA" or mi == "NA": + continue + + molecule = barcode + "-" + mi # build a molecule graph actually + + #print(read) + # WARNING: here if it crashes, just uncomment one of the two lines and comment the other + #chrom, posA, posB = read.split('_')[1:4] #specific to reference names with a "_" + chrom, posA, posB = read.split('_')[0:3] #specific to reference names without "_" + posA, posB = map(int,[posA,posB]) + #print(posA,posB,barcode) + + if posA > posB: + posA, posB = posB, posA + highest_pos = max(highest_pos,posB) + + if molecule in molecule_extents: + oldchrom, oldA, oldB = molecule_extents[molecule] + else: + oldA, oldB = posA, posB + oldchrom = chrom + if oldchrom != chrom: print("weird molecule placement",oldchrom,chrom,posA,posB,oldA,oldB) + molecule_extents[molecule] = (chrom, min(oldA,posA),max(oldB,posB)) + + nb_reads_per_mol[molecule] += 1 + + counter += 1 + if counter % 100000 == 0: + sys.stderr.write(".") + sys.stderr.flush() + +# drop molecules that have <= 3 reads or have coverage < 0.05x (usually peak is supposed to be at 0.2x as per https://www.sciencedirect.com/science/article/pii/S2001037017300855) +# assuming 150bp reads +nb_removed_low_cov_molecules = 0 +for molecule in nb_reads_per_mol: + chom, posA, posB = molecule_extents[molecule] + assert(posB >= posA) + molecule_length = posB-posA + if nb_reads_per_mol[molecule] <= 3: + del molecule_extents[molecule] + continue + if nb_reads_per_mol[molecule] * 150 <= 0.05 * molecule_length: + nb_removed_low_cov_molecules += 1 + del molecule_extents[molecule] + continue +if nb_removed_low_cov_molecules > 0: + print("removed",nb_removed_low_cov_molecules,"low-coverage molecules") + + +#build the interval tree +intervals = [] +starts = [] +ends = [] +vals = [] +for molecule in molecule_extents: + chrom, posA, posB = molecule_extents[molecule] + + intervals += [(posA,posB,molecule)] + #tree.addi(posA,posB,barcode) + starts += [posA] + ends += [posB] + vals += [molecule] + +np_starts = np.array(starts, dtype=np.long) +np_ends = np.array(ends, dtype=np.long) +np_vals = np.array(range(len(vals)), dtype=np.long) #ncls doesn't associate strings, only integers, so i'm storing indices + +tree = NCLS(np_starts,np_ends,np_vals) + +print("tree constructed") + + +# compute molecule overlap length +# assuming they already intersect +# +# a------b +# c------d +# +# or +# +# a-----------b +# c-------d +# +def overlap(a,b,c,d): + if a > c: + a,b,c,d = c,d,a,b + if c >= a and d <= b: #contained + return d-c + if c >= a and d >= b: + return b-c + else: + print("dunno how to overlap",a,b,c,d) + exit(1) + + +# construct true barcode and molecule graphs +mG = nx.Graph() +bG = nx.Graph() + +from collections import defaultdict +barcodes_labels = defaultdict(list) +for molecule in molecule_extents: + chrom, posA, posB = molecule_extents[molecule] + label = "%s:%d-%d" %(chrom,posA,posB) + mG.add_node(molecule, label=molecule+";"+label) + barcode = molecule.split("-")[0] + barcodes_labels[barcode] += [label] + +for barcode in barcodes_labels: + #print("label",barcodes_labels[barcode]) + long_label = ';'.join(barcodes_labels[barcode]) + bG.add_node(barcode, label=long_label) + +for i, molecule in enumerate(molecule_extents): + chrom, posA, posB = molecule_extents[molecule] + results = tree.find_overlap(posA,posB) + #print(results) + for otherposA, otherposB, other_molecule in results: + other_molecule = vals[other_molecule] + otherchrom, otherposA_2, otherposB_2 = molecule_extents[other_molecule] + assert(otherposA == otherposA_2 and otherposB == otherposB_2) + if chrom == otherchrom and overlap(posA,posB,otherposA,otherposB) > overlap_size: + #print(posA,posB,molecule,"-",otherposA, otherposB, other_molecule) + mG.add_edge(molecule,other_molecule) + barcode = molecule.split("-")[0] + otherbarcode = other_molecule.split("-")[0] + edge_label= '%s:%d-%d %s:%d-%d' %(chrom,posA,posB,otherchrom,otherposA,otherposB) + bG.add_edge(barcode,otherbarcode, label=edge_label) + if i % 1000 == 1: + sys.stderr.write(".") + sys.stderr.flush() + +print("writing graphs") +print(len(bG.nodes()),"nodes",len(bG.edges()),"edges for the barcode graph") +print(len(mG.nodes()),"nodes",len(mG.edges()),"edges for the molecule graph") +nx.write_graphml(bG, sys.argv[2]+".groundtruth_barcode.graphml") +nx.write_graphml(mG, sys.argv[2]+".groundtruth_molecule.graphml") + +nx.write_gpickle(bG, sys.argv[2]+".groundtruth_barcode.gpickle") +nx.write_gpickle(mG, sys.argv[2]+".groundtruth_molecule.gpickle") + +print("ground truth constructed")