Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Yoann DUFRESNE
linked reads molecule ordering
Commits
10497ebc
Commit
10497ebc
authored
May 05, 2020
by
rchikhi
Browse files
eval scripts
parent
4541613e
Changes
2
Hide whitespace changes
Inline
Side-by-side
medvedev_repo_scripts/evaluate_barcode_graph.py
0 → 100644
View file @
10497ebc
# compares a barcode graph to ground truth
# input: [tested.gpickle] [ground_truth.gpickle]
import
sys
from
collections
import
Counter
from
networkx
import
nx
print
(
"loading tested graph"
)
tested
=
nx
.
read_gpickle
(
sys
.
argv
[
1
])
print
(
"loading ground truth graph"
)
ground
=
nx
.
read_gpickle
(
sys
.
argv
[
2
])
print
(
"graphs loaded"
)
common_nodes
=
set
(
tested
.
nodes
())
&
set
(
ground
.
nodes
())
print
(
len
(
common_nodes
),
"common nodes"
)
tp
,
fp
,
fn
=
0
,
0
,
0
for
edge
in
tested
.
edges
():
if
edge
in
ground
.
edges
():
tp
+=
1
else
:
fp
+=
1
for
edge
in
ground
.
edges
():
if
edge
not
in
tested
.
edges
():
fn
+=
1
print
(
"tested edges:"
,
len
(
tested
.
edges
()))
print
(
"ground edges:"
,
len
(
ground
.
edges
()))
print
(
"TP:"
,
tp
)
print
(
"FP:"
,
fp
)
print
(
"FN:"
,
fn
)
medvedev_repo_scripts/ground_truth_barcode_graph.py
0 → 100644
View file @
10497ebc
# given a reads file produced by LRSIM and "longranger basic" and then passed through "lrsim_reinject_mi_in_barcoded_fastq.py" to get MI's
#
# construct ground truth barcode graph and molecule graph
#
# input: [reads_file.fastq.gz] [output_prefix]
#
# example:
# zcat ~/data/10x-ecoli/fastqs/ecoli.fq.gz | python ground_truth_barcode_graph.py /dev/stdin graph
import
sys
,
gzip
from
Bio.SeqIO.QualityIO
import
FastqGeneralIterator
from
networkx
import
nx
from
ncls
import
NCLS
import
numpy
as
np
from
collections
import
Counter
reads_file
=
sys
.
argv
[
1
]
overlap_size
=
9000
if
reads_file
.
endswith
(
'.gz'
):
opener
=
gzip
.
open
else
:
opener
=
open
"""
get the molecule intervals for each barcode+MI combination
"""
counter
=
0
highest_pos
=
0
molecule_extents
=
dict
()
nb_reads_per_mol
=
Counter
()
for
title
,
seq
,
qual
in
FastqGeneralIterator
(
opener
(
reads_file
,
"rt"
)):
# fastq format, @[read id] [barcode]
line
=
title
.
strip
().
split
()
read
=
line
[
0
][
1
:].
split
(
'/'
)[
0
]
#strip @ and '/1'
barcode
=
"NA"
mi
=
"NA"
for
x
in
line
:
if
x
.
startswith
(
'BX'
):
barcode
=
x
.
strip
(
"-1"
)
if
x
.
startswith
(
'MI'
):
mi
=
x
if
barcode
==
"NA"
or
mi
==
"NA"
:
continue
molecule
=
barcode
+
"-"
+
mi
# build a molecule graph actually
#print(read)
# WARNING: here if it crashes, just uncomment one of the two lines and comment the other
#chrom, posA, posB = read.split('_')[1:4] #specific to reference names with a "_"
chrom
,
posA
,
posB
=
read
.
split
(
'_'
)[
0
:
3
]
#specific to reference names without "_"
posA
,
posB
=
map
(
int
,[
posA
,
posB
])
#print(posA,posB,barcode)
if
posA
>
posB
:
posA
,
posB
=
posB
,
posA
highest_pos
=
max
(
highest_pos
,
posB
)
if
molecule
in
molecule_extents
:
oldchrom
,
oldA
,
oldB
=
molecule_extents
[
molecule
]
else
:
oldA
,
oldB
=
posA
,
posB
oldchrom
=
chrom
if
oldchrom
!=
chrom
:
print
(
"weird molecule placement"
,
oldchrom
,
chrom
,
posA
,
posB
,
oldA
,
oldB
)
molecule_extents
[
molecule
]
=
(
chrom
,
min
(
oldA
,
posA
),
max
(
oldB
,
posB
))
nb_reads_per_mol
[
molecule
]
+=
1
counter
+=
1
if
counter
%
100000
==
0
:
sys
.
stderr
.
write
(
"."
)
sys
.
stderr
.
flush
()
# drop molecules that have <= 3 reads or have coverage < 0.05x (usually peak is supposed to be at 0.2x as per https://www.sciencedirect.com/science/article/pii/S2001037017300855)
# assuming 150bp reads
nb_removed_low_cov_molecules
=
0
for
molecule
in
nb_reads_per_mol
:
chom
,
posA
,
posB
=
molecule_extents
[
molecule
]
assert
(
posB
>=
posA
)
molecule_length
=
posB
-
posA
if
nb_reads_per_mol
[
molecule
]
<=
3
:
del
molecule_extents
[
molecule
]
continue
if
nb_reads_per_mol
[
molecule
]
*
150
<=
0.05
*
molecule_length
:
nb_removed_low_cov_molecules
+=
1
del
molecule_extents
[
molecule
]
continue
if
nb_removed_low_cov_molecules
>
0
:
print
(
"removed"
,
nb_removed_low_cov_molecules
,
"low-coverage molecules"
)
#build the interval tree
intervals
=
[]
starts
=
[]
ends
=
[]
vals
=
[]
for
molecule
in
molecule_extents
:
chrom
,
posA
,
posB
=
molecule_extents
[
molecule
]
intervals
+=
[(
posA
,
posB
,
molecule
)]
#tree.addi(posA,posB,barcode)
starts
+=
[
posA
]
ends
+=
[
posB
]
vals
+=
[
molecule
]
np_starts
=
np
.
array
(
starts
,
dtype
=
np
.
long
)
np_ends
=
np
.
array
(
ends
,
dtype
=
np
.
long
)
np_vals
=
np
.
array
(
range
(
len
(
vals
)),
dtype
=
np
.
long
)
#ncls doesn't associate strings, only integers, so i'm storing indices
tree
=
NCLS
(
np_starts
,
np_ends
,
np_vals
)
print
(
"tree constructed"
)
# compute molecule overlap length
# assuming they already intersect
#
# a------b
# c------d
#
# or
#
# a-----------b
# c-------d
#
def
overlap
(
a
,
b
,
c
,
d
):
if
a
>
c
:
a
,
b
,
c
,
d
=
c
,
d
,
a
,
b
if
c
>=
a
and
d
<=
b
:
#contained
return
d
-
c
if
c
>=
a
and
d
>=
b
:
return
b
-
c
else
:
print
(
"dunno how to overlap"
,
a
,
b
,
c
,
d
)
exit
(
1
)
# construct true barcode and molecule graphs
mG
=
nx
.
Graph
()
bG
=
nx
.
Graph
()
from
collections
import
defaultdict
barcodes_labels
=
defaultdict
(
list
)
for
molecule
in
molecule_extents
:
chrom
,
posA
,
posB
=
molecule_extents
[
molecule
]
label
=
"%s:%d-%d"
%
(
chrom
,
posA
,
posB
)
mG
.
add_node
(
molecule
,
label
=
molecule
+
";"
+
label
)
barcode
=
molecule
.
split
(
"-"
)[
0
]
barcodes_labels
[
barcode
]
+=
[
label
]
for
barcode
in
barcodes_labels
:
#print("label",barcodes_labels[barcode])
long_label
=
';'
.
join
(
barcodes_labels
[
barcode
])
bG
.
add_node
(
barcode
,
label
=
long_label
)
for
i
,
molecule
in
enumerate
(
molecule_extents
):
chrom
,
posA
,
posB
=
molecule_extents
[
molecule
]
results
=
tree
.
find_overlap
(
posA
,
posB
)
#print(results)
for
otherposA
,
otherposB
,
other_molecule
in
results
:
other_molecule
=
vals
[
other_molecule
]
otherchrom
,
otherposA_2
,
otherposB_2
=
molecule_extents
[
other_molecule
]
assert
(
otherposA
==
otherposA_2
and
otherposB
==
otherposB_2
)
if
chrom
==
otherchrom
and
overlap
(
posA
,
posB
,
otherposA
,
otherposB
)
>
overlap_size
:
#print(posA,posB,molecule,"-",otherposA, otherposB, other_molecule)
mG
.
add_edge
(
molecule
,
other_molecule
)
barcode
=
molecule
.
split
(
"-"
)[
0
]
otherbarcode
=
other_molecule
.
split
(
"-"
)[
0
]
edge_label
=
'%s:%d-%d %s:%d-%d'
%
(
chrom
,
posA
,
posB
,
otherchrom
,
otherposA
,
otherposB
)
bG
.
add_edge
(
barcode
,
otherbarcode
,
label
=
edge_label
)
if
i
%
1000
==
1
:
sys
.
stderr
.
write
(
"."
)
sys
.
stderr
.
flush
()
print
(
"writing graphs"
)
print
(
len
(
bG
.
nodes
()),
"nodes"
,
len
(
bG
.
edges
()),
"edges for the barcode graph"
)
print
(
len
(
mG
.
nodes
()),
"nodes"
,
len
(
mG
.
edges
()),
"edges for the molecule graph"
)
nx
.
write_graphml
(
bG
,
sys
.
argv
[
2
]
+
".groundtruth_barcode.graphml"
)
nx
.
write_graphml
(
mG
,
sys
.
argv
[
2
]
+
".groundtruth_molecule.graphml"
)
nx
.
write_gpickle
(
bG
,
sys
.
argv
[
2
]
+
".groundtruth_barcode.gpickle"
)
nx
.
write_gpickle
(
mG
,
sys
.
argv
[
2
]
+
".groundtruth_molecule.gpickle"
)
print
(
"ground truth constructed"
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment