Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Yoann DUFRESNE
linked reads molecule ordering
Commits
d5d610e4
Commit
d5d610e4
authored
May 07, 2020
by
rchikhi
Browse files
added barcode graph from ema script
parent
f75e2095
Changes
1
Hide whitespace changes
Inline
Side-by-side
medvedev_repo_scripts/naive_contigs_barcode_graph_from_ema.py
0 → 100644
View file @
d5d610e4
#arguments: samfile produced by e.g. EMA with MI information, mapped to contigs (not unitigs)
# returns a barcode graph
# additionnally takes into account MI information, but only on molecules fully aligned inside a single contig
import
sys
import
itertools
import
gzip
import
pysam
import
operator
from
collections
import
Counter
,
defaultdict
from
networkx
import
nx
g
=
nx
.
Graph
()
from
statistics
import
mean
# parameters
min_barcode_reads
=
20
# drop barcodes having less than X reads
min_overlap_length
=
1000
# minimum overlap length between molecules
contained_threshold
=
3000
# minimum distance to contig extremity to consider a molecule to be fully within a contig (=resolved)
# -----------
molecule_span
=
dict
()
def
compute_molecule_id
(
barcode
,
mi
):
return
barcode
+
"-MI"
+
str
(
mi
)
def
update_molecule_span
(
molecule
,
read_contig
,
read_start
,
read_end
):
global
molecule_span
if
molecule
in
molecule_span
:
previous_span
=
molecule_span
[
molecule
]
pcontig
,
pstart
,
pend
=
previous_span
if
pcontig
==
"multicontig"
:
return
if
pcontig
!=
read_contig
:
molecule_span
[
molecule
]
=
(
"multicontig"
,
0
,
0
)
return
pstart
=
min
(
pstart
,
read_start
)
pend
=
max
(
pend
,
read_end
)
else
:
pcontig
,
pstart
,
pend
=
read_contig
,
read_start
,
read_end
molecule_span
[
molecule
]
=
(
pcontig
,
pstart
,
pend
)
# init some global variables
read_barcode
=
dict
()
read_mi
=
dict
()
barcode_read_count
=
Counter
()
nb_reads_inserted
,
nb_bx_key_errors
,
nb_mi_key_errors
=
0
,
0
,
0
nb_bad_mapq
,
nb_discarded
,
nb_mapped
=
0
,
0
,
0
# open sam file
sam_file
=
sys
.
argv
[
1
]
contig_dict
=
dict
()
contig_lengths
=
dict
()
if
sam_file
.
endswith
(
'.sam'
):
samfile
=
pysam
.
AlignmentFile
(
sam_file
,
"r"
)
else
:
samfile
=
pysam
.
AlignmentFile
(
sam_file
,
"rb"
)
# parse the sam header to get contig lengths
read_barcode
=
dict
()
barcode_read_count
=
Counter
()
molecule_read_count
=
Counter
()
contig_molecules
=
dict
()
for
record
in
samfile
.
header
[
'SQ'
]:
contig
=
record
[
'SN'
]
length
=
int
(
record
[
'LN'
])
contig_dict
[
contig
]
=
[]
contig_lengths
[
contig
]
=
length
contig_molecules
[
contig
]
=
set
()
print
(
"parsed"
,
len
(
contig_dict
),
"contigs"
)
# parse the reads
for
read
in
samfile
.
fetch
():
if
read
.
is_unmapped
or
read
.
is_secondary
:
nb_discarded
+=
1
continue
nb_mapped
+=
1
mapq
=
read
.
mapping_quality
if
mapq
<
50
:
#not 60, but 50
nb_discarded
+=
1
nb_bad_mapq
+=
1
continue
read_id
=
read
.
query_name
read_contig
=
read
.
reference_name
read_start
=
read
.
reference_start
read_end
=
read
.
reference_end
try
:
barcode
=
read
.
get_tag
(
'BX'
).
strip
(
"-1"
)
read_barcode
[
read_id
]
=
barcode
barcode_read_count
[
barcode
]
+=
1
except
KeyError
:
nb_bx_key_errors
+=
1
pass
try
:
if
read
.
has_tag
(
'MI'
):
mi
=
read
.
get_tag
(
'MI'
)
read_mi
[
read_id
]
=
mi
except
KeyError
:
nb_mi_key_errors
+=
1
pass
if
read_id
in
read_barcode
and
read_id
in
read_mi
:
# make sure both barcode and MI are reported for this read
# compute molecule span
molecule
=
compute_molecule_id
(
read_barcode
[
read_id
],
read_mi
[
read_id
])
update_molecule_span
(
molecule
,
read_contig
,
read_start
,
read_end
)
contig_molecules
[
read_contig
].
add
(
molecule
)
molecule_read_count
[
molecule
]
+=
1
nb_reads_inserted
print
(
"parsed %d alignments, %d discarded reads incl %d bad mapq"
%
(
nb_mapped
,
nb_discarded
,
nb_bad_mapq
))
print
(
nb_reads_inserted
,
"reads taken into account, "
,
"%d/%d"
%
(
nb_bx_key_errors
,
nb_mi_key_errors
),
"BX/MI skipped"
)
print
(
"number of barcoded reads indexed:"
,
len
(
read_barcode
))
print
(
"number of barcodes :"
,
len
(
barcode_read_count
))
# we say that a molecule is _resolved_ if it's far away from a contig extremity
# those correspond to molecules that are fully spanned in contigs
# all other molecules are unresolved and we'll just assign them together to their barcode
# i.e. all unresolved molecules are the same molecule (=the barcode)
resolved_molecules
=
set
()
for
molecule
in
molecule_span
:
mcontig
,
mstart
,
mend
=
molecule_span
[
molecule
]
contig_len
=
contig_lengths
[
mcontig
]
if
mstart
>
contained_threshold
and
mend
<
contig_len
-
contained_threshold
:
resolved_molecules
.
add
(
molecule
)
print
(
len
(
resolved_molecules
),
"/"
,
len
(
molecule_span
),
"contig-isolated molecules"
)
def
get_resolved_barcode
(
mol
):
if
mol
in
resolved_molecules
:
return
mol
else
:
#def compute_molecule_id(barcode,mi):
# return barcode+"-MI"+str(mi)
return
mol
.
split
(
"-MI"
)[
0
]
# construct graph nodes as sufficiently seen barcodes
nb_reads_per_barcode
=
[]
for
barcode
in
barcode_read_count
:
count
=
barcode_read_count
[
barcode
]
if
count
>
min_barcode_reads
:
g
.
add_node
(
barcode
)
nb_reads_per_barcode
+=
[
count
]
print
(
"mean number of reads per barcode"
,
sum
(
nb_reads_per_barcode
)
/
len
(
nb_reads_per_barcode
))
link_count
=
Counter
()
link_contigs
=
dict
()
debug_contig_mols
=
open
(
"debug_contig_mols.txt"
,
"w"
)
t
=
int
(
len
(
contig_dict
)
/
10
)
sys
.
stderr
.
write
(
'
\n
'
)
# iterate over all contigs one by one
# to construct barcode graph edges
# this code only has one constant: min_overlap_length
for
contig_number
,
contig
in
enumerate
(
contig_dict
):
barcodes
=
[]
barcodes_positions
=
defaultdict
(
list
)
active_mols
=
[]
last_endpoint
=
0
molecule_overlaps
=
set
()
# record all endpoints of molecules
molecule_endpoints
=
[]
START
,
END
=
0
,
1
for
molecule
in
contig_molecules
[
contig
]:
pcontig
,
pstart
,
pend
=
molecule_span
[
molecule
]
molecule_endpoints
+=
[(
pstart
,
START
,
molecule
)]
molecule_endpoints
+=
[(
pend
,
END
,
molecule
)]
molecule_endpoints
=
sorted
(
molecule_endpoints
)
# sliding window over all endpoints of molecules
for
endpoint_pos
,
endpoint_type
,
molecule
in
molecule_endpoints
:
print
(
"molecule"
,
molecule
,
"endpoint:"
,
endpoint_pos
,
"start"
if
endpoint_type
==
START
else
"end"
,
"active mols"
,
len
(
active_mols
))
#find all existing overlaps
molecule_overlaps
.
update
(
set
([(
x
,
y
)
for
(
ast
,
ae
,
x
)
in
active_mols
for
(
bst
,
be
,
y
)
in
active_mols
if
x
!=
y
and
ast
<
bst
and
ae
-
bst
>=
min_length
]))
# determine which molecules are now out
new_active_mols
=
[]
for
active_mol
in
active_mols
:
astart
,
aend
,
amol
=
active_mol
# keep only mols that finish after the current position
if
aend
>
endpoint_pos
:
new_active_mols
+=
[
active_mol
]
else
:
#print("endpoint_type/pos","END" if endpoint_type==1 else "START",endpoint_pos,"molecule",molecule,"astart/aend/amol",astart,aend,amol)
pass
# add maybe new active mol
if
endpoint_type
==
START
:
pcontig
,
pstart
,
pend
=
molecule_span
[
molecule
]
new_active_mols
+=
[(
pstart
,
pend
,
molecule
)]
active_mols
=
new_active_mols
# some debug
debug_advanced
=
False
if
debug_advanced
:
str_mols
=
[]
for
molecule
in
molecule_span
:
(
pcontig
,
start
,
end
)
=
molecule_span
[
molecule
]
str_mols
+=
[(
start
,
end
)
for
molecule
in
contig_molecules
[
contig
]]
str_mols
=
str
(
sorted
(
str_mols
))
debug_contig_mols
.
write
(
"contig number %d length %d, %d molecules: %s
\n
"
%
(
contig_number
,
contig_lengths
[
contig
],
len
(
molecule_endpoints
)
/
2
,
str_mols
))
else
:
debug_contig_mols
.
write
(
"contig number %d length %d, %d molecules
\n
"
%
(
contig_number
,
contig_lengths
[
contig
],
len
(
molecule_endpoints
)
/
2
))
# now iterate over all found overlaps between molecules
for
mol1
,
mol2
in
set
(
molecule_overlaps
):
# since we're constructing a barcode graph: get barcode of unresolved molecules
a
=
get_resolved_barcode
(
mol1
)
b
=
get_resolved_barcode
(
mol2
)
a
,
b
=
min
([(
a
,
b
),(
b
,
a
)])
label
=
a
+
"-"
+
b
if
label
not
in
link_count
:
link_count
[
label
]
=
0
link_contigs
[
label
]
=
""
link_count
[
label
]
+=
molecule_read_count
[
mol1
]
+
molecule_read_count
[
mol2
]
link_contigs
[
label
]
+=
" "
+
contig
g
.
add_edge
(
a
,
b
,
weight
=
link_count
[
label
],
contigs
=
link_contigs
[
label
])
if
contig_number
%
t
==
0
:
sys
.
stderr
.
write
(
'..%d%%..'
%
((
contig_number
/
t
)
*
10
))
sys
.
stderr
.
flush
()
sys
.
stderr
.
write
(
'
\n
'
)
sys
.
stderr
.
flush
()
print
(
len
(
g
.
nodes
()),
"graph nodes"
,
len
(
g
.
edges
()),
"edges"
)
# remove isolated nodes
list_isolated
=
[]
for
node
in
g
.
nodes
():
if
g
.
degree
(
node
)
==
0
:
list_isolated
+=
[
node
]
g
.
remove_nodes_from
(
list_isolated
)
print
(
len
(
list_isolated
),
"isolated nodes removed"
)
nx
.
write_graphml
(
g
,
sys
.
argv
[
1
]
+
".graphml"
)
nx
.
write_gpickle
(
g
,
sys
.
argv
[
1
]
+
".gpickle"
)
# write graph in paul's format
debug_graph
=
open
(
"debug_graph.tsv"
,
"w"
)
for
edge
in
g
.
edges
():
a
,
b
=
edge
a
,
b
=
min
([(
a
,
b
),(
b
,
a
)])
label
=
a
+
"-"
+
b
debug_graph
.
write
(
"%s %s %d
\n
"
%
(
a
,
b
,
link_count
[
label
]))
debug_graph
.
close
()
debug_contig_mols
.
close
()
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment