Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Yoann DUFRESNE
linked reads molecule ordering
Commits
3ca10697
Commit
3ca10697
authored
Jan 31, 2019
by
Yoann Dufresne
Browse files
previous scripts
parent
090d440e
Changes
6
Hide whitespace changes
Inline
Side-by-side
deconvolve.py
0 → 100644
View file @
3ca10697
# arguments: graphml file
# attempts to deconvolve a barcode graph using community detection
import
sys
import
itertools
import
operator
from
collections
import
Counter
from
networkx
import
nx
#from networkx.algorithms import community
import
community
# python-louvain
import
sys
filename
=
sys
.
argv
[
1
]
G
=
None
if
filename
.
endswith
(
'.graphml'
):
G
=
nx
.
read_graphml
(
filename
)
elif
filename
.
endswith
(
'.gexf'
):
G
=
nx
.
read_gexf
(
filename
)
print
(
G
)
def
deconvolve2
(
G
,
node
):
neighbors
=
list
(
G
.
neighbors
(
node
))
print
(
"node"
,
node
,
len
(
neighbors
),
"neighbors"
)
G2
=
nx
.
Graph
(
G
.
subgraph
(
neighbors
))
# make sure there is a single connected components
while
True
:
ccs
=
list
(
nx
.
connected_components
(
G2
))
if
len
(
ccs
)
==
1
:
break
# else artificially and weakly connect components to run fluid community
n1
=
list
(
ccs
[
0
])[
0
]
n2
=
list
(
ccs
[
1
])[
0
]
G2
.
add_edge
(
n1
,
n2
)
#cliques = list(community.asyn_fluidc(G2,2))
#cliques = list(community.girvan_newman(G2)) # too slow
#cliques = list(community.label_propagation_communities(G2)) # not very accurate
#cliques = list(community.asyn_lpa_communities(G2)) # seems accurate but hands
cliques
=
community
.
best_partition
(
G2
)
print
([
len
([
c
for
c
,
i
in
cliques
.
items
()
if
i
==
clique_id
])
for
clique_id
in
set
(
cliques
.
values
())])
if
len
(
cliques
)
==
1
:
return
# nothing to deconvolve here
for
node2
,
clique_id
in
cliques
.
items
():
G
.
add_node
(
node
+
"-MI%d"
%
clique_id
)
G
.
add_edge
(
node
+
"-MI%d"
%
clique_id
,
node2
,
contigs
=
(
G
[
node
][
node2
][
'contigs'
]
if
'contigs'
in
G
[
node
][
node2
]
else
""
))
# for debugging
G2
.
nodes
[
node2
][
'mi'
]
=
clique_id
debug
=
True
if
debug
:
nx
.
write_graphml
(
G2
,
"test.graphml"
)
#exit(0)
G
.
remove_node
(
node
)
g_nodes
=
list
(
G
.
nodes
())
for
node
in
g_nodes
:
deconvolve2
(
G
,
node
)
nx
.
write_graphml
(
G
,
sys
.
argv
[
1
]
+
".deconvolved.graphml"
)
\ No newline at end of file
generate_duplicated.py
0 → 100644
View file @
3ca10697
import
networkx
as
nx
G
=
nx
.
Graph
()
labels
=
list
(
range
(
30
))
# create nodes
names
=
{}
for
lab
in
labels
:
G
.
add_node
(
lab
)
names
[
lab
]
=
lab
nx
.
set_node_attributes
(
G
,
names
,
"test"
)
# insert duplications
labels
.
insert
(
23
,
7
)
print
(
labels
)
# create links
for
i
,
lab
in
enumerate
(
labels
):
for
j
in
range
(
i
+
1
,
min
(
i
+
4
,
len
(
labels
))):
G
.
add_edge
(
lab
,
labels
[
j
])
nx
.
write_graphml
(
G
,
"simple_duplicated_3links.graphml"
)
generate_fake_barcode_graph.py
0 → 100755
View file @
3ca10697
#!/usr/bin/env python3
import
networkx
as
nx
import
sys
G
=
nx
.
read_graphml
(
sys
.
argv
[
1
])
# label molecule nodes
labels
=
{}
for
idx
,
node
in
enumerate
(
G
.
nodes
()):
labels
[
node
]
=
str
(
idx
)
# artificially make barcodes
barcodes
=
[]
n
=
len
(
G
.
nodes
())
available_molecules
=
set
(
G
.
nodes
())
m
=
5
# m molecules per barcode
# Group molecules by barcode
import
random
while
len
(
available_molecules
)
>
0
:
barcode
=
set
(
random
.
sample
(
available_molecules
,
m
))
available_molecules
-=
barcode
# print(barcode)
barcodes
+=
[
barcode
]
# Associate molecule to barcode
molecule_barcode
=
dict
()
for
barcode_index
,
barcode
in
enumerate
(
barcodes
):
for
mol
in
barcode
:
molecule_barcode
[
mol
]
=
barcode_index
print
(
molecule_barcode
)
# Generate barcoded graph nodes
G2
=
nx
.
Graph
()
g2_labels
=
{}
for
barcode_index
,
barcode_molecules
in
enumerate
(
barcodes
):
bar_names
=
"_"
.
join
(
barcode_molecules
)
g2_labels
[
barcode_index
]
=
f
"
{
barcode_index
}
:
{
bar_names
}
"
G2
.
add_node
(
g2_labels
[
barcode_index
])
# Generate barcoded graph edges
for
mol_edge
in
G
.
edges
():
m1
,
m2
=
mol_edge
b1
,
b2
=
g2_labels
[
molecule_barcode
[
m1
]],
g2_labels
[
molecule_barcode
[
m2
]]
G2
.
add_edge
(
b1
,
b2
)
# print(G2.edges)
output
=
sys
.
argv
[
1
].
replace
(
"molecule"
,
"barcodes"
).
replace
(
".graphml"
,
f
"_
{
m
}
.gexf"
)
nx
.
write_gexf
(
G2
,
output
)
generate_fake_molecule_graph.py
0 → 100755
View file @
3ca10697
#!/usr/bin/env python3
# generate a bunch of molecules, in the form of python intervals (a,b), i.e. a<=i<b. here, b-a=10
# then find their overlaps >= 1 bases
n
=
16
o
=
5
molecules_intervals
=
[]
for
i
in
range
(
n
):
molecules_intervals
+=
[(
i
,
i
+
o
)]
#print(molecules_intervals)
def
overlap
(
a
,
b
):
for
i
in
range
(
*
a
):
if
i
in
range
(
*
b
):
return
True
return
False
#print(overlap(molecules_intervals[0],molecules_intervals[2]))
#print(overlap(molecules_intervals[0],molecules_intervals[20]))
import
networkx
as
nx
G
=
nx
.
Graph
()
for
i
,
a
in
enumerate
(
molecules_intervals
):
G
.
add_node
(
i
)
for
i
,
a
in
enumerate
(
molecules_intervals
):
for
j
,
b
in
enumerate
(
molecules_intervals
):
if
i
>=
j
:
continue
if
overlap
(
a
,
b
):
G
.
add_edge
(
i
,
j
)
print
(
G
.
edges
())
nx
.
write_graphml
(
G
,
f
"data/simulated_molecules_
{
n
}
_
{
o
}
.graphml"
)
proxy.py
0 → 100755
View file @
3ca10697
#!/usr/bin/env python3
from
networkx
import
nx
from
copy
import
deepcopy
def
compute_network_distances
(
graph
,
max_dist
=
3
):
distances
=
{}
# Init all the distances
g_nodes
=
list
(
graph
.
nodes
())
for
node
in
g_nodes
:
distances
[
node
]
=
{
node
:
0
}
# Transmit distances over max_dist iterations
for
_
in
range
(
max_dist
):
old_distances
=
deepcopy
(
distances
)
# For each node, transmit list of distances
for
node
in
g_nodes
:
neighbors
=
list
(
graph
.
neighbors
(
node
))
# for each neighbor of the current node, transmit all the distances already present.
for
neighbor
in
neighbors
:
for
key
,
val
in
old_distances
[
node
].
items
():
if
not
key
in
distances
[
neighbor
]:
distances
[
neighbor
][
key
]
=
val
+
1
return
distances
def
print_distances
(
distances
):
for
key
,
val
in
distances
.
items
():
print
(
key
,
len
(
val
))
def
evolved_distances
(
graph
,
node
):
prev_distances
=
compute_network_distances
(
graph
,
3
)
# Remove node
neighbors
=
list
(
graph
.
neighbors
(
node
))
graph
.
remove_node
(
node
)
next_distances
=
compute_network_distances
(
graph
,
3
)
# Add removed node
graph
.
add_node
(
node
)
for
neighbor
in
neighbors
:
graph
.
add_edge
(
node
,
neighbor
)
# print distance evolutions between x and y nodes (different from node)
for
origin
,
destinations
in
prev_distances
.
items
():
if
origin
==
node
:
continue
for
destination
,
distance
in
destinations
.
items
():
if
destination
==
node
:
continue
if
not
destination
in
next_distances
[
origin
]:
print
(
f
"
{
origin
}
--
{
destination
}
{
distance
}
->X"
)
elif
next_distances
[
origin
][
destination
]
!=
distance
:
print
(
f
"
{
origin
}
--
{
destination
}
{
distance
}
->
{
next_distances
[
origin
][
destination
]
}
"
)
# if not dest in next_distances:
# print(f"{node}--{dest} {dist}->X")
# elif next_distances[dest] == dist:
# print(f"{node}--{dest} {dist}->{next_distances[dest]}")
graph
=
nx
.
read_graphml
(
"simple_duplicated_3links.graphml"
)
# graph = nx.read_graphml("simulated_barcodes.graphml")
nodes
=
list
(
graph
.
nodes
())
# nodes = ["7"]
for
node
in
nodes
:
print
(
f
"node
{
node
}
"
)
evolved_distances
(
graph
,
node
)
# print_distances(distances)
simplify.py
0 → 100644
View file @
3ca10697
import
networkx
as
nx
g
=
nx
.
read_graphml
(
"simple_duplicated_3links.graphml"
)
# List all the nodes for adjacency tests.
for
n
,
nbrs
in
g
.
adj
.
items
():
neighbors
=
[
x
[
0
]
for
x
in
nbrs
.
items
()]
to_split
=
False
# For all nodes B and C neighbors of A
for
idx
in
range
(
len
(
neighbors
)):
for
jdx
in
range
(
idx
+
1
,
len
(
neighbors
)):
ni
=
neighbors
[
idx
]
nj
=
neighbors
[
jdx
]
# Need to plit A in A' and A'' if B and C are not neighbors
if
not
ni
in
g
.
adj
[
nj
]:
# print(f"{ni},{nj}")
to_split
=
True
# Split or not split, that is the question
print
(
f
"
{
n
}
split ?
{
to_split
}
"
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment