Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
sequence-bioinformatics
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Nicolas MAILLET
sequence-bioinformatics
Commits
b88e1c6e
Commit
b88e1c6e
authored
1 year ago
by
Nicolas MAILLET
Browse files
Options
Downloads
Patches
Plain Diff
Correction TP7
parent
97111e6a
No related branches found
No related tags found
No related merge requests found
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
7-Approximate_string_matching/ram.txt
+42
-0
42 additions, 0 deletions
7-Approximate_string_matching/ram.txt
7-Approximate_string_matching/tp7.py
+237
-0
237 additions, 0 deletions
7-Approximate_string_matching/tp7.py
with
279 additions
and
0 deletions
7-Approximate_string_matching/ram.txt
0 → 100644
+
42
−
0
View file @
b88e1c6e
file1 = "e_coli_IAI39.fasta"
file2 = "e_coli_UMN026.fasta"
k = 30
22.07%
Naive: 15.06s
Total allocated size: 1 171 375.6 KiB
n = 10
33.33%
MinHashBinary: 24.21s
Total allocated size: 2.0 KiB
n = 50
16.28%
MinHashBinary: 27.60s
Total allocated size: 5.5 KiB
n = 100
17.65%
MinHashBinary: 35.55s
Total allocated size: 9.8 KiB
n = 250
20.77%
MinHashBinary: 53.76s
Total allocated size: 22.8 KiB
n = 500
21.07%
MinHashBinary: 85.73s
Total allocated size: 44.0 KiB
n = 1000
21.36%
MinHashBinary: 145.13s
Total allocated size: 88.7 KiB
n = 10000
21.70%
MinHashBinary: 1472.08s
Total allocated size: 875.0 KiB
\ No newline at end of file
This diff is collapsed.
Click to expand it.
7-Approximate_string_matching/tp7.py
0 → 100644
+
237
−
0
View file @
b88e1c6e
"""
TP7 where we perform some fuzzy string comparison
"""
import
time
import
tracemalloc
def
get_all_uniq_kmers
(
genome_file
,
k
):
"""
Get each and every kmer (only one time) of a file
"""
# Open the file
with
open
(
genome_file
)
as
file
:
# Skip first line
next
(
file
)
# Get the genome WITHOUT LAST CHAR
genome
=
next
(
file
).
strip
()
# The set that will contains all uniq k-mers
k_mers
=
set
()
# K-mer decomposition
for
i
,
_
in
enumerate
(
genome
[:
len
(
genome
)
-
k
+
1
]):
# Add the current k-mer
k_mers
.
add
(
genome
[
i
:
i
+
k
])
return
k_mers
def
jaccard_index
(
k_mers_1
,
k_mers_2
):
"""
Compute Jaccard distance between two sets
"""
# Count how many k-mers are in both set1 and set2
shared_k_mers
=
0
# For each k-mer in set1
for
i
in
k_mers_1
:
# If this k-mer is in set 2
if
i
in
k_mers_2
:
# It is a shared one
shared_k_mers
+=
1
# Jaccard index: |AnB| / (|A| + |B| - |AnB|)
jaccard
=
shared_k_mers
/
(
len
(
k_mers_1
)
+
len
(
k_mers_2
)
-
shared_k_mers
)
return
jaccard
def
jaccard_index2
(
k_mers_1
,
k_mers_2
):
"""
Compute Jaccard distance between two sets
using intersection
"""
inter_len
=
len
(
k_mers_1
.
intersection
(
k_mers_2
))
return
inter_len
/
(
len
(
k_mers_1
)
+
len
(
k_mers_2
)
-
inter_len
)
def
jaccard_index3
(
k_mers_1
,
k_mers_2
):
"""
Compute Jaccard distance between two sets
using intersection and union
"""
return
len
(
k_mers_1
.
intersection
(
k_mers_2
))
/
len
(
k_mers_1
.
union
(
k_mers_2
))
def
min_hash
(
genome_file
,
k
,
n
):
"""
Get the MinHash of a file. N is the number of minimal hash we want
"""
# Open the file
with
open
(
genome_file
)
as
file
:
# Skip first line
next
(
file
)
# Get the genome
genome
=
next
(
file
)
# Initialize the array
hashes
=
[]
# Last position in the genome after the array is full
next_pos
=
0
# First step: fill the array with n element
# K-mer decomposition
for
i
,
_
in
enumerate
(
genome
[:
len
(
genome
)
-
k
]):
# Get the current hash
tmp_hash
=
hash
(
genome
[
i
:
i
+
k
])
# if it is not already in our array
if
tmp_hash
not
in
hashes
:
# If the array is not full
if
len
(
hashes
)
<
n
:
# Add the current k-mer
hashes
.
append
(
tmp_hash
)
else
:
# Sort the array
hashes
.
sort
()
# Backup next i value
next_pos
=
i
+
1
# Exit this loop
break
# Second step: add only the smallest hashes and keep the array sorted
# K-mer decomposition, from next_pos
for
i
,
_
in
enumerate
(
genome
[
next_pos
:
len
(
genome
)
-
k
]):
# Get the current hash, /!\ it starts at i+next_pos /!\
current_hash
=
hash
(
genome
[
i
+
next_pos
:
i
+
next_pos
+
k
])
# if it is not already in our array
# and it is smaller than the biggest element
if
current_hash
not
in
hashes
and
current_hash
<
hashes
[
-
1
]:
# Update the last one
hashes
[
-
1
]
=
current_hash
# Sort the array
hashes
.
sort
()
# Return the hashes array
return
hashes
def
min_hash_binary
(
genome_file
,
k
,
n
):
"""
Get the MinHash of a file. N is the number of minimal hash we want
"""
# Open the file
with
open
(
genome_file
)
as
file
:
# Skip first line
next
(
file
)
# Get the genome
genome
=
next
(
file
)
# Initialize the array
hashes
=
[]
# Last position in the genome after the array is full
next_pos
=
0
# First step: fill the array with n element
# K-mer decomposition
for
i
,
_
in
enumerate
(
genome
[:
len
(
genome
)
-
k
]):
# Get the current hash
tmp_hash
=
hash
(
genome
[
i
:
i
+
k
])
# if it is not already in our array
if
tmp_hash
not
in
hashes
:
# If the array is not full
if
len
(
hashes
)
<
n
:
# Add the current k-mer
hashes
.
append
(
tmp_hash
)
else
:
# Sort the array
hashes
.
sort
()
# Backup next i value
next_pos
=
i
+
1
# Exit this loop
break
# Second step: add only the smallest hashes and keep the array sorted
# K-mer decomposition, from next_pos
for
i
,
_
in
enumerate
(
genome
[
next_pos
:
len
(
genome
)
-
k
]):
# Get the current hash, /!\ it starts at i+next_pos /!\
current_hash
=
hash
(
genome
[
i
+
next_pos
:
i
+
next_pos
+
k
])
# if it is not already in our array
# and it is smaller than the biggest element
if
current_hash
not
in
hashes
and
current_hash
<
hashes
[
-
1
]:
# Find where we should add it (binary search)
# From beginning...
beg_pos
=
0
# To last item (we already check the last one)
end_pos
=
len
(
hashes
)
-
1
# While we are not at the correct position
while
beg_pos
<
end_pos
:
# Where to look
mid_pos
=
(
beg_pos
+
end_pos
)
//
2
# If the hash is bigger than the looked element
if
current_hash
>
hashes
[
mid_pos
]:
# Skip the current mid_pos, already tested
beg_pos
=
mid_pos
+
1
else
:
# Skip the current mid_pos
end_pos
=
mid_pos
# Insert the value in the array
hashes
.
insert
(
beg_pos
,
current_hash
)
# Remove last pos of the array
hashes
.
pop
()
# Return the hashes array
return
hashes
def
main
():
"""
The main of TP7
"""
file1
=
"
small1.fasta
"
file2
=
"
small2.fasta
"
file1
=
"
20K1.fasta
"
file2
=
"
20K2.fasta
"
file1
=
"
e_coli_IAI39.fasta
"
file2
=
"
e_coli_UMN026.fasta
"
k
=
30
n
=
1000
ram_stat
=
False
if
ram_stat
:
# Start mem trace footprint
tracemalloc
.
start
()
# Get the current time
start
=
time
.
time
()
# Naive algorithm
k_mers_1
=
get_all_uniq_kmers
(
file1
,
k
)
k_mers_2
=
get_all_uniq_kmers
(
file2
,
k
)
print
(
"
{:.2f}%
"
.
format
(
jaccard_index
(
k_mers_1
,
k_mers_2
)
*
100
))
# Get the current time
end
=
time
.
time
()
# Print the execution time (2 digits precision)
print
(
"
Naive: {:.2f}s
"
.
format
(
end
-
start
))
if
ram_stat
:
# Print the mem footprint
snapshot
=
tracemalloc
.
take_snapshot
()
display_top
(
snapshot
)
# Stop the trace
tracemalloc
.
stop
()
# Start mem trace footprint
tracemalloc
.
start
()
# Get the current time
start
=
time
.
time
()
min_hash_1
=
min_hash
(
file1
,
k
,
n
)
min_hash_2
=
min_hash
(
file2
,
k
,
n
)
print
(
"
{:.2f}%
"
.
format
(
jaccard_index
(
min_hash_1
,
min_hash_2
)
*
100
))
# Get the current time
end
=
time
.
time
()
# Print the execution time (2 digits precision)
print
(
"
MinHash: {:.2f}s
"
.
format
(
end
-
start
))
if
ram_stat
:
# Print the mem footprint
snapshot
=
tracemalloc
.
take_snapshot
()
display_top
(
snapshot
)
# Stop the trace
tracemalloc
.
stop
()
# Start mem trace footprint
tracemalloc
.
start
()
# Get the current time
start
=
time
.
time
()
min_hash_1
=
min_hash_binary
(
file1
,
k
,
n
)
min_hash_2
=
min_hash_binary
(
file2
,
k
,
n
)
print
(
"
{:.2f}%
"
.
format
(
jaccard_index
(
min_hash_1
,
min_hash_2
)
*
100
))
# Get the current time
end
=
time
.
time
()
# Print the execution time (2 digits precision)
print
(
"
MinHashBinary: {:.2f}s
"
.
format
(
end
-
start
))
if
ram_stat
:
# Print the mem footprint
snapshot
=
tracemalloc
.
take_snapshot
()
display_top
(
snapshot
)
# Stop the trace
tracemalloc
.
stop
()
def
display_top
(
snapshot
,
key_type
=
'
lineno
'
):
"""
RAM usage statistics
"""
snapshot
=
snapshot
.
filter_traces
((
tracemalloc
.
Filter
(
False
,
"
<frozen importlib._bootstrap>
"
),
tracemalloc
.
Filter
(
False
,
"
<unknown>
"
),
))
top_stats
=
snapshot
.
statistics
(
key_type
)
total
=
sum
(
stat
.
size
for
stat
in
top_stats
)
print
(
"
Total allocated size: %.1f KiB
"
%
(
total
/
1024
))
# Launch the main
main
()
# Exit without error
exit
(
0
)
# Always put one extra return line
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment