Commit c57a6c75 authored by Nicolas  MAILLET's avatar Nicolas MAILLET

Add TP6 corrected

parent f26a5a5d
""" TP6 where we code some exact pattern matching algorithms """
import time
import random
import ahocorasick
def naive_exact_pattern_matching(pattern, text):
""" Naive algorithm of exact pattern matching
Search a pattern in a text at each and every positions
"""
# Position of founded occurrences of the pattern in the text
matches = []
# For each position/letter of text (careful about the end of the text)
for i, _ in enumerate(text[0:len(text)-len(pattern)+1]):
# If we found the pattern at this position
if pattern == text[i:i+len(pattern)]:
# Add this position to the results
matches.append(i)
# Return results
return matches
class RollingHash:
""" The main class for rolling hash, k is the size of the pattern"""
def __init__(self, text, k):
""" Initialize all needed variables and compute the first hash"""
self.text = text
self.k = k
# Maps characters to int
self.alphabet = {"A": 0, "C": 1, "G": 2, "T": 3}
# Size of the alphabet
self.a = len(self.alphabet)
# Start of the current hash
self.start = 0
# End of the current hash
self.end = k
# The computed hash
self.hash = 0
# Compute the first hash
for i, char in enumerate(text[0:k]):
# val(current_char) * a^(k - (i+1))
self.hash += self.alphabet[char] * (self.a**(self.k-(i+1)))
def next_hash(self):
""" Compute the next hash """
# Avoid error on last next_hash()
if self.end < len(self.text):
# Remove left character of the current hash
self.hash -= self.alphabet[self.text[self.start]] * (self.a**(self.k-1))
# Increase all power
self.hash *= self.a
# Add new character to the hash
self.hash += self.alphabet[self.text[self.end]]
# Increment start and end
self.start += 1
self.end += 1
def get_string(self):
""" Return the actual string of current hash """
return self.text[self.start:self.end]
def rabin_karp(pattern, text):
""" Rabin-Karp algorithm of exact pattern matching
Search a pattern in a text using rolling hash
"""
# Get the hash of the pattern
pattern_hash = RollingHash(pattern, len(pattern))
# Start the rolling hash on the text
text_rolling_hash = RollingHash(text, len(pattern))
# Position of founded occurrences of the pattern in the text
matches = []
# For each position/letter of text (careful about the end of the text)
for i, _ in enumerate(text[0:len(text)-len(pattern)+1]):
# If we found the pattern at this position
if pattern_hash.hash == text_rolling_hash.hash:
# Compare characters
if pattern == text_rolling_hash.get_string():
# Add this position to the results
matches.append(i)
text_rolling_hash.next_hash()
# Return results
return matches
def aho_corasick(pattern, text):
""" Aho-Corasick implementation, requires pypi package pyahocorasick (pip3 install pyahocorasick) """
ahoc = ahocorasick.Automaton()
ahoc.add_word(pattern, (0, pattern))
ahoc.make_automaton()
matches = []
for item in ahoc.iter(text):
matches.append(item[0]-len(pattern)+1)
return matches
def generator_of_sequence(size):
""" Generate a random ntd sequence of length 'size' from TP2"""
# Counter for stopping the while
cpt = 0
# Return sequence
ret = ""
# Hash of correspondence between numbers and ntd
rv_val = {0: "A", 1: "C", 2: "G", 3: "T"}
# While we did not have a long enough sequence
while cpt < size:
# Add a new random ntd
ret += rv_val[random.randint(0, 3)]
# Increment the counter
cpt += 1
# Return the sequence
return ret
def main():
""" The main fo TP6 """
my_text = "ACGTTGATATAGCTAGCATGCATGCTA"
my_pattern = "AG"
# Print positions of occurrences
matches = naive_exact_pattern_matching(my_pattern, my_text)
print(matches)
# Print all matches
for start_pos_match in matches:
print(my_text[start_pos_match:start_pos_match+len(my_pattern)])
rh = RollingHash("ACCGTACGTTGATATAGCTAGCATGCATGCTA", 4)
print(rh.hash)
rh.next_hash()
print(rh.hash)
matches_rk = rabin_karp(my_pattern, my_text)
print(matches_rk)
""" Run some time comparison between naive alog, Rabin-Karp and Aho-Corasick
algorithms. Naive is actually faster than Rabin-Karp,
mostly because the rolling hash used is not efficient.
"""
# Time difference
my_pattern = "GC"
# Generate a big sequence
my_text = generator_of_sequence(5000000)
# Get the current time
start = time.time()
# Naive algorithm
matches = naive_exact_pattern_matching(my_pattern, my_text)
# Get the current time
end = time.time()
# Print the execution time (2 digits precision)
print("Naive: {:.2f}s".format(end - start))
# Get the current time
start = time.time()
# Naive algorithm
matches_rk = rabin_karp(my_pattern, my_text)
# Get the current time
end = time.time()
# Print the execution time (2 digits precision)
print("Rabin-Karp: {:.2f}s".format(end - start))
# Get the current time
start = time.time()
# Naive algorithm
matches_aho = aho_corasick(my_pattern, my_text)
# Get the current time
end = time.time()
# Print the execution time (2 digits precision)
print("Aho-Corasick: {:.2f}s".format(end - start))
if matches == matches_rk and matches_aho == matches:
# Print that everything is good
print("Results are identical: {} matches".format(len(matches)))
else:
# Print that there is a problem somewhere
print("There is a bug!!")
# Launch the main
main()
# Exit without error
exit(0)
# Always put one extra return line
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment