diff --git a/6-Exact_pattern_matching/6-TP-Exact_pattern_matching.pdf b/6-Exact_pattern_matching/6-TP-Exact_pattern_matching.pdf new file mode 100644 index 0000000000000000000000000000000000000000..810a122fc96f5eda14abf9f3ad3001b05045a76b Binary files /dev/null and b/6-Exact_pattern_matching/6-TP-Exact_pattern_matching.pdf differ diff --git a/6-Exact_pattern_matching/tp6.py b/6-Exact_pattern_matching/tp6.py new file mode 100644 index 0000000000000000000000000000000000000000..141040f11fbfea6d798344a7fb98c592ab98e8bb --- /dev/null +++ b/6-Exact_pattern_matching/tp6.py @@ -0,0 +1,181 @@ +""" TP6 where we code some exact pattern matching algorithms """ + +import time +import random +import ahocorasick + + +def naive_exact_pattern_matching(pattern, text): + """ Naive algorithm of exact pattern matching + Search a pattern in a text at each and every positions + """ + # Position of founded occurrences of the pattern in the text + matches = [] + # Size of pattern + k = len(pattern) + # For each position/letter of text (careful about the end of the text) + for i, _ in enumerate(text[0:len(text)-k+1]): + # If we found the pattern at this position + if pattern == text[i:i+k]: + # Add this position to the results + matches.append(i) + # Return results + return matches + +class RollingHash: + """ The main class for rolling hash, k is the size of the pattern""" + def __init__(self, text, k): + """ Initialize all needed variables and compute the first hash""" + self.text = text + self.k = k + # Maps characters to int + self.alphabet = {"A": 0, "C": 1, "G": 2, "T": 3} + # Size of the alphabet + self.a = len(self.alphabet) + # Start of the current hash + self.start = 0 + # End of the current hash + self.end = k + # The computed hash + self.hash = 0 + # Compute the first hash + for i, char in enumerate(text[0:self.k]): + # val(current_char) * a^(k - (i+1)) + self.hash += self.alphabet[char] * (self.a**(self.k-(i+1))) + + def next_hash(self): + """ Compute the next hash """ + # Avoid error on last next_hash() + if self.end < len(self.text): + # Remove left character of the current hash + self.hash -= self.alphabet[self.text[self.start]] * (self.a**(self.k-1)) + # Increase all power + self.hash *= self.a + # Add new character to the hash + self.hash += self.alphabet[self.text[self.end]] + # Increment start and end + self.start += 1 + self.end += 1 + + def get_string(self): + """ Return the actual string of current hash """ + return self.text[self.start:self.end] + +def rabin_karp(pattern, text): + """ Rabin-Karp algorithm of exact pattern matching + Search a pattern in a text using rolling hash + """ + # Get the hash of the pattern + pattern_hash = RollingHash(pattern, len(pattern)) + # Start the rolling hash on the text + text_rolling_hash = RollingHash(text, len(pattern)) + + # Position of founded occurrences of the pattern in the text + matches = [] + # For each position/letter of text (careful about the end of the text) + for i, _ in enumerate(text[0:len(text)-len(pattern)+1]): + # If we found the pattern at this position + if pattern_hash.hash == text_rolling_hash.hash: + # Compare characters + if pattern == text_rolling_hash.get_string(): + # Add this position to the results + matches.append(i) + text_rolling_hash.next_hash() + # Return results + return matches + + +def aho_corasick(pattern, text): + """ Aho-Corasick implementation, requires pypi package pyahocorasick (pip3 install pyahocorasick) """ + ahoc = ahocorasick.Automaton() + ahoc.add_word(pattern, (0, pattern)) + ahoc.make_automaton() + + matches = [] + for item in ahoc.iter(text): + matches.append(item[0]-len(pattern)+1) + return matches + + +def generator_of_sequence(size): + """ Generate a random ntd sequence of length 'size' """ + # Return sequence + ret = "" + # While we did not have a long enough sequence + for _ in range(size): + # Add a new random ntd + ret += random.choice("ACGT") + # Return the sequence + return ret + + +def main(): + """ The main fo TP6 """ + my_text = "ATAGCTAGCAT" + my_pattern = "AG" + # Print positions of occurrences + matches = naive_exact_pattern_matching(my_pattern, my_text) + print(matches) + # Print all matches + for start_pos_match in matches: + print(my_text[start_pos_match:start_pos_match+len(my_pattern)]) + + #rh = RollingHash("ACCGTACGTTGATATAGCTAGCATGCATGCTA", 4) + #print(rh.hash) + #rh.next_hash() + #print(rh.hash) + + #matches_rk = rabin_karp(my_pattern, my_text) + #print(matches_rk) + + ''' + + """ Run some time comparison between naive alog, Rabin-Karp and Aho-Corasick + algorithms. Naive is actually faster than Rabin-Karp, + mostly because the rolling hash used is not efficient. + """ + # Time difference + my_pattern = "GCATATTA" + # Generate a big sequence + my_text = generator_of_sequence(50000000) + + # Get the current time + start = time.time() + # Naive algorithm + matches = naive_exact_pattern_matching(my_pattern, my_text) + # Get the current time + end = time.time() + # Print the execution time (2 digits precision) + print("Naive: {:.2f}s".format(end - start)) + + # Get the current time + start = time.time() + # Naive algorithm + matches_rk = rabin_karp(my_pattern, my_text) + # Get the current time + end = time.time() + # Print the execution time (2 digits precision) + print("Rabin-Karp: {:.2f}s".format(end - start)) + + # Get the current time + start = time.time() + # Naive algorithm + matches_aho = aho_corasick(my_pattern, my_text) + # Get the current time + end = time.time() + # Print the execution time (2 digits precision) + print("Aho-Corasick: {:.2f}s".format(end - start)) + + if matches == matches_rk and matches_aho == matches: + # Print that everything is good + print("Results are identical: {} matches".format(len(matches))) + else: + # Print that there is a problem somewhere + print("There is a bug!!") + + ''' +# Launch the main +main() +# Exit without error +exit(0) +# Always put one extra return line