From b4db52bf0b79d3224a8562639f3eff2132aad94e Mon Sep 17 00:00:00 2001
From: Nico Maillet <nicolas.maillet@pasteur.fr>
Date: Wed, 6 Dec 2023 12:49:05 +0100
Subject: [PATCH] CM-TP regex

---
 8-Regex/tp8.py | 202 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 202 insertions(+)
 create mode 100644 8-Regex/tp8.py

diff --git a/8-Regex/tp8.py b/8-Regex/tp8.py
new file mode 100644
index 0000000..05bf568
--- /dev/null
+++ b/8-Regex/tp8.py
@@ -0,0 +1,202 @@
+""" TP8 where we play with regex """
+# Import regex
+import re
+
+def read_file_old(file_in):
+    """ Read the file and return its content (array of tuples) """
+    content = []
+    # Open the file
+    with open(file_in) as file:
+        # The header of the current sequence
+        header = False
+        # For each line
+        for line in file:
+            # If the header is False, this is a new sequence
+            if not header:
+                # Backup the header
+                header = line.strip()
+            # Header is not False, we are in a sequence
+            else:
+                # Create a tuple (not modifiable) with the header and the sequence
+                tmp = (header, line.strip())
+                # Add it to the returned list
+                content.append(tmp)
+                # Put header to False
+                header = False
+    # Return the content of the file
+    return content
+
+def read_file(file_in):
+    """ Read the file and return its content (array of tuples) """
+    content = []
+    # Open the file
+    with open(file_in) as file:
+        # For each line
+        for line in file:
+            # Get the header
+            header = line.strip()
+            # Get the sequence
+            seq = file.readline().strip()
+            # Create a tuple (not modifiable) with the header and the sequence
+            tmp = (header, seq)
+            # Add it to the returned list
+            content.append(tmp)
+    # Return the content of the file
+    return content
+
+def main():
+    """ The main of TP8 that launch regex """
+    # The file to process
+    file_in = "sequences.fasta"
+    # Get its content
+    content = read_file(file_in)
+
+    # Get all occurrences of 'GTA'
+    res = []
+    # For each headers/sequences
+    for i in content:
+        # Regex on the sequence only
+        res += re.findall("GTA", i[1])
+    print("There is {} 'GTA' in the file".format(len(res)))
+
+
+    # Is there a sequence containing 'GTA(some characters)CT'?
+    # Not founded yet
+    founded = False
+    # For each headers/sequences
+    for i in content:
+        # Regex on the sequence only
+        if re.search("GTA.*CT", i[1]):
+            # We found one!
+            founded = True
+            # Stop the process
+            break
+    # Did we find it?
+    if founded:
+        print("\nThere is!")
+    else:
+        print("\nThere is not :(")
+
+
+    # Is there a sequence containing 'GTA(max 3 characters)CTAAT'?
+    # Not founded yet
+    founded = False
+    # For each headers/sequences
+    for i in content:
+        # Regex on the sequence only
+        if re.search("GTA.{0,3}CTAAT", i[1]):
+            # We found one!
+            founded = True
+            # Stop the process
+            break
+    # Did we find it?
+    if founded:
+        print("\nThere is!")
+    else:
+        print("\nThere is not :(")
+
+    # Is there a sequence containing 'GG T or C GG'?
+    # Not founded yet
+    founded = False
+    # For each headers/sequences
+    for i in content:
+        # Regex on the sequence only
+        if re.search("GG[TC]GG", i[1]):
+            # We found one!
+            founded = True
+            # Stop the process
+            break
+    # Did we find it?
+    if founded:
+        print("\nThere is!")
+    else:
+        print("\nThere is not :(")
+
+
+    # Is there a sequence finishing by 'ATATAT'?
+    # Not founded yet
+    founded = False
+    # For each headers/sequences
+    for i in content:
+        # Regex on the sequence only
+        if re.search("ATATAT$", i[1]):
+            # We found one!
+            founded = True
+            # Stop the process
+            break
+    # Did we find it?
+    if founded:
+        print("\nThere is!")
+    else:
+        print("\nThere is not :(")
+
+    # Is there a sequence starting or finishing by 'ATATAT'?
+    # Not founded yet
+    founded = False
+    # For each headers/sequences
+    for i in content:
+        # Regex on the sequence only
+        if re.search("^ATATAT|ATATAT$", i[1]):
+            # We found one!
+            founded = True
+            # Stop the process
+            break
+    # Did we find it?
+    if founded:
+        print("\nThere is!")
+    else:
+        print("\nThere is not :(")
+
+
+    # Get headers containing mmus or musm
+    res = []
+    # For each headers/sequences
+    for i in content:
+        # Regex on the header only, starting and ending by anything and containing something that is not A, C, G or T
+        if re.search("mmus|musm", i[0]):
+            res.append(i[0])
+    print("\nMus Musculus headers: {}".format(res))
+
+
+    # Count headers containing / or \
+    res = []
+    # For each headers/sequences
+    for i in content:
+        # Regex on the header only, you need to escape the escape... \\ is a literal \ IN THE REGEX, so you need to escape it. Python only.
+        if re.search("\\\\|\\/", i[0]):
+            res.append(i[0])
+    print("\nThere is {} headers containing (back)slash".format(len(res)))
+
+
+    # Find the sequence containing not only DNA
+    res = []
+    # For each headers/sequences
+    for i in content:
+        # Regex on the sequence only, starting and ending by anything and containing something that is not A, C, G or T
+        res += re.findall("^.*[^ACGT].*$", i[1])
+    print("\nBuggy sequence: {}".format(res))
+
+
+    # Get the part of headers containing an id composed of 3 letters, 1 digit, 1 alphanumeric character, 1 character and surrounded by spaces
+    res = []
+    # For each headers/sequences
+    for i in content:
+        # Regex on the header only
+        res += re.findall("\ [A-Za-z]{3}\d{1}\w{1}.{1}\ ", i[0])
+    print("\nSpecial header sequences: {}".format(res))
+
+
+    # Get the sequence where the header contains an email address.
+    res = ""
+    # For each headers/sequences
+    for i in content:
+        # Regex on the header only
+        if re.search("[^\W][a-zA-Z0-9_]+(\.[a-zA-Z0-9_]+)*\@[a-zA-Z0-9_]+(\.[a-zA-Z0-9_]+)*\.[a-zA-Z]{2,4}", i[0]):
+            res = i[1]
+    print("\nSequence with email on the header: {}".format(res))
+
+# Launch the main
+main()
+# Exit without error
+exit(0)
+# Always put one extra return line
-- 
GitLab