From 291545cf5e04c5b54b2c453ed9ad9576d376b5ce Mon Sep 17 00:00:00 2001
From: Veronique Legrand <vlegrand@pasteur.fr>
Date: Tue, 4 Jun 2024 16:05:03 +0200
Subject: [PATCH] Want to test whether using regex module instead of re to
 allow 1 mismatch in the matching of the seed region impacts execution time.

---
 phagetermvirome/functions_PhageTerm.py | 52 +++++++++++++-------------
 phagetermvirome/utilities.py           | 17 ++++++++-
 unit-tests/test_utilities.py           | 31 +++++++++++++++
 3 files changed, 72 insertions(+), 28 deletions(-)
 create mode 100644 unit-tests/test_utilities.py

diff --git a/phagetermvirome/functions_PhageTerm.py b/phagetermvirome/functions_PhageTerm.py
index 79a11dd..0c1a330 100644
--- a/phagetermvirome/functions_PhageTerm.py
+++ b/phagetermvirome/functions_PhageTerm.py
@@ -100,11 +100,11 @@ def init_ws(p_res,refseq,hostseq):
         list_hybrid=p_res.interm_res.list_hybrid
         insert=p_res.interm_res.insert
         paired_missmatch=p_res.interm_res.paired_mismatch
-        k=int(p_res.interm_res.reads_tested)
+        cnt_reads_processed=int(p_res.interm_res.reads_tested)
         #count_line=p_res.count_line-1 # do that because readsCoverage will start by incrementing it of 1
         read_match=p_res.read_match
     return gen_len,host_len,termini_coverage,whole_coverage,paired_whole_coverage,phage_hybrid_coverage,host_hybrid_coverage, \
-           host_whole_coverage,list_hybrid,insert,paired_missmatch,k,count_line,read_match #TODO refactor that.
+           host_whole_coverage,list_hybrid,insert,paired_missmatch,cnt_reads_processed,count_line,read_match #TODO refactor that.
 
 
 
@@ -118,7 +118,7 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta
 
     p_res=chk_handler.load(core_id,idx_refseq)
     gen_len,host_len,termini_coverage, whole_coverage, paired_whole_coverage, phage_hybrid_coverage, host_hybrid_coverage,\
-    host_whole_coverage, list_hybrid, insert, paired_missmatch, k, count_line, read_match=init_ws(p_res, refseq, inDArgs.hostseq)
+    host_whole_coverage, list_hybrid, insert, paired_missmatch, cnt_reads_processed, count_line, read_match=init_ws(p_res, refseq, inDArgs.hostseq)
     if logger!=None:
         logger.add_rw(p_res)
     test_read_seq = match = 0
@@ -148,7 +148,7 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta
             break
 
         if test_read_seq:
-            k += 1
+            cnt_reads_processed += 1
             # Read sequence
             read = line.split("\n")[0].split("\r")[0]
             line = filin.readline()
@@ -160,17 +160,17 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta
             readlen = len(read)
             if readlen < fParms.seed:
                 if logger!=None:
-                    print("CPU rejecting read",k)
+                    print("CPU rejecting read",cnt_reads_processed)
                 continue
             corlen = readlen-fParms.seed
 
             if logger!=None:
-                print("CPU processing read: ",k,read, reverseComplement(read))
-                logger.newRmInfo(k)
+                print("CPU processing read: ",cnt_reads_processed,read, reverseComplement(read))
+                logger.newRmInfo(cnt_reads_processed)
             
             ### Match sense + (multiple, random pick one)
             #print("read[:fParms.seed]=",read[:fParms.seed])
-            matchPplus_start, matchPplus_end = applyCoverage(read[:fParms.seed], refseq)
+            matchPplus_start, matchPplus_end = applyCoverage(read[:fParms.seed], refseq,1)
             
             ## Phage
             if matchPplus_start != -1:
@@ -187,7 +187,7 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta
                 
                 # Paired-read
                 if inRawDArgs.paired != "":
-                    matchPplus_start_paired, matchPplus_end_paired = applyCoverage(reverseComplement(read_paired)[-fParms.seed:], refseq)
+                    matchPplus_start_paired, matchPplus_end_paired = applyCoverage(reverseComplement(read_paired)[-fParms.seed:], refseq,1)
                     insert_length = matchPplus_end_paired - matchPplus_start
                     if insert_length > 0 and insert_length < fParms.insert_max:
                         position_end = matchPplus_end_paired
@@ -196,13 +196,13 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta
                         paired_missmatch += 1
                         # Paired hybrid
                         if inDArgs.hostseq != "" and matchPplus_start_paired == -1:
-                            matchHplus_start, matchHplus_end = applyCoverage(read_paired[:fParms.seed], inDArgs.hostseq)
+                            matchHplus_start, matchHplus_end = applyCoverage(read_paired[:fParms.seed], inDArgs.hostseq,1)
                             if matchHplus_start != -1:
                                 list_hybrid[0] += 1
                                 phage_hybrid_coverage[0] = hybridCoverage(read, refseq, phage_hybrid_coverage[0], matchPplus_start, min(gen_len,matchPplus_end+corlen) )
                                 host_hybrid_coverage[0]  = hybridCoverage(read_paired, inDArgs.hostseq, host_hybrid_coverage[0], matchHplus_start, min(host_len,matchHplus_end+corlen) )
                             else:
-                                matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read_paired)[:fParms.seed], inDArgs.hostseq)
+                                matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read_paired)[:fParms.seed], inDArgs.hostseq,1)
                                 if matchHminus_start != -1:
                                     list_hybrid[0] += 1
                                     phage_hybrid_coverage[0] = hybridCoverage(read, refseq, phage_hybrid_coverage[0], matchPplus_start, min(gen_len,matchPplus_end+corlen) )
@@ -210,15 +210,15 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta
 
                 # Single hybrid
                 elif inDArgs.hostseq != "":
-                    matchPFplus_start, matchPFplus_end = applyCoverage(read[-fParms.seed:], refseq)
+                    matchPFplus_start, matchPFplus_end = applyCoverage(read[-fParms.seed:], refseq,1)
                     if matchPFplus_start == -1:
-                        matchHplus_start, matchHplus_end = applyCoverage(read[-fParms.seed:], inDArgs.hostseq)
+                        matchHplus_start, matchHplus_end = applyCoverage(read[-fParms.seed:], inDArgs.hostseq,1)
                         if matchHplus_start != -1:
                             list_hybrid[0] += 1
                             phage_hybrid_coverage[0] = hybridCoverage(read, refseq, phage_hybrid_coverage[0], matchPplus_start, min(gen_len,matchPplus_end+corlen) )
                             host_hybrid_coverage[0]  = hybridCoverage(read, inDArgs.hostseq, host_hybrid_coverage[0], matchHplus_start, min(host_len,matchHplus_end+corlen) )
                         else:
-                            matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read)[-fParms.seed:], inDArgs.hostseq)
+                            matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read)[-fParms.seed:], inDArgs.hostseq,1)
                             if matchHminus_start != -1:
                                 list_hybrid[0] += 1
                                 phage_hybrid_coverage[0] = hybridCoverage(read, refseq, phage_hybrid_coverage[0], matchPplus_start, min(gen_len,matchPplus_end+corlen) )
@@ -230,7 +230,7 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta
             
             ### if no match sense +, then test sense -
             if not match:
-                matchPminus_start, matchPminus_end = applyCoverage(reverseComplement(read)[-fParms.seed:], refseq)
+                matchPminus_start, matchPminus_end = applyCoverage(reverseComplement(read)[-fParms.seed:], refseq,1)
                 
                 ## Phage
                 if matchPminus_end != -1:
@@ -247,7 +247,7 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta
                     
                     # Paired-read
                     if inRawDArgs.paired != "":
-                        matchPminus_start_paired, matchPminus_end_paired = applyCoverage(read_paired[:fParms.seed], refseq)
+                        matchPminus_start_paired, matchPminus_end_paired = applyCoverage(read_paired[:fParms.seed], refseq,1)
                         insert_length = matchPminus_end - matchPminus_start_paired
                         if insert_length > 0 and insert_length < fParms.insert_max:
                             position_start = matchPminus_start_paired
@@ -256,14 +256,14 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta
                             paired_missmatch += 1
                             # Paired hybrid
                             if inDArgs.hostseq != "" and matchPminus_start_paired == -1:
-                                matchHplus_start, matchHplus_end = applyCoverage(read_paired[:fParms.seed], inDArgs.hostseq)
+                                matchHplus_start, matchHplus_end = applyCoverage(read_paired[:fParms.seed], inDArgs.hostseq,1)
                                 if matchHplus_start != -1:
                                     list_hybrid[1] += 1
                                     phage_hybrid_coverage[1] = hybridCoverage(reverseComplement(read), refseq, phage_hybrid_coverage[1], matchPminus_start, min(gen_len,matchPminus_end+corlen) )
                                     host_hybrid_coverage[0]  = hybridCoverage(read_paired, inDArgs.hostseq, host_hybrid_coverage[0], matchHplus_start, min(host_len,matchHplus_end+corlen) )
 
                                 else:
-                                    matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read_paired)[-fParms.seed:], inDArgs.hostseq)
+                                    matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read_paired)[-fParms.seed:], inDArgs.hostseq,1)
                                     if matchHminus_start != -1:
                                         list_hybrid[1] += 1
                                         phage_hybrid_coverage[1] = hybridCoverage(reverseComplement(read), refseq, phage_hybrid_coverage[1], matchPminus_start, min(gen_len,matchPminus_end+corlen) )
@@ -271,15 +271,15 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta
             
                     # Single hybrid
                     elif inDArgs.hostseq != "":
-                        matchPRplus_start, matchPRplus_end = applyCoverage(reverseComplement(read)[:fParms.seed], refseq)
+                        matchPRplus_start, matchPRplus_end = applyCoverage(reverseComplement(read)[:fParms.seed], refseq,1)
                         if matchPRplus_start == -1:
-                            matchHplus_start, matchHplus_end = applyCoverage(read[:fParms.seed], inDArgs.hostseq)
+                            matchHplus_start, matchHplus_end = applyCoverage(read[:fParms.seed], inDArgs.hostseq,1)
                             if matchHplus_start != -1:
                                 list_hybrid[1] += 1
                                 phage_hybrid_coverage[1] = hybridCoverage(reverseComplement(read), refseq, phage_hybrid_coverage[1], matchPminus_start, min(gen_len,matchPminus_end+corlen) )
                                 host_hybrid_coverage[0]  = hybridCoverage(read, inDArgs.hostseq, host_hybrid_coverage[0], matchHplus_start, min(host_len,matchHplus_end+corlen) )
                             else:
-                                matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read)[:fParms.seed], inDArgs.hostseq)
+                                matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read)[:fParms.seed], inDArgs.hostseq,1)
                                 if matchHminus_start != -1:
                                     list_hybrid[1] += 1
                                     phage_hybrid_coverage[1] = hybridCoverage(reverseComplement(read), refseq, phage_hybrid_coverage[1], matchPminus_start, min(gen_len,matchPminus_end+corlen) )
@@ -291,12 +291,12 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta
             
             ### if no match on Phage, test Host
             if not match:
-                matchHplus_start, matchHplus_end = applyCoverage(read[:fParms.seed], inDArgs.hostseq)
+                matchHplus_start, matchHplus_end = applyCoverage(read[:fParms.seed], inDArgs.hostseq,1)
                 if matchHplus_start != -1:
                     for i in range(matchHplus_start, min(host_len,matchHplus_end+corlen)):
                         host_whole_coverage[0][i]+=1
                 else:
-                    matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read)[-fParms.seed:], inDArgs.hostseq)
+                    matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read)[-fParms.seed:], inDArgs.hostseq,1)
                     if matchHminus_end != -1:
                         for i in range(max(0,matchHminus_start-corlen), matchHminus_end):
                             host_whole_coverage[1][i]+=1
@@ -307,13 +307,13 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta
             match = test_read_seq = 0
             # Timer
             if core_id == (tParms.core-1):
-                if k%1000 == 0:
+                if cnt_reads_processed%1000 == 0:
                     sys.stdout.write("\b\b\b\b\b\b\b\b\b%3.1f %%" %( float(read_match/gen_len) / tParms.limit_coverage * 100 ))
                     sys.stdout.flush()
 
             chk_handler.check(count_line,core_id,idx_refseq,termini_coverage,whole_coverage,paired_whole_coverage,\
                  phage_hybrid_coverage, host_hybrid_coverage, \
-                 host_whole_coverage,list_hybrid,insert,paired_missmatch,k,read_match) # maybe time to create checkpoint
+                 host_whole_coverage,list_hybrid,insert,paired_missmatch,cnt_reads_processed,read_match) # maybe time to create checkpoint
 
         else:
             if line[0] == "@":
@@ -357,7 +357,7 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta
         fic_name = os.path.join(tParms.dir_cov_mm, "coverage" + str(idx_refseq) + "_" + str(core_id))
         res=RCRes(termini_coverage,whole_coverage,paired_whole_coverage,\
                  phage_hybrid_coverage, host_hybrid_coverage, \
-                 host_whole_coverage,list_hybrid,insert,paired_missmatch,k)
+                 host_whole_coverage,list_hybrid,insert,paired_missmatch,cnt_reads_processed)
         res.save(fic_name)
     else:
         print("Error: readsCoverage must be used either with directory name or return_dict")
diff --git a/phagetermvirome/utilities.py b/phagetermvirome/utilities.py
index 40ddd3b..ee80af6 100644
--- a/phagetermvirome/utilities.py
+++ b/phagetermvirome/utilities.py
@@ -3,6 +3,7 @@
 # Gather here utility methods for phageterm. Used in both CPU and GPU version.
 #from string import maketrans
 import re
+import regex
 import random
 import sys
 
@@ -66,16 +67,28 @@ def hybridCoverage(read, sequence, hybrid_coverage, start, end):
         hybrid_coverage[i]+=1
     return hybrid_coverage
 
+
+
+## Determines if readPart matches against a sequence allowing 1 mistake (substitution only; no insertion or deletion).
+#def applyCoverage1ms(readPart, sequence):
+#    mismatch1_pattern = f'({readPart}){{s<=1}}'
+#    for pos in re.finditer(mismatch1_pattern,sequence):
+
+
+
 ## Determines if readPart maps against Sequence.
 #
 # @param readPart A part of a read (seed characters usually)
 # @param sequence (a contig)
 # It choses randomly a mapping position amongst all mappings found.
 # It returns 2 numbers: the start and stop position of the chosen mapping location.
-def applyCoverage(readPart, sequence):
+def applyCoverage(readPart, sequence, allow1_mismatch=0):
     """Return a random match of a read onto the sequence. """
     position = []
-    for pos in re.finditer(readPart,sequence):
+    searched_pattern=readPart
+    if (allow1_mismatch!=0) :
+        searched_pattern = f'({readPart}){{s<=1}}'
+    for pos in regex.finditer(searched_pattern,sequence):
         position.append(pos)
     if len(position) > 0:
         match = random.choice(position)
diff --git a/unit-tests/test_utilities.py b/unit-tests/test_utilities.py
new file mode 100644
index 0000000..658431f
--- /dev/null
+++ b/unit-tests/test_utilities.py
@@ -0,0 +1,31 @@
+import sys
+sys.path.append("..")
+sys.path.append("../phagetermvirome")
+
+
+import unittest
+
+from utilities import applyCoverage
+
+class MyTestCase(unittest.TestCase):
+    def test_applyCoverage(self):
+        readpart="AAAAAAAAAAAAAAAAAAAA"
+        sequence1="TAAAAAAAAAAAAAAAAAAAATTTTATCGAAAAAAAAAAAAAAAAAAAAAACTGGGGGGGG"
+        sequence2="ATCGGGGAAACTCTAGGGGACACACCCTTGATCATCGGTCC"
+        sequence3="AGGGTAAAAAAAAAAAAAAAAAAA"
+        start,stop=applyCoverage(readpart, sequence1,1)
+        self.assertIn(start,[0,1,28,29,30,31])
+        self.assertIn(stop,[19,20,47,48,49,50])
+        start,stop=applyCoverage(readpart, sequence2,1)
+        self.assertEqual(start,-1)
+        self.assertEqual(stop,-1)
+        start,stop=applyCoverage(readpart, sequence3,1)
+        self.assertIn(start, [4])
+        self.assertIn(stop, [24])
+        start, stop = applyCoverage(readpart, sequence1, 0)
+        self.assertIn(start, [1, 29, 30, 31])
+        self.assertIn(stop, [21,  49, 50, 51])
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab