From 291545cf5e04c5b54b2c453ed9ad9576d376b5ce Mon Sep 17 00:00:00 2001 From: Veronique Legrand <vlegrand@pasteur.fr> Date: Tue, 4 Jun 2024 16:05:03 +0200 Subject: [PATCH] Want to test whether using regex module instead of re to allow 1 mismatch in the matching of the seed region impacts execution time. --- phagetermvirome/functions_PhageTerm.py | 52 +++++++++++++------------- phagetermvirome/utilities.py | 17 ++++++++- unit-tests/test_utilities.py | 31 +++++++++++++++ 3 files changed, 72 insertions(+), 28 deletions(-) create mode 100644 unit-tests/test_utilities.py diff --git a/phagetermvirome/functions_PhageTerm.py b/phagetermvirome/functions_PhageTerm.py index 79a11dd..0c1a330 100644 --- a/phagetermvirome/functions_PhageTerm.py +++ b/phagetermvirome/functions_PhageTerm.py @@ -100,11 +100,11 @@ def init_ws(p_res,refseq,hostseq): list_hybrid=p_res.interm_res.list_hybrid insert=p_res.interm_res.insert paired_missmatch=p_res.interm_res.paired_mismatch - k=int(p_res.interm_res.reads_tested) + cnt_reads_processed=int(p_res.interm_res.reads_tested) #count_line=p_res.count_line-1 # do that because readsCoverage will start by incrementing it of 1 read_match=p_res.read_match return gen_len,host_len,termini_coverage,whole_coverage,paired_whole_coverage,phage_hybrid_coverage,host_hybrid_coverage, \ - host_whole_coverage,list_hybrid,insert,paired_missmatch,k,count_line,read_match #TODO refactor that. + host_whole_coverage,list_hybrid,insert,paired_missmatch,cnt_reads_processed,count_line,read_match #TODO refactor that. @@ -118,7 +118,7 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta p_res=chk_handler.load(core_id,idx_refseq) gen_len,host_len,termini_coverage, whole_coverage, paired_whole_coverage, phage_hybrid_coverage, host_hybrid_coverage,\ - host_whole_coverage, list_hybrid, insert, paired_missmatch, k, count_line, read_match=init_ws(p_res, refseq, inDArgs.hostseq) + host_whole_coverage, list_hybrid, insert, paired_missmatch, cnt_reads_processed, count_line, read_match=init_ws(p_res, refseq, inDArgs.hostseq) if logger!=None: logger.add_rw(p_res) test_read_seq = match = 0 @@ -148,7 +148,7 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta break if test_read_seq: - k += 1 + cnt_reads_processed += 1 # Read sequence read = line.split("\n")[0].split("\r")[0] line = filin.readline() @@ -160,17 +160,17 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta readlen = len(read) if readlen < fParms.seed: if logger!=None: - print("CPU rejecting read",k) + print("CPU rejecting read",cnt_reads_processed) continue corlen = readlen-fParms.seed if logger!=None: - print("CPU processing read: ",k,read, reverseComplement(read)) - logger.newRmInfo(k) + print("CPU processing read: ",cnt_reads_processed,read, reverseComplement(read)) + logger.newRmInfo(cnt_reads_processed) ### Match sense + (multiple, random pick one) #print("read[:fParms.seed]=",read[:fParms.seed]) - matchPplus_start, matchPplus_end = applyCoverage(read[:fParms.seed], refseq) + matchPplus_start, matchPplus_end = applyCoverage(read[:fParms.seed], refseq,1) ## Phage if matchPplus_start != -1: @@ -187,7 +187,7 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta # Paired-read if inRawDArgs.paired != "": - matchPplus_start_paired, matchPplus_end_paired = applyCoverage(reverseComplement(read_paired)[-fParms.seed:], refseq) + matchPplus_start_paired, matchPplus_end_paired = applyCoverage(reverseComplement(read_paired)[-fParms.seed:], refseq,1) insert_length = matchPplus_end_paired - matchPplus_start if insert_length > 0 and insert_length < fParms.insert_max: position_end = matchPplus_end_paired @@ -196,13 +196,13 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta paired_missmatch += 1 # Paired hybrid if inDArgs.hostseq != "" and matchPplus_start_paired == -1: - matchHplus_start, matchHplus_end = applyCoverage(read_paired[:fParms.seed], inDArgs.hostseq) + matchHplus_start, matchHplus_end = applyCoverage(read_paired[:fParms.seed], inDArgs.hostseq,1) if matchHplus_start != -1: list_hybrid[0] += 1 phage_hybrid_coverage[0] = hybridCoverage(read, refseq, phage_hybrid_coverage[0], matchPplus_start, min(gen_len,matchPplus_end+corlen) ) host_hybrid_coverage[0] = hybridCoverage(read_paired, inDArgs.hostseq, host_hybrid_coverage[0], matchHplus_start, min(host_len,matchHplus_end+corlen) ) else: - matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read_paired)[:fParms.seed], inDArgs.hostseq) + matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read_paired)[:fParms.seed], inDArgs.hostseq,1) if matchHminus_start != -1: list_hybrid[0] += 1 phage_hybrid_coverage[0] = hybridCoverage(read, refseq, phage_hybrid_coverage[0], matchPplus_start, min(gen_len,matchPplus_end+corlen) ) @@ -210,15 +210,15 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta # Single hybrid elif inDArgs.hostseq != "": - matchPFplus_start, matchPFplus_end = applyCoverage(read[-fParms.seed:], refseq) + matchPFplus_start, matchPFplus_end = applyCoverage(read[-fParms.seed:], refseq,1) if matchPFplus_start == -1: - matchHplus_start, matchHplus_end = applyCoverage(read[-fParms.seed:], inDArgs.hostseq) + matchHplus_start, matchHplus_end = applyCoverage(read[-fParms.seed:], inDArgs.hostseq,1) if matchHplus_start != -1: list_hybrid[0] += 1 phage_hybrid_coverage[0] = hybridCoverage(read, refseq, phage_hybrid_coverage[0], matchPplus_start, min(gen_len,matchPplus_end+corlen) ) host_hybrid_coverage[0] = hybridCoverage(read, inDArgs.hostseq, host_hybrid_coverage[0], matchHplus_start, min(host_len,matchHplus_end+corlen) ) else: - matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read)[-fParms.seed:], inDArgs.hostseq) + matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read)[-fParms.seed:], inDArgs.hostseq,1) if matchHminus_start != -1: list_hybrid[0] += 1 phage_hybrid_coverage[0] = hybridCoverage(read, refseq, phage_hybrid_coverage[0], matchPplus_start, min(gen_len,matchPplus_end+corlen) ) @@ -230,7 +230,7 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta ### if no match sense +, then test sense - if not match: - matchPminus_start, matchPminus_end = applyCoverage(reverseComplement(read)[-fParms.seed:], refseq) + matchPminus_start, matchPminus_end = applyCoverage(reverseComplement(read)[-fParms.seed:], refseq,1) ## Phage if matchPminus_end != -1: @@ -247,7 +247,7 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta # Paired-read if inRawDArgs.paired != "": - matchPminus_start_paired, matchPminus_end_paired = applyCoverage(read_paired[:fParms.seed], refseq) + matchPminus_start_paired, matchPminus_end_paired = applyCoverage(read_paired[:fParms.seed], refseq,1) insert_length = matchPminus_end - matchPminus_start_paired if insert_length > 0 and insert_length < fParms.insert_max: position_start = matchPminus_start_paired @@ -256,14 +256,14 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta paired_missmatch += 1 # Paired hybrid if inDArgs.hostseq != "" and matchPminus_start_paired == -1: - matchHplus_start, matchHplus_end = applyCoverage(read_paired[:fParms.seed], inDArgs.hostseq) + matchHplus_start, matchHplus_end = applyCoverage(read_paired[:fParms.seed], inDArgs.hostseq,1) if matchHplus_start != -1: list_hybrid[1] += 1 phage_hybrid_coverage[1] = hybridCoverage(reverseComplement(read), refseq, phage_hybrid_coverage[1], matchPminus_start, min(gen_len,matchPminus_end+corlen) ) host_hybrid_coverage[0] = hybridCoverage(read_paired, inDArgs.hostseq, host_hybrid_coverage[0], matchHplus_start, min(host_len,matchHplus_end+corlen) ) else: - matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read_paired)[-fParms.seed:], inDArgs.hostseq) + matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read_paired)[-fParms.seed:], inDArgs.hostseq,1) if matchHminus_start != -1: list_hybrid[1] += 1 phage_hybrid_coverage[1] = hybridCoverage(reverseComplement(read), refseq, phage_hybrid_coverage[1], matchPminus_start, min(gen_len,matchPminus_end+corlen) ) @@ -271,15 +271,15 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta # Single hybrid elif inDArgs.hostseq != "": - matchPRplus_start, matchPRplus_end = applyCoverage(reverseComplement(read)[:fParms.seed], refseq) + matchPRplus_start, matchPRplus_end = applyCoverage(reverseComplement(read)[:fParms.seed], refseq,1) if matchPRplus_start == -1: - matchHplus_start, matchHplus_end = applyCoverage(read[:fParms.seed], inDArgs.hostseq) + matchHplus_start, matchHplus_end = applyCoverage(read[:fParms.seed], inDArgs.hostseq,1) if matchHplus_start != -1: list_hybrid[1] += 1 phage_hybrid_coverage[1] = hybridCoverage(reverseComplement(read), refseq, phage_hybrid_coverage[1], matchPminus_start, min(gen_len,matchPminus_end+corlen) ) host_hybrid_coverage[0] = hybridCoverage(read, inDArgs.hostseq, host_hybrid_coverage[0], matchHplus_start, min(host_len,matchHplus_end+corlen) ) else: - matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read)[:fParms.seed], inDArgs.hostseq) + matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read)[:fParms.seed], inDArgs.hostseq,1) if matchHminus_start != -1: list_hybrid[1] += 1 phage_hybrid_coverage[1] = hybridCoverage(reverseComplement(read), refseq, phage_hybrid_coverage[1], matchPminus_start, min(gen_len,matchPminus_end+corlen) ) @@ -291,12 +291,12 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta ### if no match on Phage, test Host if not match: - matchHplus_start, matchHplus_end = applyCoverage(read[:fParms.seed], inDArgs.hostseq) + matchHplus_start, matchHplus_end = applyCoverage(read[:fParms.seed], inDArgs.hostseq,1) if matchHplus_start != -1: for i in range(matchHplus_start, min(host_len,matchHplus_end+corlen)): host_whole_coverage[0][i]+=1 else: - matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read)[-fParms.seed:], inDArgs.hostseq) + matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read)[-fParms.seed:], inDArgs.hostseq,1) if matchHminus_end != -1: for i in range(max(0,matchHminus_start-corlen), matchHminus_end): host_whole_coverage[1][i]+=1 @@ -307,13 +307,13 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta match = test_read_seq = 0 # Timer if core_id == (tParms.core-1): - if k%1000 == 0: + if cnt_reads_processed%1000 == 0: sys.stdout.write("\b\b\b\b\b\b\b\b\b%3.1f %%" %( float(read_match/gen_len) / tParms.limit_coverage * 100 )) sys.stdout.flush() chk_handler.check(count_line,core_id,idx_refseq,termini_coverage,whole_coverage,paired_whole_coverage,\ phage_hybrid_coverage, host_hybrid_coverage, \ - host_whole_coverage,list_hybrid,insert,paired_missmatch,k,read_match) # maybe time to create checkpoint + host_whole_coverage,list_hybrid,insert,paired_missmatch,cnt_reads_processed,read_match) # maybe time to create checkpoint else: if line[0] == "@": @@ -357,7 +357,7 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta fic_name = os.path.join(tParms.dir_cov_mm, "coverage" + str(idx_refseq) + "_" + str(core_id)) res=RCRes(termini_coverage,whole_coverage,paired_whole_coverage,\ phage_hybrid_coverage, host_hybrid_coverage, \ - host_whole_coverage,list_hybrid,insert,paired_missmatch,k) + host_whole_coverage,list_hybrid,insert,paired_missmatch,cnt_reads_processed) res.save(fic_name) else: print("Error: readsCoverage must be used either with directory name or return_dict") diff --git a/phagetermvirome/utilities.py b/phagetermvirome/utilities.py index 40ddd3b..ee80af6 100644 --- a/phagetermvirome/utilities.py +++ b/phagetermvirome/utilities.py @@ -3,6 +3,7 @@ # Gather here utility methods for phageterm. Used in both CPU and GPU version. #from string import maketrans import re +import regex import random import sys @@ -66,16 +67,28 @@ def hybridCoverage(read, sequence, hybrid_coverage, start, end): hybrid_coverage[i]+=1 return hybrid_coverage + + +## Determines if readPart matches against a sequence allowing 1 mistake (substitution only; no insertion or deletion). +#def applyCoverage1ms(readPart, sequence): +# mismatch1_pattern = f'({readPart}){{s<=1}}' +# for pos in re.finditer(mismatch1_pattern,sequence): + + + ## Determines if readPart maps against Sequence. # # @param readPart A part of a read (seed characters usually) # @param sequence (a contig) # It choses randomly a mapping position amongst all mappings found. # It returns 2 numbers: the start and stop position of the chosen mapping location. -def applyCoverage(readPart, sequence): +def applyCoverage(readPart, sequence, allow1_mismatch=0): """Return a random match of a read onto the sequence. """ position = [] - for pos in re.finditer(readPart,sequence): + searched_pattern=readPart + if (allow1_mismatch!=0) : + searched_pattern = f'({readPart}){{s<=1}}' + for pos in regex.finditer(searched_pattern,sequence): position.append(pos) if len(position) > 0: match = random.choice(position) diff --git a/unit-tests/test_utilities.py b/unit-tests/test_utilities.py new file mode 100644 index 0000000..658431f --- /dev/null +++ b/unit-tests/test_utilities.py @@ -0,0 +1,31 @@ +import sys +sys.path.append("..") +sys.path.append("../phagetermvirome") + + +import unittest + +from utilities import applyCoverage + +class MyTestCase(unittest.TestCase): + def test_applyCoverage(self): + readpart="AAAAAAAAAAAAAAAAAAAA" + sequence1="TAAAAAAAAAAAAAAAAAAAATTTTATCGAAAAAAAAAAAAAAAAAAAAAACTGGGGGGGG" + sequence2="ATCGGGGAAACTCTAGGGGACACACCCTTGATCATCGGTCC" + sequence3="AGGGTAAAAAAAAAAAAAAAAAAA" + start,stop=applyCoverage(readpart, sequence1,1) + self.assertIn(start,[0,1,28,29,30,31]) + self.assertIn(stop,[19,20,47,48,49,50]) + start,stop=applyCoverage(readpart, sequence2,1) + self.assertEqual(start,-1) + self.assertEqual(stop,-1) + start,stop=applyCoverage(readpart, sequence3,1) + self.assertIn(start, [4]) + self.assertIn(stop, [24]) + start, stop = applyCoverage(readpart, sequence1, 0) + self.assertIn(start, [1, 29, 30, 31]) + self.assertIn(stop, [21, 49, 50, 51]) + + +if __name__ == '__main__': + unittest.main() -- GitLab