diff --git a/phagetermvirome/PhageTerm.py b/phagetermvirome/PhageTerm.py index e9104cc8396894f015a16745d3cc3774182c597f..dc1593f39138ba717e688541cc5ca0effee5187b 100755 --- a/phagetermvirome/PhageTerm.py +++ b/phagetermvirome/PhageTerm.py @@ -142,7 +142,7 @@ def main(): host_whole_coverage = return_dict[core_id][5] list_hybrid = return_dict[core_id][6] insert = return_dict[core_id][7].tolist() - paired_missmatch = return_dict[core_id][8] + paired_mismatch = return_dict[core_id][8] reads_tested = return_dict[core_id][9] else: termini_coverage += return_dict[core_id][0] @@ -153,7 +153,7 @@ def main(): host_whole_coverage += return_dict[core_id][5] list_hybrid += return_dict[core_id][6] insert += return_dict[core_id][7].tolist() - paired_missmatch += return_dict[core_id][8] + paired_mismatch += return_dict[core_id][8] reads_tested += return_dict[core_id][9] termini_coverage = termini_coverage.tolist() diff --git a/phagetermvirome/functions_PhageTerm.py b/phagetermvirome/functions_PhageTerm.py index 79a11dd17df103a32ed3a69ee5345bb13dc81329..0a941a169f270e91967bf8c0eedea4503dfbc4d5 100644 --- a/phagetermvirome/functions_PhageTerm.py +++ b/phagetermvirome/functions_PhageTerm.py @@ -88,7 +88,7 @@ def init_ws(p_res,refseq,hostseq): host_whole_coverage = np.array([host_len*[0], host_len*[0]]) list_hybrid = np.array([0,0]) insert = [] - paired_missmatch = 0 + paired_mismatch = 0 read_match = 0 else: termini_coverage=p_res.interm_res.termini_coverage @@ -99,12 +99,12 @@ def init_ws(p_res,refseq,hostseq): host_whole_coverage=p_res.interm_res.host_whole_coverage list_hybrid=p_res.interm_res.list_hybrid insert=p_res.interm_res.insert - paired_missmatch=p_res.interm_res.paired_mismatch + paired_mismatch=p_res.interm_res.paired_mismatch k=int(p_res.interm_res.reads_tested) #count_line=p_res.count_line-1 # do that because readsCoverage will start by incrementing it of 1 read_match=p_res.read_match return gen_len,host_len,termini_coverage,whole_coverage,paired_whole_coverage,phage_hybrid_coverage,host_hybrid_coverage, \ - host_whole_coverage,list_hybrid,insert,paired_missmatch,k,count_line,read_match #TODO refactor that. + host_whole_coverage,list_hybrid,insert,paired_mismatch,k,count_line,read_match #TODO refactor that. @@ -118,7 +118,12 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta p_res=chk_handler.load(core_id,idx_refseq) gen_len,host_len,termini_coverage, whole_coverage, paired_whole_coverage, phage_hybrid_coverage, host_hybrid_coverage,\ - host_whole_coverage, list_hybrid, insert, paired_missmatch, k, count_line, read_match=init_ws(p_res, refseq, inDArgs.hostseq) + host_whole_coverage, list_hybrid, insert, paired_mismatch, k, count_line, read_match=init_ws(p_res, refseq, inDArgs.hostseq) + if p_res==None: + # no existing checkpoint and starting processing of a new sequence + chk_handler.start(count_line,core_id,idx_refseq,termini_coverage,whole_coverage,paired_whole_coverage,\ + phage_hybrid_coverage, host_hybrid_coverage, \ + host_whole_coverage,list_hybrid,insert,paired_mismatch,count_line,read_match) if logger!=None: logger.add_rw(p_res) test_read_seq = match = 0 @@ -193,7 +198,7 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta position_end = matchPplus_end_paired insert.append(insert_length) else: - paired_missmatch += 1 + paired_mismatch += 1 # Paired hybrid if inDArgs.hostseq != "" and matchPplus_start_paired == -1: matchHplus_start, matchHplus_end = applyCoverage(read_paired[:fParms.seed], inDArgs.hostseq) @@ -253,7 +258,7 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta position_start = matchPminus_start_paired insert.append(insert_length) else: - paired_missmatch += 1 + paired_mismatch += 1 # Paired hybrid if inDArgs.hostseq != "" and matchPminus_start_paired == -1: matchHplus_start, matchHplus_end = applyCoverage(read_paired[:fParms.seed], inDArgs.hostseq) @@ -313,7 +318,7 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta chk_handler.check(count_line,core_id,idx_refseq,termini_coverage,whole_coverage,paired_whole_coverage,\ phage_hybrid_coverage, host_hybrid_coverage, \ - host_whole_coverage,list_hybrid,insert,paired_missmatch,k,read_match) # maybe time to create checkpoint + host_whole_coverage,list_hybrid,insert,paired_mismatch,k,read_match) # maybe time to create checkpoint else: if line[0] == "@": @@ -351,13 +356,13 @@ def readsCoverage(inRawDArgs,refseq,inDArgs,fParms,return_dict, core_id,line_sta host_hybrid_coverage = correctEdge(host_hybrid_coverage, fParms.edge) if return_dict!=None and tParms.dir_cov_mm==None: - return_dict[core_id] = [termini_coverage, whole_coverage, paired_whole_coverage, phage_hybrid_coverage, host_hybrid_coverage, host_whole_coverage, list_hybrid, np.array(insert), paired_missmatch, k] + return_dict[core_id] = [termini_coverage, whole_coverage, paired_whole_coverage, phage_hybrid_coverage, host_hybrid_coverage, host_whole_coverage, list_hybrid, np.array(insert), paired_mismatch, k] elif return_dict==None and tParms.dir_cov_mm!=None: insert = np.array(insert) fic_name = os.path.join(tParms.dir_cov_mm, "coverage" + str(idx_refseq) + "_" + str(core_id)) res=RCRes(termini_coverage,whole_coverage,paired_whole_coverage,\ phage_hybrid_coverage, host_hybrid_coverage, \ - host_whole_coverage,list_hybrid,insert,paired_missmatch,k) + host_whole_coverage,list_hybrid,insert,paired_mismatch,k) res.save(fic_name) else: print("Error: readsCoverage must be used either with directory name or return_dict") @@ -1082,7 +1087,7 @@ def CreateReport(phagename, seed, added_whole_coverage, draw, Redundant, P_left, report.append(Paragraph(ptext, styles["Left"])) report.append(Spacer(1, 10)) - ptext = '<i><font size=10>PhageTerm software uses raw reads of a phage sequenced with a sequencing technology using random fragmentation and its genomic reference sequence to determine the termini position. The process starts with the alignment of NGS reads to the phage genome in order to calculate the starting position coverage (SPC), where a hit is given only to the position of the first base in a successfully aligned read (the alignment algorithm uses the lenght of the seed (default: 20) for mapping and does not accept gap or missmatch to speed up the process). Then the program apply 2 distinct scoring methods: i) a statistical approach based on the Gamma law; and ii) a method derived from LI and al. 2014 paper.</font></i>' + ptext = '<i><font size=10>PhageTerm software uses raw reads of a phage sequenced with a sequencing technology using random fragmentation and its genomic reference sequence to determine the termini position. The process starts with the alignment of NGS reads to the phage genome in order to calculate the starting position coverage (SPC), where a hit is given only to the position of the first base in a successfully aligned read (the alignment algorithm uses the lenght of the seed (default: 20) for mapping and does not accept gap or mismatch to speed up the process). Then the program apply 2 distinct scoring methods: i) a statistical approach based on the Gamma law; and ii) a method derived from LI and al. 2014 paper.</font></i>' report.append(Paragraph(ptext, styles["Justify"])) report.append(Spacer(1, 5)) diff --git a/phagetermvirome/readsCoverage_res.py b/phagetermvirome/readsCoverage_res.py index 15fce0ca079376bc5a339f7f919dbd2ae258adc9..0a989ed0688bf68abf5d3412e4364643ce39289c 100644 --- a/phagetermvirome/readsCoverage_res.py +++ b/phagetermvirome/readsCoverage_res.py @@ -291,6 +291,18 @@ class RCCheckpoint_handler: host_whole_coverage,list_hybrid,insert,paired_mismatch,reads_tested,read_match) chkp.save(self.dir_chk,core_id,idx_seq) + # When running on a cluster, ptv may be killed due to timeout. It is possible that in that case, that the processing of sequence n-1 is over + # (there is no more checkpoint for n-1) and ptv has not yet created a checkpoint for sequence n. + # The following method is used for creating a checkpoint at the beginning of processing of sequence n to avoid ptv + # having to restart from the beginning if it is killed due to timeout + def start(self,count_line,core_id,idx_seq,termini_coverage,whole_coverage,paired_whole_coverage,\ + phage_hybrid_coverage, host_hybrid_coverage, \ + host_whole_coverage,list_hybrid,insert,paired_mismatch,reads_tested,read_match): + if self.chk_freq != 0 and self.test_mode == False: + chkp = RCCheckpoint(count_line, core_id, idx_seq, termini_coverage, whole_coverage, paired_whole_coverage, \ + phage_hybrid_coverage, host_hybrid_coverage, \ + host_whole_coverage, list_hybrid, insert, paired_mismatch, reads_tested, read_match) + chkp.save(self.dir_chk, core_id, idx_seq) def end(self,core_id): if (self.test_mode==False and self.chk_freq!=0) :