diff --git a/crisprbact/off_target.py b/crisprbact/off_target.py index 39ffc9fe06eee97f33d32a3b6e6df0604b45d687..3646c4017c2ed5e7271801c2448dd9367d529412 100644 --- a/crisprbact/off_target.py +++ b/crisprbact/off_target.py @@ -12,24 +12,86 @@ def get_pos_features(position, f_df): return [] +def gen_extract_off_target_strand_plus(off_target_matches, rec, guide, seed_size): + for match in off_target_matches: + + guide_subseq = guide[: 20 - seed_size][::-1] + # extract part of the sequence + seq_extension_len = len(guide) - seed_size + start_pos_seq = match.start() - seq_extension_len + end_pos_seq = match.end() - seed_size - 3 + sub_sequence_to_match = rec.seq[start_pos_seq:end_pos_seq][::-1] + assert len(sub_sequence_to_match) == len(guide_subseq) + matching_chars = list(common_start(sub_sequence_to_match, guide_subseq)) + matching_substr = "".join(matching_chars) + yield match.span() + ( + match.end(), + "+", + rec.id, + seed_size + len(matching_chars), + matching_substr[::-1] + match.group(0), + ) + + +def common_start(seq1, seq2): + for a, b in zip(seq1, seq2): + if a == b: + yield a + else: + return + + +def gen_extract_off_target_strand_minus(off_target_matches, rec, guide, seed_size): + for match in off_target_matches: + # Extract the sequence. + # since rev_comp, extend the sequence to the end + seq_extension_len = len(guide) - seed_size + end_pos_seq = match.end() + seq_extension_len + start_pos_seq = match.start() + seed_size + 3 + sub_sequence_to_match = rec.seq[start_pos_seq:end_pos_seq] + + # Extract part of the guide to match + guide_subseq = rev_comp(guide[: 20 - seed_size]) + assert len(sub_sequence_to_match) == len(guide_subseq) + + matching_chars = list(common_start(sub_sequence_to_match, guide_subseq)) + matching_substr = "".join(matching_chars) + yield match.span() + ( + match.start(), + "-", + rec.id, + seed_size + len(matching_chars), + match.group(0) + matching_substr, + ) + + def get_off_target_pos(guide, recs, seed_size): if recs is not None: for rec in recs: - # + ori offs_plus = re.finditer(guide[-seed_size:] + "[ATGC]GG", str(rec.seq)) - offs = [match.span() + (match.end(), "+", rec.id) for match in offs_plus] - # TODO: comparer guide avec rec.seq[match.start():match.end()] + offs = list( + gen_extract_off_target_strand_plus(offs_plus, rec, guide, seed_size) + ) # - ori offs_minus = re.finditer( "CC[ATGC]" + rev_comp(guide[-seed_size:]), str(rec.seq) ) - offs += [ - match.span() + (match.start(), "-", rec.id) for match in offs_minus - ] - # comparer guide avec rec.seq[match.start():match.end()].reverse_complement() - # comparer les positions identique à partir de la fin et d'affilé. + offs += list( + gen_extract_off_target_strand_minus(offs_minus, rec, guide, seed_size) + ) offs_dict = dict( - zip(["start", "end", "pampos", "strand", "recid"], zip(*offs)) + zip( + [ + "start", + "end", + "pampos", + "strand", + "recid", + "max_matching_len", + "max_matching_seq", + ], + zip(*offs), + ) ) return pd.DataFrame(offs_dict) else: