diff --git a/crisprbact/off_target.py b/crisprbact/off_target.py index 40d5471f26eb19e08bb43b0667b7f73d05ac0395..886235f7f0d34510d10a9538f28cab429484e192 100644 --- a/crisprbact/off_target.py +++ b/crisprbact/off_target.py @@ -2,6 +2,7 @@ import re import pandas as pd from crisprbact.utils import rev_comp +from itertools import chain def compute_off_target_df(guide, seed_size, records, feature_df): @@ -11,71 +12,79 @@ def compute_off_target_df(guide, seed_size, records, feature_df): offs_df = get_off_target_pos(guide, records, seed_size) if offs_df is not None: offs_df["features"] = [ - get_pos_features(off.pampos, feature_df) for i, off in offs_df.iterrows() + get_pos_features(off.recid, off.pampos, feature_df) + for i, off in offs_df.iterrows() ] return offs_df else: return None -def get_off_target_pos(guide, recs, seed_size): - if recs is not None: - for rec in recs: - offs_plus = re.finditer(guide[-seed_size:] + "[ATGC]GG", str(rec.seq)) - offs = list( - gen_extract_off_target_strand_plus(offs_plus, rec, guide, seed_size) +def get_off_target_pos(guide, records, seed_size): + if records is not None: + offs = iter([]) + for rec in records: + sequence_str = str(rec.seq) + seqid = rec.id + offs_plus = re.finditer(guide[-seed_size:] + "[ATGC]GG", sequence_str) + offs_plus = gen_extract_off_target_strand_plus( + offs_plus, seqid, sequence_str, guide, seed_size ) + # - ori offs_minus = re.finditer( - "CC[ATGC]" + rev_comp(guide[-seed_size:]), str(rec.seq) + "CC[ATGC]" + rev_comp(guide[-seed_size:]), sequence_str ) - offs += list( - gen_extract_off_target_strand_minus(offs_minus, rec, guide, seed_size) + offs_minus = gen_extract_off_target_strand_minus( + offs_minus, seqid, sequence_str, guide, seed_size ) - offs_dict = dict( - zip( - [ - "start", - "end", - "pampos", - "strand", - "recid", - "max_matching_len", - "max_matching_seq", - "pam_seq", - ], - zip(*offs), - ) + offs = chain(offs, offs_plus, offs_minus) + offs_dict = dict( + zip( + [ + "start", + "end", + "pampos", + "strand", + "recid", + "max_matching_len", + "max_matching_seq", + "pam_seq", + ], + zip(*offs), ) - return pd.DataFrame(offs_dict) + ) + return pd.DataFrame(offs_dict) else: return None -def get_pos_features(position, f_df): +def get_pos_features(recid, position, f_df): if len(f_df) > 0: - feature_at_pos = f_df[(f_df.start < position) & (f_df.end > position)] + feature_at_pos = f_df[ + (f_df.start < position) & (f_df.end > position) & (f_df.recid == recid) + ] return feature_at_pos.feature.values else: return [] -def gen_extract_off_target_strand_plus(off_target_matches, rec, guide, seed_size): +def gen_extract_off_target_strand_plus( + off_target_matches, seqid, sequence_string, guide, seed_size +): for match in off_target_matches: - guide_subseq = guide[: 20 - seed_size][::-1] # extract part of the sequence seq_extension_len = len(guide) - seed_size start_pos_seq = match.start() - seq_extension_len end_pos_seq = match.end() - seed_size - 3 - sub_sequence_to_match = rec.seq[start_pos_seq:end_pos_seq][::-1] - assert len(sub_sequence_to_match) == len(guide_subseq) + sub_sequence_to_match = sequence_string[start_pos_seq:end_pos_seq][::-1] matching_chars = list(common_start(sub_sequence_to_match, guide_subseq)) matching_substr = "".join(matching_chars) yield match.span() + ( match.end(), "+", - rec.id, + seqid, seed_size + len(matching_chars), matching_substr[::-1] + match.group(0)[:-3], match.group(0)[-3:], @@ -90,25 +99,25 @@ def common_start(seq1, seq2): return -def gen_extract_off_target_strand_minus(off_target_matches, rec, guide, seed_size): +def gen_extract_off_target_strand_minus( + off_target_matches, seqid, sequence_string, guide, seed_size +): for match in off_target_matches: # Extract the sequence. # since rev_comp, extend the sequence to the end seq_extension_len = len(guide) - seed_size end_pos_seq = match.end() + seq_extension_len start_pos_seq = match.start() + seed_size + 3 - sub_sequence_to_match = rec.seq[start_pos_seq:end_pos_seq] + sub_sequence_to_match = sequence_string[start_pos_seq:end_pos_seq] # Extract part of the guide to match guide_subseq = rev_comp(guide[: 20 - seed_size]) - assert len(sub_sequence_to_match) == len(guide_subseq) - matching_chars = list(common_start(sub_sequence_to_match, guide_subseq)) matching_substr = "".join(matching_chars) yield match.span() + ( match.start(), "-", - rec.id, + seqid, seed_size + len(matching_chars), match.group(0)[3:] + matching_substr, match.group(0)[:3],