diff --git a/crisprbact/off_target.py b/crisprbact/off_target.py index 3646c4017c2ed5e7271801c2448dd9367d529412..40d5471f26eb19e08bb43b0667b7f73d05ac0395 100644 --- a/crisprbact/off_target.py +++ b/crisprbact/off_target.py @@ -4,6 +4,54 @@ import pandas as pd from crisprbact.utils import rev_comp +def compute_off_target_df(guide, seed_size, records, feature_df): + """ Returns a pandas DataFrame with data about the identified off-targets. + The features column contains a list of biopython SeqFeature objects that overlap + with the off-target""" + offs_df = get_off_target_pos(guide, records, seed_size) + if offs_df is not None: + offs_df["features"] = [ + get_pos_features(off.pampos, feature_df) for i, off in offs_df.iterrows() + ] + return offs_df + else: + return None + + +def get_off_target_pos(guide, recs, seed_size): + if recs is not None: + for rec in recs: + offs_plus = re.finditer(guide[-seed_size:] + "[ATGC]GG", str(rec.seq)) + offs = list( + gen_extract_off_target_strand_plus(offs_plus, rec, guide, seed_size) + ) + # - ori + offs_minus = re.finditer( + "CC[ATGC]" + rev_comp(guide[-seed_size:]), str(rec.seq) + ) + offs += list( + gen_extract_off_target_strand_minus(offs_minus, rec, guide, seed_size) + ) + offs_dict = dict( + zip( + [ + "start", + "end", + "pampos", + "strand", + "recid", + "max_matching_len", + "max_matching_seq", + "pam_seq", + ], + zip(*offs), + ) + ) + return pd.DataFrame(offs_dict) + else: + return None + + def get_pos_features(position, f_df): if len(f_df) > 0: feature_at_pos = f_df[(f_df.start < position) & (f_df.end > position)] @@ -29,7 +77,8 @@ def gen_extract_off_target_strand_plus(off_target_matches, rec, guide, seed_size "+", rec.id, seed_size + len(matching_chars), - matching_substr[::-1] + match.group(0), + matching_substr[::-1] + match.group(0)[:-3], + match.group(0)[-3:], ) @@ -61,43 +110,11 @@ def gen_extract_off_target_strand_minus(off_target_matches, rec, guide, seed_siz "-", rec.id, seed_size + len(matching_chars), - match.group(0) + matching_substr, + match.group(0)[3:] + matching_substr, + match.group(0)[:3], ) -def get_off_target_pos(guide, recs, seed_size): - if recs is not None: - for rec in recs: - offs_plus = re.finditer(guide[-seed_size:] + "[ATGC]GG", str(rec.seq)) - offs = list( - gen_extract_off_target_strand_plus(offs_plus, rec, guide, seed_size) - ) - # - ori - offs_minus = re.finditer( - "CC[ATGC]" + rev_comp(guide[-seed_size:]), str(rec.seq) - ) - offs += list( - gen_extract_off_target_strand_minus(offs_minus, rec, guide, seed_size) - ) - offs_dict = dict( - zip( - [ - "start", - "end", - "pampos", - "strand", - "recid", - "max_matching_len", - "max_matching_seq", - ], - zip(*offs), - ) - ) - return pd.DataFrame(offs_dict) - else: - return None - - def extract_records(genome): records = list(genome) if records and len(records) > 0: @@ -132,17 +149,3 @@ def extract_features(recs): return pd.DataFrame(f_dict) else: return None - - -def compute_off_target_df(guide, seed_size, records, feature_df): - """ Returns a pandas DataFrame with data about the identified off-targets. - The features column contains a list of biopython SeqFeature objects that overlap - with the off-target""" - offs_df = get_off_target_pos(guide, records, seed_size) - if offs_df is not None: - offs_df["features"] = [ - get_pos_features(off.pampos, feature_df) for i, off in offs_df.iterrows() - ] - return offs_df - else: - return None