Skip to content
Snippets Groups Projects
Commit 60ca2962 authored by Remi  PLANEL's avatar Remi PLANEL
Browse files

compute the longuest match perfect match between the guide and the off-target sequence

parent c9f5cced
No related branches found
No related tags found
No related merge requests found
......@@ -4,6 +4,54 @@ import pandas as pd
from crisprbact.utils import rev_comp
def compute_off_target_df(guide, seed_size, records, feature_df):
""" Returns a pandas DataFrame with data about the identified off-targets.
The features column contains a list of biopython SeqFeature objects that overlap
with the off-target"""
offs_df = get_off_target_pos(guide, records, seed_size)
if offs_df is not None:
offs_df["features"] = [
get_pos_features(off.pampos, feature_df) for i, off in offs_df.iterrows()
]
return offs_df
else:
return None
def get_off_target_pos(guide, recs, seed_size):
if recs is not None:
for rec in recs:
offs_plus = re.finditer(guide[-seed_size:] + "[ATGC]GG", str(rec.seq))
offs = list(
gen_extract_off_target_strand_plus(offs_plus, rec, guide, seed_size)
)
# - ori
offs_minus = re.finditer(
"CC[ATGC]" + rev_comp(guide[-seed_size:]), str(rec.seq)
)
offs += list(
gen_extract_off_target_strand_minus(offs_minus, rec, guide, seed_size)
)
offs_dict = dict(
zip(
[
"start",
"end",
"pampos",
"strand",
"recid",
"max_matching_len",
"max_matching_seq",
"pam_seq",
],
zip(*offs),
)
)
return pd.DataFrame(offs_dict)
else:
return None
def get_pos_features(position, f_df):
if len(f_df) > 0:
feature_at_pos = f_df[(f_df.start < position) & (f_df.end > position)]
......@@ -29,7 +77,8 @@ def gen_extract_off_target_strand_plus(off_target_matches, rec, guide, seed_size
"+",
rec.id,
seed_size + len(matching_chars),
matching_substr[::-1] + match.group(0),
matching_substr[::-1] + match.group(0)[:-3],
match.group(0)[-3:],
)
......@@ -61,43 +110,11 @@ def gen_extract_off_target_strand_minus(off_target_matches, rec, guide, seed_siz
"-",
rec.id,
seed_size + len(matching_chars),
match.group(0) + matching_substr,
match.group(0)[3:] + matching_substr,
match.group(0)[:3],
)
def get_off_target_pos(guide, recs, seed_size):
if recs is not None:
for rec in recs:
offs_plus = re.finditer(guide[-seed_size:] + "[ATGC]GG", str(rec.seq))
offs = list(
gen_extract_off_target_strand_plus(offs_plus, rec, guide, seed_size)
)
# - ori
offs_minus = re.finditer(
"CC[ATGC]" + rev_comp(guide[-seed_size:]), str(rec.seq)
)
offs += list(
gen_extract_off_target_strand_minus(offs_minus, rec, guide, seed_size)
)
offs_dict = dict(
zip(
[
"start",
"end",
"pampos",
"strand",
"recid",
"max_matching_len",
"max_matching_seq",
],
zip(*offs),
)
)
return pd.DataFrame(offs_dict)
else:
return None
def extract_records(genome):
records = list(genome)
if records and len(records) > 0:
......@@ -132,17 +149,3 @@ def extract_features(recs):
return pd.DataFrame(f_dict)
else:
return None
def compute_off_target_df(guide, seed_size, records, feature_df):
""" Returns a pandas DataFrame with data about the identified off-targets.
The features column contains a list of biopython SeqFeature objects that overlap
with the off-target"""
offs_df = get_off_target_pos(guide, records, seed_size)
if offs_df is not None:
offs_df["features"] = [
get_pos_features(off.pampos, feature_df) for i, off in offs_df.iterrows()
]
return offs_df
else:
return None
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment