Skip to content
Snippets Groups Projects
Commit 5d74c21d authored by Remi  PLANEL's avatar Remi PLANEL
Browse files

off-target features per record id

parent 222412cd
No related branches found
No related tags found
No related merge requests found
Pipeline #26159 passed with stage
in 48 seconds
......@@ -2,6 +2,7 @@ import re
import pandas as pd
from crisprbact.utils import rev_comp
from itertools import chain
def compute_off_target_df(guide, seed_size, records, feature_df):
......@@ -11,71 +12,79 @@ def compute_off_target_df(guide, seed_size, records, feature_df):
offs_df = get_off_target_pos(guide, records, seed_size)
if offs_df is not None:
offs_df["features"] = [
get_pos_features(off.pampos, feature_df) for i, off in offs_df.iterrows()
get_pos_features(off.recid, off.pampos, feature_df)
for i, off in offs_df.iterrows()
]
return offs_df
else:
return None
def get_off_target_pos(guide, recs, seed_size):
if recs is not None:
for rec in recs:
offs_plus = re.finditer(guide[-seed_size:] + "[ATGC]GG", str(rec.seq))
offs = list(
gen_extract_off_target_strand_plus(offs_plus, rec, guide, seed_size)
def get_off_target_pos(guide, records, seed_size):
if records is not None:
offs = iter([])
for rec in records:
sequence_str = str(rec.seq)
seqid = rec.id
offs_plus = re.finditer(guide[-seed_size:] + "[ATGC]GG", sequence_str)
offs_plus = gen_extract_off_target_strand_plus(
offs_plus, seqid, sequence_str, guide, seed_size
)
# - ori
offs_minus = re.finditer(
"CC[ATGC]" + rev_comp(guide[-seed_size:]), str(rec.seq)
"CC[ATGC]" + rev_comp(guide[-seed_size:]), sequence_str
)
offs += list(
gen_extract_off_target_strand_minus(offs_minus, rec, guide, seed_size)
offs_minus = gen_extract_off_target_strand_minus(
offs_minus, seqid, sequence_str, guide, seed_size
)
offs_dict = dict(
zip(
[
"start",
"end",
"pampos",
"strand",
"recid",
"max_matching_len",
"max_matching_seq",
"pam_seq",
],
zip(*offs),
)
offs = chain(offs, offs_plus, offs_minus)
offs_dict = dict(
zip(
[
"start",
"end",
"pampos",
"strand",
"recid",
"max_matching_len",
"max_matching_seq",
"pam_seq",
],
zip(*offs),
)
return pd.DataFrame(offs_dict)
)
return pd.DataFrame(offs_dict)
else:
return None
def get_pos_features(position, f_df):
def get_pos_features(recid, position, f_df):
if len(f_df) > 0:
feature_at_pos = f_df[(f_df.start < position) & (f_df.end > position)]
feature_at_pos = f_df[
(f_df.start < position) & (f_df.end > position) & (f_df.recid == recid)
]
return feature_at_pos.feature.values
else:
return []
def gen_extract_off_target_strand_plus(off_target_matches, rec, guide, seed_size):
def gen_extract_off_target_strand_plus(
off_target_matches, seqid, sequence_string, guide, seed_size
):
for match in off_target_matches:
guide_subseq = guide[: 20 - seed_size][::-1]
# extract part of the sequence
seq_extension_len = len(guide) - seed_size
start_pos_seq = match.start() - seq_extension_len
end_pos_seq = match.end() - seed_size - 3
sub_sequence_to_match = rec.seq[start_pos_seq:end_pos_seq][::-1]
assert len(sub_sequence_to_match) == len(guide_subseq)
sub_sequence_to_match = sequence_string[start_pos_seq:end_pos_seq][::-1]
matching_chars = list(common_start(sub_sequence_to_match, guide_subseq))
matching_substr = "".join(matching_chars)
yield match.span() + (
match.end(),
"+",
rec.id,
seqid,
seed_size + len(matching_chars),
matching_substr[::-1] + match.group(0)[:-3],
match.group(0)[-3:],
......@@ -90,25 +99,25 @@ def common_start(seq1, seq2):
return
def gen_extract_off_target_strand_minus(off_target_matches, rec, guide, seed_size):
def gen_extract_off_target_strand_minus(
off_target_matches, seqid, sequence_string, guide, seed_size
):
for match in off_target_matches:
# Extract the sequence.
# since rev_comp, extend the sequence to the end
seq_extension_len = len(guide) - seed_size
end_pos_seq = match.end() + seq_extension_len
start_pos_seq = match.start() + seed_size + 3
sub_sequence_to_match = rec.seq[start_pos_seq:end_pos_seq]
sub_sequence_to_match = sequence_string[start_pos_seq:end_pos_seq]
# Extract part of the guide to match
guide_subseq = rev_comp(guide[: 20 - seed_size])
assert len(sub_sequence_to_match) == len(guide_subseq)
matching_chars = list(common_start(sub_sequence_to_match, guide_subseq))
matching_substr = "".join(matching_chars)
yield match.span() + (
match.start(),
"-",
rec.id,
seqid,
seed_size + len(matching_chars),
match.group(0)[3:] + matching_substr,
match.group(0)[:3],
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment