Skip to content
Snippets Groups Projects
Commit 8e22bb2d authored by Remi  PLANEL's avatar Remi PLANEL
Browse files

Compute the biggest match between the guide and the off-target

parent 2592f048
No related branches found
No related tags found
No related merge requests found
Pipeline #25960 passed with stage
in 57 seconds
...@@ -12,24 +12,86 @@ def get_pos_features(position, f_df): ...@@ -12,24 +12,86 @@ def get_pos_features(position, f_df):
return [] return []
def gen_extract_off_target_strand_plus(off_target_matches, rec, guide, seed_size):
for match in off_target_matches:
guide_subseq = guide[: 20 - seed_size][::-1]
# extract part of the sequence
seq_extension_len = len(guide) - seed_size
start_pos_seq = match.start() - seq_extension_len
end_pos_seq = match.end() - seed_size - 3
sub_sequence_to_match = rec.seq[start_pos_seq:end_pos_seq][::-1]
assert len(sub_sequence_to_match) == len(guide_subseq)
matching_chars = list(common_start(sub_sequence_to_match, guide_subseq))
matching_substr = "".join(matching_chars)
yield match.span() + (
match.end(),
"+",
rec.id,
seed_size + len(matching_chars),
matching_substr[::-1] + match.group(0),
)
def common_start(seq1, seq2):
for a, b in zip(seq1, seq2):
if a == b:
yield a
else:
return
def gen_extract_off_target_strand_minus(off_target_matches, rec, guide, seed_size):
for match in off_target_matches:
# Extract the sequence.
# since rev_comp, extend the sequence to the end
seq_extension_len = len(guide) - seed_size
end_pos_seq = match.end() + seq_extension_len
start_pos_seq = match.start() + seed_size + 3
sub_sequence_to_match = rec.seq[start_pos_seq:end_pos_seq]
# Extract part of the guide to match
guide_subseq = rev_comp(guide[: 20 - seed_size])
assert len(sub_sequence_to_match) == len(guide_subseq)
matching_chars = list(common_start(sub_sequence_to_match, guide_subseq))
matching_substr = "".join(matching_chars)
yield match.span() + (
match.start(),
"-",
rec.id,
seed_size + len(matching_chars),
match.group(0) + matching_substr,
)
def get_off_target_pos(guide, recs, seed_size): def get_off_target_pos(guide, recs, seed_size):
if recs is not None: if recs is not None:
for rec in recs: for rec in recs:
# + ori
offs_plus = re.finditer(guide[-seed_size:] + "[ATGC]GG", str(rec.seq)) offs_plus = re.finditer(guide[-seed_size:] + "[ATGC]GG", str(rec.seq))
offs = [match.span() + (match.end(), "+", rec.id) for match in offs_plus] offs = list(
# TODO: comparer guide avec rec.seq[match.start():match.end()] gen_extract_off_target_strand_plus(offs_plus, rec, guide, seed_size)
)
# - ori # - ori
offs_minus = re.finditer( offs_minus = re.finditer(
"CC[ATGC]" + rev_comp(guide[-seed_size:]), str(rec.seq) "CC[ATGC]" + rev_comp(guide[-seed_size:]), str(rec.seq)
) )
offs += [ offs += list(
match.span() + (match.start(), "-", rec.id) for match in offs_minus gen_extract_off_target_strand_minus(offs_minus, rec, guide, seed_size)
] )
# comparer guide avec rec.seq[match.start():match.end()].reverse_complement()
# comparer les positions identique à partir de la fin et d'affilé.
offs_dict = dict( offs_dict = dict(
zip(["start", "end", "pampos", "strand", "recid"], zip(*offs)) zip(
[
"start",
"end",
"pampos",
"strand",
"recid",
"max_matching_len",
"max_matching_seq",
],
zip(*offs),
)
) )
return pd.DataFrame(offs_dict) return pd.DataFrame(offs_dict)
else: else:
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment