compute the longuest match perfect match between the guide and the off-target sequence

60ca2962 · Remi PLANEL · c9f5cced · 60ca2962
Commit 60ca2962 authored 5 years ago by Remi PLANEL
--- a/crisprbact/off_target.py
+++ b/crisprbact/off_target.py
@@ -4,6 +4,54 @@ import pandas as pd
 from crisprbact.utils import rev_comp


+def compute_off_target_df(guide, seed_size, records, feature_df):
+    """ Returns a pandas DataFrame with data about the identified off-targets.
+    The features column contains a list of biopython SeqFeature objects that overlap
+    with the off-target"""
+    offs_df = get_off_target_pos(guide, records, seed_size)
+    if offs_df is not None:
+        offs_df["features"] = [
+            get_pos_features(off.pampos, feature_df) for i, off in offs_df.iterrows()
+        ]
+        return offs_df
+    else:
+        return None
+
+
+def get_off_target_pos(guide, recs, seed_size):
+    if recs is not None:
+        for rec in recs:
+            offs_plus = re.finditer(guide[-seed_size:] + "[ATGC]GG", str(rec.seq))
+            offs = list(
+                gen_extract_off_target_strand_plus(offs_plus, rec, guide, seed_size)
+            )
+            # - ori
+            offs_minus = re.finditer(
+                "CC[ATGC]" + rev_comp(guide[-seed_size:]), str(rec.seq)
+            )
+            offs += list(
+                gen_extract_off_target_strand_minus(offs_minus, rec, guide, seed_size)
+            )
+            offs_dict = dict(
+                zip(
+                    [
+                        "start",
+                        "end",
+                        "pampos",
+                        "strand",
+                        "recid",
+                        "max_matching_len",
+                        "max_matching_seq",
+                        "pam_seq",
+                    ],
+                    zip(*offs),
+                )
+            )
+            return pd.DataFrame(offs_dict)
+    else:
+        return None
+
+
 def get_pos_features(position, f_df):
    if len(f_df) > 0:
        feature_at_pos = f_df[(f_df.start < position) & (f_df.end > position)]
@@ -29,7 +77,8 @@ def gen_extract_off_target_strand_plus(off_target_matches, rec, guide, seed_size
            "+",
            rec.id,
            seed_size + len(matching_chars),
-            matching_substr[::-1] + match.group(0),
+            matching_substr[::-1] + match.group(0)[:-3],
+            match.group(0)[-3:],
        )


@@ -61,43 +110,11 @@ def gen_extract_off_target_strand_minus(off_target_matches, rec, guide, seed_siz
            "-",
            rec.id,
            seed_size + len(matching_chars),
-            match.group(0) + matching_substr,
+            match.group(0)[3:] + matching_substr,
+            match.group(0)[:3],
        )


-def get_off_target_pos(guide, recs, seed_size):
-    if recs is not None:
-        for rec in recs:
-            offs_plus = re.finditer(guide[-seed_size:] + "[ATGC]GG", str(rec.seq))
-            offs = list(
-                gen_extract_off_target_strand_plus(offs_plus, rec, guide, seed_size)
-            )
-            # - ori
-            offs_minus = re.finditer(
-                "CC[ATGC]" + rev_comp(guide[-seed_size:]), str(rec.seq)
-            )
-            offs += list(
-                gen_extract_off_target_strand_minus(offs_minus, rec, guide, seed_size)
-            )
-            offs_dict = dict(
-                zip(
-                    [
-                        "start",
-                        "end",
-                        "pampos",
-                        "strand",
-                        "recid",
-                        "max_matching_len",
-                        "max_matching_seq",
-                    ],
-                    zip(*offs),
-                )
-            )
-            return pd.DataFrame(offs_dict)
-    else:
-        return None
-
-
 def extract_records(genome):
    records = list(genome)
    if records and len(records) > 0:
@@ -132,17 +149,3 @@ def extract_features(recs):
        return pd.DataFrame(f_dict)
    else:
        return None
-
-
-def compute_off_target_df(guide, seed_size, records, feature_df):
-    """ Returns a pandas DataFrame with data about the identified off-targets.
-    The features column contains a list of biopython SeqFeature objects that overlap
-    with the off-target"""
-    offs_df = get_off_target_pos(guide, records, seed_size)
-    if offs_df is not None:
-        offs_df["features"] = [
-            get_pos_features(off.pampos, feature_df) for i, off in offs_df.iterrows()
-        ]
-        return offs_df
-    else:
-        return None