Skip to content
Snippets Groups Projects
Commit a7fa4bae authored by Remi  PLANEL's avatar Remi PLANEL
Browse files

Merge branch 'multi-seed-size' into 'master'

Raise exception on wrong off-target format. Fix #7

Closes #7

See merge request !5
parents 924f5953 6c6d982c
No related branches found
No related tags found
1 merge request!5Raise exception on wrong off-target format. Fix #7
Pipeline #25166 passed with stage
in 49 seconds
...@@ -4,10 +4,12 @@ from crisprbact.off_target import ( ...@@ -4,10 +4,12 @@ from crisprbact.off_target import (
extract_features, extract_features,
extract_records, extract_records,
) )
from crisprbact.utils import NoRecordsException
__all__ = [ __all__ = [
"extract_records", "extract_records",
"on_target_predict", "on_target_predict",
"compute_off_target_df", "compute_off_target_df",
"extract_features", "extract_features",
"NoRecordsException",
] ]
...@@ -12,16 +12,25 @@ def get_pos_features(position, f_df): ...@@ -12,16 +12,25 @@ def get_pos_features(position, f_df):
return [] return []
def get_off_target_pos(guide, recs, records): def get_off_target_pos(guide, recs, seed_size):
for rec in recs: if recs is not None:
# + ori for rec in recs:
offs_plus = re.finditer(guide[-records:] + "[ATGC]GG", str(rec.seq)) # + ori
offs = [match.span() + (match.end(), "+", rec.id) for match in offs_plus] offs_plus = re.finditer(guide[-seed_size:] + "[ATGC]GG", str(rec.seq))
# - ori offs = [match.span() + (match.end(), "+", rec.id) for match in offs_plus]
offs_minus = re.finditer("CC[ATGC]" + rev_comp(guide[-records:]), str(rec.seq)) # - ori
offs += [match.span() + (match.start(), "-", rec.id) for match in offs_minus] offs_minus = re.finditer(
offs_dict = dict(zip(["start", "end", "pampos", "strand", "recid"], zip(*offs))) "CC[ATGC]" + rev_comp(guide[-seed_size:]), str(rec.seq)
return pd.DataFrame(offs_dict) )
offs += [
match.span() + (match.start(), "-", rec.id) for match in offs_minus
]
offs_dict = dict(
zip(["start", "end", "pampos", "strand", "recid"], zip(*offs))
)
return pd.DataFrame(offs_dict)
else:
return None
def extract_records(genome): def extract_records(genome):
...@@ -33,24 +42,31 @@ def extract_records(genome): ...@@ -33,24 +42,31 @@ def extract_records(genome):
def extract_features(recs): def extract_features(recs):
f_list = []
for rec in recs: if recs is not None:
for f in rec.features: f_list = []
if f.type in ["CDS", "ncRNA", "rRNA", "tRNA"]: for rec in recs:
f_list.append( for f in rec.features:
( if f.type in ["CDS", "ncRNA", "rRNA", "tRNA"]:
f.location.start.position, f_list.append(
f.location.end.position, (
f.location.strand, f.location.start.position,
f.type, f.location.end.position,
f, f.location.strand,
rec.id, f.type,
f,
rec.id,
)
) )
) f_dict = dict(
f_dict = dict( zip(
zip(["start", "end", "strand", "type", "feature", "recid"], zip(*f_list[1:]),) ["start", "end", "strand", "type", "feature", "recid"],
) # starts at 1 to get rid of the first feature which is the whole chromosome zip(*f_list[1:]),
return pd.DataFrame(f_dict) )
) # starts at 1 to get rid of the first feature which is the whole chromosome
return pd.DataFrame(f_dict)
else:
return None
def compute_off_target_df(guide, seed_size, records, feature_df): def compute_off_target_df(guide, seed_size, records, feature_df):
...@@ -58,7 +74,10 @@ def compute_off_target_df(guide, seed_size, records, feature_df): ...@@ -58,7 +74,10 @@ def compute_off_target_df(guide, seed_size, records, feature_df):
The features column contains a list of biopython SeqFeature objects that overlap The features column contains a list of biopython SeqFeature objects that overlap
with the off-target""" with the off-target"""
offs_df = get_off_target_pos(guide, records, seed_size) offs_df = get_off_target_pos(guide, records, seed_size)
offs_df["features"] = [ if offs_df is not None:
get_pos_features(off.pampos, feature_df) for i, off in offs_df.iterrows() offs_df["features"] = [
] get_pos_features(off.pampos, feature_df) for i, off in offs_df.iterrows()
return offs_df ]
return offs_df
else:
return None
import numpy as np import numpy as np
import re import re
from importlib.resources import open_binary from importlib.resources import open_binary
from crisprbact.utils import rev_comp from crisprbact.utils import rev_comp, NoRecordsException
from crisprbact.off_target import ( from crisprbact.off_target import (
compute_off_target_df, compute_off_target_df,
extract_records, extract_records,
...@@ -56,7 +56,12 @@ def on_target_predict(seq, genome=None, seed_sizes=[8, 9, 10, 11, 12]): ...@@ -56,7 +56,12 @@ def on_target_predict(seq, genome=None, seed_sizes=[8, 9, 10, 11, 12]):
genome_features = None genome_features = None
if genome: if genome:
records = extract_records(genome) records = extract_records(genome)
genome_features = extract_features(records) if records is None:
raise NoRecordsException(
"No records found in sequence file. Check the sequence or the format"
)
else:
genome_features = extract_features(records)
alltargets = list(find_targets(seq)) alltargets = list(find_targets(seq))
if alltargets: if alltargets:
...@@ -81,7 +86,7 @@ def on_target_predict(seq, genome=None, seed_sizes=[8, 9, 10, 11, 12]): ...@@ -81,7 +86,7 @@ def on_target_predict(seq, genome=None, seed_sizes=[8, 9, 10, 11, 12]):
target["guide"], seed_size, records, genome_features target["guide"], seed_size, records, genome_features
) )
off_targets_list = [] off_targets_list = []
if not off_target_df.empty: if off_target_df is not None and not off_target_df.empty:
off_targets = off_target_df.loc[ off_targets = off_target_df.loc[
0:, 0:,
["start", "end", "pampos", "strand", "recid", "features"], ["start", "end", "pampos", "strand", "recid", "features"],
......
def rev_comp(seq): def rev_comp(seq):
comp = str.maketrans("ATGC", "TACG") comp = str.maketrans("ATGC", "TACG")
return seq.translate(comp)[::-1] return seq.translate(comp)[::-1]
class NoRecordsException(Exception):
"""No Record found in the sequence file"""
pass
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment