diff --git a/crisprbact/__init__.py b/crisprbact/__init__.py index 7f60791b466ebe85a087307e7ba0cbdd0f83db58..77e1df9fd51cfe42059c18b14f3545f008187835 100644 --- a/crisprbact/__init__.py +++ b/crisprbact/__init__.py @@ -4,10 +4,12 @@ from crisprbact.off_target import ( extract_features, extract_records, ) +from crisprbact.utils import NoRecordsException __all__ = [ "extract_records", "on_target_predict", "compute_off_target_df", "extract_features", + "NoRecordsException", ] diff --git a/crisprbact/off_target.py b/crisprbact/off_target.py index 5d711d032969f43a386a0c38f9956c9671464f0c..eb5ed88f8b5c68d029dc40ad56d13541fa5edf85 100644 --- a/crisprbact/off_target.py +++ b/crisprbact/off_target.py @@ -12,16 +12,25 @@ def get_pos_features(position, f_df): return [] -def get_off_target_pos(guide, recs, records): - for rec in recs: - # + ori - offs_plus = re.finditer(guide[-records:] + "[ATGC]GG", str(rec.seq)) - offs = [match.span() + (match.end(), "+", rec.id) for match in offs_plus] - # - ori - offs_minus = re.finditer("CC[ATGC]" + rev_comp(guide[-records:]), str(rec.seq)) - offs += [match.span() + (match.start(), "-", rec.id) for match in offs_minus] - offs_dict = dict(zip(["start", "end", "pampos", "strand", "recid"], zip(*offs))) - return pd.DataFrame(offs_dict) +def get_off_target_pos(guide, recs, seed_size): + if recs is not None: + for rec in recs: + # + ori + offs_plus = re.finditer(guide[-seed_size:] + "[ATGC]GG", str(rec.seq)) + offs = [match.span() + (match.end(), "+", rec.id) for match in offs_plus] + # - ori + offs_minus = re.finditer( + "CC[ATGC]" + rev_comp(guide[-seed_size:]), str(rec.seq) + ) + offs += [ + match.span() + (match.start(), "-", rec.id) for match in offs_minus + ] + offs_dict = dict( + zip(["start", "end", "pampos", "strand", "recid"], zip(*offs)) + ) + return pd.DataFrame(offs_dict) + else: + return None def extract_records(genome): @@ -33,24 +42,31 @@ def extract_records(genome): def extract_features(recs): - f_list = [] - for rec in recs: - for f in rec.features: - if f.type in ["CDS", "ncRNA", "rRNA", "tRNA"]: - f_list.append( - ( - f.location.start.position, - f.location.end.position, - f.location.strand, - f.type, - f, - rec.id, + + if recs is not None: + f_list = [] + for rec in recs: + for f in rec.features: + if f.type in ["CDS", "ncRNA", "rRNA", "tRNA"]: + f_list.append( + ( + f.location.start.position, + f.location.end.position, + f.location.strand, + f.type, + f, + rec.id, + ) ) - ) - f_dict = dict( - zip(["start", "end", "strand", "type", "feature", "recid"], zip(*f_list[1:]),) - ) # starts at 1 to get rid of the first feature which is the whole chromosome - return pd.DataFrame(f_dict) + f_dict = dict( + zip( + ["start", "end", "strand", "type", "feature", "recid"], + zip(*f_list[1:]), + ) + ) # starts at 1 to get rid of the first feature which is the whole chromosome + return pd.DataFrame(f_dict) + else: + return None def compute_off_target_df(guide, seed_size, records, feature_df): @@ -58,7 +74,10 @@ def compute_off_target_df(guide, seed_size, records, feature_df): The features column contains a list of biopython SeqFeature objects that overlap with the off-target""" offs_df = get_off_target_pos(guide, records, seed_size) - offs_df["features"] = [ - get_pos_features(off.pampos, feature_df) for i, off in offs_df.iterrows() - ] - return offs_df + if offs_df is not None: + offs_df["features"] = [ + get_pos_features(off.pampos, feature_df) for i, off in offs_df.iterrows() + ] + return offs_df + else: + return None diff --git a/crisprbact/predict.py b/crisprbact/predict.py index 1dd657cb1bafbf30fc2981cdfff1d76c17c793bd..aee0aaa6596ba563b37bb49f98b263be7cd12c5d 100644 --- a/crisprbact/predict.py +++ b/crisprbact/predict.py @@ -1,7 +1,7 @@ import numpy as np import re from importlib.resources import open_binary -from crisprbact.utils import rev_comp +from crisprbact.utils import rev_comp, NoRecordsException from crisprbact.off_target import ( compute_off_target_df, extract_records, @@ -56,7 +56,12 @@ def on_target_predict(seq, genome=None, seed_sizes=[8, 9, 10, 11, 12]): genome_features = None if genome: records = extract_records(genome) - genome_features = extract_features(records) + if records is None: + raise NoRecordsException( + "No records found in sequence file. Check the sequence or the format" + ) + else: + genome_features = extract_features(records) alltargets = list(find_targets(seq)) if alltargets: @@ -81,7 +86,7 @@ def on_target_predict(seq, genome=None, seed_sizes=[8, 9, 10, 11, 12]): target["guide"], seed_size, records, genome_features ) off_targets_list = [] - if not off_target_df.empty: + if off_target_df is not None and not off_target_df.empty: off_targets = off_target_df.loc[ 0:, ["start", "end", "pampos", "strand", "recid", "features"], diff --git a/crisprbact/utils.py b/crisprbact/utils.py index 96557dbe7d9d9449f9f34cf30c67b4e4f201fc6f..a1bb0482da05ad3e25da09ecc6ac68cc961913d8 100644 --- a/crisprbact/utils.py +++ b/crisprbact/utils.py @@ -1,3 +1,9 @@ def rev_comp(seq): comp = str.maketrans("ATGC", "TACG") return seq.translate(comp)[::-1] + + +class NoRecordsException(Exception): + """No Record found in the sequence file""" + + pass