diff --git a/crisprbact/predict.py b/crisprbact/predict.py index 0c395ebefe923bf5f408259311d96f355ea2d55b..3a10c6f7e6a40fe73b8a9b31032b1bda7d4c7d7f 100644 --- a/crisprbact/predict.py +++ b/crisprbact/predict.py @@ -48,7 +48,6 @@ def find_targets(seq): yield dict( [ ("target", matching_target), - # ("guide", matching_target[:20]), ("guide", guide), ("start", pos_seq_start), ("stop", pos_seq_stop), @@ -79,13 +78,9 @@ def on_target_predict(seq, genome=None, seed_sizes=[8, 9, 10, 11, 12, GUIDE_LEN] genome_features = extract_features(records) alltargets = list(find_targets(seq)) if alltargets: - X = np.array( - [ - encode(target["target"][:7] + target["target"][9:]) - for target in alltargets - ] # encode and remove GG of PAM - ) - X = X.reshape(X.shape[0], -1) + gen_targets = (target["target"] for target in alltargets) + gen_clean_targets = remove_GG_of_PAM(gen_targets) + X = reshape_targets(gen_clean_targets) preds = predict(X) for i, target in enumerate(alltargets): target_id = i + 1 @@ -100,20 +95,8 @@ def on_target_predict(seq, genome=None, seed_sizes=[8, 9, 10, 11, 12, GUIDE_LEN] ) off_targets_list = [] if off_target_df is not None and not off_target_df.empty: - off_targets = off_target_df.loc[ - 0:, - [ - "start", - "end", - "pampos", - "strand", - "recid", - "longest_perfect_match", - "pam_seq", - "features", - ], - ] - for j, off_t in enumerate(off_targets.values.tolist()): + off_targets = slice_off_targets_results(off_target_df) + for j, off_t in enumerate(off_targets): off_target_dict = { "off_target_id": str(target_id) + "-" @@ -129,38 +112,28 @@ def on_target_predict(seq, genome=None, seed_sizes=[8, 9, 10, 11, 12, GUIDE_LEN] "off_target_pam_seq": off_t[6], "off_target_good_orientation": None, } - index_features = 7 - # Loop through features associated to an off-target position - if len(off_t[index_features]) > 0: - # Loop for each feature - for feat in off_t[index_features]: - if get_strand_value( - off_target_dict["off_target_strand"] - ) != get_strand_value(feat.location.strand): - off_target_dict[ - "off_target_good_orientation" - ] = True - else: - off_target_dict[ - "off_target_good_orientation" - ] = False - - feature_dict = { - "off_target_feat_strand": feat.location.strand, - "off_target_feat_start": feat.location.start, - "off_target_feat_end": feat.location.end, - "off_target_feat_type": feat.type, - } - for k, feat in feat.qualifiers.items(): - if k != "translation": - feature_dict["off_target_" + k] = "::".join( - feat - ) + if ( + seed_size == GUIDE_LEN + or off_target_dict["off_target_longest_perfect_match"] + != GUIDE_LEN + ): + index_features = 7 + # Get feature details associated + # to an off-target position + if len(off_t[index_features]) > 0: + feat = off_t[index_features][0] + off_target_dict[ + "off_target_good_orientation" + ] = is_good_orientation( + feat, off_target_dict["off_target_strand"] + ) + off_target_feature = get_off_target_feature(feat) off_targets_list.append( - {**feature_dict, **off_target_dict} + {**off_target_feature, **off_target_dict} ) - else: - off_targets_list.append(off_target_dict) + else: + off_targets_list.append(off_target_dict) + off_targets_per_seed.append( { "id": str(i) + "-" + str(seed_size), @@ -182,3 +155,53 @@ def on_target_predict(seq, genome=None, seed_sizes=[8, 9, 10, 11, 12, GUIDE_LEN] return alltargets else: return [] + + +def remove_GG_of_PAM(targets): + for target in targets: + yield target[:7] + target[9:] + + +def reshape_targets(targets): + X = np.array([encode(target) for target in targets]) # encode and remove GG of PAM + return X.reshape(X.shape[0], -1) + + +def slice_off_targets_results(off_target_df): + return off_target_df.loc[ + 0:, + [ + "start", + "end", + "pampos", + "strand", + "recid", + "longest_perfect_match", + "pam_seq", + "features", + ], + ].values.tolist() + + +def is_good_orientation(feat, off_target_strand): + if get_strand_value(off_target_strand) != get_strand_value(feat.location.strand): + return True + else: + return False + + +def get_off_target_feature(feat): + + feature_dict = { + "off_target_feat_strand": feat.location.strand, + "off_target_feat_start": feat.location.start, + "off_target_feat_end": feat.location.end, + "off_target_feat_type": feat.type, + } + for k, feat in feat.qualifiers.items(): + if k != "translation": + feature_dict["off_target_" + k] = "::".join(feat) + return feature_dict + + +# def gen_off_target_per_seed_size(seed_sizes):