diff --git a/crisprbact/predict.py b/crisprbact/predict.py index 20e97ddbe977b7aff70b432d9dd3247319b95e5a..00f3aeabcba89cb2d18245d2620d84d21aa270bf 100644 --- a/crisprbact/predict.py +++ b/crisprbact/predict.py @@ -43,7 +43,12 @@ def find_targets(seq): ) -def on_target_predict(seq, genome=None, seed_size=7): +def get_strand_value(value): + strand_dict = {"+": 1, "1": 1, "-": -1, "-1": -1} + return strand_dict[str(value)] + + +def on_target_predict(seq, genome=None, seed_sizes=[8, 9, 10, 11, 12]): seq = seq.upper() # make uppercase seq = re.sub(r"\s", "", seq) # removes white space @@ -65,47 +70,80 @@ def on_target_predict(seq, genome=None, seed_size=7): preds = predict(X) for i, target in enumerate(alltargets): + target_id = i + 1 + target.update({"id": target_id}) target.update({"pred": preds[i]}) - target.update({"seed_size": seed_size}) if genome: - off_target_df = compute_off_target_df( - target["guide"], seed_size, records, genome_features - ) - off_targets_list = [] - if not off_target_df.empty: - off_targets = off_target_df.loc[ - 0:, ["start", "end", "pampos", "strand", "recid", "features"] - ] - for index, off_t in enumerate(off_targets.values.tolist()): - off_target_dict = { - "off_target_start": off_t[0], - "off_target_end": off_t[1], - "pampos": off_t[2], - "strand": off_t[3], - "recid": off_t[4], - } - if len(off_t[5]) > 0: - # Loop for each feature - for feat in off_t[5]: - feature_dict = { - "feat_strand": feat.location.strand, - "feat_start": feat.location.start, - "feat_end": feat.location.end, - "feat_type": feat.type, - } - for k, feat in feat.qualifiers.items(): - if k != "translation": - feature_dict[k] = "::".join(feat) - off_targets_list.append( - {**feature_dict, **off_target_dict} + off_targets_per_seed = [] + for seed_size in seed_sizes: + off_target_df = compute_off_target_df( + target["guide"], seed_size, records, genome_features + ) + off_targets_list = [] + if not off_target_df.empty: + off_targets = off_target_df.loc[ + 0:, + ["start", "end", "pampos", "strand", "recid", "features"], + ] + for j, off_t in enumerate(off_targets.values.tolist()): + off_target_dict = { + "id": str(target_id) + + "-" + + str(seed_size) + + "-" + + str(j), + "off_target_start": off_t[0], + "off_target_end": off_t[1], + "off_target_pampos": off_t[2], + "off_target_strand": off_t[3], + "off_target_recid": off_t[4], + } + + off_t[5] = list( + filter( + lambda feat: get_strand_value( + off_target_dict["off_target_strand"] + ) + != get_strand_value(feat.location.strand), + off_t[5], ) - else: - off_targets_list.append(off_target_dict) - target.update({"off_targets": off_targets_list}) - else: - target.update({"off_targets": off_targets_list}) + ) + if len(off_t[5]) > 0: + # Loop for each feature + for feat in off_t[5]: + feature_dict = { + "off_target_feat_strand": feat.location.strand, + "off_target_feat_start": feat.location.start, + "off_target_feat_end": feat.location.end, + "off_target_feat_type": feat.type, + } + for k, feat in feat.qualifiers.items(): + if k != "translation": + feature_dict[k] = "::".join(feat) + off_targets_list.append( + {**feature_dict, **off_target_dict} + ) + else: + off_targets_list.append(off_target_dict) + off_targets_per_seed.append( + { + "id": str(i) + "-" + str(seed_size), + "seed_size": seed_size, + "off_targets": off_targets_list, + } + ) + else: + off_targets_per_seed.append( + { + "id": str(i) + "-" + str(seed_size), + "seed_size": seed_size, + "off_targets": off_targets_list, + } + ) + # target.update({"off_targets": off_targets_list}) + target.update({"off_targets_per_seed": off_targets_per_seed}) else: - target.update({"off_targets": []}) + target.update({"off_targets_per_seed": []}) return alltargets else: return []