diff --git a/CHANGELOG b/CHANGELOG index 9e5c7de098ee5f5ae441850e7492cc81d7836ada..6bc3da3acd9df30c8d0dcc1106afd4640b4043a4 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,9 @@ +# 0.3.3 +- Compute prediction for seeds 8-12 +- Prefix off-target feature key result with "off_target" +- Change header of output format +- Change some of the command lines : -g => -s and -gf => -w + # 0.3.2 - Fix bugg when no off-target feature - Handle genome in fasta file diff --git a/README.md b/README.md index 0a6acf83b6dd28f0c3c3c6ed108095aa2c6d07e0..c5c5bdee463f6f1e62143582527f878a90dce1cf 100644 --- a/README.md +++ b/README.md @@ -46,10 +46,10 @@ for guide_rna in guide_rnas: _output :_ ``` -{'target': 'TCATCACGGGCCTTCGCCACGCGCG', 'guide': 'TCATCACGGGCCTTCGCCAC', 'start': 82, 'stop': 102, 'pam': 80, 'ori': '-', 'pred': -0.4719254873780802} -{'target': 'CATCACGGGCCTTCGCCACGCGCGC', 'guide': 'CATCACGGGCCTTCGCCACG', 'start': 81, 'stop': 101, 'pam': 79, 'ori': '-', 'pred': 1.0491308060379676} -{'target': 'CGCGCGCGGCAAACAATCACAAACA', 'guide': 'CGCGCGCGGCAAACAATCAC', 'start': 63, 'stop': 83, 'pam': 61, 'ori': '-', 'pred': -0.9021152826078697} -{'target': 'CCTGATCGGTATTGAACAGCATCTG', 'guide': 'CCTGATCGGTATTGAACAGC', 'start': 29, 'stop': 49, 'pam': 27, 'ori': '-', 'pred': 0.23853258873311955} +{'target': 'TCATCACGGGCCTTCGCCACGCGCG', 'guide': 'TCATCACGGGCCTTCGCCAC', 'start': 82, 'stop': 102, 'pam': 80, 'ori': '-', 'target_id': 1, 'pred': -0.4719254873780802, 'off_targets_per_seed': []} +{'target': 'CATCACGGGCCTTCGCCACGCGCGC', 'guide': 'CATCACGGGCCTTCGCCACG', 'start': 81, 'stop': 101, 'pam': 79, 'ori': '-', 'target_id': 2, 'pred': 1.0491308060379676, 'off_targets_per_seed': []} +{'target': 'CGCGCGCGGCAAACAATCACAAACA', 'guide': 'CGCGCGCGGCAAACAATCAC', 'start': 63, 'stop': 83, 'pam': 61, 'ori': '-', 'target_id': 3, 'pred': -0.9021152826078697, 'off_targets_per_seed': []} +{'target': 'CCTGATCGGTATTGAACAGCATCTG', 'guide': 'CCTGATCGGTATTGAACAGC', 'start': 29, 'stop': 49, 'pam': 27, 'ori': '-', 'target_id': 4, 'pred': 0.23853258873311955, 'off_targets_per_seed': []} ``` ### Command line interface @@ -82,7 +82,7 @@ $ crisprbact predict from-str --help ``` ``` -Usage: crisprbact predict from-str [OPTIONS] [OUTPUT_FILE] +Usage: cli.py predict from-str [OPTIONS] [OUTPUT_FILE] Outputs candidate guide RNAs for the S. pyogenes dCas9 with predicted on- target activity from a target gene. @@ -91,8 +91,15 @@ Usage: crisprbact predict from-str [OPTIONS] [OUTPUT_FILE] "stdout" Options: - -t, --target TEXT [required] - --help Show this message and exit. + -t, --target TEXT Sequence file to target [required] + -s, --off-target-sequence FILENAME + Sequence in which you want to find off- + targets + -w, --off-target-sequence-format [fasta|gb|genbank] + Sequence in which you want to find off- + targets format [default: genbank] + --help Show this message and exit. + ``` ```console @@ -124,7 +131,7 @@ $ crisprbact predict from-seq --help ``` ``` -Usage: crisprbact predict from-seq [OPTIONS] [OUTPUT_FILE] +Usage: cli.py predict from-seq [OPTIONS] [OUTPUT_FILE] Outputs candidate guide RNAs for the S. pyogenes dCas9 with predicted on- target activity from a target gene. @@ -133,9 +140,16 @@ Usage: crisprbact predict from-seq [OPTIONS] [OUTPUT_FILE] "stdout" Options: - -t, --target FILENAME Sequence file [required] - -f, --seq-format [fasta|fa|gb|genbank] - Sequence file format [default: fasta] + -t, --target FILENAME Sequence file to target [required] + -f, --seq-format [fasta|gb|genbank] + Sequence file to target format [default: + fasta] + -s, --off-target-sequence FILENAME + Sequence in which you want to find off- + targets + -w, --off-target-sequence-format [fasta|gb|genbank] + Sequence in which you want to find off- + targets format [default: genbank] --help Show this message and exit. ``` @@ -151,17 +165,22 @@ $ crisprbact predict from-seq -t /tmp/seq.fasta guide-rnas.tsv $ crisprbact predict from-seq -t /tmp/seq.gb -f gb guide-rnas.tsv ``` +- Off-targets + +```console +predict from-seq -t data-test/sequence.fasta -s data-test/sequence.gb guide-rnas.tsv +``` + ##### Output file ``` -target PAM position prediction input_id -ATTTGTTGGCAACCCAGCCAGCCTT 855 -0.7310112260341689 CP027060.1 -CACGTCCGGCAATATTTCCGCGAAC 830 0.14773859036983505 CP027060.1 -TCCGAGCGGCAACGTCTCTGATAAC 799 -0.4922487382950619 CP027060.1 -GCTTAAAGGGCAAAATGTCACGCCT 769 -1.814666749464254 CP027060.1 -CTTAAAGGGCAAAATGTCACGCCTT 768 -0.4285147731290152 CP027060.1 -CGTTTGAGGAGATCCACAAAATTAT 732 -1.2437430146548256 CP027060.1 -CATGAATGGTATAAACCGGCGTGCC 680 -0.8043242669169294 CP027060.1 +target_id target PAM position prediction target_seq_id seed_size off_target_recid off_target_start off_target_end off_target_pampos off_target_strand off_target_feat_type off_target_feat_start off_target_feat_end off_target_feat_strand off_target_locus_tag off_target_gene off_target_note off_target_product off_target_protein_id +1 TGATCCAGGCATTTTTTAGCTTCAT 835 0.47949500169043713 NC_017634.1:2547433-2548329 8 NC_017634.1 1388198 1388209 1388209 + +1 TGATCCAGGCATTTTTTAGCTTCAT 835 0.47949500169043713 NC_017634.1:2547433-2548329 8 NC_017634.1 2244514 2244525 2244525 + CDS 2243562 2244720 -1 NRG857_10810 COG1174 ABC-type proline/glycine betaine transport systems, permease component putative transport system permease YP_006120510.1 +1 TGATCCAGGCATTTTTTAGCTTCAT 835 0.47949500169043713 NC_017634.1:2547433-2548329 8 NC_017634.1 4160984 4160995 4160995 + CDS 4160074 4161406 -1 NRG857_19625 hslU COG1220 ATP-dependent protease HslVU (ClpYQ), ATPase subunit ATP-dependent protease ATP-binding subunit HslU YP_006122267.1 +1 TGATCCAGGCATTTTTTAGCTTCAT 835 0.47949500169043713 NC_017634.1:2547433-2548329 8 NC_017634.1 4534189 4534200 4534200 + +1 TGATCCAGGCATTTTTTAGCTTCAT 835 0.47949500169043713 NC_017634.1:2547433-2548329 8 NC_017634.1 548804 548815 548804 - +1 TGATCCAGGCATTTTTTAGCTTCAT 835 0.47949500169043713 NC_017634.1:2547433-2548329 8 NC_017634.1 786462 786473 786462 - CDS 785384 786470 1 NRG857_03580 COG2055 Malate/L-lactate dehydrogenases hypothetical protein YP_006119079.1 ``` diff --git a/crisprbact/cli.py b/crisprbact/cli.py index 4eeaebd2e2700a0dd9450dcef52143d77f21264b..8f7552fd8f56106baefabc0faaf0ba7831e70bdf 100644 --- a/crisprbact/cli.py +++ b/crisprbact/cli.py @@ -10,7 +10,32 @@ class Config(object): pass_config = click.make_pass_decorator(Config, ensure=True) -HEADER = ["target", "PAM position", "prediction", "seq_id"] +OFF_TARGET_DETAILS = [ + "off_target_recid", + "off_target_start", + "off_target_end", + "off_target_pampos", + "off_target_strand", + "off_target_feat_type", + "off_target_feat_start", + "off_target_feat_end", + "off_target_feat_strand", + "off_target_locus_tag", + "off_target_gene", + "off_target_note", + "off_target_product", + "off_target_protein_id", +] +HEADER = [ + "target_id", + "target", + "PAM position", + "prediction", + "target_seq_id", + "seed_size", +] + OFF_TARGET_DETAILS +SEED_SIZE = 8 +GENOME_FORMAT = "genbank" @click.group() @@ -27,10 +52,28 @@ def predict(config): @predict.command() -@click.option("-t", "--target", type=str, required=True) +@click.option( + "-t", "--target", type=str, required=True, help="Sequence file to target", +) +@click.option( + "-s", + "--off-target-sequence", + type=click.File("rU"), + help="Sequence in which you want to find off-targets", +) +@click.option( + "-w", + "--off-target-sequence-format", + type=click.Choice(["fasta", "gb", "genbank"]), + default=GENOME_FORMAT, + show_default=True, + help="Sequence in which you want to find off-targets format", +) @click.argument("output-file", type=click.File("w"), default="-") @pass_config -def from_str(config, target, output_file): +def from_str( + config, target, off_target_sequence, off_target_sequence_format, output_file +): """ Outputs candidate guide RNAs for the S. pyogenes dCas9 with predicted on-target activity from a target gene. @@ -40,28 +83,59 @@ def from_str(config, target, output_file): """ if config.verbose: print_parameters(target) + if off_target_sequence: + genome_fh = SeqIO.parse(off_target_sequence, off_target_sequence_format) + else: + genome_fh = None + guide_rnas = on_target_predict(target, genome_fh) - guide_rnas = on_target_predict(target) click.echo("\t".join(HEADER), file=output_file) write_guide_rnas(guide_rnas, output_file) @predict.command() @click.option( - "-t", "--target", type=click.File("rU"), required=True, help="Sequence file", + "-t", + "--target", + type=click.File("rU"), + required=True, + help="Sequence file to target", ) @click.option( "-f", "--seq-format", - type=click.Choice(["fasta", "fa", "gb", "genbank"]), - help="Sequence file format", + type=click.Choice(["fasta", "gb", "genbank"]), + help="Sequence file to target format", default="fasta", show_default=True, ) -# @click.option("-g", "--genome", type=click.File("rU"), required=True, help="Genome") +# @click.option( +# "-s", "--seed-size", type=click.IntRange(8, 15, clamp=True), +# ) +@click.option( + "-s", + "--off-target-sequence", + type=click.File("rU"), + help="Sequence in which you want to find off-targets", +) +@click.option( + "-w", + "--off-target-sequence-format", + type=click.Choice(["fasta", "gb", "genbank"]), + default=GENOME_FORMAT, + show_default=True, + help="Sequence in which you want to find off-targets format", +) @click.argument("output-file", type=click.File("w"), default="-") @pass_config -def from_seq(config, target, seq_format, output_file): # genome, +def from_seq( + config, + target, + seq_format, + off_target_sequence, + off_target_sequence_format, + output_file, +): """ Outputs candidate guide RNAs for the S. pyogenes dCas9 with predicted on-target activity from a target gene. @@ -76,9 +150,12 @@ def from_seq(config, target, seq_format, output_file): # genome, for record in SeqIO.parse(target, seq_format): if config.verbose: click.secho(" - search guide RNAs for %s " % record.id, fg=fg) - # g = SeqIO.parse(genome, "genbank") - guide_rnas = on_target_predict(str(record.seq)) - + if off_target_sequence: + genome_fh = SeqIO.parse(off_target_sequence, off_target_sequence_format) + else: + genome_fh = None + guide_rnas = on_target_predict(str(record.seq), genome_fh) + # print(guide_rnas) write_guide_rnas(guide_rnas, output_file, record.id) @@ -89,17 +166,34 @@ def print_parameters(target, fg="blue"): def write_guide_rnas(guide_rnas, output_file, seq_id="N/A"): for guide_rna in guide_rnas: - click.echo( - "\t".join( - [ - guide_rna["target"], - str(guide_rna["pam"]), - str(guide_rna["pred"]), - seq_id, - ] - ), - file=output_file, - ) + row = [ + str(guide_rna["target_id"]), + guide_rna["target"], + str(guide_rna["pam"]), + str(guide_rna["pred"]), + seq_id, + ] + if len(guide_rna["off_targets_per_seed"]) > 0: + for off_target_per_seed in guide_rna["off_targets_per_seed"]: + for off_target in off_target_per_seed["off_targets"]: + seed_size = off_target_per_seed["seed_size"] + + def extract_off_target_detail(key): + if key in off_target: + return str(off_target[key]) + else: + return "" + + details = map(extract_off_target_detail, OFF_TARGET_DETAILS) + click.echo( + "\t".join(row + [str(seed_size)] + list(details)), + file=output_file, + ) + else: + click.echo( + "\t".join(row + list(map(lambda x: "", OFF_TARGET_DETAILS))), + file=output_file, + ) if __name__ == "__main__": diff --git a/crisprbact/predict.py b/crisprbact/predict.py index 20e97ddbe977b7aff70b432d9dd3247319b95e5a..1dd657cb1bafbf30fc2981cdfff1d76c17c793bd 100644 --- a/crisprbact/predict.py +++ b/crisprbact/predict.py @@ -43,7 +43,12 @@ def find_targets(seq): ) -def on_target_predict(seq, genome=None, seed_size=7): +def get_strand_value(value): + strand_dict = {"+": 1, "1": 1, "-": -1, "-1": -1} + return strand_dict[str(value)] + + +def on_target_predict(seq, genome=None, seed_sizes=[8, 9, 10, 11, 12]): seq = seq.upper() # make uppercase seq = re.sub(r"\s", "", seq) # removes white space @@ -65,47 +70,85 @@ def on_target_predict(seq, genome=None, seed_size=7): preds = predict(X) for i, target in enumerate(alltargets): + target_id = i + 1 + target.update({"target_id": target_id}) target.update({"pred": preds[i]}) - target.update({"seed_size": seed_size}) if genome: - off_target_df = compute_off_target_df( - target["guide"], seed_size, records, genome_features - ) - off_targets_list = [] - if not off_target_df.empty: - off_targets = off_target_df.loc[ - 0:, ["start", "end", "pampos", "strand", "recid", "features"] - ] - for index, off_t in enumerate(off_targets.values.tolist()): - off_target_dict = { - "off_target_start": off_t[0], - "off_target_end": off_t[1], - "pampos": off_t[2], - "strand": off_t[3], - "recid": off_t[4], - } - if len(off_t[5]) > 0: - # Loop for each feature - for feat in off_t[5]: - feature_dict = { - "feat_strand": feat.location.strand, - "feat_start": feat.location.start, - "feat_end": feat.location.end, - "feat_type": feat.type, - } - for k, feat in feat.qualifiers.items(): - if k != "translation": - feature_dict[k] = "::".join(feat) - off_targets_list.append( - {**feature_dict, **off_target_dict} + off_targets_per_seed = [] + for seed_size in seed_sizes: + # off-target found for a guide + off_target_df = compute_off_target_df( + target["guide"], seed_size, records, genome_features + ) + off_targets_list = [] + if not off_target_df.empty: + off_targets = off_target_df.loc[ + 0:, + ["start", "end", "pampos", "strand", "recid", "features"], + ] + for j, off_t in enumerate(off_targets.values.tolist()): + off_target_dict = { + "off_target_id": str(target_id) + + "-" + + str(seed_size) + + "-" + + str(j), + "off_target_start": off_t[0], + "off_target_end": off_t[1], + "off_target_pampos": off_t[2], + "off_target_strand": off_t[3], + "off_target_recid": off_t[4], + } + # Filter the off targets if the strand + # is not the opposite of the feature + off_t[5] = list( + filter( + lambda feat: get_strand_value( + off_target_dict["off_target_strand"] + ) + != get_strand_value(feat.location.strand), + off_t[5], ) - else: - off_targets_list.append(off_target_dict) - target.update({"off_targets": off_targets_list}) - else: - target.update({"off_targets": off_targets_list}) + ) + # Loop through features associated to an off-target position + if len(off_t[5]) > 0: + # Loop for each feature + for feat in off_t[5]: + feature_dict = { + "off_target_feat_strand": feat.location.strand, + "off_target_feat_start": feat.location.start, + "off_target_feat_end": feat.location.end, + "off_target_feat_type": feat.type, + } + for k, feat in feat.qualifiers.items(): + if k != "translation": + feature_dict["off_target_" + k] = "::".join( + feat + ) + off_targets_list.append( + {**feature_dict, **off_target_dict} + ) + else: + off_targets_list.append(off_target_dict) + off_targets_per_seed.append( + { + "id": str(i) + "-" + str(seed_size), + "seed_size": seed_size, + "off_targets": off_targets_list, + } + ) + else: + off_targets_per_seed.append( + { + "id": str(i) + "-" + str(seed_size), + "seed_size": seed_size, + "off_targets": [], + } + ) + # target.update({"off_targets": off_targets_list}) + target.update({"off_targets_per_seed": off_targets_per_seed}) else: - target.update({"off_targets": []}) + target.update({"off_targets_per_seed": []}) return alltargets else: return [] diff --git a/pyproject.toml b/pyproject.toml index 6bab28dbacede95d8211712738043a5ae34e9a20..ad1f1868cb6f10c2b23ee673b6256c005b7ce18d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "crisprbact" -version = "0.3.2" +version = "0.3.3" license = "GPL-3.0" description = "Tools to design and analyse CRISPRi experiments" authors = ["David Bikard <david.bikard@pasteur.fr>", "Remi Planel <rplanel@pasteur.fr>"]