diff --git a/.gitignore b/.gitignore index c40ef5a8060c03793fa50865996f141d8f1e3ac5..2cf279d7d9e07546491b2c66882aa64923b8bb8b 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ dist *.egg-info __pycache__ +data-test/ diff --git a/crisprbact/__init__.py b/crisprbact/__init__.py index ed62d5fa657fefca074e246eff6e9ba5dcd4aa0d..7f60791b466ebe85a087307e7ba0cbdd0f83db58 100644 --- a/crisprbact/__init__.py +++ b/crisprbact/__init__.py @@ -1,3 +1,13 @@ -from .predict import on_target_predict +from crisprbact.predict import on_target_predict +from crisprbact.off_target import ( + compute_off_target_df, + extract_features, + extract_records, +) -__all__ = ["on_target_predict"] +__all__ = [ + "extract_records", + "on_target_predict", + "compute_off_target_df", + "extract_features", +] diff --git a/crisprbact/cli.py b/crisprbact/cli.py index e89b02c2e80fe006ff08c958e3bd3cbf20609f85..594206bbceb3007aaba36a2936d3e70be723eaac 100644 --- a/crisprbact/cli.py +++ b/crisprbact/cli.py @@ -1,4 +1,4 @@ -from crisprbact import on_target_predict +from crisprbact.predict import on_target_predict from Bio import SeqIO import click @@ -48,7 +48,7 @@ def from_str(config, target, output_file): @predict.command() @click.option( - "-t", "--target", type=click.File("rU"), required=True, help="Sequence file" + "-t", "--target", type=click.File("rU"), required=True, help="Sequence file", ) @click.option( "-f", @@ -58,9 +58,10 @@ def from_str(config, target, output_file): default="fasta", show_default=True, ) +# @click.option("-g", "--genome", type=click.File("rU"), required=True, help="Genome") @click.argument("output-file", type=click.File("w"), default="-") @pass_config -def from_seq(config, target, seq_format, output_file): +def from_seq(config, target, seq_format, output_file): # genome, """ Outputs candidate guide RNAs for the S. pyogenes dCas9 with predicted on-target activity from a target gene. @@ -76,6 +77,7 @@ def from_seq(config, target, seq_format, output_file): if config.verbose: click.secho(" - search guide RNAs for %s " % record.id, fg=fg) guide_rnas = on_target_predict(str(record.seq)) + write_guide_rnas(guide_rnas, output_file, record.id) @@ -85,9 +87,7 @@ def print_parameters(target, fg="blue"): def write_guide_rnas(guide_rnas, output_file, seq_id="N/A"): - for guide_rna in guide_rnas: - # click.echo(guide_rna) click.echo( "\t".join( [ diff --git a/crisprbact/off_target.py b/crisprbact/off_target.py new file mode 100644 index 0000000000000000000000000000000000000000..5d711d032969f43a386a0c38f9956c9671464f0c --- /dev/null +++ b/crisprbact/off_target.py @@ -0,0 +1,64 @@ +import re +import pandas as pd + +from crisprbact.utils import rev_comp + + +def get_pos_features(position, f_df): + if len(f_df) > 0: + feature_at_pos = f_df[(f_df.start < position) & (f_df.end > position)] + return feature_at_pos.feature.values + else: + return [] + + +def get_off_target_pos(guide, recs, records): + for rec in recs: + # + ori + offs_plus = re.finditer(guide[-records:] + "[ATGC]GG", str(rec.seq)) + offs = [match.span() + (match.end(), "+", rec.id) for match in offs_plus] + # - ori + offs_minus = re.finditer("CC[ATGC]" + rev_comp(guide[-records:]), str(rec.seq)) + offs += [match.span() + (match.start(), "-", rec.id) for match in offs_minus] + offs_dict = dict(zip(["start", "end", "pampos", "strand", "recid"], zip(*offs))) + return pd.DataFrame(offs_dict) + + +def extract_records(genome): + records = list(genome) + if records and len(records) > 0: + return records + else: + return None + + +def extract_features(recs): + f_list = [] + for rec in recs: + for f in rec.features: + if f.type in ["CDS", "ncRNA", "rRNA", "tRNA"]: + f_list.append( + ( + f.location.start.position, + f.location.end.position, + f.location.strand, + f.type, + f, + rec.id, + ) + ) + f_dict = dict( + zip(["start", "end", "strand", "type", "feature", "recid"], zip(*f_list[1:]),) + ) # starts at 1 to get rid of the first feature which is the whole chromosome + return pd.DataFrame(f_dict) + + +def compute_off_target_df(guide, seed_size, records, feature_df): + """ Returns a pandas DataFrame with data about the identified off-targets. + The features column contains a list of biopython SeqFeature objects that overlap + with the off-target""" + offs_df = get_off_target_pos(guide, records, seed_size) + offs_df["features"] = [ + get_pos_features(off.pampos, feature_df) for i, off in offs_df.iterrows() + ] + return offs_df diff --git a/crisprbact/predict.py b/crisprbact/predict.py index 70b41df5d3e55888e410ac0604f7de71f9761870..a34429ac266b8304282837c3c3d38fff6e90ea42 100644 --- a/crisprbact/predict.py +++ b/crisprbact/predict.py @@ -1,6 +1,12 @@ import numpy as np import re from importlib.resources import open_binary +from crisprbact.utils import rev_comp +from crisprbact.off_target import ( + compute_off_target_df, + extract_records, + extract_features, +) with open_binary("crisprbact", "reg_coef.pkl") as handle: coef = np.load(handle, allow_pickle=True) @@ -19,45 +25,62 @@ def predict(X): return [np.sum(x * coef) for x in X] -def rev_comp(seq): - comp = str.maketrans("ATGC", "TACG") - return seq.translate(comp)[::-1] - - def find_targets(seq): repam = "[ATGC]GG" L = len(seq) seq_revcomp = rev_comp(seq) - return ( - dict( + matching_target = re.finditer("(?=([ATGC]{6}" + repam + "[ATGC]{16}))", seq_revcomp) + for target in matching_target: + yield dict( [ - ("target", m.group(1)), - ("guide", m.group(1)[:20]), - ("start", L - m.start() - 20), - ("stop", L - m.start()), - ("pam", L - m.start() - 22), + ("target", target.group(1)), + ("guide", target.group(1)[:20]), + ("start", L - target.start() - 20), + ("stop", L - target.start()), + ("pam", L - target.start() - 22), ("ori", "-"), ] ) - for m in re.finditer("(?=([ATGC]{6}" + repam + "[ATGC]{16}))", seq_revcomp) - ) -def on_target_predict(seq): +def on_target_predict(seq, genome=None): seq = seq.upper() # make uppercase seq = re.sub(r"\s", "", seq) # removes white space + records = None + genome_features = None + if genome: + records = extract_records(genome) + genome_features = extract_features(records) + alltargets = list(find_targets(seq)) if alltargets: X = np.array( [ - encode(tar["target"][:7] + tar["target"][9:]) for tar in alltargets + encode(target["target"][:7] + target["target"][9:]) + for target in alltargets ] # encode and remove GG of PAM ) X = X.reshape(X.shape[0], -1) preds = predict(X) for i, target in enumerate(alltargets): target.update({"pred": preds[i]}) + if genome: + off_target_df = compute_off_target_df( + target["guide"], 7, records, genome_features + ) + off_target_list = [] + features = off_target_df.loc[0:, "features"] + for feat in features: + features_list = [] + for x in feat: + for k, feat in x.qualifiers.items(): + if k != "translation": + features_list.append((k, feat)) + off_target_list.append(features_list) + target.update({"off_targets": off_target_list}) + else: + target.update({"off_targets": []}) return alltargets else: return [] diff --git a/crisprbact/utils.py b/crisprbact/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..96557dbe7d9d9449f9f34cf30c67b4e4f201fc6f --- /dev/null +++ b/crisprbact/utils.py @@ -0,0 +1,3 @@ +def rev_comp(seq): + comp = str.maketrans("ATGC", "TACG") + return seq.translate(comp)[::-1] diff --git a/poetry.lock b/poetry.lock index 3086dbfbf04d34e99839e7c5869b7b920b66d252..efeb490b88eedf09a2f364bdf43e10a97869c849 100644 --- a/poetry.lock +++ b/poetry.lock @@ -191,6 +191,22 @@ version = "19.2" pyparsing = ">=2.0.2" six = "*" +[[package]] +category = "main" +description = "Powerful data structures for data analysis, time series, and statistics" +name = "pandas" +optional = false +python-versions = ">=3.5.3" +version = "0.25.3" + +[package.dependencies] +numpy = ">=1.13.3" +python-dateutil = ">=2.6.1" +pytz = ">=2017.2" + +[package.extras] +test = ["pytest (>=4.0.2)", "pytest-xdist", "hypothesis (>=3.58)"] + [[package]] category = "dev" description = "Utility library for gitignore style pattern matching of file paths." @@ -294,13 +310,32 @@ version = ">=0.12" [package.extras] testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] +[[package]] +category = "main" +description = "Extensions to the standard Python datetime module" +name = "python-dateutil" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +version = "2.8.1" + +[package.dependencies] +six = ">=1.5" + +[[package]] +category = "main" +description = "World timezone definitions, modern and historical" +name = "pytz" +optional = false +python-versions = "*" +version = "2019.3" + [[package]] category = "dev" description = "YAML parser and emitter for Python" name = "pyyaml" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "5.2" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +version = "5.3" [[package]] category = "dev" @@ -311,7 +346,18 @@ python-versions = "*" version = "2019.12.20" [[package]] -category = "dev" +category = "main" +description = "a python refactoring library..." +name = "rope" +optional = false +python-versions = "*" +version = "0.16.0" + +[package.extras] +dev = ["pytest"] + +[[package]] +category = "main" description = "Python 2 and 3 compatibility utilities" name = "six" optional = false @@ -371,7 +417,7 @@ docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] testing = ["pathlib2", "contextlib2", "unittest2"] [metadata] -content-hash = "74295930225fa16a026b2f57e2f50e2bb3919702d8845b1f834a5217b8d655a4" +content-hash = "fd6a383dbb81068f75ead2da55eccaea5f426d79758f22ebaf6eef7ac197234a" python-versions = "^3.7" [metadata.files] @@ -489,6 +535,27 @@ packaging = [ {file = "packaging-19.2-py2.py3-none-any.whl", hash = "sha256:d9551545c6d761f3def1677baf08ab2a3ca17c56879e70fecba2fc4dde4ed108"}, {file = "packaging-19.2.tar.gz", hash = "sha256:28b924174df7a2fa32c1953825ff29c61e2f5e082343165438812f00d3a7fc47"}, ] +pandas = [ + {file = "pandas-0.25.3-cp35-cp35m-macosx_10_6_intel.whl", hash = "sha256:df8864824b1fe488cf778c3650ee59c3a0d8f42e53707de167ba6b4f7d35f133"}, + {file = "pandas-0.25.3-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:7458c48e3d15b8aaa7d575be60e1e4dd70348efcd9376656b72fecd55c59a4c3"}, + {file = "pandas-0.25.3-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:61741f5aeb252f39c3031d11405305b6d10ce663c53bc3112705d7ad66c013d0"}, + {file = "pandas-0.25.3-cp35-cp35m-win32.whl", hash = "sha256:adc3d3a3f9e59a38d923e90e20c4922fc62d1e5a03d083440468c6d8f3f1ae0a"}, + {file = "pandas-0.25.3-cp35-cp35m-win_amd64.whl", hash = "sha256:975c461accd14e89d71772e89108a050fa824c0b87a67d34cedf245f6681fc17"}, + {file = "pandas-0.25.3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:ee50c2142cdcf41995655d499a157d0a812fce55c97d9aad13bc1eef837ed36c"}, + {file = "pandas-0.25.3-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:4545467a637e0e1393f7d05d61dace89689ad6d6f66f267f86fff737b702cce9"}, + {file = "pandas-0.25.3-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:bbe3eb765a0b1e578833d243e2814b60c825b7fdbf4cdfe8e8aae8a08ed56ecf"}, + {file = "pandas-0.25.3-cp36-cp36m-win32.whl", hash = "sha256:8153705d6545fd9eb6dd2bc79301bff08825d2e2f716d5dced48daafc2d0b81f"}, + {file = "pandas-0.25.3-cp36-cp36m-win_amd64.whl", hash = "sha256:26382aab9c119735908d94d2c5c08020a4a0a82969b7e5eefb92f902b3b30ad7"}, + {file = "pandas-0.25.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:00dff3a8e337f5ed7ad295d98a31821d3d0fe7792da82d78d7fd79b89c03ea9d"}, + {file = "pandas-0.25.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e45055c30a608076e31a9fcd780a956ed3b1fa20db61561b8d88b79259f526f7"}, + {file = "pandas-0.25.3-cp37-cp37m-win32.whl", hash = "sha256:255920e63850dc512ce356233081098554d641ba99c3767dde9e9f35630f994b"}, + {file = "pandas-0.25.3-cp37-cp37m-win_amd64.whl", hash = "sha256:22361b1597c8c2ffd697aa9bf85423afa9e1fcfa6b1ea821054a244d5f24d75e"}, + {file = "pandas-0.25.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9962957a27bfb70ab64103d0a7b42fa59c642fb4ed4cb75d0227b7bb9228535d"}, + {file = "pandas-0.25.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:78bf638993219311377ce9836b3dc05f627a666d0dbc8cec37c0ff3c9ada673b"}, + {file = "pandas-0.25.3-cp38-cp38-win32.whl", hash = "sha256:6a3ac2c87e4e32a969921d1428525f09462770c349147aa8e9ab95f88c71ec71"}, + {file = "pandas-0.25.3-cp38-cp38-win_amd64.whl", hash = "sha256:33970f4cacdd9a0ddb8f21e151bfb9f178afb7c36eb7c25b9094c02876f385c2"}, + {file = "pandas-0.25.3.tar.gz", hash = "sha256:52da74df8a9c9a103af0a72c9d5fdc8e0183a90884278db7f386b5692a2220a4"}, +] pathspec = [ {file = "pathspec-0.7.0-py2.py3-none-any.whl", hash = "sha256:163b0632d4e31cef212976cf57b43d9fd6b0bac6e67c26015d611a647d5e7424"}, {file = "pathspec-0.7.0.tar.gz", hash = "sha256:562aa70af2e0d434367d9790ad37aed893de47f1693e4201fd1d3dca15d19b96"}, @@ -521,18 +588,26 @@ pytest = [ {file = "pytest-5.2.2-py3-none-any.whl", hash = "sha256:58cee9e09242937e136dbb3dab466116ba20d6b7828c7620f23947f37eb4dae4"}, {file = "pytest-5.2.2.tar.gz", hash = "sha256:27abc3fef618a01bebb1f0d6d303d2816a99aa87a5968ebc32fe971be91eb1e6"}, ] +python-dateutil = [ + {file = "python-dateutil-2.8.1.tar.gz", hash = "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c"}, + {file = "python_dateutil-2.8.1-py2.py3-none-any.whl", hash = "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"}, +] +pytz = [ + {file = "pytz-2019.3-py2.py3-none-any.whl", hash = "sha256:1c557d7d0e871de1f5ccd5833f60fb2550652da6be2693c1e02300743d21500d"}, + {file = "pytz-2019.3.tar.gz", hash = "sha256:b02c06db6cf09c12dd25137e563b31700d3b80fcc4ad23abb7a315f2789819be"}, +] pyyaml = [ - {file = "PyYAML-5.2-cp27-cp27m-win32.whl", hash = "sha256:35ace9b4147848cafac3db142795ee42deebe9d0dad885ce643928e88daebdcc"}, - {file = "PyYAML-5.2-cp27-cp27m-win_amd64.whl", hash = "sha256:ebc4ed52dcc93eeebeae5cf5deb2ae4347b3a81c3fa12b0b8c976544829396a4"}, - {file = "PyYAML-5.2-cp35-cp35m-win32.whl", hash = "sha256:38a4f0d114101c58c0f3a88aeaa44d63efd588845c5a2df5290b73db8f246d15"}, - {file = "PyYAML-5.2-cp35-cp35m-win_amd64.whl", hash = "sha256:483eb6a33b671408c8529106df3707270bfacb2447bf8ad856a4b4f57f6e3075"}, - {file = "PyYAML-5.2-cp36-cp36m-win32.whl", hash = "sha256:7f38e35c00e160db592091751d385cd7b3046d6d51f578b29943225178257b31"}, - {file = "PyYAML-5.2-cp36-cp36m-win_amd64.whl", hash = "sha256:0e7f69397d53155e55d10ff68fdfb2cf630a35e6daf65cf0bdeaf04f127c09dc"}, - {file = "PyYAML-5.2-cp37-cp37m-win32.whl", hash = "sha256:e4c015484ff0ff197564917b4b4246ca03f411b9bd7f16e02a2f586eb48b6d04"}, - {file = "PyYAML-5.2-cp37-cp37m-win_amd64.whl", hash = "sha256:4b6be5edb9f6bb73680f5bf4ee08ff25416d1400fbd4535fe0069b2994da07cd"}, - {file = "PyYAML-5.2-cp38-cp38-win32.whl", hash = "sha256:8100c896ecb361794d8bfdb9c11fce618c7cf83d624d73d5ab38aef3bc82d43f"}, - {file = "PyYAML-5.2-cp38-cp38-win_amd64.whl", hash = "sha256:2e9f0b7c5914367b0916c3c104a024bb68f269a486b9d04a2e8ac6f6597b7803"}, - {file = "PyYAML-5.2.tar.gz", hash = "sha256:c0ee8eca2c582d29c3c2ec6e2c4f703d1b7f1fb10bc72317355a746057e7346c"}, + {file = "PyYAML-5.3-cp27-cp27m-win32.whl", hash = "sha256:940532b111b1952befd7db542c370887a8611660d2b9becff75d39355303d82d"}, + {file = "PyYAML-5.3-cp27-cp27m-win_amd64.whl", hash = "sha256:059b2ee3194d718896c0ad077dd8c043e5e909d9180f387ce42012662a4946d6"}, + {file = "PyYAML-5.3-cp35-cp35m-win32.whl", hash = "sha256:4fee71aa5bc6ed9d5f116327c04273e25ae31a3020386916905767ec4fc5317e"}, + {file = "PyYAML-5.3-cp35-cp35m-win_amd64.whl", hash = "sha256:dbbb2379c19ed6042e8f11f2a2c66d39cceb8aeace421bfc29d085d93eda3689"}, + {file = "PyYAML-5.3-cp36-cp36m-win32.whl", hash = "sha256:e3a057b7a64f1222b56e47bcff5e4b94c4f61faac04c7c4ecb1985e18caa3994"}, + {file = "PyYAML-5.3-cp36-cp36m-win_amd64.whl", hash = "sha256:74782fbd4d4f87ff04159e986886931456a1894c61229be9eaf4de6f6e44b99e"}, + {file = "PyYAML-5.3-cp37-cp37m-win32.whl", hash = "sha256:24521fa2890642614558b492b473bee0ac1f8057a7263156b02e8b14c88ce6f5"}, + {file = "PyYAML-5.3-cp37-cp37m-win_amd64.whl", hash = "sha256:1cf708e2ac57f3aabc87405f04b86354f66799c8e62c28c5fc5f88b5521b2dbf"}, + {file = "PyYAML-5.3-cp38-cp38-win32.whl", hash = "sha256:70024e02197337533eef7b85b068212420f950319cc8c580261963aefc75f811"}, + {file = "PyYAML-5.3-cp38-cp38-win_amd64.whl", hash = "sha256:cb1f2f5e426dc9f07a7681419fe39cee823bb74f723f36f70399123f439e9b20"}, + {file = "PyYAML-5.3.tar.gz", hash = "sha256:e9f45bd5b92c7974e59bcd2dcc8631a6b6cc380a904725fce7bc08872e691615"}, ] regex = [ {file = "regex-2019.12.20-cp27-cp27m-win32.whl", hash = "sha256:7bbbdbada3078dc360d4692a9b28479f569db7fc7f304b668787afc9feb38ec8"}, @@ -557,6 +632,11 @@ regex = [ {file = "regex-2019.12.20-cp38-cp38-win_amd64.whl", hash = "sha256:d3ee0b035816e0520fac928de31b6572106f0d75597f6fa3206969a02baba06f"}, {file = "regex-2019.12.20.tar.gz", hash = "sha256:106e25a841921d8259dcef2a42786caae35bc750fb996f830065b3dfaa67b77e"}, ] +rope = [ + {file = "rope-0.16.0-py2-none-any.whl", hash = "sha256:ae1fa2fd56f64f4cc9be46493ce54bed0dd12dee03980c61a4393d89d84029ad"}, + {file = "rope-0.16.0-py3-none-any.whl", hash = "sha256:52423a7eebb5306a6d63bdc91a7c657db51ac9babfb8341c9a1440831ecf3203"}, + {file = "rope-0.16.0.tar.gz", hash = "sha256:d2830142c2e046f5fc26a022fe680675b6f48f81c7fc1f03a950706e746e9dfe"}, +] six = [ {file = "six-1.13.0-py2.py3-none-any.whl", hash = "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd"}, {file = "six-1.13.0.tar.gz", hash = "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66"}, diff --git a/pyproject.toml b/pyproject.toml index d5450c547b6de0a99216d6bba982b321dc14ed7b..003f0355af653abc60f819747e8cc33e544a0864 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,8 @@ python = "^3.7" numpy = "^1.17" click = "^7.0" biopython = "^1.75" +rope = "^0.16.0" +pandas = "^0.25.3" [tool.poetry.dev-dependencies] pytest = "^5.2" @@ -32,10 +34,6 @@ black = "^19.10b0" [tool.poetry.scripts] crisprbact= "crisprbact.cli:main" -[build-system] -requires = ["poetry>=0.12"] -build-backend = "poetry.masonry.api" - [tool.black] target-version = ['py37'] include = '\.pyi?$' @@ -60,6 +58,10 @@ exclude = ''' ) ''' +[build-system] +requires = ["poetry>=0.12"] +build-backend = "poetry.masonry.api" + [flake8] max-line-length = 89 max-complexity = 18