Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
CRISPRbact
Manage
Activity
Members
Labels
Plan
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
David BIKARD
CRISPRbact
Commits
5d74c21d
Commit
5d74c21d
authored
5 years ago
by
Remi PLANEL
Browse files
Options
Downloads
Patches
Plain Diff
off-target features per record id
parent
222412cd
No related branches found
No related tags found
No related merge requests found
Pipeline
#26159
passed
5 years ago
Stage: test
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
crisprbact/off_target.py
+46
-37
46 additions, 37 deletions
crisprbact/off_target.py
with
46 additions
and
37 deletions
crisprbact/off_target.py
+
46
−
37
View file @
5d74c21d
...
...
@@ -2,6 +2,7 @@ import re
import
pandas
as
pd
from
crisprbact.utils
import
rev_comp
from
itertools
import
chain
def
compute_off_target_df
(
guide
,
seed_size
,
records
,
feature_df
):
...
...
@@ -11,71 +12,79 @@ def compute_off_target_df(guide, seed_size, records, feature_df):
offs_df
=
get_off_target_pos
(
guide
,
records
,
seed_size
)
if
offs_df
is
not
None
:
offs_df
[
"
features
"
]
=
[
get_pos_features
(
off
.
pampos
,
feature_df
)
for
i
,
off
in
offs_df
.
iterrows
()
get_pos_features
(
off
.
recid
,
off
.
pampos
,
feature_df
)
for
i
,
off
in
offs_df
.
iterrows
()
]
return
offs_df
else
:
return
None
def
get_off_target_pos
(
guide
,
recs
,
seed_size
):
if
recs
is
not
None
:
for
rec
in
recs
:
offs_plus
=
re
.
finditer
(
guide
[
-
seed_size
:]
+
"
[ATGC]GG
"
,
str
(
rec
.
seq
))
offs
=
list
(
gen_extract_off_target_strand_plus
(
offs_plus
,
rec
,
guide
,
seed_size
)
def
get_off_target_pos
(
guide
,
records
,
seed_size
):
if
records
is
not
None
:
offs
=
iter
([])
for
rec
in
records
:
sequence_str
=
str
(
rec
.
seq
)
seqid
=
rec
.
id
offs_plus
=
re
.
finditer
(
guide
[
-
seed_size
:]
+
"
[ATGC]GG
"
,
sequence_str
)
offs_plus
=
gen_extract_off_target_strand_plus
(
offs_plus
,
seqid
,
sequence_str
,
guide
,
seed_size
)
# - ori
offs_minus
=
re
.
finditer
(
"
CC[ATGC]
"
+
rev_comp
(
guide
[
-
seed_size
:]),
s
tr
(
rec
.
seq
)
"
CC[ATGC]
"
+
rev_comp
(
guide
[
-
seed_size
:]),
s
equence_str
)
offs
+=
list
(
gen_extract_off_target_strand_minus
(
offs_minus
,
rec
,
guide
,
seed_size
)
offs
_minus
=
gen_extract_off_target_strand_minus
(
offs_minus
,
seqid
,
sequence_str
,
guide
,
seed_size
)
offs
_dict
=
dict
(
zip
(
[
"
start
"
,
"
end
"
,
"
pampos
"
,
"
strand
"
,
"
reci
d
"
,
"
max_matching_len
"
,
"
max_matching_
seq
"
,
"
pam
_seq
"
,
]
,
zip
(
*
offs
)
,
)
offs
=
chain
(
offs
,
offs_plus
,
offs_minus
)
offs_dict
=
dict
(
zip
(
[
"
start
"
,
"
end
"
,
"
pampos
"
,
"
stran
d
"
,
"
recid
"
,
"
max_matching_
len
"
,
"
max_matching
_seq
"
,
"
pam_seq
"
,
]
,
zip
(
*
offs
),
)
return
pd
.
DataFrame
(
offs_dict
)
)
return
pd
.
DataFrame
(
offs_dict
)
else
:
return
None
def
get_pos_features
(
position
,
f_df
):
def
get_pos_features
(
recid
,
position
,
f_df
):
if
len
(
f_df
)
>
0
:
feature_at_pos
=
f_df
[(
f_df
.
start
<
position
)
&
(
f_df
.
end
>
position
)]
feature_at_pos
=
f_df
[
(
f_df
.
start
<
position
)
&
(
f_df
.
end
>
position
)
&
(
f_df
.
recid
==
recid
)
]
return
feature_at_pos
.
feature
.
values
else
:
return
[]
def
gen_extract_off_target_strand_plus
(
off_target_matches
,
rec
,
guide
,
seed_size
):
def
gen_extract_off_target_strand_plus
(
off_target_matches
,
seqid
,
sequence_string
,
guide
,
seed_size
):
for
match
in
off_target_matches
:
guide_subseq
=
guide
[:
20
-
seed_size
][::
-
1
]
# extract part of the sequence
seq_extension_len
=
len
(
guide
)
-
seed_size
start_pos_seq
=
match
.
start
()
-
seq_extension_len
end_pos_seq
=
match
.
end
()
-
seed_size
-
3
sub_sequence_to_match
=
rec
.
seq
[
start_pos_seq
:
end_pos_seq
][::
-
1
]
assert
len
(
sub_sequence_to_match
)
==
len
(
guide_subseq
)
sub_sequence_to_match
=
sequence_string
[
start_pos_seq
:
end_pos_seq
][::
-
1
]
matching_chars
=
list
(
common_start
(
sub_sequence_to_match
,
guide_subseq
))
matching_substr
=
""
.
join
(
matching_chars
)
yield
match
.
span
()
+
(
match
.
end
(),
"
+
"
,
rec
.
id
,
seq
id
,
seed_size
+
len
(
matching_chars
),
matching_substr
[::
-
1
]
+
match
.
group
(
0
)[:
-
3
],
match
.
group
(
0
)[
-
3
:],
...
...
@@ -90,25 +99,25 @@ def common_start(seq1, seq2):
return
def
gen_extract_off_target_strand_minus
(
off_target_matches
,
rec
,
guide
,
seed_size
):
def
gen_extract_off_target_strand_minus
(
off_target_matches
,
seqid
,
sequence_string
,
guide
,
seed_size
):
for
match
in
off_target_matches
:
# Extract the sequence.
# since rev_comp, extend the sequence to the end
seq_extension_len
=
len
(
guide
)
-
seed_size
end_pos_seq
=
match
.
end
()
+
seq_extension_len
start_pos_seq
=
match
.
start
()
+
seed_size
+
3
sub_sequence_to_match
=
rec
.
seq
[
start_pos_seq
:
end_pos_seq
]
sub_sequence_to_match
=
sequence_string
[
start_pos_seq
:
end_pos_seq
]
# Extract part of the guide to match
guide_subseq
=
rev_comp
(
guide
[:
20
-
seed_size
])
assert
len
(
sub_sequence_to_match
)
==
len
(
guide_subseq
)
matching_chars
=
list
(
common_start
(
sub_sequence_to_match
,
guide_subseq
))
matching_substr
=
""
.
join
(
matching_chars
)
yield
match
.
span
()
+
(
match
.
start
(),
"
-
"
,
rec
.
id
,
seq
id
,
seed_size
+
len
(
matching_chars
),
match
.
group
(
0
)[
3
:]
+
matching_substr
,
match
.
group
(
0
)[:
3
],
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment