Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
R
rpg
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Nicolas MAILLET
rpg
Commits
dae3e53a
Commit
dae3e53a
authored
4 years ago
by
Nicolas MAILLET
Browse files
Options
Downloads
Patches
Plain Diff
Add parallel execution
parent
2c070bd1
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
rpg/digest.py
+87
-59
87 additions, 59 deletions
rpg/digest.py
with
87 additions
and
59 deletions
rpg/digest.py
+
87
−
59
View file @
dae3e53a
...
@@ -25,6 +25,9 @@
...
@@ -25,6 +25,9 @@
"""
Contains class and function needed to perform a digestion
"""
"""
Contains class and function needed to perform a digestion
"""
import
os
import
os
import
random
import
random
import
sys
from
multiprocessing
import
Pool
from
functools
import
partial
from
rpg
import
core
from
rpg
import
core
from
rpg
import
rule
from
rpg
import
rule
from
rpg
import
sequence
from
rpg
import
sequence
...
@@ -68,10 +71,6 @@ class ResultOneDigestion:
...
@@ -68,10 +71,6 @@ class ResultOneDigestion:
return
self
.
__dict__
==
other
.
__dict__
return
self
.
__dict__
==
other
.
__dict__
return
False
return
False
# Needed with __eq__ to make it hashable
def
__hash__
(
self
):
return
hash
(
self
.
__dict__
.
values
())
# Create a clean output according to fmt
# Create a clean output according to fmt
def
__format__
(
self
,
fmt
):
def
__format__
(
self
,
fmt
):
ret
=
""
ret
=
""
...
@@ -409,20 +408,56 @@ def concurrent_digest(seq, enz, aa_pka):
...
@@ -409,20 +408,56 @@ def concurrent_digest(seq, enz, aa_pka):
# it will be one result by enzyme
# it will be one result by enzyme
return
[
result
]
return
[
result
]
def
digest_from_input
(
input_data
,
input_type
,
enz
,
mode
,
aa_pka
):
def
digest_part
(
offset_start
,
offset_end
,
file
,
enz
,
mode
,
aa_pka
):
"""
Digest all sequences of input data with
"""
Main parallelized function that digest each sequence of a file
selected enzymes and mode.
in an offset range.
:param offset_start: where to start taking sequences in the file
:param offset_end: where to stop taking sequences in the file
:param file: the filename of the file where to take sequences from
:param enz: enzymes to digest with
:param mode: digestion mode (concurrent / sequential)
:param aa_pka: pKa values (IPC / Stryer)
:type offset_start: int
:type offset_end: int
:type file: string
:type enz: list(:py:class:`~rpg.enzyme.Enzyme`)
:type mode: str
:type aa_pka: str
"""
# Resulting digestions of current offset range
results_digestion
=
[]
try
:
# Query each sequence, one by one, in the offset range
for
header
,
seq
in
core
.
next_read
(
file
,
offset_start
,
offset_end
):
# Construct the Sequence to digest (remove first char of header)
tmp_seq
=
sequence
.
Sequence
(
header
[
1
:],
sequence
.
check_sequence
(
seq
))
# Digest it
results_digestion
.
append
(
digest_one_sequence
(
tmp_seq
,
enz
,
mode
,
aa_pka
))
except
ValueError
as
exc
:
raise
exc
# Add the global result into the queue
return
results_digestion
def
digest_from_input
(
input_data
,
input_type
,
enz
,
mode
,
aa_pka
,
nb_proc
=
1
):
"""
Digest all sequences of input data according to selected enzymes
and mode. Can be done in parallel using nb_proc argument.
:param input_data: either a sequence or the path of a file of sequence (fasta/fastq)
:param input_data: either a sequence or the path of a file of sequence (fasta/fastq
, gzipped or not
)
:param input_type: either
'
sequence
'
or
'
file
'
:param input_type: either
'
sequence
'
or
'
file
'
:param enz: enzymes to digest with
:param enz: enzymes to digest with
:param mode: digestion mode (concurrent / sequential)
:param mode: digestion mode (concurrent / sequential)
:param aa_pka: pKa values (IPC / Stryer)
:param aa_pka: pKa values (IPC / Stryer)
:param nb_proc: number of process to run in parallel
:type input_data: str
:type input_data: str
:type input_type: str
:type input_type: str
:type enz: list(:py:class:`~rpg.enzyme.Enzyme`)
:type enz: list(:py:class:`~rpg.enzyme.Enzyme`)
:type mode: str
:type mode: str
:type aa_pka: str
:type aa_pka: str
:type nb_proc: int (default: 1)
:return: result of digestions
:return: result of digestions
:rtype: list(list(:py:class:`ResultOneDigestion`))
:rtype: list(list(:py:class:`ResultOneDigestion`))
...
@@ -431,59 +466,52 @@ def digest_from_input(input_data, input_type, enz, mode, aa_pka):
...
@@ -431,59 +466,52 @@ def digest_from_input(input_data, input_type, enz, mode, aa_pka):
results_digestion
=
[]
results_digestion
=
[]
# Input is a file?
# Input is a file?
if
input_type
==
"
file
"
:
if
input_type
==
"
file
"
:
with
open
(
input_data
)
as
in_file
:
# Get the size of the file
header_first_car
=
in_file
.
read
(
1
)
total_size
=
os
.
path
.
getsize
(
input_data
)
in_file
.
seek
(
0
)
# Size of what to read
# Fasta file, can be multi-line
chunk_size
=
total_size
//
nb_proc
if
header_first_car
==
"
>
"
:
# Starting offset
# First header
offset_start
=
0
header
=
in_file
.
readline
().
strip
()
try
:
# First line
# Create the pool of process
tmp_line
=
in_file
.
readline
().
strip
()
pool
=
Pool
()
seq
=
""
# Partial function to fix all but firsts arguments
while
tmp_line
:
prod_digest
=
partial
(
digest_part
,
file
=
input_data
,
enz
=
enz
,
mode
=
mode
,
if
not
tmp_line
.
startswith
(
"
>
"
):
aa_pka
=
aa_pka
)
seq
+=
tmp_line
# All tuples of offset_start, offset_end
tmp_line
=
in_file
.
readline
().
strip
()
all_offsets
=
[]
else
:
# For each thread/chunk
# Create a Sequence
for
_
in
range
(
nb_proc
-
1
):
tmp_seq
=
sequence
.
Sequence
(
header
[
1
:],
# Compute the ending offset for this chunk
sequence
.
check_sequence
(
seq
))
offset_end
=
offset_start
+
chunk_size
# Digest sequence
# Add this couple of start/end
results_digestion
.
append
(
digest_one_sequence
all_offsets
.
append
((
offset_start
,
offset_end
))
(
tmp_seq
,
enz
,
mode
,
aa_pka
))
# Next start is where it stops
seq
=
""
offset_start
=
offset_start
+
chunk_size
header
=
tmp_line
# Add the last chunk
tmp_line
=
in_file
.
readline
().
strip
()
all_offsets
.
append
((
offset_start
,
total_size
))
# Last sequence to digest
tmp_seq
=
sequence
.
Sequence
(
header
[
1
:],
# Launch all process (Results is a list of list)
sequence
.
check_sequence
(
seq
))
results
=
pool
.
starmap
(
prod_digest
,
all_offsets
)
# Digest it
except
ValueError
as
exc
:
results_digestion
.
append
(
digest_one_sequence
(
tmp_seq
,
enz
,
pool
.
terminate
()
mode
,
aa_pka
))
core
.
handle_errors
(
str
(
exc
),
0
,
"
Input
"
)
# Fastq file
pool
.
terminate
()
elif
header_first_car
==
"
@
"
:
header
=
in_file
.
readline
().
strip
()
# Get a flatten list
while
header
:
for
i
in
results
:
seq
=
in_file
.
readline
().
strip
()
results_digestion
+=
i
tmp_seq
=
sequence
.
Sequence
(
header
[
1
:],
sequence
.
check_sequence
(
seq
))
# Digest sequence
results_digestion
.
append
(
digest_one_sequence
(
tmp_seq
,
enz
,
mode
,
aa_pka
))
in_file
.
readline
()
in_file
.
readline
()
header
=
in_file
.
readline
().
strip
()
else
:
core
.
handle_errors
(
"
input file format not recognized (%s).
"
%
header_first_car
,
0
,
"
Input
"
)
# input is a single sequence
# input is a single sequence
elif
input_type
==
"
sequence
"
:
elif
input_type
==
"
sequence
"
:
tmp_seq
=
sequence
.
Sequence
(
"
Input
"
,
try
:
sequence
.
check_sequence
(
input_data
))
tmp_seq
=
sequence
.
Sequence
(
"
Input
"
,
# Digest the sequence
sequence
.
check_sequence
(
input_data
))
results_digestion
.
append
(
digest_one_sequence
(
tmp_seq
,
enz
,
mode
,
# Digest the sequence
aa_pka
))
results_digestion
.
append
(
digest_one_sequence
(
tmp_seq
,
enz
,
mode
,
aa_pka
))
except
ValueError
as
exc
:
core
.
handle_errors
(
str
(
exc
),
0
,
"
Input
"
)
# bad input
# bad input
else
:
else
:
core
.
handle_errors
(
"
input type not recognized (%s).
"
%
core
.
handle_errors
(
"
input type not recognized (%s).
"
%
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment