Skip to content
Snippets Groups Projects
Commit dae3e53a authored by Nicolas  MAILLET's avatar Nicolas MAILLET
Browse files

Add parallel execution

parent 2c070bd1
No related branches found
No related tags found
No related merge requests found
...@@ -25,6 +25,9 @@ ...@@ -25,6 +25,9 @@
"""Contains class and function needed to perform a digestion""" """Contains class and function needed to perform a digestion"""
import os import os
import random import random
import sys
from multiprocessing import Pool
from functools import partial
from rpg import core from rpg import core
from rpg import rule from rpg import rule
from rpg import sequence from rpg import sequence
...@@ -68,10 +71,6 @@ class ResultOneDigestion: ...@@ -68,10 +71,6 @@ class ResultOneDigestion:
return self.__dict__ == other.__dict__ return self.__dict__ == other.__dict__
return False return False
# Needed with __eq__ to make it hashable
def __hash__(self):
return hash(self.__dict__.values())
# Create a clean output according to fmt # Create a clean output according to fmt
def __format__(self, fmt): def __format__(self, fmt):
ret = "" ret = ""
...@@ -409,20 +408,56 @@ def concurrent_digest(seq, enz, aa_pka): ...@@ -409,20 +408,56 @@ def concurrent_digest(seq, enz, aa_pka):
# it will be one result by enzyme # it will be one result by enzyme
return [result] return [result]
def digest_from_input(input_data, input_type, enz, mode, aa_pka): def digest_part(offset_start, offset_end, file, enz, mode, aa_pka):
"""Digest all sequences of input data with """ Main parallelized function that digest each sequence of a file
selected enzymes and mode. in an offset range.
:param offset_start: where to start taking sequences in the file
:param offset_end: where to stop taking sequences in the file
:param file: the filename of the file where to take sequences from
:param enz: enzymes to digest with
:param mode: digestion mode (concurrent / sequential)
:param aa_pka: pKa values (IPC / Stryer)
:type offset_start: int
:type offset_end: int
:type file: string
:type enz: list(:py:class:`~rpg.enzyme.Enzyme`)
:type mode: str
:type aa_pka: str
"""
# Resulting digestions of current offset range
results_digestion = []
try:
# Query each sequence, one by one, in the offset range
for header, seq in core.next_read(file, offset_start, offset_end):
# Construct the Sequence to digest (remove first char of header)
tmp_seq = sequence.Sequence(header[1:], sequence.check_sequence(seq))
# Digest it
results_digestion.append(digest_one_sequence(tmp_seq, enz, mode,
aa_pka))
except ValueError as exc:
raise exc
# Add the global result into the queue
return results_digestion
def digest_from_input(input_data, input_type, enz, mode, aa_pka, nb_proc=1):
"""Digest all sequences of input data according to selected enzymes
and mode. Can be done in parallel using nb_proc argument.
:param input_data: either a sequence or the path of a file of sequence (fasta/fastq) :param input_data: either a sequence or the path of a file of sequence (fasta/fastq, gzipped or not)
:param input_type: either 'sequence' or 'file' :param input_type: either 'sequence' or 'file'
:param enz: enzymes to digest with :param enz: enzymes to digest with
:param mode: digestion mode (concurrent / sequential) :param mode: digestion mode (concurrent / sequential)
:param aa_pka: pKa values (IPC / Stryer) :param aa_pka: pKa values (IPC / Stryer)
:param nb_proc: number of process to run in parallel
:type input_data: str :type input_data: str
:type input_type: str :type input_type: str
:type enz: list(:py:class:`~rpg.enzyme.Enzyme`) :type enz: list(:py:class:`~rpg.enzyme.Enzyme`)
:type mode: str :type mode: str
:type aa_pka: str :type aa_pka: str
:type nb_proc: int (default: 1)
:return: result of digestions :return: result of digestions
:rtype: list(list(:py:class:`ResultOneDigestion`)) :rtype: list(list(:py:class:`ResultOneDigestion`))
...@@ -431,59 +466,52 @@ def digest_from_input(input_data, input_type, enz, mode, aa_pka): ...@@ -431,59 +466,52 @@ def digest_from_input(input_data, input_type, enz, mode, aa_pka):
results_digestion = [] results_digestion = []
# Input is a file? # Input is a file?
if input_type == "file": if input_type == "file":
with open(input_data) as in_file: # Get the size of the file
header_first_car = in_file.read(1) total_size = os.path.getsize(input_data)
in_file.seek(0) # Size of what to read
# Fasta file, can be multi-line chunk_size = total_size // nb_proc
if header_first_car == ">": # Starting offset
# First header offset_start = 0
header = in_file.readline().strip() try:
# First line # Create the pool of process
tmp_line = in_file.readline().strip() pool = Pool()
seq = "" # Partial function to fix all but firsts arguments
while tmp_line: prod_digest=partial(digest_part, file=input_data, enz=enz, mode=mode,
if not tmp_line.startswith(">"): aa_pka=aa_pka)
seq += tmp_line # All tuples of offset_start, offset_end
tmp_line = in_file.readline().strip() all_offsets = []
else: # For each thread/chunk
# Create a Sequence for _ in range(nb_proc - 1):
tmp_seq = sequence.Sequence(header[1:], # Compute the ending offset for this chunk
sequence.check_sequence(seq)) offset_end = offset_start + chunk_size
# Digest sequence # Add this couple of start/end
results_digestion.append(digest_one_sequence all_offsets.append((offset_start, offset_end))
(tmp_seq, enz, mode, aa_pka)) # Next start is where it stops
seq = "" offset_start = offset_start + chunk_size
header = tmp_line # Add the last chunk
tmp_line = in_file.readline().strip() all_offsets.append((offset_start, total_size))
# Last sequence to digest
tmp_seq = sequence.Sequence(header[1:], # Launch all process (Results is a list of list)
sequence.check_sequence(seq)) results = pool.starmap(prod_digest, all_offsets)
# Digest it except ValueError as exc:
results_digestion.append(digest_one_sequence(tmp_seq, enz, pool.terminate()
mode, aa_pka)) core.handle_errors(str(exc), 0, "Input ")
# Fastq file pool.terminate()
elif header_first_car == "@":
header = in_file.readline().strip() # Get a flatten list
while header: for i in results:
seq = in_file.readline().strip() results_digestion += i
tmp_seq = sequence.Sequence(header[1:],
sequence.check_sequence(seq))
# Digest sequence
results_digestion.append(digest_one_sequence(tmp_seq, enz,
mode, aa_pka))
in_file.readline()
in_file.readline()
header = in_file.readline().strip()
else:
core.handle_errors("input file format not recognized (%s)." %
header_first_car, 0, "Input ")
# input is a single sequence # input is a single sequence
elif input_type == "sequence": elif input_type == "sequence":
tmp_seq = sequence.Sequence("Input", try:
sequence.check_sequence(input_data)) tmp_seq = sequence.Sequence("Input",
# Digest the sequence sequence.check_sequence(input_data))
results_digestion.append(digest_one_sequence(tmp_seq, enz, mode, # Digest the sequence
aa_pka)) results_digestion.append(digest_one_sequence(tmp_seq, enz, mode,
aa_pka))
except ValueError as exc:
core.handle_errors(str(exc), 0, "Input ")
# bad input # bad input
else: else:
core.handle_errors("input type not recognized (%s)." % core.handle_errors("input type not recognized (%s)." %
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment