Add parallel execution

dae3e53a · Nicolas MAILLET · 2c070bd1 · dae3e53a
Commit dae3e53a authored 4 years ago by Nicolas MAILLET
--- a/rpg/digest.py
+++ b/rpg/digest.py
@@ -25,6 +25,9 @@
 """Contains class and function needed to perform a digestion"""
 import os
 import random
+import sys
+from multiprocessing import Pool
+from functools import partial
 from rpg import core
 from rpg import rule
 from rpg import sequence
@@ -68,10 +71,6 @@ class ResultOneDigestion:
            return self.__dict__ == other.__dict__
        return False
-    # Needed with __eq__ to make it hashable
-    def __hash__(self):
-        return hash(self.__dict__.values())
    # Create a clean output according to fmt
    def __format__(self, fmt):
        ret = ""
@@ -409,20 +408,56 @@ def concurrent_digest(seq, enz, aa_pka):
    # it will be one result by enzyme
    return [result]
-def digest_from_input(input_data, input_type, enz, mode, aa_pka):
+def digest_part(offset_start, offset_end, file, enz, mode, aa_pka):
-    """Digest all sequences of input data with
+    """ Main parallelized function that digest each sequence of a file
-    selected enzymes and mode.
+        in an offset range.
+    :param offset_start: where to start taking sequences in the file
+    :param offset_end: where to stop taking sequences in the file
+    :param file: the filename of the file where to take sequences from
+    :param enz: enzymes to digest with
+    :param mode: digestion mode (concurrent / sequential)
+    :param aa_pka: pKa values (IPC / Stryer)
+    :type offset_start: int
+    :type offset_end: int
+    :type file: string
+    :type enz: list(:py:class:`~rpg.enzyme.Enzyme`)
+    :type mode: str
+    :type aa_pka: str
+    """
+    # Resulting digestions of current offset range
+    results_digestion = []
+    try:
+        # Query each sequence, one by one, in the offset range
+        for header, seq in core.next_read(file, offset_start, offset_end):
+            # Construct the Sequence to digest (remove first char of header)
+            tmp_seq = sequence.Sequence(header[1:], sequence.check_sequence(seq))
+            # Digest it
+            results_digestion.append(digest_one_sequence(tmp_seq, enz, mode,
+                                                         aa_pka))
+    except ValueError as exc:
+        raise exc
+    # Add the global result into the queue
+    return results_digestion
+def digest_from_input(input_data, input_type, enz, mode, aa_pka, nb_proc=1):
+    """Digest all sequences of input data according to selected enzymes
+    and mode. Can be done in parallel using nb_proc argument.
-    :param input_data: either a sequence or the path of a file of sequence (fasta/fastq)
+    :param input_data: either a sequence or the path of a file of sequence (fasta/fastq, gzipped or not)
    :param input_type: either 'sequence' or 'file'
    :param enz: enzymes to digest with
    :param mode: digestion mode (concurrent / sequential)
    :param aa_pka: pKa values (IPC / Stryer)
+    :param nb_proc: number of process to run in parallel
    :type input_data: str
    :type input_type: str
    :type enz: list(:py:class:`~rpg.enzyme.Enzyme`)
    :type mode: str
    :type aa_pka: str
+    :type nb_proc: int (default: 1)
    :return: result of digestions
    :rtype: list(list(:py:class:`ResultOneDigestion`))
@@ -431,59 +466,52 @@ def digest_from_input(input_data, input_type, enz, mode, aa_pka):
    results_digestion = []
    # Input is a file?
    if input_type == "file":
-        with open(input_data) as in_file:
+        # Get the size of the file
-            header_first_car = in_file.read(1)
+        total_size = os.path.getsize(input_data)
-            in_file.seek(0)
+        # Size of what to read
-            # Fasta file, can be multi-line
+        chunk_size = total_size // nb_proc
-            if header_first_car == ">":
+        # Starting offset
-                # First header
+        offset_start = 0
-                header = in_file.readline().strip()
+        try:
-                # First line
+            # Create the pool of process
-                tmp_line = in_file.readline().strip()
+            pool = Pool()
-                seq = ""
+            # Partial function to fix all but firsts arguments
-                while tmp_line:
+            prod_digest=partial(digest_part, file=input_data, enz=enz, mode=mode,
-                    if not tmp_line.startswith(">"):
+                                aa_pka=aa_pka)
-                        seq += tmp_line
+            # All tuples of offset_start, offset_end
-                        tmp_line = in_file.readline().strip()
+            all_offsets = []
-                    else:
+            # For each thread/chunk
-                        # Create a Sequence
+            for _ in range(nb_proc - 1):
-                        tmp_seq = sequence.Sequence(header[1:],
+                # Compute the ending offset for this chunk
-                                                    sequence.check_sequence(seq))
+                offset_end = offset_start + chunk_size
-                        # Digest sequence
+                # Add this couple of start/end
-                        results_digestion.append(digest_one_sequence
+                all_offsets.append((offset_start, offset_end))
-                                                 (tmp_seq, enz, mode, aa_pka))
+                # Next start is where it stops
-                        seq = ""
+                offset_start = offset_start + chunk_size
-                        header = tmp_line
+            # Add the last chunk
-                        tmp_line = in_file.readline().strip()
+            all_offsets.append((offset_start, total_size))
-                # Last sequence to digest
-                tmp_seq = sequence.Sequence(header[1:],
+            # Launch all process (Results is a list of list)
-                                            sequence.check_sequence(seq))
+            results = pool.starmap(prod_digest, all_offsets)
-                # Digest it
+        except ValueError as exc:
-                results_digestion.append(digest_one_sequence(tmp_seq, enz,
+            pool.terminate()
-                                                             mode, aa_pka))
+            core.handle_errors(str(exc), 0, "Input ")
-            # Fastq file
+        pool.terminate()
-            elif header_first_car == "@":
-                header = in_file.readline().strip()
+        # Get a flatten list
-                while header:
+        for i in results:
-                    seq = in_file.readline().strip()
+            results_digestion += i
-                    tmp_seq = sequence.Sequence(header[1:],
-                                                sequence.check_sequence(seq))
-                    # Digest sequence
-                    results_digestion.append(digest_one_sequence(tmp_seq, enz,
-                                                                 mode, aa_pka))
-                    in_file.readline()
-                    in_file.readline()
-                    header = in_file.readline().strip()
-            else:
-                core.handle_errors("input file format not recognized (%s)." %
-                                   header_first_car, 0, "Input ")
    # input is a single sequence
    elif input_type == "sequence":
-        tmp_seq = sequence.Sequence("Input",
+        try:
-                                    sequence.check_sequence(input_data))
+            tmp_seq = sequence.Sequence("Input",
-        # Digest the sequence
+                                        sequence.check_sequence(input_data))
-        results_digestion.append(digest_one_sequence(tmp_seq, enz, mode,
+            # Digest the sequence
-                                                     aa_pka))
+            results_digestion.append(digest_one_sequence(tmp_seq, enz, mode,
+                                                         aa_pka))
+        except ValueError as exc:
+            core.handle_errors(str(exc), 0, "Input ")
    # bad input
    else:
        core.handle_errors("input type not recognized (%s)." %