Add parallel execution

2c070bd1 · Nicolas MAILLET · 94c9225e · 2c070bd1 · 2c070bd1
Commit 2c070bd1 authored Feb 3, 2021 by Nicolas MAILLET
--- a/rpg/RapidPeptidesGenerator.py
+++ b/rpg/RapidPeptidesGenerator.py
@@ -163,8 +163,6 @@ def get_enzymes_to_use(mode, id_enz_selected, miscleavage):
    :return: list of enzyme's id with associated miscleavage values
    :rtype: list(int)
-    .. warning:: Not tested
    """
    # Get the correct Enzymes inputed
@@ -217,10 +215,7 @@ def get_enzymes_to_use(mode, id_enz_selected, miscleavage):
    return enzymes_to_use
 # Not tested
 def main():
-    """Launcher of RapidPeptidesGenerator
+    """Launcher of RapidPeptidesGenerator"""
-    .. warning:: Not tested
-    """
    parser = argparse.ArgumentParser(description="This software takes protein "
                                                 "sequences as input (-i optio"
                                                 "n). All sequences will be cl"
@@ -285,6 +280,8 @@ def main():
                              "to output result peptides.")
    group_output.add_argument("-r", "--randomname", action="store_true",
                              help="Random (not used) output file name")
+    parser.add_argument("-c", "--processes", type=int, metavar="", default=1,
+                       help="Number of parallel processes to use (default: 1)")
    group_verbose = parser.add_mutually_exclusive_group()
    group_verbose.add_argument("-q", "--quiet", action="store_true",
                               help="No standard output, only error(s)")
@@ -322,6 +319,10 @@ def main():
        args.quiet = 1
        args.verbose = 0
+    # Be sure to have at least 1 process
+    if args.processes <= 0:
+        parser.error("argument -c/--processes should be greater than 0")
    # input data
    input_data = None
    input_type = None
@@ -394,13 +395,13 @@ def main():
    # Make the actual digestion of input data
    results_digestion = digest.digest_from_input(input_data, input_type,
-                                                 enzymes_to_use, mode, aa_pka)
+                                                 enzymes_to_use, mode, aa_pka,
+                                                 args.processes)
    # Output results
    core.output_results(output_file, results_digestion, args.fmt, args.quiet,
                        args.verbose)
 ### Let'z go ###
 if __name__ == '__main__':
    main()


--- a/rpg/core.py
+++ b/rpg/core.py
@@ -24,6 +24,7 @@
 """Contains generic functions and global variables used by RPG"""
 import sys
+import gzip
 AMINOACIDS = ["A", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N",
              "O", "P", "Q", "R", "S", "T", "U", "V", "W", "Y", "B", "X", "Z",
@@ -204,3 +205,96 @@ def output_results(output_file, all_seq_digested, fmt, quiet, verbose):
        except IOError:
            handle_errors(output_file + " can't be open in 'w' mode", 0,
                          "File ")
+def next_read(file, offset_start, offset_end):
+    """ Return each sequence between offsets range of a file
+        as a tuple (header, seq) using a generator.
+        Can be fasta or fastq, gzipped or not.
+    :param file: fasta/fastq file to read
+    :param offset_start: offset in the file from where to read
+    :param offset_end: offset in the file until where to read
+    :type file: str
+    :type offset_start: int
+    :type offset_end: int
+    """
+    # Is it a GZIP file?
+    test_file = open(file, "rb")
+    # Get the first values
+    magic = test_file.read(2)
+    # Close the file
+    test_file.close()
+    # Open the file, GZIP or not
+    with (gzip.open(file, "rb") if magic == b"\x1f\x8b"
+          else open(file, "rb")) as in_file:
+        first_line = in_file.readline().decode('utf-8')
+        # FASTQ file
+        if first_line.startswith("@"):
+            # Go to starting offset
+            in_file.seek(offset_start)
+            # Set current offset
+            beg_line_offset = offset_start
+            # Read each line from this point
+            for line in in_file:
+                # Consider this line as a header
+                header = line.decode('utf-8').strip()
+                # It is a proper fastq header
+                if header.startswith("@"):
+                    # The beginning of header is in the offset range
+                    if beg_line_offset < offset_end:
+                        # Get the sequence
+                        sequence = in_file.readline().decode('utf-8').strip()
+                        # Skip the two next lines
+                        in_file.readline()
+                        in_file.readline()
+                        # Return header and sequence and wait for the next one
+                        yield (header, sequence.upper())
+                    # Out of offset, stop this loop
+                    else:
+                        break
+                # Current offset
+                beg_line_offset = in_file.tell()
+        # (multi?)FASTA file
+        elif first_line.startswith(">"):
+            # Go to starting offset
+            in_file.seek(offset_start)
+            # Set current offset
+            beg_line_offset = offset_start
+            # Read each line from this point
+            for line in in_file:
+                # Consider this line as a header
+                header = line.decode('utf-8').strip()
+                # It is a proper fasta header
+                if header.startswith(">"):
+                    # The beginning of header is in the offset range
+                    if beg_line_offset < offset_end:
+                        # Get the sequence
+                        sequence = in_file.readline().decode('utf-8').strip()
+                        # Get current offset
+                        current_offset = in_file.tell()
+                        # Get next line
+                        next_l = in_file.readline().decode('utf-8').strip()
+                        # While this next line is not a fasta header...
+                        while next_l and not next_l.startswith(">"):
+                            # Add this to the Sequence
+                            sequence += next_l
+                            # Get current offset
+                            current_offset = in_file.tell()
+                            # Get next line
+                            next_l = in_file.readline().decode('utf-8').strip()
+                        # Next line is a fasta header, go back to its beginning
+                        in_file.seek(current_offset)
+                        # Return header and sequence and wait for the next one
+                        yield (header, sequence.upper())
+                    # Out of offset, stop this loop
+                    else:
+                        break
+                # Current offset
+                beg_line_offset = in_file.tell()
+        # Not a valid file
+        else:
+            # Stop the generator with the error to show
+            raise ValueError("input file format not recognized (%s)"\
+                             "." % first_line[0])