diff --git a/rpg/RapidPeptidesGenerator.py b/rpg/RapidPeptidesGenerator.py index de07794dd5c93bcf9b5d395836a8dee777384ebc..378b897f004397f3fc5f194fd0c8ef6a1c7a1942 100644 --- a/rpg/RapidPeptidesGenerator.py +++ b/rpg/RapidPeptidesGenerator.py @@ -163,8 +163,6 @@ def get_enzymes_to_use(mode, id_enz_selected, miscleavage): :return: list of enzyme's id with associated miscleavage values :rtype: list(int) - - .. warning:: Not tested """ # Get the correct Enzymes inputed @@ -217,10 +215,7 @@ def get_enzymes_to_use(mode, id_enz_selected, miscleavage): return enzymes_to_use # Not tested def main(): - """Launcher of RapidPeptidesGenerator - - .. warning:: Not tested - """ + """Launcher of RapidPeptidesGenerator""" parser = argparse.ArgumentParser(description="This software takes protein " "sequences as input (-i optio" "n). All sequences will be cl" @@ -285,6 +280,8 @@ def main(): "to output result peptides.") group_output.add_argument("-r", "--randomname", action="store_true", help="Random (not used) output file name") + parser.add_argument("-c", "--processes", type=int, metavar="", default=1, + help="Number of parallel processes to use (default: 1)") group_verbose = parser.add_mutually_exclusive_group() group_verbose.add_argument("-q", "--quiet", action="store_true", help="No standard output, only error(s)") @@ -322,6 +319,10 @@ def main(): args.quiet = 1 args.verbose = 0 + # Be sure to have at least 1 process + if args.processes <= 0: + parser.error("argument -c/--processes should be greater than 0") + # input data input_data = None input_type = None @@ -394,13 +395,13 @@ def main(): # Make the actual digestion of input data results_digestion = digest.digest_from_input(input_data, input_type, - enzymes_to_use, mode, aa_pka) + enzymes_to_use, mode, aa_pka, + args.processes) # Output results core.output_results(output_file, results_digestion, args.fmt, args.quiet, args.verbose) - ### Let'z go ### if __name__ == '__main__': main() diff --git a/rpg/core.py b/rpg/core.py index 06509f7333365df7d5fcf4357a85970dfdd74c48..d9f55858a17474e177aafc7e178a757c4b1208e7 100644 --- a/rpg/core.py +++ b/rpg/core.py @@ -24,6 +24,7 @@ """Contains generic functions and global variables used by RPG""" import sys +import gzip AMINOACIDS = ["A", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "Y", "B", "X", "Z", @@ -204,3 +205,96 @@ def output_results(output_file, all_seq_digested, fmt, quiet, verbose): except IOError: handle_errors(output_file + " can't be open in 'w' mode", 0, "File ") + +def next_read(file, offset_start, offset_end): + """ Return each sequence between offsets range of a file + as a tuple (header, seq) using a generator. + Can be fasta or fastq, gzipped or not. + + :param file: fasta/fastq file to read + :param offset_start: offset in the file from where to read + :param offset_end: offset in the file until where to read + :type file: str + :type offset_start: int + :type offset_end: int + """ + # Is it a GZIP file? + test_file = open(file, "rb") + # Get the first values + magic = test_file.read(2) + # Close the file + test_file.close() + + # Open the file, GZIP or not + with (gzip.open(file, "rb") if magic == b"\x1f\x8b" + else open(file, "rb")) as in_file: + first_line = in_file.readline().decode('utf-8') + # FASTQ file + if first_line.startswith("@"): + # Go to starting offset + in_file.seek(offset_start) + # Set current offset + beg_line_offset = offset_start + # Read each line from this point + for line in in_file: + # Consider this line as a header + header = line.decode('utf-8').strip() + # It is a proper fastq header + if header.startswith("@"): + # The beginning of header is in the offset range + if beg_line_offset < offset_end: + # Get the sequence + sequence = in_file.readline().decode('utf-8').strip() + # Skip the two next lines + in_file.readline() + in_file.readline() + # Return header and sequence and wait for the next one + yield (header, sequence.upper()) + # Out of offset, stop this loop + else: + break + # Current offset + beg_line_offset = in_file.tell() + + # (multi?)FASTA file + elif first_line.startswith(">"): + # Go to starting offset + in_file.seek(offset_start) + # Set current offset + beg_line_offset = offset_start + # Read each line from this point + for line in in_file: + # Consider this line as a header + header = line.decode('utf-8').strip() + # It is a proper fasta header + if header.startswith(">"): + # The beginning of header is in the offset range + if beg_line_offset < offset_end: + # Get the sequence + sequence = in_file.readline().decode('utf-8').strip() + # Get current offset + current_offset = in_file.tell() + # Get next line + next_l = in_file.readline().decode('utf-8').strip() + # While this next line is not a fasta header... + while next_l and not next_l.startswith(">"): + # Add this to the Sequence + sequence += next_l + # Get current offset + current_offset = in_file.tell() + # Get next line + next_l = in_file.readline().decode('utf-8').strip() + # Next line is a fasta header, go back to its beginning + in_file.seek(current_offset) + # Return header and sequence and wait for the next one + yield (header, sequence.upper()) + # Out of offset, stop this loop + else: + break + # Current offset + beg_line_offset = in_file.tell() + # Not a valid file + else: + # Stop the generator with the error to show + raise ValueError("input file format not recognized (%s)"\ + "." % first_line[0])