RapidPeptidesGenerator.py 17.6 KB
Newer Older
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# -*- coding: utf-8 -*-
#!/usr/bin/env python3.6

########################################################################
# Rapid Peptide Generator (RPG) is a software dedicated to predict     #
# cleavage sites of proteases. User can create his own enzyme,         #
# following a simple grammar.                                          #
#                                                                      #
# Author: Nicolas Maillet                                              #
# Copyright © 2018 Institut Pasteur, Paris.                            #
# See the COPYRIGHT file for details                                   #
#                                                                      #
# RPG is free software: you can redistribute it and/or modify          #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or    #
# any later version.                                                   #
#                                                                      #
# RPG is distributed in the hope that it will be useful,               #
# but WITHOUT ANY WARRANTY; without even the implied warranty of       #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        #
# GNU General Public License for more details.                         #
#                                                                      #
# You should have received a copy of the GNU General Public license    #
# along with RPG (LICENSE file).                                       #
# If not, see <http://www.gnu.org/licenses/>.                          #
########################################################################

"""Main file of RPG software, handle input/output and launch
necessary functions
"""

Nicolas  MAILLET's avatar
Nicolas MAILLET committed
32
__version_info__ = ('1', '2', '2')
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
33
__version__ = '.'.join(__version_info__)
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
34
__revision_date__ = "2021-02-03"
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
35
36
37
38
39
40
41
__author__ = "Nicolas Maillet"

import argparse
import os
import sys
import uuid
from pathlib import Path
42
#from context import rpg
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
43
44
45
46
from rpg import core
from rpg import digest
from rpg import enzyme
from rpg.enzymes_definition import AVAILABLE_ENZYMES
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
47
48
49
50
51
52
53
sys.path.insert(0, str(Path.home())) # Home path
from rpg_user import AVAILABLE_ENZYMES_USER

ALL_ENZYMES = AVAILABLE_ENZYMES + AVAILABLE_ENZYMES_USER
"""All available enzymes in RPG."""

def restricted_float(mc_val):
54
    """Restricts input miscleavage value to a float between 0 and 100.
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
55
56
57
58
59
60
61
62
63
64
65
66
67

    :param mc_val: value to test
    :type mc_val: float

    :return: the inputed value if correct
    :rtype: float

    :raises custom ValueError: if value is not between 0 and 100
    :raises custom TypeError: if value is not a float
    """
    try:
        mc_val = float(mc_val)
        if mc_val < 0 or mc_val > 100:
68
            core.handle_errors("miscleavage value should be between 0 and "\
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
69
70
71
72
                               "100.", 0, "Value ")
        return mc_val
    except ValueError:
        # Throw an error
73
        core.handle_errors("miscleavage value should be a float between 0 "\
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
                           "and 100.", 0, "Type ")

def restricted_enzyme_id(enz_id):
    """Restrict input enzyme id to an int corresponding to an enzyme.

    :param mc_val: value to test
    :type mc_val: int

    :return: the inputed enzyme id
    :rtype: int

    :raises custom ValueError: if id does not correspond to any enzyme
    :raises custom TypeError: if value is not an int
    """
    try:
        enz_id = int(enz_id)
        ids_available = []
        for i in ALL_ENZYMES:
            ids_available.append(i.id_)
        if enz_id not in ids_available:
            core.handle_errors("id " + str(enz_id) + " does not correspond to"\
                               " any enzyme. Use -l to get enzyme ids.", 0,
                               "Input ")
        return enz_id
    except ValueError:
        # Throw an error
        core.handle_errors("Enzyme id should be an integer.", 0, "Type ")

def list_enzyme():
    """Print all available enzymes"""
    for enz in ALL_ENZYMES:
        print("%i: %s" % (enz.id_, enz.name))

107
def create_enzymes_to_use(enzymes, miscleavage):
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
108
    """Create the list of chosen :py:class:`~rpg.enzyme.Enzyme` to use.
109
    Each enzyme can be associated to a miscleavage value.
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
110
111

    :param enzymes: enzymes ids chosen by user
112
    :param miscleavage: associated miscleavage values
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
113
    :type enzymes: list(int)
114
    :type miscleavage: list(float)
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
115

116
    :return: list of enzyme's id with associated miscleavage values
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
117
118
119
120
121
122
    :rtype: list(int)
    """

    # Complete Enzymes to use (return)
    enzymes_to_use = []
    if enzymes:
123
        # Too much miscleavage values
124
        if len(miscleavage) > len(enzymes):
125
            core.handle_errors("Too much miscleavage values. Last values"
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
126
127
                               " will be ignored.")
            # Get only the first ones
128
            miscleavage = miscleavage[:len(enzymes)]
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
129
        cur_pos = -1
130
131
        # Get all enzymes with a given miscleavage
        for i, _ in enumerate(miscleavage):
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
132
133
134
135
            # In all available enzymes
            for enz in ALL_ENZYMES:
                # Get the current one
                if enz.id_ == enzymes[i]:
136
137
                    # Change miscleavage ratio
                    enz.ratio_miscleavage = miscleavage[i]
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
138
139
140
                    # Add it
                    enzymes_to_use.append(enz)
            cur_pos = i
141
        # Get all enzymes without miscleavage value
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
142
143
144
145
146
147
148
149
150
151
        for i in enzymes[cur_pos + 1:]:
            # In all available enzymes
            for enz in ALL_ENZYMES:
                # Get the current one
                if enz.id_ == i:
                    # Add it
                    enzymes_to_use.append(enz)
    # Return enzymes to use
    return enzymes_to_use
# Not tested
152
def get_enzymes_to_use(mode, id_enz_selected, miscleavage):
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
153
    """Get the list of chosen :py:class:`~rpg.enzyme.Enzyme` to use.
154
    Each enzyme (and associated miscleavage value) are inputed by
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
155
156
    user. If there is a problem, user is interrogated again.

157
    :param mode: Digestion mode. If 'concurrent', no miscleavage values are used
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
158
    :param enzymes: enzyme's ids chosen by user
159
    :param miscleavage: associated miscleavage values
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
160
161
    :type mode: str
    :type enzymes: list(int)
162
    :type miscleavage: list(float)
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
163

164
    :return: list of enzyme's id with associated miscleavage values
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
165
166
167
168
    :rtype: list(int)
    """

    # Get the correct Enzymes inputed
169
    enzymes_to_use = create_enzymes_to_use(id_enz_selected, miscleavage)
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
170
171
172
173
174
175
176
177
    # No good Enzymes inputed, let user choose
    if not enzymes_to_use:
        id_enz_inputed = []
        # Print all available enzymes
        list_enzyme()
        # Ask user to give correct enzymes ids
        while not enzymes_to_use:
            id_enz_inp = input("Choose which enzyme(s) to use, separated by"
178
                               " space (example: 1 5 6). (q) to quit:\n")
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
179
180
181
182
            # Quit
            if "q" in id_enz_inp:
                sys.exit(0)
            # Get a list of ids
183
            for i in id_enz_inp.split(" "):
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
184
185
186
187
188
189
190
191
192
193
194
                try:
                    # Convert it to int
                    i = int(i)
                    id_enz_inputed.append(i)
                # Not an int?
                except ValueError:
                    # Throw an error
                    core.handle_errors("'%s' should be an integer value. This"
                                       " values will be ignored." % i)
            mc_enz_inputed = []
            if mode == "sequential":
195
                mc_enz_inp = input("Percentage of miscleavage per inputed"
196
197
                                   " enzyme (default 0), separated by sapce"
                                   " (example: 1.2 5 6):\n")
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
198
199
                if mc_enz_inp:
                    # Get a list of int
200
                    for i in mc_enz_inp.split(" "):
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
                        try:
                            # Convert it to int
                            i = float(i)
                            mc_enz_inputed.append(i)
                        # Not an int?
                        except ValueError:
                            # Throw an error
                            core.handle_errors("'%s' should be an floating"
                                               " value. This values will be"
                                               " ignored." % i)
            # Get the correct Enzyme if enzymes inputed
            enzymes_to_use = create_enzymes_to_use(id_enz_inputed,
                                                   mc_enz_inputed)
    # Return Enzymes to use
    return enzymes_to_use
# Not tested
def main():
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
218
    """Launcher of RapidPeptidesGenerator"""
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
219
220
221
222
223
    parser = argparse.ArgumentParser(description="This software takes protein "
                                                 "sequences as input (-i optio"
                                                 "n). All sequences will be cl"
                                                 "eaved according to selected "
                                                 "enzymes (-e option) and give"
224
                                                 "n miscleavage percentage ("
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
225
226
227
                                                 "-m option). Cleaving can be "
                                                 "sequential or concurrent (-d"
                                                 " option). Resulting peptides"
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
228
                                                 " are outputted in a file (-o"
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
229
                                                 " option) in fasta, csv or ts"
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
230
231
232
233
234
235
236
237
238
239
                                                 "v format (-f option). Classi"
                                                 "cal enzymes are included (-l"
                                                 " option to print available e"
                                                 "nzymes) but it is possible t"
                                                 "o define other enzymes (-a o"
                                                 "ption). See https://gitlab.p"
                                                 "asteur.fr/nmaillet/rpg/ and "
                                                 "https://rapid-peptide-genera"
                                                 "tor.readthedocs.io for more "
                                                 "informations.")
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
240
241
    group_launch = parser.add_mutually_exclusive_group(required=True)
    group_launch.add_argument("-a", "--addenzyme", action="store_true",
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
242
243
244
                              help="Create a new enzyme. See https://rapid-pe"\
                              "ptide-generator.readthedocs.io for more inform"\
                              "ations")
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
245
246
247
248
249
    parser.add_argument("-d", "--digest", metavar="",
                        choices=['s', 'sequential', 'c', 'concurrent'],
                        default="s", help="Digestion mode. Either 's', 'seque"
                        "ntial', 'c' or 'concurrent' (default: s)")
    parser.add_argument("-e", "--enzymes", metavar="", default=[],
250
251
252
                        nargs='+', type=restricted_enzyme_id,
                        help="Id of enzyme(s) to use (i.e. -e 0 5 10 to use "
                        "enzymes 0, 5 and 10). Use -l first to get enzyme ids")
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
253
254
255
256
257
    parser.add_argument("-f", "--fmt", metavar="",
                        choices=['fasta', 'csv', 'tsv'], default="fasta",
                        help="Output file format. Either 'fasta', 'csv', or "
                        "'tsv' (default: fasta)")
    group_launch.add_argument("-i", "--inputdata", metavar="",
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
258
259
                              help="Input file, in fasta / fastq format "\
                                   "(gzipped or not)")
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
260
261
    group_launch.add_argument("-l", "--list", action="store_true",
                              help="Display the list of available enzymes")
262
263
    group_launch.add_argument("-s", "--sequence", metavar="",
                              help="Input a single protein sequence without commentary")
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
264
    parser.add_argument("-m", "--miscleavage", metavar="", default=[],
265
                        nargs='+', type=restricted_float,
266
                        help="Percentage of miscleavage, between 0 and 100,"
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
267
                        " by enzyme(s). It should be in the same order than "
268
269
                        "-enzymes options (i.e. -m 15 5 10). Only for sequenti"
                        "al digestion (default: 0)")
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
270
271
272
273
274
    parser.add_argument("-n", "--noninteractive", action='store_true',
                        help="Non-interactive mode. No standard output, only "
                        "error(s) (--quiet enable, overwrite -v). If output "
                        "filename already exists, output file will be "
                        "overwritten.")
275
276
    parser.add_argument("-p", "--pka", metavar="", choices=['ipc', 'stryer'],
                        default="ipc", help="Define pKa values. Either 'ipc' "
277
                        "or 'stryer' (default: ipc)")
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
278
279
    group_output = parser.add_mutually_exclusive_group()
    group_output.add_argument("-o", "--outputfile", type=str, metavar="",
280
281
                              default="", help="Optional result file "
                              "to output result peptides.")
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
282
    group_output.add_argument("-r", "--randomname", action="store_true",
283
                              help="Random (not used) output file name")
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
284
285
    parser.add_argument("-c", "--processes", type=int, metavar="", default=1,
                       help="Number of parallel processes to use (default: 1)")
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
286
287
288
289
    group_verbose = parser.add_mutually_exclusive_group()
    group_verbose.add_argument("-q", "--quiet", action="store_true",
                               help="No standard output, only error(s)")
    group_verbose.add_argument("-v", "--verbose", action="count", default=0,
290
291
                               help="Increase output verbosity. -vv will "
                               "increase more than -v")
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
292
293
294
295
296
297
298
    parser.add_argument("--version", action="version",
                        version="%(prog)s " + __version__ + " from " +
                        __revision_date__)
    args = parser.parse_args()

    # --addenzyme option
    if args.addenzyme:
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
299
        enzyme.user_creation_enzyme(ALL_ENZYMES)
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
300
301
302
303
304
305
        sys.exit(0)

    # --digest option
    mode = "sequential"
    if args.digest == "c" or args.digest == "concurrent":
        mode = "concurrent"
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
306
        args.miscleavage = []  # No miscleavage on concurrent, infinite time
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
307

308
309
310
311
312
    # --pka option
    aa_pka = core.AA_PKA_IPC
    if args.pka == "stryer":
        aa_pka = core.AA_PKA_S

Nicolas  MAILLET's avatar
Nicolas MAILLET committed
313
314
315
316
317
318
319
320
321
322
    # --list option
    if args.list:
        list_enzyme()
        sys.exit(0)

    # --nointeractive option
    if args.noninteractive:
        args.quiet = 1
        args.verbose = 0

Nicolas  MAILLET's avatar
Nicolas MAILLET committed
323
324
325
326
    # Be sure to have at least 1 process
    if args.processes <= 0:
        parser.error("argument -c/--processes should be greater than 0")

327
328
329
330
331
332
333
334
335
336
337
338
339
340
    # input data
    input_data = None
    input_type = None
    if args.inputdata:
        if os.path.isfile(args.inputdata):
            input_data = args.inputdata
            input_type = "file"
        else:
            core.handle_errors("file not found (%s)." % args.inputdata, 0, "I"\
                               "nput ")
    elif args.sequence:
        input_data = args.sequence
        input_type = "sequence"

Nicolas  MAILLET's avatar
Nicolas MAILLET committed
341
    # --outputfile / --randomname options
342
    output_file = "" # No output file (default)
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
343
344
345
346
347
348
349
    if args.randomname:
        # Generate a random file name
        output_file = str(uuid.uuid4().hex) + "." + args.fmt
        # Ensure the name is unique
        while os.path.isfile(output_file):
            # Generate a random file name
            output_file = str(uuid.uuid4().hex) + "." + args.fmt
350
351
    # Chosen file name if exist
    elif args.outputfile:
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
        # Given name
        tmpname = str(args.outputfile)
        # No extension?
        if "." not in tmpname:
            # Add default extension for this file format
            output_file = tmpname + "." + args.fmt
        else:
            output_file = tmpname
        # If interactive mode
        if not args.noninteractive:
            # This file already exist?
            while os.path.isfile(output_file):
                core.handle_errors("File '%s' already exit!" % output_file)
                # Don't overwrite it
                if input("Overwrite it? (y/n)\n") != "y":
                    tmpname = input("Output filename?\n")
                    # No extension?
                    if "." not in tmpname:
                        # Add default extension for this file format
                        output_file = tmpname + "." + args.fmt
                    else:
                        output_file = tmpname
                # Overwrite it
                else:
                    break

Nicolas  MAILLET's avatar
Nicolas MAILLET committed
378
379
    # More mis cleavage than enzyme
    if len(args.miscleavage) > len(args.enzymes):
380
        core.handle_errors("Too much miscleavage values. Last values will "
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
381
                           "be ignored.")
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
382
        args.miscleavage = args.miscleavage[:len(args.enzymes)]
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
383
384

    # Get all enzymes inputed
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
385
    enzymes_to_use = get_enzymes_to_use(mode, args.enzymes, args.miscleavage)
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
386
387
388

    # Output options
    if args.verbose:
389
        print("Input: " + input_data)
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
390
391
        print("Enzyme(s) used: " + str([enz.name for enz in enzymes_to_use]))
        print("Mode: " + mode)
392
        print("miscleavage ratio: " +
393
              str([enz.ratio_miscleavage for enz in enzymes_to_use]))
394
395
        if output_file:
            print("Output file: " + os.path.abspath(output_file))
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
396
397

    # Make the actual digestion of input data
398
    results_digestion = digest.digest_from_input(input_data, input_type,
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
399
400
                                                 enzymes_to_use, mode, aa_pka,
                                                 args.processes)
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
401
402
403
404
405
406
407
408
409
410

    # Output results
    core.output_results(output_file, results_digestion, args.fmt, args.quiet,
                        args.verbose)

### Let'z go ###
if __name__ == '__main__':
    main()
    # The end
    sys.exit(0)