core.py 12 KB
Newer Older
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# -*- coding: utf-8 -*-

########################################################################
# Author: Nicolas Maillet                                              #
# Copyright © 2018 Institut Pasteur, Paris.                            #
# See the COPYRIGHT file for details                                   #
#                                                                      #
# This file is part of Rapid Peptide Generator (RPG) software.         #
#                                                                      #
# RPG is free software: you can redistribute it and/or modify          #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or    #
# any later version.                                                   #
#                                                                      #
# RPG is distributed in the hope that it will be useful,               #
# but WITHOUT ANY WARRANTY; without even the implied warranty of       #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        #
# GNU General Public License for more details.                         #
#                                                                      #
# You should have received a copy of the GNU General Public license    #
# along with RPG (LICENSE file).                                       #
# If not, see <http://www.gnu.org/licenses/>.                          #
########################################################################

"""Contains generic functions and global variables used by RPG"""
import sys
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
27
import gzip
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66

AMINOACIDS = ["A", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N",
              "O", "P", "Q", "R", "S", "T", "U", "V", "W", "Y", "B", "X", "Z",
              "#", "*"]
"""All character accepted in a peptide."""

AA_MASS_AVERAGE = {"*" : 0.0,
                   "A" : 71.0788,
                   "C" : 103.1388,
                   "D" : 115.0886,
                   "E" : 129.1155,
                   "F" : 147.1766,
                   "G" : 57.0519,
                   "H" : 137.1411,
                   "I" : 113.1594,
                   "J" : 113.1594,
                   "K" : 128.1741,
                   "L" : 113.1594,
                   "M" : 131.1926,
                   "N" : 114.1038,
                   "O" : 237.3018,
                   "P" : 97.1167,
                   "Q" : 128.1307,
                   "R" : 156.1875,
                   "S" : 87.0782,
                   "T" : 101.1051,
                   "U" : 150.0388,
                   "V" : 99.1326,
                   "W" : 186.2132,
                   "Y" : 163.1760,
                   "B" : 0.0,
                   "X" : 0.0,
                   "Z" : 0.0,
                   "#" : 0.0}
"""Mass of all amino acids."""

WATER_MASS = 18.01528
"""Mass of a water molecule."""

Nicolas  MAILLET's avatar
Nicolas MAILLET committed
67
# Biochemistry Stryer 7th
68
69
70
71
72
73
74
75
76
77
AA_PKA_S = {"Nterm" : 8.0,
            "C" : 8.3,
            "D" : 4.1,
            "E" : 4.1,
            "H" : 6.0,
            "K" : 10.8,
            "R" : 12.5,
            "Y" : 10.9,
            "Cterm" : 3.1}
"""pKa of important amino acid to compute pI (from Stryer)."""
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
78
# IPC_peptide
79
80
81
82
83
84
85
86
87
AA_PKA_IPC = {"Nterm" : 9.564,
              "C" : 8.297,
              "D" : 3.887,
              "E" : 4.317,
              "H" : 6.018,
              "K" : 10.517,
              "R" : 12.503,
              "Y" : 10.071,
              "Cterm" : 2.383}
88
"""pKa of important amino acid to compute pI (from IPC_peptide. See http://isoelectric.org/theory.html for details)."""
89
90
91
92
93
94
95
96
97
98
99
# IPC_peptide2
AA_PKA_IPC_2 = {"Nterm" : 7.947,
              "C" : 9.454,
              "D" : 3.969,
              "E" : 4.507,
              "H" : 6.439,
              "K" : 8.165,
              "R" : 11.493,
              "Y" : 9.153,
              "Cterm" : 2.977}
"""pKa of important amino acid to compute pI (from IPC_peptide2. See http://www.ipc2-isoelectric-point.org/ for details)."""
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

def handle_errors(message="", err=1, error_type=""):
    """Custom handling of errors and warnings.

    :param message: error message to print
    :param err: Type of message
    :param error_type: header of error to print
    :type message: str
    :type err: int
    :type error_type: str

    *Type of message* is:\n
    - **0** for critical error (exit)\n
    - **1** for warning (no exit, default)\n
    - **2** for print in stderr

    """

    if err == 0:
        print(error_type + "Error: " + message, file=sys.stderr)
        sys.exit(1)
    elif err == 2:
        print(error_type + message, file=sys.stderr)
    else:
        print(error_type + "Warning: " + message, file=sys.stderr)

def get_header(fmt="fasta"):
    """Construct a header for output file in `csv` or `tsv`.

    :param fmt: format of header
    :type fmt: str

    :return: formatted header
    :rtype: str or None

    Informations on the header are:\n
    Original_header No_pep Enzyme Cleav_pos Pep_size Pep_mass pI Seq\n
    No header for `fasta` or other format.
    """
    ret = None
    if fmt == "csv":
        separator = ","
    elif fmt == "tsv":
        separator = "\t"
    if fmt == "csv" or fmt == "tsv":
        ret = "Original_header" + separator + "No_peptide" + separator + \
              "Enzyme" + separator + "Cleaving_pos" + separator + \
              "Peptide_size" + separator + "Peptide_mass" + separator + "pI" +\
              separator + "Sequence"
    return ret

def output_results(output_file, all_seq_digested, fmt, quiet, verbose):
    """Output results of digestion in file and optionally in `stdout`.

154
    :param output_file: the file where to print results, if exist
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
155
156
157
158
159
160
161
162
163
164
165
    :param all_seq_digested: results of digestions
    :param fmt: output format (`csv`, `tsv` or `fasta`)
    :param quiet: quiet mode, no `stdout` message
    :param verbose: verbosity level
    :type output_file: str
    :type all_seq_digested: list(list(:py:class:`~rpg.digest.ResultOneDigestion`))
    :type fmt: str
    :type quiet: bool
    :type verbose: int
    """

166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
    # Not output file
    if not output_file:
        # Header
        header = get_header(fmt)
        # If we have a header to print (csv/tsv)
        if header:
            # Stdout if small verbose
            if verbose < 2 and not quiet:
                print(header)
        # Print all peptides
        for one_seq in all_seq_digested:  # list of list of ResultOneDig
            # For all ResultOneDigestion
            for one_enz_res in one_seq:
                # Print on stdout
                if verbose >= 2:
                    # Big verbose
                    print(one_enz_res.get_more_info())
                    if header:
                        print(header)
                # Default stdout
                if not quiet:
                    print(format(one_enz_res, fmt), end='')
    # Output file exist
    else:
        try:
            with open(output_file, 'w') as outfile:
                # Header
                header = get_header(fmt)
                # If we have a header to print (csv/tsv)
                if header:
                    # Print it in file
                    outfile.write(header + "\n")
                    # Stdout if small verbose
                    if verbose < 2 and not quiet:
                        print(header)
                # Print all peptides
                for one_seq in all_seq_digested:  # list of list of ResultOneDi
                    # For all ResultOneDigestion
                    for one_enz_res in one_seq:
                        # Print results in file
                        outfile.write(format(one_enz_res, fmt))
                        # Print on stdout
                        if verbose >= 2:
                            # Big verbose
                            print(one_enz_res.get_more_info())
                            if header:
                                print(header)
                        # Default stdout
                        if not quiet:
                            print(format(one_enz_res, fmt), end='')
        except IOError:
            handle_errors(output_file + " can't be open in 'w' mode", 0,
                          "File ")
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
219
220
221
222
223
224
225
226
227
228
229
230
231
232

def next_read(file, offset_start, offset_end):
    """ Return each sequence between offsets range of a file
        as a tuple (header, seq) using a generator.
        Can be fasta or fastq, gzipped or not.

    :param file: fasta/fastq file to read
    :param offset_start: offset in the file from where to read
    :param offset_end: offset in the file until where to read
    :type file: str
    :type offset_start: int
    :type offset_end: int
    """
    # Is it a GZIP file?
233
    test_file = open(file, "rb")
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
234
235
236
237
238
239
240
    # Get the first values
    magic = test_file.read(2)
    # Close the file
    test_file.close()

    # Open the file, GZIP or not
    with (gzip.open(file, "rb") if magic == b"\x1f\x8b"
241
          else open(file, "rb")) as in_file:
Nicolas  MAILLET's avatar
Nicolas MAILLET committed
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
        first_line = in_file.readline().decode('utf-8')
        # FASTQ file
        if first_line.startswith("@"):
            # Go to starting offset
            in_file.seek(offset_start)
            # Set current offset
            beg_line_offset = offset_start
            # Read each line from this point
            for line in in_file:
                # Consider this line as a header
                header = line.decode('utf-8').strip()
                # It is a proper fastq header
                if header.startswith("@"):
                    # The beginning of header is in the offset range
                    if beg_line_offset < offset_end:
                        # Get the sequence
                        sequence = in_file.readline().decode('utf-8').strip()
                        # Skip the two next lines
                        in_file.readline()
                        in_file.readline()
                        # Return header and sequence and wait for the next one
                        yield (header, sequence.upper())
                    # Out of offset, stop this loop
                    else:
                        break
                # Current offset
                beg_line_offset = in_file.tell()

        # (multi?)FASTA file
        elif first_line.startswith(">"):
            # Go to starting offset
            in_file.seek(offset_start)
            # Set current offset
            beg_line_offset = offset_start
            # Read each line from this point
            for line in in_file:
                # Consider this line as a header
                header = line.decode('utf-8').strip()
                # It is a proper fasta header
                if header.startswith(">"):
                    # The beginning of header is in the offset range
                    if beg_line_offset < offset_end:
                        # Get the sequence
                        sequence = in_file.readline().decode('utf-8').strip()
                        # Get current offset
                        current_offset = in_file.tell()
                        # Get next line
                        next_l = in_file.readline().decode('utf-8').strip()
                        # While this next line is not a fasta header...
                        while next_l and not next_l.startswith(">"):
                            # Add this to the Sequence
                            sequence += next_l
                            # Get current offset
                            current_offset = in_file.tell()
                            # Get next line
                            next_l = in_file.readline().decode('utf-8').strip()
                        # Next line is a fasta header, go back to its beginning
                        in_file.seek(current_offset)
                        # Return header and sequence and wait for the next one
                        yield (header, sequence.upper())
                    # Out of offset, stop this loop
                    else:
                        break
                # Current offset
                beg_line_offset = in_file.tell()
        # Not a valid file
        else:
            # Stop the generator with the error to show
            raise ValueError("input file format not recognized (%s)"\
                             "." % first_line[0])