Commit 8aa1b417 authored by Nicolas  MAILLET's avatar Nicolas MAILLET

Initial commit

parents
=========
CHANGELOG
=========
\ No newline at end of file
This diff is collapsed.
==========
User Guide
==========
Todo
\ No newline at end of file
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SPHINXPROJ = RapidPeptidesGenerator
SOURCEDIR = ./source
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
.. include:: ../../CHANGELOG.rst
\ No newline at end of file
# -*- coding: utf-8 -*-
#
# Configuration file for the Sphinx documentation builder.
#
# This file does only contain a selection of the most common options. For a
# full list see the documentation:
# http://www.sphinx-doc.org/en/stable/config
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__),
'../../rpg')))
# -- Project information -----------------------------------------------------
project = 'RapidPeptidesGenerator'
copyright = '2018, Nicolas Maillet'
author = 'Nicolas Maillet'
# The short X.Y version
version = ''
# The full version, including alpha/beta/rc tags
release = '0.5.1'
# -- General configuration ---------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.coverage',
'sphinx.ext.viewcode',
'sphinx.ext.githubpages',
'sphinx.ext.napoleon',
]
napoleon_google_docstring = False
napoleon_use_param = False
napoleon_use_ivar = True
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = '.rst'
# The master toctree document.
master_doc = 'index'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path .
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. Major themes that come with
# Sphinx are currently 'default' and 'sphinxdoc'.
html_theme = 'sphinx_rtd_theme'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ['_static']
# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# The default sidebars (for documents that don't match any pattern) are
# defined by theme itself. Builtin themes are using these templates by
# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
# 'searchbox.html']``.
#
# html_sidebars = {}
# -- Options for HTMLHelp output ---------------------------------------------
# Output file base name for HTML help builder.
htmlhelp_basename = 'RapidPeptidesGeneratordoc'
html_short_title = 'RPG'
# -- Options for LaTeX output ------------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',
# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'RapidPeptidesGenerator.tex', 'RapidPeptidesGenerator Documentation',
'Nicolas Maillet', 'manual'),
]
# -- Options for manual page output ------------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'rapidpeptidesgenerator', 'RapidPeptidesGenerator Documentation',
[author], 1)
]
# -- Options for Texinfo output ----------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'RapidPeptidesGenerator', 'RapidPeptidesGenerator Documentation',
author, 'RapidPeptidesGenerator', 'One line description of project.',
'Miscellaneous'),
]
# -- Extension configuration -------------------------------------------------
\ No newline at end of file
This diff is collapsed.
.. RapidPeptidesGenerator documentation master file, created by
sphinx-quickstart on Thu Mar 1 16:20:03 2018.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to RapidPeptidesGenerator's documentation!
==================================================
In silico protein digestion
Proteases, also known as proteolytic enzymes, have been studied for more than 80 years. (1)
Those enzymes are widely used in industry, medicine and as a biological research tool, for example in protein characterization or more generally in proteomics and proteogenomics. (2)
Recently, interest in proteases has gained importance due to advancements in mass spectrometry techniques used proteomics and proteogenomics. In "bottom-up" analysis, using tandem mass spectrometry (MS/MS), optimal peptide size range is 600–5,000Da (3). Therefor, for bottom-up approaches, protein digestions are required. Digestions are performed using one or several proteases, like trypsin, pepsin or thrombin. Each enzymes has specific cleavage sites relying on solvent accessibility, pH, temperature, etc. Using different proteases individually or in combination creates a unique set of peptide. Performing multiple digestions can increase protein coverage and overall confidence in protein identification if cleaving sites are different. It is not always easy to determine which combination of enzyme will lead to a set of peptides suitable for MS/MS analysis. Moreover, the cost of some enzymes does not allow to easily try several combinations to avoid redundancy of cleaving site. Few software exit to predict cleavage sites of proteases in protein sequences. Among those, most commonly used are PeptideCutter from ExPASy Server (4), and a module integrated in MaxQuant. (5)
PeptideCutter includes 38 enzymes and chemicals. The digestion of one sequence by one or several enzymes leads to detailed results including positions of cleavage site, resulting peptide sequences, peptides length (in amino acid) and peptides mass (in dalton). Three main features are missing.
The first one is the impossibility to digest more than one sequence at a time. In order to thoroughly analyze the behavior of a specific combination of enzyme, it is important to try this combination on many different proteins. Using PeptideCutter for this is time consuming and not efficient.
The second drawback of this tool is how combination of enzyme are used. In PeptideCutter, all selected enzymes are supposed to be present at the same time during digestion. It is therefor difficult to simulate distinct digestions. One has to run as many time the software as the number of distinct digestion, multiplied by the numbers of concerned sequences.
The last issue is the impossibility for the user to defined a new enzyme. As previously mentioned, interest in proteases is quickly growing, and new or more specific enzymes (denoted as "Sequencing Grade", or SG) are developed. Depending on the company manufacturing those SG enzymes, specificity can change. It is important for a user to easily adapt the software to available products he/she may already have.
MaxQuant partially overcome those issues. User can create new enzymes by specifying between which amino acids cleavages occur. Unfortunately this definition is not sufficient to properly define several enzymes. For example, definition of Trypsin in MaxQuant lake some exceptions. It is define as cleaving after K or R but not before P. But, it has been reported (6) that if P usually blocks the action when found after K, it is not true when K is preceded by W: a cleavage then occurs after K in WKP motif. Currently, it is not possible to create such rules in MaxQuant.
This poster present Rapid Peptides Generator (RPG), a new software dedicated to predict cleavage sites of proteases. RPG is a standalone python tools taking (multi-)fasta/fastq file of proteins as input and digest each of them. Digestion mode can be either concurrent, where all enzymes are present at the same time during digestion, or sequential. In sequential mode, each protein will be digested by each enzymes, one by one. Resulting peptides contains the same informations as PeptideCutter, as-well as an estimation of isoelectric point (pI) of each peptide. Isoelectric point is the pH at which a peptide carries no net electrical charge and a good approximation can be computed on small molecules. Results are output in multi-fasta, CSV or TSV file. Currently, there is 42 enzymes and chemicals in RPG. User can easily design new enzymes, using a simple yet powerful grammar. This grammar allows user to design complex enzymes like trypsin or thrombin, including many exceptions and different cleaving site. User-defined enzymes are then interpreted by RPG and user can use them as the regular one already available in the software.
Cleaving results are identical with all enzymes available in both PeptideCutter and RPG, with exception of enzymes where PeptideCutter can not be as specific as RPG.
1. Neurath, H. Proteolytic enzymes, past and future. Proc Natl Acad Sci USA 96, 10962–10963 (1999).
2. Nesvizhskii, A. I. Proteogenomics: concepts, applications and computational strategies. Nature Publishing Group 11, 1114–1125 (2014).
3. Engel, L., Saveliev, S., Urh, M., Simpson, D., Jones, R. and Wood, K. Using Endoproteinases Asp-N and Glu-C to Improve Protein Characterization. [Internet] . [cited: 2018, 04, 20]. Available from: http://france.promega.com/resources/pubhub/using-endoproteinases-asp-n-and-glu-c-to-improve-protein-characterization/
4. Gasteiger E. et al. (2005) Protein Identification and Analysis Tools on the ExPASy Server. In: Walker J.M. (eds) The Proteomics Protocols Handbook. Humana Press
5. Cox, J. & Mann, M. MaxQuant enables high peptide identification rates, individualized p.p.b.-range mass accuracies and proteome-wide protein quantification. Nature Biotechnology 26, 1367 EP –
6. Keil, Borivoj. Specificity of Proteolysis. 10.1007/978-3-642-48380-6. (1992)
.. toctree::
:maxdepth: 2
:caption: Contents:
:numbered:
readme.rst
enzymes.rst
modules.rst
changelog.rst
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
===
API
===
.. toctree::
:maxdepth: 4
rpg
.. include:: ../../README.rst
\ No newline at end of file
rpg package
===========
Submodules
----------
rpg.RapidPeptidesGenerator module
---------------------------------
.. automodule:: rpg.RapidPeptidesGenerator
:members:
:undoc-members:
:show-inheritance:
rpg.core module
---------------
.. automodule:: rpg.core
:members:
:undoc-members:
:show-inheritance:
rpg.digest module
-----------------
.. automodule:: rpg.digest
:members:
:undoc-members:
:show-inheritance:
rpg.enzyme module
-----------------
.. automodule:: rpg.enzyme
:members:
:undoc-members:
:show-inheritance:
rpg.enzymes\_definition module
------------------------------
.. automodule:: rpg.enzymes_definition
:members:
:undoc-members:
:show-inheritance:
rpg.rule module
---------------
.. automodule:: rpg.rule
:members:
:undoc-members:
:show-inheritance:
rpg.sequence module
-------------------
.. automodule:: rpg.sequence
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: rpg
:members:
:undoc-members:
:show-inheritance:
This diff is collapsed.
"""Contains everything related to RPG software"""
# -*- coding: utf-8 -*-
########################################################################
# Author: Nicolas Maillet #
# Copyright © 2018 Institut Pasteur, Paris. #
# See the COPYRIGHT file for details #
# #
# This file is part of Rapid Peptide Generator (RPG) software. #
# #
# RPG is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# any later version. #
# #
# RPG is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public license #
# along with RPG (LICENSE file). #
# If not, see <http://www.gnu.org/licenses/>. #
########################################################################
"""Contains generic functions and global variables used by RPG"""
import sys
AMINOACIDS = ["A", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N",
"O", "P", "Q", "R", "S", "T", "U", "V", "W", "Y", "B", "X", "Z",
"#", "*"]
"""All character accepted in a peptide."""
AA_MASS_AVERAGE = {"*" : 0.0,
"A" : 71.0788,
"C" : 103.1388,
"D" : 115.0886,
"E" : 129.1155,
"F" : 147.1766,
"G" : 57.0519,
"H" : 137.1411,
"I" : 113.1594,
"J" : 113.1594,
"K" : 128.1741,
"L" : 113.1594,
"M" : 131.1926,
"N" : 114.1038,
"O" : 237.3018,
"P" : 97.1167,
"Q" : 128.1307,
"R" : 156.1875,
"S" : 87.0782,
"T" : 101.1051,
"U" : 150.0388,
"V" : 99.1326,
"W" : 186.2132,
"Y" : 163.1760,
"B" : 0.0,
"X" : 0.0,
"Z" : 0.0,
"#" : 0.0}
"""Mass of all amino acids."""
WATER_MASS = 18.01528
"""Mass of a water molecule."""
AA_PKA = {"Nterm" : 8.0,
"C" : 8.3,
"D" : 4.1,
"E" : 4.1,
"H" : 6.0,
"K" : 10.8,
"R" : 12.5,
"Y" : 10.9,
"Cterm" : 3.1}
"""pKa of important amino acid to compute pI."""
def handle_errors(message="", err=1, error_type=""):
"""Custom handling of errors and warnings.
:param message: error message to print
:param err: Type of message
:param error_type: header of error to print
:type message: str
:type err: int
:type error_type: str
*Type of message* is:\n
- **0** for critical error (exit)\n
- **1** for warning (no exit, default)\n
- **2** for print in stderr
"""
if err == 0:
print(error_type + "Error: " + message, file=sys.stderr)
sys.exit(1)
elif err == 2:
print(error_type + message, file=sys.stderr)
else:
print(error_type + "Warning: " + message, file=sys.stderr)
def get_header(fmt="fasta"):
"""Construct a header for output file in `csv` or `tsv`.
:param fmt: format of header
:type fmt: str
:return: formatted header
:rtype: str or None
Informations on the header are:\n
Original_header No_pep Enzyme Cleav_pos Pep_size Pep_mass pI Seq\n
No header for `fasta` or other format.
"""
ret = None
if fmt == "csv":
separator = ","
elif fmt == "tsv":
separator = "\t"
if fmt == "csv" or fmt == "tsv":
ret = "Original_header" + separator + "No_peptide" + separator + \
"Enzyme" + separator + "Cleaving_pos" + separator + \
"Peptide_size" + separator + "Peptide_mass" + separator + "pI" +\
separator + "Sequence"
return ret
def output_results(output_file, all_seq_digested, fmt, quiet, verbose):
"""Output results of digestion in file and optionally in `stdout`.
:param output_file: the file where to print results
:param all_seq_digested: results of digestions
:param fmt: output format (`csv`, `tsv` or `fasta`)
:param quiet: quiet mode, no `stdout` message
:param verbose: verbosity level
:type output_file: str
:type all_seq_digested: list(list(:py:class:`~rpg.digest.ResultOneDigestion`))
:type fmt: str
:type quiet: bool
:type verbose: int
"""
# Open output file
try:
with open(output_file, 'w') as outfile:
# Header
header = get_header(fmt)
# If we have a header to print (csv/tsv)
if header:
# Print it in file
outfile.write(header + "\n")
# Stdout if small verbose
if verbose < 2 and not quiet:
print(header)
# Print all peptides
for one_seq in all_seq_digested: # list of list of ResultOneDig
# For all ResultOneDigestion
for one_enz_res in one_seq:
# Print results in file
outfile.write(format(one_enz_res, fmt))
# Print on stdout
if verbose >= 2:
# Big verbose
print(one_enz_res.get_more_info())
if header:
print(header)
# Default stdout
if not quiet:
print(format(one_enz_res, fmt), end='')
except IOError:
handle_errors(output_file + " can't be open in 'w' mode", 0, "File ")
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# -*- coding: utf-8 -*-
########################################################################
# Author: Nicolas Maillet #
# Copyright © 2018 Institut Pasteur, Paris. #
# See the COPYRIGHT file for details #
# #
# This file is part of Rapid Peptide Generator (RPG) software. #
# #
# RPG is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# any later version. #
# #
# RPG is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public license #
# along with RPG (LICENSE file). #
# If not, see <http://www.gnu.org/licenses/>. #
########################################################################
"""Contains classes and function related to sequences"""
import core
class Peptide:
"""Definition of a peptide, containing the header of its original
sequence, an amino acid sequence, the name of the enzyme used to
produce it and more informations.
:param header: header of the peptide
:param sequence: sequence in amino acids
:param enzyme_name: name of the enzyme used
:param nb_peptide: number of this peptide (default: 0)
:param position: position of cleavage on the original sequence (default: 0)
:type header: str
:type sequence: str
:type enzyme_name: str
:type nb_peptide: int
:type position: int
:var size: size of the peptide
:var mass: mass of the peptide
:var p_i: pI of the peptide
:vartype size: int
:vartype mass: float
:vartype p_i: float
"""
def __init__(self, header, sequence, enzyme_name, nb_peptide=0, position=0):
self.header = header # header of this peptide
self.sequence = sequence # peptide sequence
self.enzyme_name = enzyme_name # name of the enzyme used
self.nb_peptide = nb_peptide # number of this peptide
self.position = position # position of cleavage
self.size = len(sequence) # size of the peptide
# Mass of the peptide
tmp_mass = core.WATER_MASS
for i in sequence:
tmp_mass += core.AA_MASS_AVERAGE[i]
self.mass = round(tmp_mass, 5) # mass of the peptide
self.p_i = self.get_isoelectric_point()
# self representation for print
def __repr__(self):
return "Original header: " + self.header + "\nNo. peptide: " + \
str(self.nb_peptide) + "\nEnzyme: " + self.enzyme_name + \
"\nCleav. pos: " + str(self.position) + "\nPep. size: " + \
str(self.size) + "\nPep. mass: " + str(self.mass) + "\nPep. pI: " \
+ str(self.p_i) + "\nSequence: " + self.sequence + "\n"
# Equality between two Peptides
def __eq__(self, other):
if isinstance(self, other.__class__):
return self.__dict__ == other.__dict__
return False
# Create a clean output according to fmt
def __format__(self, fmt):
ret = ""
# Formating the print according to format
if fmt == "fasta":
ret += ">"
separator = "_"
elif fmt == "csv":
separator = ","
else:
separator = "\t"
# Main values to print
ret += self.header + separator + str(self.nb_peptide) + separator + \
self.enzyme_name + separator + str(self.position) + separator + \
str(self.size) + separator + str(self.mass) + separator + \
str(self.p_i)
# Last separator, \n for fasta format
if fmt == "fasta":
ret += "\n"
else:
ret += separator
# End of the print
ret += self.sequence
return ret
def get_isoelectric_point(self):
"""Compute isoelectric point (pI) of the peptide using
binary search.
:return: computed pI
:rtype: float
:note: This function used :py:const:`~rpg.core.AA_PKA`
"""
ph_val = 7 # Neutral pH, starting point of binary search
ph_min = 0.0 # Minimal pH
ph_max = 14.0 # Maximal pH
precision = 0.01
# While we are not precise enough
while (ph_val-ph_min > precision) or (ph_max-ph_val > precision):
# Compute the pI
qn1 = -1.0 / (1.0 + pow(10, (core.AA_PKA["Cterm"] - ph_val)))
qn2 = -self.sequence.count('D') / (1.0 + pow(10, (core.AA_PKA["D"]-
ph_val)))
qn3 = -self.sequence.count('E') / (1.0 + pow(10, (core.AA_PKA["E"]-
ph_val)))
qn4 = -self.sequence.count('C') / (1.0 + pow(10, (core.AA_PKA["C"]-
ph_val)))
qn5 = -self.sequence.count('Y') / (1.0 + pow(10, (core.AA_PKA["Y"]-
ph_val)))
qp1 = self.sequence.count('H') / (1.0 + pow(10, (ph_val -
core.AA_PKA["H"])))
qp2 = 1.0 / (1.0 + pow(10, (ph_val - core.AA_PKA["Nterm"])))
qp3 = self.sequence.count('K') / (1.0 + pow(10, (ph_val -
core.AA_PKA["K"])))
qp4 = self.sequence.count('R') / (1.0 + pow(10, (ph_val -
core.AA_PKA["R"])))
nq_final = qn1 + qn2 + qn3 + qn4 + qn5 + qp1 + qp2 + qp3 + qp4
# We are below solution, good pH value must be smaller
if nq_final < 0.0:
ph_max = ph_val
ph_val -= (ph_max - ph_min) / 2
# We are above solution, good pH value must be bigger
else:
ph_min = ph_val
ph_val += (ph_max - ph_min) / 2
# We got a good enough pH value
return round(ph_val, 2)
class Sequence:
"""Definition of an amino acid sequence to digest.
:param header: header of the sequence
:param sequence: sequence itself
:type header: str
:type sequence: str
"""
def __init__(self, header, sequence):
self.header = header # header of this peptide
self.sequence = sequence # peptide sequence
# self representation for print
def __repr__(self):
return "Header: " + self.header + "\nSequence: " + self.sequence + "\n"
# Equality between two Sequences
def __eq__(self, other):
if isinstance(self, other.__class__):
return self.__dict__ == other.__dict__
return False
def check_sequence(seq):
"""Validate an input sequence. Each amino acid should be in
:py:const:`~rpg.core.AMINOACIDS`.
:param seq: the sequence to check
:type seq: str