Source code for maspy.peptidemethods

"""
provides functions to work with peptide
  sequences, mass to charge ratios and modifications and calvulation
  of masses.
"""

#  Copyright 2015-2017 David M. Hollenstein, Jakob J. Hollenstein
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

######################## Python 2 and 3 compatibility #########################
from __future__ import absolute_import, division, print_function
from __future__ import unicode_literals
from future.utils import viewitems, viewkeys, viewvalues, listitems, listvalues

try:
    #python 2.7
    from itertools import izip as zip
except ImportError:
    #python 3 series
    pass
################################################################################

import itertools
import re

import pyteomics.mass

import maspy.constants


[docs]def digestInSilico(proteinSequence, cleavageRule='[KR]', missedCleavage=0, removeNtermM=True, minLength=5, maxLength=55): """Returns a list of peptide sequences and cleavage information derived from an in silico digestion of a polypeptide. :param proteinSequence: amino acid sequence of the poly peptide to be digested :param cleavageRule: cleavage rule expressed in a regular expression, see :attr:`maspy.constants.expasy_rules` :param missedCleavage: number of allowed missed cleavage sites :param removeNtermM: booo, True to consider also peptides with the N-terminal methionine of the protein removed :param minLength: int, only yield peptides with length >= minLength :param maxLength: int, only yield peptides with length <= maxLength :returns: a list of resulting peptide enries. Protein positions start with ``1`` and end with ``len(proteinSequence``. :: [(peptide amino acid sequence, {'startPos': int, 'endPos': int, 'missedCleavage': int} ), ... ] .. note:: This is a regex example for specifying N-terminal cleavage at lysine sites ``\\w(?=[K])`` """ passFilter = lambda startPos, endPos: (endPos - startPos >= minLength and endPos - startPos <= maxLength ) _regexCleave = re.finditer(cleavageRule, proteinSequence) cleavagePosList = set(itertools.chain(map(lambda x: x.end(), _regexCleave))) cleavagePosList.add(len(proteinSequence)) cleavagePosList = sorted(list(cleavagePosList)) #Add end of protein as cleavage site if protein doesn't end with specififed #cleavage positions numCleavageSites = len(cleavagePosList) if missedCleavage >= numCleavageSites: missedCleavage = numCleavageSites -1 digestionresults = list() #Generate protein n-terminal peptides after methionine removal if removeNtermM and proteinSequence[0] == 'M': for cleavagePos in range(0, missedCleavage+1): startPos = 1 endPos = cleavagePosList[cleavagePos] if passFilter(startPos, endPos): sequence = proteinSequence[startPos:endPos] info = dict() info['startPos'] = startPos+1 info['endPos'] = endPos info['missedCleavage'] = cleavagePos digestionresults.append((sequence, info)) #Generate protein n-terminal peptides if cleavagePosList[0] != 0: for cleavagePos in range(0, missedCleavage+1): startPos = 0 endPos = cleavagePosList[cleavagePos] if passFilter(startPos, endPos): sequence = proteinSequence[startPos:endPos] info = dict() info['startPos'] = startPos+1 info['endPos'] = endPos info['missedCleavage'] = cleavagePos digestionresults.append((sequence, info)) #Generate all remaining peptides, including the c-terminal peptides lastCleavagePos = 0 while lastCleavagePos < numCleavageSites: for missedCleavage in range(0, missedCleavage+1): nextCleavagePos = lastCleavagePos + missedCleavage + 1 if nextCleavagePos < numCleavageSites: startPos = cleavagePosList[lastCleavagePos] endPos = cleavagePosList[nextCleavagePos] if passFilter(startPos, endPos): sequence = proteinSequence[startPos:endPos] info = dict() info['startPos'] = startPos+1 info['endPos'] = endPos info['missedCleavage'] = missedCleavage digestionresults.append((sequence, info)) lastCleavagePos += 1 return digestionresults
# --- Functions to work with peptide sequences --- #
[docs]def calcPeptideMass(peptide, **kwargs): """Calculate the mass of a peptide. :param aaMass: A dictionary with the monoisotopic masses of amino acid residues, by default :attr:`maspy.constants.aaMass` :param aaModMass: A dictionary with the monoisotopic mass changes of modications, by default :attr:`maspy.constants.aaModMass` :param elementMass: A dictionary with the masses of chemical elements, by default ``pyteomics.mass.nist_mass`` :param peptide: peptide sequence, modifications have to be written in the format "[modificationId]" and "modificationId" has to be present in :attr:`maspy.constants.aaModMass` #TODO: change to a more efficient way of calculating the modified mass, by first extracting all present modifications and then looking up their masses. """ aaMass = kwargs.get('aaMass', maspy.constants.aaMass) aaModMass = kwargs.get('aaModMass', maspy.constants.aaModMass) elementMass = kwargs.get('elementMass', pyteomics.mass.nist_mass) addModMass = float() unmodPeptide = peptide for modId, modMass in viewitems(aaModMass): modSymbol = '[' + modId + ']' numMod = peptide.count(modSymbol) if numMod > 0: unmodPeptide = unmodPeptide.replace(modSymbol, '') addModMass += modMass * numMod if unmodPeptide.find('[') != -1: print(unmodPeptide) raise Exception('The peptide contains modification, ' + 'not present in maspy.constants.aaModMass' ) unmodPeptideMass = sum(aaMass[i] for i in unmodPeptide) unmodPeptideMass += elementMass['H'][0][0]*2 + elementMass['O'][0][0] modPeptideMass = unmodPeptideMass + addModMass return modPeptideMass
[docs]def removeModifications(peptide): """Removes all modifications from a peptide string and return the plain amino acid sequence. :param peptide: peptide sequence, modifications have to be written in the format "[modificationName]" :param peptide: str :returns: amino acid sequence of ``peptide`` without any modifications """ while peptide.find('[') != -1: peptide = peptide.split('[', 1)[0] + peptide.split(']', 1)[1] return peptide
[docs]def returnModPositions(peptide, indexStart=1, removeModString='UNIMOD:'): """Determines the amino acid positions of all present modifications. :param peptide: peptide sequence, modifications have to be written in the format "[modificationName]" :param indexStart: returned amino acids positions of the peptide start with this number (first amino acid position = indexStart) :param removeModString: string to remove from the returned modification name :return: {modificationName:[position1, position2, ...], ...} #TODO: adapt removeModString to the new unimod ids in #maspy.constants.aaModComp ("UNIMOD:X" -> "u:X") -> also change unit tests. """ unidmodPositionDict = dict() while peptide.find('[') != -1: currModification = peptide.split('[')[1].split(']')[0] currPosition = peptide.find('[') - 1 if currPosition == -1: # move n-terminal modifications to first position currPosition = 0 currPosition += indexStart peptide = peptide.replace('['+currModification+']', '', 1) if removeModString: currModification = currModification.replace(removeModString, '') unidmodPositionDict.setdefault(currModification,list()) unidmodPositionDict[currModification].append(currPosition) return unidmodPositionDict
# --- Functions to transform mass to mz values --- #
[docs]def calcMhFromMz(mz, charge): """Calculate the MH+ value from mz and charge. :param mz: float, mass to charge ratio (Dalton / charge) :param charge: int, charge state :returns: mass to charge ratio of the mono protonated ion (charge = 1) """ mh = (mz * charge) - (maspy.constants.atomicMassProton * (charge-1) ) return mh
[docs]def calcMzFromMh(mh, charge): """Calculate the mz value from MH+ and charge. :param mh: float, mass to charge ratio (Dalton / charge) of the mono protonated ion :param charge: int, charge state :returns: mass to charge ratio of the specified charge state """ mz = (mh + (maspy.constants.atomicMassProton * (charge-1))) / charge return mz
[docs]def calcMzFromMass(mass, charge): """Calculate the mz value of a peptide from its mass and charge. :param mass: float, exact non protonated mass :param charge: int, charge state :returns: mass to charge ratio of the specified charge state """ mz = (mass + (maspy.constants.atomicMassProton * charge)) / charge return mz
[docs]def calcMassFromMz(mz, charge): """Calculate the mass of a peptide from its mz and charge. :param mz: float, mass to charge ratio (Dalton / charge) :param charge: int, charge state :returns: non protonated mass (charge = 0) """ mass = (mz - maspy.constants.atomicMassProton) * charge return mass