Source code for maspy.peptidemethods

"""
provides functions to work with peptide
  sequences, mass to charge ratios and modifications and calvulation
  of masses.
"""

#  Copyright 2015-2017 David M. Hollenstein, Jakob J. Hollenstein
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

######################## Python 2 and 3 compatibility #########################
from __future__ import absolute_import, division, print_function
from __future__ import unicode_literals
from future.utils import viewitems, viewkeys, viewvalues, listitems, listvalues

try:
    #python 2.7
    from itertools import izip as zip
except ImportError:
    #python 3 series
    pass
################################################################################

import itertools
import re

import pyteomics.mass

import maspy.constants


[docs]def digestInSilico(proteinSequence, cleavageRule='[KR]', missedCleavage=0,
                   removeNtermM=True, minLength=5, maxLength=55):
    """Returns a list of peptide sequences and cleavage information derived
    from an in silico digestion of a polypeptide.

    :param proteinSequence: amino acid sequence of the poly peptide to be
        digested
    :param cleavageRule: cleavage rule expressed in a regular expression, see
        :attr:`maspy.constants.expasy_rules`
    :param missedCleavage: number of allowed missed cleavage sites
    :param removeNtermM: booo, True to consider also peptides with the
        N-terminal methionine of the protein removed
    :param minLength: int, only yield peptides with length >= minLength
    :param maxLength: int, only yield peptides with length <= maxLength

    :returns: a list of resulting peptide enries. Protein positions start with
        ``1`` and end with ``len(proteinSequence``. ::

            [(peptide amino acid sequence,
              {'startPos': int, 'endPos': int, 'missedCleavage': int}
              ), ...
             ]

    .. note::
        This is a regex example for specifying N-terminal cleavage at lysine
        sites ``\\w(?=[K])``
    """
    passFilter = lambda startPos, endPos: (endPos - startPos >= minLength and
                                           endPos - startPos <= maxLength
                                           )
    _regexCleave = re.finditer(cleavageRule, proteinSequence)

    cleavagePosList = set(itertools.chain(map(lambda x: x.end(), _regexCleave)))
    cleavagePosList.add(len(proteinSequence))
    cleavagePosList = sorted(list(cleavagePosList))
    #Add end of protein as cleavage site if protein doesn't end with specififed
    #cleavage positions
    numCleavageSites = len(cleavagePosList)

    if missedCleavage >= numCleavageSites:
        missedCleavage = numCleavageSites -1

    digestionresults = list()
    #Generate protein n-terminal peptides after methionine removal
    if removeNtermM and proteinSequence[0] == 'M':
        for cleavagePos in range(0, missedCleavage+1):
            startPos = 1
            endPos = cleavagePosList[cleavagePos]
            if passFilter(startPos, endPos):
                sequence = proteinSequence[startPos:endPos]
                info = dict()
                info['startPos'] = startPos+1
                info['endPos'] = endPos
                info['missedCleavage'] = cleavagePos
                digestionresults.append((sequence, info))

    #Generate protein n-terminal peptides
    if cleavagePosList[0] != 0:
        for cleavagePos in range(0, missedCleavage+1):
            startPos = 0
            endPos = cleavagePosList[cleavagePos]
            if passFilter(startPos, endPos):
                sequence = proteinSequence[startPos:endPos]
                info = dict()
                info['startPos'] = startPos+1
                info['endPos'] = endPos
                info['missedCleavage'] = cleavagePos
                digestionresults.append((sequence, info))

    #Generate all remaining peptides, including the c-terminal peptides
    lastCleavagePos = 0
    while lastCleavagePos < numCleavageSites:
        for missedCleavage in range(0, missedCleavage+1):
            nextCleavagePos = lastCleavagePos + missedCleavage + 1
            if nextCleavagePos < numCleavageSites:
                startPos = cleavagePosList[lastCleavagePos]
                endPos = cleavagePosList[nextCleavagePos]
                if passFilter(startPos, endPos):
                    sequence = proteinSequence[startPos:endPos]
                    info = dict()
                    info['startPos'] = startPos+1
                    info['endPos'] = endPos
                    info['missedCleavage'] = missedCleavage
                    digestionresults.append((sequence, info))
        lastCleavagePos += 1

    return digestionresults


# --- Functions to work with peptide sequences --- #
[docs]def calcPeptideMass(peptide, **kwargs):
    """Calculate the mass of a peptide.

    :param aaMass: A dictionary with the monoisotopic masses of amino acid
        residues, by default :attr:`maspy.constants.aaMass`
    :param aaModMass: A dictionary with the monoisotopic mass changes of
        modications, by default :attr:`maspy.constants.aaModMass`
    :param elementMass: A dictionary with the masses of chemical elements, by
        default ``pyteomics.mass.nist_mass``
    :param peptide: peptide sequence, modifications have to be written in the
        format "[modificationId]" and "modificationId" has to be present in
        :attr:`maspy.constants.aaModMass`

    #TODO: change to a more efficient way of calculating the modified mass, by
    first extracting all present modifications and then looking up their masses.
    """
    aaMass = kwargs.get('aaMass', maspy.constants.aaMass)
    aaModMass = kwargs.get('aaModMass', maspy.constants.aaModMass)
    elementMass = kwargs.get('elementMass', pyteomics.mass.nist_mass)

    addModMass = float()
    unmodPeptide = peptide
    for modId, modMass in viewitems(aaModMass):
        modSymbol = '[' + modId + ']'
        numMod = peptide.count(modSymbol)
        if numMod > 0:
            unmodPeptide = unmodPeptide.replace(modSymbol, '')
            addModMass += modMass * numMod

    if unmodPeptide.find('[') != -1:
        print(unmodPeptide)
        raise Exception('The peptide contains modification, ' +
                        'not present in maspy.constants.aaModMass'
                        )

    unmodPeptideMass = sum(aaMass[i] for i in unmodPeptide)
    unmodPeptideMass += elementMass['H'][0][0]*2 + elementMass['O'][0][0]
    modPeptideMass = unmodPeptideMass + addModMass
    return modPeptideMass


[docs]def removeModifications(peptide):
    """Removes all modifications from a peptide string and return the plain
    amino acid sequence.

    :param peptide: peptide sequence, modifications have to be written in the
        format "[modificationName]"
    :param peptide: str

    :returns: amino acid sequence of ``peptide`` without any modifications
    """
    while peptide.find('[') != -1:
        peptide = peptide.split('[', 1)[0] + peptide.split(']', 1)[1]
    return peptide


[docs]def returnModPositions(peptide, indexStart=1, removeModString='UNIMOD:'):
    """Determines the amino acid positions of all present modifications.

    :param peptide: peptide sequence, modifications have to be written in the
        format "[modificationName]"
    :param indexStart: returned amino acids positions of the peptide start with
        this number (first amino acid position = indexStart)
    :param removeModString: string to remove from the returned modification name

    :return: {modificationName:[position1, position2, ...], ...}

    #TODO: adapt removeModString to the new unimod ids in
    #maspy.constants.aaModComp ("UNIMOD:X" -> "u:X") -> also change unit tests.
    """
    unidmodPositionDict = dict()
    while peptide.find('[') != -1:
        currModification = peptide.split('[')[1].split(']')[0]
        currPosition = peptide.find('[') - 1
        if currPosition == -1: # move n-terminal modifications to first position
            currPosition = 0
        currPosition += indexStart

        peptide = peptide.replace('['+currModification+']', '', 1)

        if removeModString:
            currModification = currModification.replace(removeModString, '')
        unidmodPositionDict.setdefault(currModification,list())
        unidmodPositionDict[currModification].append(currPosition)
    return unidmodPositionDict


# --- Functions to transform mass to mz values --- #
[docs]def calcMhFromMz(mz, charge):
    """Calculate the MH+ value from mz and charge.

    :param mz: float, mass to charge ratio (Dalton / charge)
    :param charge: int, charge state

    :returns: mass to charge ratio of the mono protonated ion (charge = 1)
    """
    mh = (mz * charge) - (maspy.constants.atomicMassProton * (charge-1) )
    return mh


[docs]def calcMzFromMh(mh, charge):
    """Calculate the mz value from MH+ and charge.

    :param mh: float, mass to charge ratio (Dalton / charge) of the mono
        protonated ion
    :param charge: int, charge state

    :returns: mass to charge ratio of the specified charge state
    """
    mz = (mh + (maspy.constants.atomicMassProton * (charge-1))) / charge
    return mz


[docs]def calcMzFromMass(mass, charge):
    """Calculate the mz value of a peptide from its mass and charge.

    :param mass: float, exact non protonated mass
    :param charge: int, charge state

    :returns: mass to charge ratio of the specified charge state
    """
    mz = (mass + (maspy.constants.atomicMassProton * charge)) / charge
    return mz


[docs]def calcMassFromMz(mz, charge):
    """Calculate the mass of a peptide from its mz and charge.

    :param mz: float, mass to charge ratio (Dalton / charge)
    :param charge: int, charge state

    :returns: non protonated mass (charge = 0)
    """
    mass = (mz - maspy.constants.atomicMassProton) * charge
    return mass