Source code for maspy.proteindb

"""
The protein database module allows the import of protein sequences from fasta
files, parsing of fasta entry headers and performing in silico digestion by
specified cleavage rules to generate peptides.
"""

#  Copyright 2015-2017 David M. Hollenstein, Jakob J. Hollenstein
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

######################## Python 2 and 3 compatibility #########################
from __future__ import absolute_import, division, print_function
from __future__ import unicode_literals
from future.utils import viewitems, viewkeys, viewvalues, listitems, listvalues

try:
    #python 2.7
    from itertools import izip as zip
except ImportError:
    #python 3 series
    pass
################################################################################

import io
import itertools
import json
import re
import zipfile

import numpy
import pyteomics
import pyteomics.fasta

import maspy.auxiliary as aux
import maspy.peptidemethods


# --- Protein and peptide related classes --- #
[docs]class PeptideSequence(object): """Describes a peptide as derived by digestion of one or multiple proteins, can't contain any modified amino acids. :param sequence: amino acid sequence of the peptide :param missedCleavage: number of missed cleavages, dependens on enzyme specificity :param proteins: protein ids that generate this peptide under certain digest condition :param proteinPositions: start position and end position of a peptide in a protein sequence. One based index, ie the first protein position is "1". ``{proteinId:(startPosition, endPositions) ...}`` """ __slots__ = ['sequence', 'missedCleavage', 'isUnique', 'proteins', 'proteinPositions'] def __init__(self, sequence, mc=None): self.sequence = sequence self.missedCleavage = mc self.isUnique = None self.proteins = set() self.proteinPositions = dict()
[docs] def length(self): """Returns the number of amino acids of the polypeptide sequence.""" return len(self.sequence)
[docs] def mass(self): """Returns the mass of the polypeptide sequence in Dalton.""" return maspy.peptidemethods.calcPeptideMass(self.sequence)
def _reprJSON(self): """Returns a JSON serializable represenation of a ``PeptideSequence`` class instance. Use :func:`maspy.proteindb.PeptideSequence._fromJSON()` to generate a new ``PeptideSequence`` instance from the return value. :returns: a JSON serializable python object """ return {'__PepSeq__': [self.sequence, self.missedCleavage, self.isUnique, list(self.proteins), self.proteinPositions]} @classmethod def _fromJSON(cls, jsonobject): """Generates a new instance of :class:`maspy.proteindb.PeptideSequence` from a decoded JSON object (as generated by :func:`maspy.proteindb.PeptideSequence._reprJSON()`). :param jsonobject: decoded JSON object :returns: a new instance of :class:`PeptideSequence` """ newInstance = cls(jsonobject[0], jsonobject[1]) newInstance.isUnique = jsonobject[2] newInstance.proteins = set(jsonobject[3]) newInstance.proteinPositions = jsonobject[4] return newInstance @staticmethod
[docs] def jsonHook(encoded): """Custom JSON decoder that allows construction of a new ``PeptideSequence`` instance from a decoded JSON object. :param encoded: a JSON decoded object literal (a dict) :returns: "encoded" or :class:`PeptideSequence` """ if '__PepSeq__' in encoded: return PeptideSequence._fromJSON(encoded['__PepSeq__']) else: return encoded
[docs]class ProteinSequence(object): """Describes a protein. :ivar id: identifier of the protein, for example a uniprot id. :ivar name: name of the protein :ivar sequence: amino acid sequence of the protein :ivar fastaHeader: str(), the proteins faster header line :ivar fastaInfo: dict(), the interpreted fasta header as generated when using a faster header parsing function, see :func:`fastaParseSgd()`. :ivar isUnique: bool, True if at least one unique peptide can be assigned to the protein :ivar uniquePeptides: a set of peptides which can be unambiguously assigned to this protein :ivar sharedPeptides: a set of peptides which are shared between different proteins :ivar coverageUnique: the number of amino acids in the protein sequence that are coverd by unique peptides :ivar coverageShared: the number of amino acids in the protein sequence that are coverd by unique or shared peptides """ def __init__(self, identifier, sequence, name=str()): self.id = identifier self.name = name self.sequence = sequence self.isUnique = None self.uniquePeptides = set() self.sharedPeptides = set()
[docs] def mass(self): """Returns the number of amino acids of the polypeptide sequence.""" return maspy.peptidemethods.calcPeptideMass(self.sequence)
[docs] def length(self): """Returns the mass of the polypeptide sequence in dalton.""" return len(self.sequence)
def _reprJSON(self): """Returns a JSON serializable represenation of a ``ProteinSequence`` class instance. Use :func:`maspy.proteindb.ProteinSequence._fromJSON()` to generate a new ``ProteinSequence`` instance from the return value. :returns: a JSON serializable python object """ jsonDict = self.__dict__ jsonDict['uniquePeptides'] = list(jsonDict['uniquePeptides']) jsonDict['sharedPeptides'] = list(jsonDict['sharedPeptides']) return {'__ProtSeq__': jsonDict} @classmethod def _fromJSON(cls, jsonobject): """Generates a new instance of :class:`maspy.proteindb.ProteinSequence` from a decoded JSON object (as generated by :func:`maspy.proteindb.ProteinSequence._reprJSON()`). :param jsonobject: decoded JSON object :returns: a new instance of :class:`ProteinSequence` """ newInstance = cls(None, None) newInstance.__dict__.update(jsonobject) newInstance.uniquePeptides = set(newInstance.uniquePeptides) newInstance.sharedPeptides = set(newInstance.sharedPeptides) return newInstance @staticmethod
[docs] def jsonHook(encoded): """Custom JSON decoder that allows construction of a new ``ProteinSequence`` instance from a decoded JSON object. :param encoded: a JSON decoded object literal (a dict) :returns: "encoded" or :class:`ProteinSequence` """ if '__ProtSeq__' in encoded: return ProteinSequence._fromJSON(encoded['__ProtSeq__']) else: return encoded
[docs]class ProteinDatabase(object): """Describes proteins and peptides generated by an in silico digestion of proteins. :ivar peptides: {sequence:PeptideSequence(), ...} contains elements of :class:`PeptideSequence` derived by an in silico digest of the proteins :ivar proteins: {proteinId:Protein(), proteinId:Protein()}, used to access :class:`ProteinSequence` elements by their id :ivar proteinNames: {proteinName:Protein(), proteinName:Protein()}, alternative way to access :class:`ProteinSequence` elements by their names. Must be populated manually :ivar info: a dictionary containing information about the protein database and parameters specified for the in silico digestion of the protein entries. :: {'name': str, 'mc': str, 'cleavageRule': str, 'minLength': int 'maxLength': int, 'ignoreIsoleucine': bool, 'removeNtermM': bool } **name**: a descriptive name of the protein database, used as the file name when saving the protein database to the hard disk **mc**: number of allowed missed cleavage sites **cleavageRule**: cleavage rule expressed in a regular expression **minLength**: minimal peptide length **maxLength**: maximal peptide length **ignoreIsoleucine**: if True Isoleucine and Leucinge in peptide sequences are treated as indistinguishable. **removeNtermM**: if True also peptides with the N-terminal Methionine of the protein removed are considered. """ def __init__(self): self.peptides = dict() self.proteins = dict() self.proteinNames = dict() self.info = {'name': '', 'mc': 0, 'cleavageRule': '', 'minLength': 0, 'maxLength': 0, 'ignoreIsoleucine': False, 'removeNtermM': False} def __getitem__(self, key): """Uses key to return protein entries. :param key: either a protein id or a protein name :returns: :class:`Protein` """ if key in self.proteins: return self.proteins[key] elif key in self.proteinNames: return self.proteinNames[key] else: raise KeyError(key)
[docs] def save(self, path, compress=True): """Writes the ``.proteins`` and ``.peptides`` entries to the hard disk as a ``proteindb`` file. .. note:: If ``.save()`` is called and no ``proteindb`` file is present in the specified path a new files is generated, otherwise the old file is replaced. :param path: filedirectory to which the ``proteindb`` file is written. The output file name is specified by ``self.info['name']`` :param compress: bool, True to use zip file compression """ with aux.PartiallySafeReplace() as msr: filename = self.info['name'] + '.proteindb' filepath = aux.joinpath(path, filename) with msr.open(filepath, mode='w+b') as openfile: self._writeContainer(openfile, compress=compress)
def _writeContainer(self, filelike, compress=True): """Writes the ``.proteins`` and ``.peptides`` entries to the ``proteindb`` format. In addition it also dumps the ``self.info`` entry to the zipfile with the filename ``info``. For details see :func:`maspy.auxiliary.writeJsonZipfile()` :param filelike: path to a file (str) or a file-like object :param compress: bool, True to use zip file compression """ aux.writeJsonZipfile(filelike, self.proteins, compress, 'w', 'proteins') aux.writeJsonZipfile(filelike, self.peptides, compress, 'a', 'peptides') zipcomp = zipfile.ZIP_DEFLATED if compress else zipfile.ZIP_STORED with zipfile.ZipFile(filelike, 'a', allowZip64=True) as containerFile: infodata = {key: value for key, value in viewitems(self.info) if key != 'path' } containerFile.writestr('info', json.dumps(infodata, zipcomp)) @classmethod
[docs] def load(cls, path, name): """Imports the specified ``proteindb`` file from the hard disk. :param path: filedirectory of the ``proteindb`` file :param name: filename without the file extension ".proteindb" .. note:: this generates rather large files, which actually take longer to import than to newly generate. Maybe saving / loading should be limited to the protein database whitout in silico digestion information. """ filepath = aux.joinpath(path, name + '.proteindb') with zipfile.ZipFile(filepath, 'r', allowZip64=True) as containerZip: #Convert the zipfile data into a str object, necessary since #containerZip.read() returns a bytes object. proteinsString = io.TextIOWrapper(containerZip.open('proteins'), encoding='utf-8' ).read() peptidesString = io.TextIOWrapper(containerZip.open('peptides'), encoding='utf-8' ).read() infoString = io.TextIOWrapper(containerZip.open('info'), encoding='utf-8' ).read() newInstance = cls() newInstance.proteins = json.loads(proteinsString, object_hook=ProteinSequence.jsonHook) newInstance.peptides = json.loads(peptidesString, object_hook=PeptideSequence.jsonHook) newInstance.info.update(json.loads(infoString)) return newInstance
[docs] def calculateCoverage(self): """Calcualte the sequence coverage masks for all protein entries. For a detailed description see :func:`_calculateCoverageMasks()` """ self._calculateCoverageMasks(self.proteins, self.peptides)
@staticmethod def _calculateCoverageMasks(proteindb, peptidedb): """Calcualte the sequence coverage masks for all proteindb elements. Private method used by :class:`ProteinDatabase`. A coverage mask is a numpy boolean array with the length of the protein sequence. Each protein position that has been covered in at least one peptide is set to True. Coverage masks are calculated for unique and for shared peptides. Peptides are matched to proteins according to positions derived by the digestion of the FASTA file. Alternatively peptides could also be matched to proteins just by sequence as it is done in :func:`pyteomics.parser.coverage`, but this is not the case here. :param proteindb: a dictionary containing :class:`ProteinSequence` entries, for example ``ProteinDatabase.proteins`` :param proteindb: a dictionary containing :class:`PeptideSequence` entries, for example ``ProteinDatabase.peptides`` Sets two attributes for each ``ProteinSequence`` entry: ``.coverageMaskUnique`` = coverage mask of unique peptides ``.coverageMaskShared`` = coverage mask of shared peptides """ for proteinId, proteinEntry in viewitems(proteindb): coverageMaskUnique = numpy.zeros(proteinEntry.length(), dtype='bool') for peptide in proteinEntry.uniquePeptides: startPos, endPos = peptidedb[peptide].proteinPositions[proteinId] coverageMaskUnique[startPos-1:endPos] = True coverageMaskShared = numpy.zeros(proteinEntry.length(), dtype='bool') for peptide in proteinEntry.sharedPeptides: startPos, endPos = peptidedb[peptide].proteinPositions[proteinId] coverageMaskShared[startPos-1:endPos] = True setattr(proteinEntry, 'coverageMaskUnique', coverageMaskUnique) setattr(proteinEntry, 'coverageMaskShared', coverageMaskShared)
# --- import for ProteinDatabase class --- #
[docs]def importProteinDatabase(filePath, proteindb=None, decoyTag='[decoy]', contaminationTag='[cont]', headerParser=None, forceId=False, cleavageRule='[KR]', minLength=5, maxLength=40, missedCleavage=2, ignoreIsoleucine=False, removeNtermM=True): """Generates a :class:`ProteinDatabase` by in silico digestion of proteins from a fasta file. :param filePath: File path :param proteindb: optional an existing :class:`ProteinDatabase` can be specified, otherwise a new instance is generated and returned :param decoyTag: If a fasta file contains decoy protein entries, they should be specified with a sequence tag :param contaminationTag: If a fasta file contains contamination protein entries, they should be specified with a sequence tag :param headerParser: optional a headerParser can be specified #TODO: describe how a parser looks like :param forceId: bool, if True and no id can be extracted from the fasta header the whole header sequence is used as a protein id instead of raising an exception. :param cleavageRule: cleavage rule expressed in a regular expression, see :attr:`maspy.constants.expasy_rules` :param missedCleavage: number of allowed missed cleavage sites :param removeNtermM: bool, True to consider also peptides with the N-terminal Methionine of the protein removed :param minLength: int, only yield peptides with length >= minLength :param maxLength: int, only yield peptides with length <= maxLength :param ignoreIsoleucine: bool, if True treat Isoleucine and Leucine in peptide sequences as indistinguishable See also :func:`maspy.peptidemethods.digestInSilico` """ proteindb = ProteinDatabase() if proteindb is None else proteindb fastaRead = _readFastaFile(filePath) for header, sequence in fastaRead: proteinTags = list() if header.startswith(decoyTag): isDecoy = True header = header.replace(decoyTag, '') proteinTags.append(decoyTag) else: isDecoy = False if header.startswith(contaminationTag): isCont = True header = header.replace(contaminationTag, '') proteinTags.append(contaminationTag) else: isCont = False headerInfo = _extractFastaHeader(header, headerParser, forceId) proteinId = ''.join(itertools.chain(proteinTags, [headerInfo['id']])) if 'name' in headerInfo: proteinName = ''.join(itertools.chain(proteinTags, [headerInfo['name']] ) ) else: proteinName = proteinId if proteinId not in proteindb.proteins: protein = ProteinSequence(proteinId, sequence) protein.name = proteinName protein.fastaHeader = header protein.fastaInfo = headerInfo proteindb.proteins[protein.id] = protein #Perform the insilico digestion _digestion = maspy.peptidemethods.digestInSilico(sequence, cleavageRule, missedCleavage, removeNtermM, minLength, maxLength ) #Add peptides to the protein database for unmodPeptide, info in _digestion: if ignoreIsoleucine: unmodPeptideNoIsoleucine = unmodPeptide.replace('I', 'L') if unmodPeptideNoIsoleucine in proteindb.peptides: currPeptide = proteindb.peptides[unmodPeptideNoIsoleucine] else: currPeptide = PeptideSequence(unmodPeptideNoIsoleucine, mc=info['missedCleavage'] ) proteindb.peptides[unmodPeptideNoIsoleucine] = currPeptide if unmodPeptide not in proteindb.peptides: proteindb.peptides[unmodPeptide] = currPeptide else: if unmodPeptide in proteindb.peptides: currPeptide = proteindb.peptides[unmodPeptide] else: currPeptide = PeptideSequence(unmodPeptide, mc=info['missedCleavage'] ) proteindb.peptides[unmodPeptide] = currPeptide if proteinId not in currPeptide.proteins: currPeptide.proteins.add(proteinId) #TODO: change that a peptide can appear multiple times in a # protein sequence. currPeptide.proteinPositions[proteinId] = (info['startPos'], info['endPos'] ) #Add peptide entries to the protein entries, define wheter a peptide can be #uniquely assigend to a single protein (.isUnique = True). for peptide, peptideEntry in viewitems(proteindb.peptides): numProteinMatches = len(peptideEntry.proteins) if numProteinMatches == 1: peptideEntry.isUnique = True elif numProteinMatches > 1: peptideEntry.isUnique = False else: raise Exception('No protein matches in proteindb for peptide' + 'sequence: ' + peptide) for proteinId in peptideEntry.proteins: if peptideEntry.isUnique: proteindb.proteins[proteinId].uniquePeptides.add(peptide) else: proteindb.proteins[proteinId].sharedPeptides.add(peptide) #Check protein entries if the digestions generated at least one peptide that #is uniquely assigned to the protein (.isUnique = True) for proteinEntry in viewvalues(proteindb.proteins): if len(proteinEntry.uniquePeptides) > 0: proteinEntry.isUnique = True else: proteinEntry.isUnique = False #Note: TODO, altough isoleucin is ignored, the protein entry should only #show the actually present ILE / LEU occurence, not any possibilities return proteindb
def _readFastaFile(filepath): """Read a FASTA file and yields tuples of 'header' and 'sequence' entries. :param filepath: file path of the FASTA file :yields: FASTA entries in the format ('header', 'sequence'). The 'header' string does not contain the '>' and trailing white spaces. The 'sequence' string does not contain trailing white spaces, a '*' at the end of the sequence is removed. See also :func:`importProteinDatabase` and :func:`maspy.peptidemethods.digestInSilico`. """ processSequences = lambda i: ''.join([s.rstrip() for s in i]).rstrip('*') processHeaderLine = lambda line: line[1:].rstrip() with io.open(filepath) as openfile: #Iterate through lines until the first header is encountered try: line = next(openfile) while line[0] != '>': line = next(openfile) header = processHeaderLine(line) sequences = list() except StopIteration: errorText = 'File does not contain fasta entries.' raise maspy.errors.FileFormatError(errorText) for line in openfile: if line[0] == '>': yield header, processSequences(sequences) header = processHeaderLine(line) sequences = list() else: sequences.append(line) #Yield last entry if sequences: yield header, processSequences(sequences) def _extractFastaHeader(fastaHeader, parser=None, forceId=False): """Parses a fasta header and returns extracted information in a dictionary. Unless a custom parser is specified, a ``Pyteomics`` function is used, which provides parsers for the formats of UniProtKB, UniRef, UniParc and UniMES (UniProt Metagenomic and Environmental Sequences), described at `www.uniprot.org <http://www.uniprot.org/help/fasta-headers>_`. :param fastaHeader: str, protein entry header from a fasta file :param parser: is a function that takes a fastaHeader string and returns a dictionary, containing at least the key "id". If None the parser function from pyteomics ``pyteomics.fasta.parse()`` is used. :param forceId: bool, if True and no id can be extracted from the fasta header the whole header sequence is used as a protein id instead of raising an exception. :returns: dict, describing a fasta header """ if parser is None: try: headerInfo = pyteomics.fasta.parse(fastaHeader) except pyteomics.auxiliary.PyteomicsError as pyteomicsError: #If forceId is set True, it uses the whole header as an id if forceId: headerInfo = {'id': fastaHeader} else: raise pyteomicsError else: headerInfo = parser(fastaHeader) return headerInfo
[docs]def fastaParseSgd(header): """Custom parser for fasta headers in the SGD format, see www.yeastgenome.org. :param header: str, protein entry header from a fasta file :returns: dict, parsed header """ rePattern = '([\S]+)\s([\S]+).+(\".+\")' ID, name, description = re.match(rePattern, header).groups() info = {'id':ID, 'name':name, 'description':description} return info