Source code for maspy.reader

"""
This module provides functions to import various data types as maspy objects,
which are associated with analysis workflows of mass spectrometry data. This
currently comprises the mzML format, results of the percolator software and to
some extent mzIdentML files, and file formats representing peptide LC-MS feature
".featureXML" and ".features.tsv".
"""

#  Copyright 2015-2017 David M. Hollenstein, Jakob J. Hollenstein
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

######################## Python 2 and 3 compatibility #########################
from __future__ import absolute_import, division, print_function
from __future__ import unicode_literals
from future.utils import viewitems, viewkeys, viewvalues, listitems, listvalues

try:
    #python 2.7
    from itertools import izip as zip
except ImportError:
    #python 3 series
    pass
################################################################################

from collections import defaultdict as ddict
import io
from lxml import etree as ETREE
from operator import itemgetter as ITEMGETTER
import os
import warnings

import numpy
import pyteomics.mzid
import pyteomics.mass

import maspy.auxiliary as aux
import maspy.core
import maspy.peptidemethods
import maspy.xml


#####################################################################
### mzml import methods                       #######################
#####################################################################
[docs]def importMzml(filepath, msrunContainer=None, siAttrFromSmi=None, specfilename=None):
    """Performs a complete import of a mzml file into a maspy MsrunContainer.

    :paramsiAttrFromSmi: allow here to specify a custom function that extracts params a from spectrumMetadataItem
    :param specfilename: by default the filename will be used as the specfilename in the MsrunContainer and all
        mzML item instances, specify here an alternative specfilename to override the default one
    """
    #TODO: docstring
    siAttrFromSmi = defaultFetchSiAttrFromSmi if siAttrFromSmi is None else siAttrFromSmi
    if msrunContainer is None:
        msrunContainer = maspy.core.MsrunContainer()

    basename = os.path.basename(filepath)
    dirname = os.path.dirname(filepath)
    filename, extension = os.path.splitext(basename)
    specfilename = filename if specfilename is None else specfilename

    #Check if the specified file is valid for an import
    if not os.path.isfile(filepath):
        raise IOError('File does not exist: %s' % filepath)
    elif extension.lower() != '.mzml':
        raise IOError('Filetype is not "mzml": %s' % filepath)
    elif specfilename in msrunContainer.info:
        print(specfilename, 'already present in the msrunContainer, aborting import.')
        return None

    mzmlReader = maspy.xml.MzmlReader(filepath)
    masterContainer = {'rm': str(), 'ci': {}, 'si': {}, 'sai': {}, 'smi': {}}
    #Dictionary recording which MS2 scans follow a MS1 scan
    ms1Record = ddict(list)

    for xmlSpectrum in mzmlReader.parseSpectra():
        smi, binaryDataArrayList = smiFromXmlSpectrum(xmlSpectrum, specfilename)
        #Generate SpectrumItem
        si = maspy.core.Si(smi.id, smi.specfile)
        si.isValid = True
        siAttrFromSmi(smi, si)
        if si.msLevel > 1:
            si.precursorId = si.precursorId.split('scan=')[1] #TODO: change to use regex to extract from known vendor format
            ms1Record[si.precursorId].append(si.id)
        else:
            ms1Record[si.id] #Touch the ddict to add the MS1 id, if it is not already present
        #Generate SpectrumArrayItem
        sai = maspy.core.Sai(smi.id, smi.specfile)
        sai.arrays, sai.arrayInfo = maspy.xml.extractBinaries(binaryDataArrayList,
                                                              smi.attributes['defaultArrayLength'])
        #Store all items in the appropriate containers
        masterContainer['smi'][smi.id] = smi
        masterContainer['si'][smi.id] = si
        masterContainer['sai'][smi.id] = sai

    for siId, msnIdList in viewitems(ms1Record):
        #Ignore KeyError if the spectrum is not present in the mzML file for whatever reason
        try:
            setattr(masterContainer['si'][siId], 'msnIdList', msnIdList)
        except KeyError:
            pass

    for xmlChromatogram in mzmlReader.chromatogramList:
        ci = ciFromXml(xmlChromatogram, specfilename)
        masterContainer['ci'][ci.id] = ci
    masterContainer['rm'] = mzmlReader.metadataNode

    msrunContainer._addSpecfile(specfilename, dirname)
    msrunContainer.rmc[specfilename] = masterContainer['rm']
    msrunContainer.info[specfilename]['status']['rm'] = True
    msrunContainer.smic[specfilename] = masterContainer['smi']
    msrunContainer.info[specfilename]['status']['smi'] = True
    msrunContainer.sic[specfilename] = masterContainer['si']
    msrunContainer.info[specfilename]['status']['si'] = True
    msrunContainer.saic[specfilename] = masterContainer['sai']
    msrunContainer.info[specfilename]['status']['sai'] = True
    msrunContainer.cic[specfilename] = masterContainer['ci']
    msrunContainer.info[specfilename]['status']['ci'] = True

    return msrunContainer


# --- generate python objects from mzML xml elements --- #
[docs]def ciFromXml(xmlelement, specfile):
    #TODO: docstring
    ciId = xmlelement.attrib['id']
    ci = maspy.core.Ci(ciId, specfile)
    for key in ['dataProcessingRef']:
        if key in xmlelement.attrib:
            setattr(ci, key, xmlelement.attrib[key])
    ci.params, children = maspy.xml.extractParams(xmlelement)
    arrayLength = xmlelement.attrib['defaultArrayLength']

    for child in children:
        childElements, childParams = maspy.xml.sublistReader(child)
        childTag = maspy.xml.clearTag(child.tag)
        if childTag == 'precursor':
            #TODO: THIS HAS TO BE ADAPTED, REPLACE "**element", ADD A LOOP
            newElement = MzmlPrecursor(**element)
        elif childTag == 'product':
            #TODO: THIS HAS TO BE ADAPTED
            newElement = MzmlProduct(**element)
        elif childTag == 'binaryDataArrayList':
            for childElement in childElements:
                dataType, dataTypeParam = maspy.xml.findBinaryDataType(childElement['params'])
                ci.arrayInfo[dataType] = {'dataProcessingRef': None, 'params': childElement['params']}
                if 'dataProcessingRef' in childElement:
                    ci.arrayInfo[dataType]['dataProcessingRef'] = childElement['dataProcessingRef']
            ci.arrays, ci.arrayInfo = maspy.xml.extractBinaries(childElements, arrayLength)
    return ci


[docs]def smiFromXmlSpectrum(xmlelement, specfile):
    #TODO: docstring
    scanId = xmlelement.attrib['id'].split('scan=')[1] #TODO: change to use regex to extract from known vendor format
    smi = maspy.core.Smi(scanId, specfile)
    smi.params, spectrumChildren = maspy.xml.extractParams(xmlelement)
    smi.attributes.update(xmlelement.attrib)
    binaryDataArrayList = list()

    for spectrumChild in spectrumChildren:
        spectrumChildTag = maspy.xml.clearTag(spectrumChild.tag)
        elements, params = maspy.xml.sublistReader(spectrumChild)
        if params:
            #Define scanListParams here
            setattr(smi, spectrumChildTag+'Params', params)
        for element in elements:
            if spectrumChildTag == 'scanList':
                newElement = maspy.core.MzmlScan(**element)
            elif spectrumChildTag == 'precursorList':
                newElement = maspy.core.MzmlPrecursor(**element)
            elif spectrumChildTag == 'productList':
                newElement = maspy.core.MzmlProduct(**element)
            elif spectrumChildTag == 'binaryDataArrayList':
                binaryDataArrayList.append(element)
                continue
            else:
                raise Exception('unexpected spectrum xml child tag!')
            getattr(smi, spectrumChildTag).append(newElement)
    return smi, binaryDataArrayList


# --- extract spectrum attributes from the mzml like structure of a Smi object --- #
[docs]def fetchSpectrumInfo(smi):
    #TODO: docstring
    attributes = dict()
    for key, accession, dtype in [('msLevel', 'MS:1000511', int),
                                  ('tic', 'MS:1000285', float),
                                  ('basepeakMz', 'MS:1000504', float),
                                  ('basepeakI', 'MS:1000505', float)
                                  ]:
        param = maspy.xml.findParam(smi.params, accession)
        if param is not None:
            attributes[key] = dtype(param[1])
        else:
            #Or don't add key to attributes?
            attributes[key] = None
    return attributes


[docs]def fetchScanInfo(smi):
    #TODO: docstring
    if smi.scanList:
        attributes = dict()
        for key, accession, dtype in [('rt', 'MS:1000016', float),
                                      ('iit', 'MS:1000927', float),
                                      ]:
            param = maspy.xml.findParam(smi.scanList[0]['params'], accession)
            if param is not None:
                attributes[key] = dtype(param[1])
                #TODO: raise a warning if time is neither in minutes nor in seconds
                if key == 'rt' and param[2] == 'UO:0000031':
                    #convert retention time from minutes to seconds
                    attributes['rt'] *= 60
            else:
                attributes[key] = None
    else:
        attributes = None
    return attributes


[docs]def fetchParentIon(smi):
    #TODO: docstring
    if smi.precursorList:
        attributes = dict()
        attributes['precursorId'] = smi.precursorList[0].spectrumRef
        selectedIon = smi.precursorList[0]['selectedIonList'][0]
        for key, accession, dtype in [('obsMz', 'MS:1000744', float),
                                      ('i', 'MS:1000042', float),
                                      ('charge', 'MS:1000041', int)
                                      ]:
            param = maspy.xml.findParam(selectedIon, accession)
            if param is not None:
                attributes[key] = dtype(param[1])
            else:
                attributes[key] = None
    else:
        attributes = None
    return attributes


[docs]def defaultFetchSiAttrFromSmi(smi, si):
    """Default method to extract attributes from a spectrum metadata item (sai) and
    adding them to a spectrum item (si)."""
    for key, value in viewitems(fetchSpectrumInfo(smi)):
        setattr(si, key, value)
    for key, value in viewitems(fetchScanInfo(smi)):
        setattr(si, key, value)
    if si.msLevel > 1:
        for key, value in viewitems(fetchParentIon(smi)):
            setattr(si, key, value)


[docs]def convertMzml(mzmlPath, outputDirectory=None):
    """Imports an mzml file and converts it to a MsrunContainer file

    :param mzmlPath: path of the mzml file
    :param outputDirectory: directory where the MsrunContainer file should be written
    if it is not specified, the output directory is set to the mzml files directory.
    """
    outputDirectory = outputDirectory if outputDirectory is not None else os.path.dirname(mzmlPath)
    msrunContainer = importMzml(mzmlPath)
    msrunContainer.setPath(outputDirectory)
    msrunContainer.save()


#####################################################################
### import for SiiContainer class             #######################
#####################################################################
[docs]def prepareSiiImport(siiContainer, specfile, path, qcAttr, qcLargerBetter,
                     qcCutoff, rankAttr, rankLargerBetter):
    """Prepares the ``siiContainer`` for the import of peptide spectrum matching
    results. Adds entries to ``siiContainer.container`` and to
    ``siiContainer.info``.

    :param siiContainer: instance of :class:`maspy.core.SiiContainer`
    :param specfile: unambiguous identifier of a ms-run file. Is also used as
        a reference to other MasPy file containers.
    :param path: folder location used by the ``SiiContainer`` to save and load
        data to the hard disk.
    :param qcAttr: name of the parameter to define a ``Sii`` quality cut off.
        Typically this is some sort of a global false positive estimator,
        for example a 'false discovery rate' (FDR).
    :param qcLargerBetter: bool, True if a large value for the ``.qcAttr`` means
        a higher confidence.
    :param qcCutOff: float, the quality threshold for the specifed ``.qcAttr``
    :param rankAttr: name of the parameter used for ranking ``Sii`` according
        to how well they match to a fragment ion spectrum, in the case when
        their are multiple ``Sii`` present for the same spectrum.
    :param rankLargerBetter: bool, True if a large value for the ``.rankAttr``
        means a better match to the fragment ion spectrum.

    For details on ``Sii`` ranking see :func:`applySiiRanking()`

    For details on ``Sii`` quality validation see :func:`applySiiQcValidation()`
    """
    if specfile not in siiContainer.info:
        siiContainer.addSpecfile(specfile, path)
    else:
        raise Exception('...')

    siiContainer.info[specfile]['qcAttr'] = qcAttr
    siiContainer.info[specfile]['qcLargerBetter'] = qcLargerBetter
    siiContainer.info[specfile]['qcCutoff'] = qcCutoff
    siiContainer.info[specfile]['rankAttr'] = rankAttr
    siiContainer.info[specfile]['rankLargerBetter'] = rankLargerBetter


[docs]def addSiiToContainer(siiContainer, specfile, siiList):
    """Adds the ``Sii`` elements contained in the siiList to the appropriate
    list in ``siiContainer.container[specfile]``.

    :param siiContainer: instance of :class:`maspy.core.SiiContainer`
    :param specfile: unambiguous identifier of a ms-run file. Is also used as
        a reference to other MasPy file containers.
    :param siiList: a list of ``Sii`` elements imported from any PSM search
        engine results
    """
    for sii in siiList:
        if sii.id not in siiContainer.container[specfile]:
            siiContainer.container[specfile][sii.id] = list()
        siiContainer.container[specfile][sii.id].append(sii)


[docs]def applySiiRanking(siiContainer, specfile):
    """Iterates over all Sii entries of a specfile in siiContainer and sorts Sii
    elements of the same spectrum according to the score attribute specified in
    ``siiContainer.info[specfile]['rankAttr']``. Sorted Sii elements are then
    ranked  according to their sorted position, if multiple Sii have the same
    score, all get the same rank and the next entries rank is its list position.

    :param siiContainer: instance of :class:`maspy.core.SiiContainer`
    :param specfile: unambiguous identifier of a ms-run file. Is also used as
        a reference to other MasPy file containers.
    """
    attr = siiContainer.info[specfile]['rankAttr']
    reverse = siiContainer.info[specfile]['rankLargerBetter']
    for itemList in listvalues(siiContainer.container[specfile]):
        sortList = [(getattr(sii, attr), sii) for sii in itemList]
        itemList = [sii for score, sii in sorted(sortList, reverse=reverse)]

        #Rank Sii according to their position
        lastValue = None
        for itemPosition, item in enumerate(itemList, 1):
            if getattr(item, attr) != lastValue:
                rank = itemPosition
            item.rank = rank
            lastValue = getattr(item, attr)


[docs]def applySiiQcValidation(siiContainer, specfile):
    """Iterates over all Sii entries of a specfile in siiContainer and validates
    if they surpass a user defined quality threshold. The parameters for
    validation are defined in ``siiContainer.info[specfile]``:

        - ``qcAttr``, ``qcCutoff`` and ``qcLargerBetter``

    In addition to passing this validation a ``Sii`` has also to be at the first
    list position in the ``siiContainer.container``. If both criteria are met
    the attribute ``Sii.isValid`` is set to ``True``.

    :param siiContainer: instance of :class:`maspy.core.SiiContainer`
    :param specfile: unambiguous identifier of a ms-run file. Is also used as
        a reference to other MasPy file containers.
    """
    attr = siiContainer.info[specfile]['qcAttr']
    cutOff = siiContainer.info[specfile]['qcCutoff']
    if siiContainer.info[specfile]['qcLargerBetter']:
        evaluator = lambda sii: getattr(sii, attr) >= cutOff and sii.rank == 1
    else:
        evaluator = lambda sii: getattr(sii, attr) <= cutOff and sii.rank == 1

    for itemList in listvalues(siiContainer.container[specfile]):
        #Set the .isValid attribute of all Sii to False
        for sii in itemList:
            sii.isValid = False

        #Validate the first Sii
        sii = itemList[0]
        if evaluator(sii):
            sii.isValid = True


[docs]def readPercolatorResults(filelocation, specfile, psmEngine):
    """Reads percolator PSM results from a txt file and returns a list of
    :class:`Sii <maspy.core.Sii>` elements.

    :param filelocation: file path of the percolator result file
    :param specfile: unambiguous identifier of a ms-run file. Is also used as
        a reference to other MasPy file containers.
    :param psmEngine: PSM PSM search engine used for peptide spectrum matching
        before percolator. This is important to specify, since the scanNr
        information is written in a different format by some engines. It might
        be necessary to adjust the settings for different versions of percolator
        or the PSM search engines used.

        Possible values are 'comet', 'xtandem', 'msgf'.

    :returns: [sii, sii, sii, ...]
    """
    if psmEngine not in ['comet', 'msgf', 'xtandem']:
        raise Exception('PSM search engine not supported: ', psmEngine)
    itemList = list()

    #Note: regarding headerline, xtandem seperates proteins with ';',
    #msgf separates proteins with a tab
    with io.open(filelocation, 'r', encoding='utf-8') as openfile:
        lines = openfile.readlines()

    headerDict = dict([[y,x] for (x,y) in
                       enumerate(lines[0].strip().split('\t'))
                       ])
    scanEntryList = list()
    for line in lines[1:]:
        if len(line.strip()) == 0:
            continue
        fields = line.strip().split('\t')

        if psmEngine in ['comet', 'msgf']:
            scanNr = fields[headerDict['PSMId']].split('_')[-3]
        elif psmEngine in ['xtandem']:
            scanNr = fields[headerDict['PSMId']].split('_')[-2]

        peptide = fields[headerDict['peptide']]
        if peptide.find('.') != -1:
            peptide = peptide.split('.')[1]
        #Change to the new unimod syntax
        peptide = peptide.replace('[UNIMOD:', '[u:')
        sequence = maspy.peptidemethods.removeModifications(peptide)

        qValue = fields[headerDict['q-value']]
        score = fields[headerDict['score']]
        pep = fields[headerDict['posterior_error_prob']]

        sii = maspy.core.Sii(scanNr, specfile)
        sii.peptide = peptide
        sii.sequence = sequence
        sii.qValue = float(qValue)
        sii.score = float(score)
        sii.pep = float(pep)
        sii.isValid = False

        itemList.append(sii)
    return itemList


[docs]def importPercolatorResults(siiContainer, filelocation, specfile, psmEngine,
                            qcAttr='qValue', qcLargerBetter=False,
                            qcCutoff=0.01, rankAttr='score',
                            rankLargerBetter=True):
    """Import peptide spectrum matches (PSMs) from a percolator result file,
    generate :class:`Sii <maspy.core.Sii>` elements and store them in the
    specified :class:`siiContainer <maspy.core.SiiContainer>`. Imported ``Sii``
    are ranked according to a specified attribute and validated if they surpass
    a specified quality threshold.

    :param siiContainer: imported PSM results are added to this instance of
        :class:`siiContainer <maspy.core.SiiContainer>`
    :param filelocation: file path of the percolator result file
    :param specfile: unambiguous identifier of a ms-run file. Is also used as
        a reference to other MasPy file containers.
    :param psmEngine: PSM search engine used for peptide spectrum matching
        before percolator. For details see :func:`readPercolatorResults()`.
        Possible values are 'comet', 'xtandem', 'msgf'.
    :param qcAttr: name of the parameter to define a quality cut off. Typically
        this is some sort of a global false positive estimator (eg FDR)
    :param qcLargerBetter: bool, True if a large value for the ``.qcAttr`` means
        a higher confidence.
    :param qcCutOff: float, the quality threshold for the specifed ``.qcAttr``
    :param rankAttr: name of the parameter used for ranking ``Sii`` according
        to how well they match to a fragment ion spectrum, in the case when
        their are multiple ``Sii`` present for the same spectrum.
    :param rankLargerBetter: bool, True if a large value for the ``.rankAttr``
        means a better match to the fragment ion spectrum

    For details on ``Sii`` ranking see :func:`applySiiRanking()`

    For details on ``Sii`` quality validation see :func:`applySiiQcValidation()`
    """

    path = os.path.dirname(filelocation)
    siiList = readPercolatorResults(filelocation, specfile, psmEngine)
    prepareSiiImport(siiContainer, specfile, path, qcAttr, qcLargerBetter,
                     qcCutoff, rankAttr, rankLargerBetter)
    addSiiToContainer(siiContainer, specfile, siiList)
    applySiiRanking(siiContainer, specfile)
    applySiiQcValidation(siiContainer, specfile)


[docs]def readMsgfMzidResults(filelocation, specfile=None):
    """Reads MS-GF+ PSM results from a mzIdentML file and returns a list of
    :class:`Sii <maspy.core.Sii>` elements.

    :param filelocation: file path of the percolator result file
    :param specfile: optional, unambiguous identifier of a ms-run file. Is also
        used as a reference to other MasPy file containers. If specified all
        the ``.specfile`` attribute of all ``Sii`` are set to this value, else
        it is read from the mzIdentML file.

    :returns: [sii, sii, sii, ...]
    """
    readSpecfile = True if specfile is None else False

    unimod = pyteomics.mass.mass.Unimod()
    _tempMods = dict()
    mzid_refs = pyteomics.mzid.read(filelocation, retrieve_refs=True,
                                    iterative=False)

    siiList = list()
    for mzidEntry in mzid_refs:
        mzidSii = mzidEntry['SpectrumIdentificationItem'][0]
        scanNumber = str(int(mzidEntry['scan number(s)']))
        if readSpecfile:
            specfile = os.path.splitext(mzidEntry['name'])[0]

        sii = maspy.core.Sii(scanNumber, specfile)
        sii.isValid = mzidSii['passThreshold']
        sii.rank = mzidSii['rank']
        sii.eValue = mzidSii['MS-GF:EValue']
        sii.charge = mzidSii['chargeState']
        sii.sequence = mzidSii['PeptideSequence']
        sii.specEValue = mzidSii['MS-GF:SpecEValue']
        sii.score = numpy.log10(sii.eValue)*-1

        if 'Modification' in mzidSii:
            modifications = list()
            for modEntry in mzidSii['Modification']:
                try:
                    modSymbolMaspy = _tempMods[modEntry['name']]
                except KeyError:
                    unimodEntry = unimod.by_title(modEntry['name'])
                    if len(unimodEntry) != 0:
                        modSymbol = 'u:'+str(unimodEntry['record_id'])
                    else:
                        modSymbol = modEntry['name']
                    modSymbolMaspy = '[' + modSymbol + ']'
                    _tempMods[modEntry['name']] = modSymbolMaspy
                modifications.append((modEntry['location'], modSymbolMaspy))
            modifications.sort(key=ITEMGETTER(0))

            _lastPos = 0
            _peptide = list()
            for pos, mod in modifications:
                 _peptide.extend((sii.sequence[_lastPos:pos], mod))
                 _lastPos = pos
            _peptide.append(sii.sequence[_lastPos:])

            sii.peptide = ''.join(_peptide)
        else:
            sii.peptide = sii.sequence
        siiList.append(sii)
    return siiList


[docs]def importMsgfMzidResults(siiContainer, filelocation, specfile=None,
                          qcAttr='eValue', qcLargerBetter=False, qcCutoff=0.01,
                          rankAttr='score', rankLargerBetter=True):
    """Import peptide spectrum matches (PSMs) from a MS-GF+ mzIdentML file,
    generate :class:`Sii <maspy.core.Sii>` elements and store them in the
    specified :class:`siiContainer <maspy.core.SiiContainer>`. Imported ``Sii``
    are ranked according to a specified attribute and validated if they surpass
    a specified quality threshold.

    :param siiContainer: imported PSM results are added to this instance of
        :class:`siiContainer <maspy.core.SiiContainer>`
    :param filelocation: file path of the percolator result file
    :param specfile: optional, unambiguous identifier of a ms-run file. Is also
        used as a reference to other MasPy file containers. If specified the
        attribute ``.specfile`` of all ``Sii`` is set to this value, else
        it is read from the mzIdentML file.
    :param qcAttr: name of the parameter to define a quality cut off. Typically
        this is some sort of a global false positive estimator (eg FDR)
    :param qcLargerBetter: bool, True if a large value for the ``.qcAttr`` means
        a higher confidence.
    :param qcCutOff: float, the quality threshold for the specifed ``.qcAttr``
    :param rankAttr: name of the parameter used for ranking ``Sii`` according
        to how well they match to a fragment ion spectrum, in the case when
        their are multiple ``Sii`` present for the same spectrum.
    :param rankLargerBetter: bool, True if a large value for the ``.rankAttr``
        means a better match to the fragment ion spectrum

    For details on ``Sii`` ranking see :func:`applySiiRanking()`

    For details on ``Sii`` quality validation see :func:`applySiiQcValidation()`
    """
    path = os.path.dirname(filelocation)
    siiList = readMsgfMzidResults(filelocation, specfile)

    #If the mzIdentML file contains multiple specfiles, split the sii elements
    # up according to their specified "specfile" attribute.
    specfiles = ddict(list)
    if specfile is None:
        for sii in siiList:
            specfiles[sii.specfile].append(sii)
    else:
        specfiles[specfile] = siiList

    for specfile in specfiles:
        _siiList = specfiles[specfile]

        prepareSiiImport(siiContainer, specfile, path, qcAttr, qcLargerBetter,
                         qcCutoff, rankAttr, rankLargerBetter)
        addSiiToContainer(siiContainer, specfile, _siiList)
        applySiiRanking(siiContainer, specfile)
        applySiiQcValidation(siiContainer, specfile)


#####################################################################
### import for FeatureContainer class         #######################
#####################################################################
[docs]def importPeptideFeatures(fiContainer, filelocation, specfile):
    """ Import peptide features from a featureXml file, as generated for example
    by the OpenMS node featureFinderCentroided, or a features.tsv file by the
    Dinosaur command line tool.

    :param fiContainer: imported features are added to this instance of
        :class:`FeatureContainer <maspy.core.FeatureContainer>`.
    :param filelocation: Actual file path
    :param specfile: Keyword (filename) to represent file in the
        :class:`FeatureContainer`. Each filename can only occure once, therefore
        importing the same filename again is prevented.
    """
    if not os.path.isfile(filelocation):
        warnings.warn('The specified file does not exist %s' %(filelocation, ))
        return None
    elif (not filelocation.lower().endswith('.featurexml') and
          not filelocation.lower().endswith('.features.tsv')
          ):
        #TODO: this is depricated as importPeptideFeatues
        #is not longer be used solely for featurexml
        print('Wrong file extension, %s' %(filelocation, ))
    elif specfile in fiContainer.info:
        print('%s is already present in the SiContainer, import interrupted.'
              %(specfile, )
              )
        return None

    #Prepare the file container for the import
    fiContainer.addSpecfile(specfile, os.path.dirname(filelocation))

    #import featurexml file
    if filelocation.lower().endswith('.featurexml'):
        featureDict = _importFeatureXml(filelocation)
        for featureId, featureEntryDict in viewitems(featureDict):
            rtArea = set()
            for convexHullEntry in featureEntryDict['convexHullDict']['0']:
                rtArea.update([convexHullEntry[0]])

            fi = maspy.core.Fi(featureId, specfile)
            fi.rt = featureEntryDict['rt']
            fi.rtArea = max(rtArea) - min(rtArea)
            fi.rtLow = min(rtArea)
            fi.rtHigh = max(rtArea)
            fi.charge = featureEntryDict['charge']
            fi.mz = featureEntryDict['mz']
            fi.mh = maspy.peptidemethods.calcMhFromMz(featureEntryDict['mz'],
                                                      featureEntryDict['charge'])
            fi.intensity = featureEntryDict['intensity']
            fi.quality = featureEntryDict['overallquality']
            fi.isMatched = False
            fi.isAnnotated = False
            fi.isValid = True

            fiContainer.container[specfile][featureId] = fi

    #import dinosaur tsv file
    elif filelocation.lower().endswith('.features.tsv'):
        featureDict = _importDinosaurTsv(filelocation)
        for featureId, featureEntryDict in viewitems(featureDict):
            fi = maspy.core.Fi(featureId, specfile)
            fi.rt = featureEntryDict['rtApex']
            fi.rtArea = featureEntryDict['rtEnd'] - featureEntryDict['rtStart']
            fi.rtFwhm = featureEntryDict['fwhm']
            fi.rtLow = featureEntryDict['rtStart']
            fi.rtHigh = featureEntryDict['rtEnd']
            fi.charge = featureEntryDict['charge']
            fi.numScans  = featureEntryDict['nScans']
            fi.mz = featureEntryDict['mz']
            fi.mh = maspy.peptidemethods.calcMhFromMz(featureEntryDict['mz'],
                                                      featureEntryDict['charge'])
            fi.intensity = featureEntryDict['intensitySum']
            fi.intensityApex = featureEntryDict['intensityApex']
            #Note: not used keys:
            #mostAbundantMz nIsotopes nScans averagineCorr mass massCalib

            fi.isMatched = False
            fi.isAnnotated = False
            fi.isValid = True

            fiContainer.container[specfile][featureId] = fi


def _importFeatureXml(filelocation):
    """Reads a featureXml file.

    :param filelocation: #TODO: docstring
    :returns: {featureKey1: {attribute1:value1, attribute2:value2, ...}, ...}

    See also :func:`importPeptideFeatures`
    """
    with io.open(filelocation, 'r', encoding='utf-8') as openFile:
        readingFeature = False
        readingHull = False
        featureDict = dict()

        for i, line in enumerate(openFile):
            line = line.strip()
            if readingFeature == True:
                if line.find('<convexhull') != -1:
                    readingHull = True
                    hullNr = line.split('<convexhull nr=\"')[1].split('\">')[0]
                    hullList = list()
                elif readingHull == True:
                    if line.find('<pt') != -1:
                        x = float(line.split('x=\"')[1].split('\"')[0])
                        y = float(line.split('y=\"')[1].split('\"')[0])
                        # x = retentiontime, y = m/z
                        #retentionTimeList.append(x)
                        hullList.append([x,y])
                    elif line.find('</convexhull>') != -1:
                        featureDict[featureKey]['convexHullDict'][hullNr] = hullList
                        readingHull = False

                elif line.find('<position dim=\"0\">') != -1:
                    featureDict[featureKey]['dim0'] = float(line.split('<position dim=\"0\">')[1].split('</position>')[0])
                elif line.find('<position dim=\"1\">') != -1:
                    featureDict[featureKey]['dim1'] = float(line.split('<position dim=\"1\">')[1].split('</position>')[0])
                elif line.find('<intensity>') != -1:
                    featureDict[featureKey]['intensity'] = float(line.split('<intensity>')[1].split('</intensity>')[0])
                elif line.find('<overallquality>') != -1:
                    featureDict[featureKey]['overallquality'] = float(line.split('<overallquality>')[1].split('</overallquality>')[0])
                elif line.find('<charge>') != -1:
                    featureDict[featureKey]['charge'] = int( line.split('<charge>')[1].split('</charge>')[0] )

                elif line.find('<userParam') != -1:
                    if line.find('name=\"label\"') != -1:
                        featureDict[featureKey]['label'] = line.split('value=\"')[1].split('\"/>')[0]
                    elif line.find('name=\"score_fit\"') != -1:
                        featureDict[featureKey]['score_fit'] = float(line.split('value=\"')[1].split('\"/>')[0])
                    elif line.find('name=\"score_correlation\"') != -1:
                        featureDict[featureKey]['score_correlation'] = float(line.split('value=\"')[1].split('\"/>')[0])
                    elif line.find('name=\"FWHM\"') != -1:
                        featureDict[featureKey]['FWHM'] = float(line.split('value=\"')[1].split('\"/>')[0])
                    elif line.find('name=\"spectrum_index\"') != -1:
                        featureDict[featureKey]['spectrum_index'] = line.split('value=\"')[1].split('\"/>')[0]
                    elif line.find('name=\"spectrum_native_id\"') != -1:
                        featureDict[featureKey]['spectrum_native_id'] = line.split('value=\"')[1].split('\"/>')[0]

                elif line.find('</feature>') != -1:
                    #mzList = list()
                    #for retentionTime,mz in featureDict[featureKey]['convexHullDict']['0']:
                    #    mzList.append(mz)
                    featureDict[featureKey]['rt'] = featureDict[featureKey]['dim0']#numpy.median(retentionTimeList)
                    featureDict[featureKey]['mz'] = featureDict[featureKey]['dim1']#numpy.median(mzList)

                    readingFeature == False

            if line.find('<feature id') != -1:
                readingFeature = True
                featureKey = line.split('<feature id=\"')[1].split('\">')[0]
                featureDict[featureKey] = dict()
                featureDict[featureKey]['convexHullDict'] = dict()
                #retentionTimeList = list()
    return featureDict


def _importDinosaurTsv(filelocation):
    """Reads a Dinosaur tsv file.

    :returns: {featureKey1: {attribute1:value1, attribute2:value2, ...}, ...}

    See also :func:`importPeptideFeatures`
    """
    with io.open(filelocation, 'r', encoding='utf-8') as openFile:
        #NOTE: this is pretty similar to importing percolator results, maybe unify in a common function
        lines = openFile.readlines()
        headerDict = dict([[y,x] for (x,y) in enumerate(lines[0].strip().split('\t'))])
        featureDict = dict()
        for linePos, line in enumerate(lines[1:]):
            featureId = str(linePos)
            fields = line.strip().split('\t')
            entryDict = dict()
            for headerName, headerPos in viewitems(headerDict):
                entryDict[headerName] = float(fields[headerPos])
                if headerName in ['rtApex', 'rtEnd', 'rtStart', 'fwhm']:
                    #Covnert to seconds
                    entryDict[headerName] *= 60
                elif headerName in ['charge', 'intensitySum', 'nIsotopes', 'nScans', 'intensityApex']:
                    entryDict[headerName] = int(entryDict[headerName])
            featureDict[featureId] = entryDict
    return featureDict