"""
This module provides functions to import various data types as maspy objects,
which are associated with analysis workflows of mass spectrometry data. This
currently comprises the mzML format, results of the percolator software and to
some extent mzIdentML files, and file formats representing peptide LC-MS feature
".featureXML" and ".features.tsv".
"""
# Copyright 2015-2017 David M. Hollenstein, Jakob J. Hollenstein
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
######################## Python 2 and 3 compatibility #########################
from __future__ import absolute_import, division, print_function
from __future__ import unicode_literals
from future.utils import viewitems, viewkeys, viewvalues, listitems, listvalues
try:
#python 2.7
from itertools import izip as zip
except ImportError:
#python 3 series
pass
################################################################################
from collections import defaultdict as ddict
import io
from lxml import etree as ETREE
from operator import itemgetter as ITEMGETTER
import os
import warnings
import numpy
import pyteomics.mzid
import pyteomics.mass
import maspy.auxiliary as aux
import maspy.core
import maspy.peptidemethods
import maspy.xml
#####################################################################
### mzml import methods #######################
#####################################################################
[docs]def importMzml(filepath, msrunContainer=None, siAttrFromSmi=None, specfilename=None):
"""Performs a complete import of a mzml file into a maspy MsrunContainer.
:paramsiAttrFromSmi: allow here to specify a custom function that extracts params a from spectrumMetadataItem
:param specfilename: by default the filename will be used as the specfilename in the MsrunContainer and all
mzML item instances, specify here an alternative specfilename to override the default one
"""
#TODO: docstring
siAttrFromSmi = defaultFetchSiAttrFromSmi if siAttrFromSmi is None else siAttrFromSmi
if msrunContainer is None:
msrunContainer = maspy.core.MsrunContainer()
basename = os.path.basename(filepath)
dirname = os.path.dirname(filepath)
filename, extension = os.path.splitext(basename)
specfilename = filename if specfilename is None else specfilename
#Check if the specified file is valid for an import
if not os.path.isfile(filepath):
raise IOError('File does not exist: %s' % filepath)
elif extension.lower() != '.mzml':
raise IOError('Filetype is not "mzml": %s' % filepath)
elif specfilename in msrunContainer.info:
print(specfilename, 'already present in the msrunContainer, aborting import.')
return None
mzmlReader = maspy.xml.MzmlReader(filepath)
masterContainer = {'rm': str(), 'ci': {}, 'si': {}, 'sai': {}, 'smi': {}}
#Dictionary recording which MS2 scans follow a MS1 scan
ms1Record = ddict(list)
for xmlSpectrum in mzmlReader.parseSpectra():
smi, binaryDataArrayList = smiFromXmlSpectrum(xmlSpectrum, specfilename)
#Generate SpectrumItem
si = maspy.core.Si(smi.id, smi.specfile)
si.isValid = True
siAttrFromSmi(smi, si)
if si.msLevel > 1:
si.precursorId = si.precursorId.split('scan=')[1] #TODO: change to use regex to extract from known vendor format
ms1Record[si.precursorId].append(si.id)
else:
ms1Record[si.id] #Touch the ddict to add the MS1 id, if it is not already present
#Generate SpectrumArrayItem
sai = maspy.core.Sai(smi.id, smi.specfile)
sai.arrays, sai.arrayInfo = maspy.xml.extractBinaries(binaryDataArrayList,
smi.attributes['defaultArrayLength'])
#Store all items in the appropriate containers
masterContainer['smi'][smi.id] = smi
masterContainer['si'][smi.id] = si
masterContainer['sai'][smi.id] = sai
for siId, msnIdList in viewitems(ms1Record):
#Ignore KeyError if the spectrum is not present in the mzML file for whatever reason
try:
setattr(masterContainer['si'][siId], 'msnIdList', msnIdList)
except KeyError:
pass
for xmlChromatogram in mzmlReader.chromatogramList:
ci = ciFromXml(xmlChromatogram, specfilename)
masterContainer['ci'][ci.id] = ci
masterContainer['rm'] = mzmlReader.metadataNode
msrunContainer._addSpecfile(specfilename, dirname)
msrunContainer.rmc[specfilename] = masterContainer['rm']
msrunContainer.info[specfilename]['status']['rm'] = True
msrunContainer.smic[specfilename] = masterContainer['smi']
msrunContainer.info[specfilename]['status']['smi'] = True
msrunContainer.sic[specfilename] = masterContainer['si']
msrunContainer.info[specfilename]['status']['si'] = True
msrunContainer.saic[specfilename] = masterContainer['sai']
msrunContainer.info[specfilename]['status']['sai'] = True
msrunContainer.cic[specfilename] = masterContainer['ci']
msrunContainer.info[specfilename]['status']['ci'] = True
return msrunContainer
# --- generate python objects from mzML xml elements --- #
[docs]def ciFromXml(xmlelement, specfile):
#TODO: docstring
ciId = xmlelement.attrib['id']
ci = maspy.core.Ci(ciId, specfile)
for key in ['dataProcessingRef']:
if key in xmlelement.attrib:
setattr(ci, key, xmlelement.attrib[key])
ci.params, children = maspy.xml.extractParams(xmlelement)
arrayLength = xmlelement.attrib['defaultArrayLength']
for child in children:
childElements, childParams = maspy.xml.sublistReader(child)
childTag = maspy.xml.clearTag(child.tag)
if childTag == 'precursor':
#TODO: THIS HAS TO BE ADAPTED, REPLACE "**element", ADD A LOOP
newElement = MzmlPrecursor(**element)
elif childTag == 'product':
#TODO: THIS HAS TO BE ADAPTED
newElement = MzmlProduct(**element)
elif childTag == 'binaryDataArrayList':
for childElement in childElements:
dataType, dataTypeParam = maspy.xml.findBinaryDataType(childElement['params'])
ci.arrayInfo[dataType] = {'dataProcessingRef': None, 'params': childElement['params']}
if 'dataProcessingRef' in childElement:
ci.arrayInfo[dataType]['dataProcessingRef'] = childElement['dataProcessingRef']
ci.arrays, ci.arrayInfo = maspy.xml.extractBinaries(childElements, arrayLength)
return ci
[docs]def smiFromXmlSpectrum(xmlelement, specfile):
#TODO: docstring
scanId = xmlelement.attrib['id'].split('scan=')[1] #TODO: change to use regex to extract from known vendor format
smi = maspy.core.Smi(scanId, specfile)
smi.params, spectrumChildren = maspy.xml.extractParams(xmlelement)
smi.attributes.update(xmlelement.attrib)
binaryDataArrayList = list()
for spectrumChild in spectrumChildren:
spectrumChildTag = maspy.xml.clearTag(spectrumChild.tag)
elements, params = maspy.xml.sublistReader(spectrumChild)
if params:
#Define scanListParams here
setattr(smi, spectrumChildTag+'Params', params)
for element in elements:
if spectrumChildTag == 'scanList':
newElement = maspy.core.MzmlScan(**element)
elif spectrumChildTag == 'precursorList':
newElement = maspy.core.MzmlPrecursor(**element)
elif spectrumChildTag == 'productList':
newElement = maspy.core.MzmlProduct(**element)
elif spectrumChildTag == 'binaryDataArrayList':
binaryDataArrayList.append(element)
continue
else:
raise Exception('unexpected spectrum xml child tag!')
getattr(smi, spectrumChildTag).append(newElement)
return smi, binaryDataArrayList
# --- extract spectrum attributes from the mzml like structure of a Smi object --- #
[docs]def fetchSpectrumInfo(smi):
#TODO: docstring
attributes = dict()
for key, accession, dtype in [('msLevel', 'MS:1000511', int),
('tic', 'MS:1000285', float),
('basepeakMz', 'MS:1000504', float),
('basepeakI', 'MS:1000505', float)
]:
param = maspy.xml.findParam(smi.params, accession)
if param is not None:
attributes[key] = dtype(param[1])
else:
#Or don't add key to attributes?
attributes[key] = None
return attributes
[docs]def fetchScanInfo(smi):
#TODO: docstring
if smi.scanList:
attributes = dict()
for key, accession, dtype in [('rt', 'MS:1000016', float),
('iit', 'MS:1000927', float),
]:
param = maspy.xml.findParam(smi.scanList[0]['params'], accession)
if param is not None:
attributes[key] = dtype(param[1])
#TODO: raise a warning if time is neither in minutes nor in seconds
if key == 'rt' and param[2] == 'UO:0000031':
#convert retention time from minutes to seconds
attributes['rt'] *= 60
else:
attributes[key] = None
else:
attributes = None
return attributes
[docs]def fetchParentIon(smi):
#TODO: docstring
if smi.precursorList:
attributes = dict()
attributes['precursorId'] = smi.precursorList[0].spectrumRef
selectedIon = smi.precursorList[0]['selectedIonList'][0]
for key, accession, dtype in [('obsMz', 'MS:1000744', float),
('i', 'MS:1000042', float),
('charge', 'MS:1000041', int)
]:
param = maspy.xml.findParam(selectedIon, accession)
if param is not None:
attributes[key] = dtype(param[1])
else:
attributes[key] = None
else:
attributes = None
return attributes
[docs]def defaultFetchSiAttrFromSmi(smi, si):
"""Default method to extract attributes from a spectrum metadata item (sai) and
adding them to a spectrum item (si)."""
for key, value in viewitems(fetchSpectrumInfo(smi)):
setattr(si, key, value)
for key, value in viewitems(fetchScanInfo(smi)):
setattr(si, key, value)
if si.msLevel > 1:
for key, value in viewitems(fetchParentIon(smi)):
setattr(si, key, value)
[docs]def convertMzml(mzmlPath, outputDirectory=None):
"""Imports an mzml file and converts it to a MsrunContainer file
:param mzmlPath: path of the mzml file
:param outputDirectory: directory where the MsrunContainer file should be written
if it is not specified, the output directory is set to the mzml files directory.
"""
outputDirectory = outputDirectory if outputDirectory is not None else os.path.dirname(mzmlPath)
msrunContainer = importMzml(mzmlPath)
msrunContainer.setPath(outputDirectory)
msrunContainer.save()
#####################################################################
### import for SiiContainer class #######################
#####################################################################
[docs]def prepareSiiImport(siiContainer, specfile, path, qcAttr, qcLargerBetter,
qcCutoff, rankAttr, rankLargerBetter):
"""Prepares the ``siiContainer`` for the import of peptide spectrum matching
results. Adds entries to ``siiContainer.container`` and to
``siiContainer.info``.
:param siiContainer: instance of :class:`maspy.core.SiiContainer`
:param specfile: unambiguous identifier of a ms-run file. Is also used as
a reference to other MasPy file containers.
:param path: folder location used by the ``SiiContainer`` to save and load
data to the hard disk.
:param qcAttr: name of the parameter to define a ``Sii`` quality cut off.
Typically this is some sort of a global false positive estimator,
for example a 'false discovery rate' (FDR).
:param qcLargerBetter: bool, True if a large value for the ``.qcAttr`` means
a higher confidence.
:param qcCutOff: float, the quality threshold for the specifed ``.qcAttr``
:param rankAttr: name of the parameter used for ranking ``Sii`` according
to how well they match to a fragment ion spectrum, in the case when
their are multiple ``Sii`` present for the same spectrum.
:param rankLargerBetter: bool, True if a large value for the ``.rankAttr``
means a better match to the fragment ion spectrum.
For details on ``Sii`` ranking see :func:`applySiiRanking()`
For details on ``Sii`` quality validation see :func:`applySiiQcValidation()`
"""
if specfile not in siiContainer.info:
siiContainer.addSpecfile(specfile, path)
else:
raise Exception('...')
siiContainer.info[specfile]['qcAttr'] = qcAttr
siiContainer.info[specfile]['qcLargerBetter'] = qcLargerBetter
siiContainer.info[specfile]['qcCutoff'] = qcCutoff
siiContainer.info[specfile]['rankAttr'] = rankAttr
siiContainer.info[specfile]['rankLargerBetter'] = rankLargerBetter
[docs]def addSiiToContainer(siiContainer, specfile, siiList):
"""Adds the ``Sii`` elements contained in the siiList to the appropriate
list in ``siiContainer.container[specfile]``.
:param siiContainer: instance of :class:`maspy.core.SiiContainer`
:param specfile: unambiguous identifier of a ms-run file. Is also used as
a reference to other MasPy file containers.
:param siiList: a list of ``Sii`` elements imported from any PSM search
engine results
"""
for sii in siiList:
if sii.id not in siiContainer.container[specfile]:
siiContainer.container[specfile][sii.id] = list()
siiContainer.container[specfile][sii.id].append(sii)
[docs]def applySiiRanking(siiContainer, specfile):
"""Iterates over all Sii entries of a specfile in siiContainer and sorts Sii
elements of the same spectrum according to the score attribute specified in
``siiContainer.info[specfile]['rankAttr']``. Sorted Sii elements are then
ranked according to their sorted position, if multiple Sii have the same
score, all get the same rank and the next entries rank is its list position.
:param siiContainer: instance of :class:`maspy.core.SiiContainer`
:param specfile: unambiguous identifier of a ms-run file. Is also used as
a reference to other MasPy file containers.
"""
attr = siiContainer.info[specfile]['rankAttr']
reverse = siiContainer.info[specfile]['rankLargerBetter']
for itemList in listvalues(siiContainer.container[specfile]):
sortList = [(getattr(sii, attr), sii) for sii in itemList]
itemList = [sii for score, sii in sorted(sortList, reverse=reverse)]
#Rank Sii according to their position
lastValue = None
for itemPosition, item in enumerate(itemList, 1):
if getattr(item, attr) != lastValue:
rank = itemPosition
item.rank = rank
lastValue = getattr(item, attr)
[docs]def applySiiQcValidation(siiContainer, specfile):
"""Iterates over all Sii entries of a specfile in siiContainer and validates
if they surpass a user defined quality threshold. The parameters for
validation are defined in ``siiContainer.info[specfile]``:
- ``qcAttr``, ``qcCutoff`` and ``qcLargerBetter``
In addition to passing this validation a ``Sii`` has also to be at the first
list position in the ``siiContainer.container``. If both criteria are met
the attribute ``Sii.isValid`` is set to ``True``.
:param siiContainer: instance of :class:`maspy.core.SiiContainer`
:param specfile: unambiguous identifier of a ms-run file. Is also used as
a reference to other MasPy file containers.
"""
attr = siiContainer.info[specfile]['qcAttr']
cutOff = siiContainer.info[specfile]['qcCutoff']
if siiContainer.info[specfile]['qcLargerBetter']:
evaluator = lambda sii: getattr(sii, attr) >= cutOff and sii.rank == 1
else:
evaluator = lambda sii: getattr(sii, attr) <= cutOff and sii.rank == 1
for itemList in listvalues(siiContainer.container[specfile]):
#Set the .isValid attribute of all Sii to False
for sii in itemList:
sii.isValid = False
#Validate the first Sii
sii = itemList[0]
if evaluator(sii):
sii.isValid = True
[docs]def readPercolatorResults(filelocation, specfile, psmEngine):
"""Reads percolator PSM results from a txt file and returns a list of
:class:`Sii <maspy.core.Sii>` elements.
:param filelocation: file path of the percolator result file
:param specfile: unambiguous identifier of a ms-run file. Is also used as
a reference to other MasPy file containers.
:param psmEngine: PSM PSM search engine used for peptide spectrum matching
before percolator. This is important to specify, since the scanNr
information is written in a different format by some engines. It might
be necessary to adjust the settings for different versions of percolator
or the PSM search engines used.
Possible values are 'comet', 'xtandem', 'msgf'.
:returns: [sii, sii, sii, ...]
"""
if psmEngine not in ['comet', 'msgf', 'xtandem']:
raise Exception('PSM search engine not supported: ', psmEngine)
itemList = list()
#Note: regarding headerline, xtandem seperates proteins with ';',
#msgf separates proteins with a tab
with io.open(filelocation, 'r', encoding='utf-8') as openfile:
lines = openfile.readlines()
headerDict = dict([[y,x] for (x,y) in
enumerate(lines[0].strip().split('\t'))
])
scanEntryList = list()
for line in lines[1:]:
if len(line.strip()) == 0:
continue
fields = line.strip().split('\t')
if psmEngine in ['comet', 'msgf']:
scanNr = fields[headerDict['PSMId']].split('_')[-3]
elif psmEngine in ['xtandem']:
scanNr = fields[headerDict['PSMId']].split('_')[-2]
peptide = fields[headerDict['peptide']]
if peptide.find('.') != -1:
peptide = peptide.split('.')[1]
#Change to the new unimod syntax
peptide = peptide.replace('[UNIMOD:', '[u:')
sequence = maspy.peptidemethods.removeModifications(peptide)
qValue = fields[headerDict['q-value']]
score = fields[headerDict['score']]
pep = fields[headerDict['posterior_error_prob']]
sii = maspy.core.Sii(scanNr, specfile)
sii.peptide = peptide
sii.sequence = sequence
sii.qValue = float(qValue)
sii.score = float(score)
sii.pep = float(pep)
sii.isValid = False
itemList.append(sii)
return itemList
[docs]def importPercolatorResults(siiContainer, filelocation, specfile, psmEngine,
qcAttr='qValue', qcLargerBetter=False,
qcCutoff=0.01, rankAttr='score',
rankLargerBetter=True):
"""Import peptide spectrum matches (PSMs) from a percolator result file,
generate :class:`Sii <maspy.core.Sii>` elements and store them in the
specified :class:`siiContainer <maspy.core.SiiContainer>`. Imported ``Sii``
are ranked according to a specified attribute and validated if they surpass
a specified quality threshold.
:param siiContainer: imported PSM results are added to this instance of
:class:`siiContainer <maspy.core.SiiContainer>`
:param filelocation: file path of the percolator result file
:param specfile: unambiguous identifier of a ms-run file. Is also used as
a reference to other MasPy file containers.
:param psmEngine: PSM search engine used for peptide spectrum matching
before percolator. For details see :func:`readPercolatorResults()`.
Possible values are 'comet', 'xtandem', 'msgf'.
:param qcAttr: name of the parameter to define a quality cut off. Typically
this is some sort of a global false positive estimator (eg FDR)
:param qcLargerBetter: bool, True if a large value for the ``.qcAttr`` means
a higher confidence.
:param qcCutOff: float, the quality threshold for the specifed ``.qcAttr``
:param rankAttr: name of the parameter used for ranking ``Sii`` according
to how well they match to a fragment ion spectrum, in the case when
their are multiple ``Sii`` present for the same spectrum.
:param rankLargerBetter: bool, True if a large value for the ``.rankAttr``
means a better match to the fragment ion spectrum
For details on ``Sii`` ranking see :func:`applySiiRanking()`
For details on ``Sii`` quality validation see :func:`applySiiQcValidation()`
"""
path = os.path.dirname(filelocation)
siiList = readPercolatorResults(filelocation, specfile, psmEngine)
prepareSiiImport(siiContainer, specfile, path, qcAttr, qcLargerBetter,
qcCutoff, rankAttr, rankLargerBetter)
addSiiToContainer(siiContainer, specfile, siiList)
applySiiRanking(siiContainer, specfile)
applySiiQcValidation(siiContainer, specfile)
[docs]def readMsgfMzidResults(filelocation, specfile=None):
"""Reads MS-GF+ PSM results from a mzIdentML file and returns a list of
:class:`Sii <maspy.core.Sii>` elements.
:param filelocation: file path of the percolator result file
:param specfile: optional, unambiguous identifier of a ms-run file. Is also
used as a reference to other MasPy file containers. If specified all
the ``.specfile`` attribute of all ``Sii`` are set to this value, else
it is read from the mzIdentML file.
:returns: [sii, sii, sii, ...]
"""
readSpecfile = True if specfile is None else False
unimod = pyteomics.mass.mass.Unimod()
_tempMods = dict()
mzid_refs = pyteomics.mzid.read(filelocation, retrieve_refs=True,
iterative=False)
siiList = list()
for mzidEntry in mzid_refs:
mzidSii = mzidEntry['SpectrumIdentificationItem'][0]
scanNumber = str(int(mzidEntry['scan number(s)']))
if readSpecfile:
specfile = os.path.splitext(mzidEntry['name'])[0]
sii = maspy.core.Sii(scanNumber, specfile)
sii.isValid = mzidSii['passThreshold']
sii.rank = mzidSii['rank']
sii.eValue = mzidSii['MS-GF:EValue']
sii.charge = mzidSii['chargeState']
sii.sequence = mzidSii['PeptideSequence']
sii.specEValue = mzidSii['MS-GF:SpecEValue']
sii.score = numpy.log10(sii.eValue)*-1
if 'Modification' in mzidSii:
modifications = list()
for modEntry in mzidSii['Modification']:
try:
modSymbolMaspy = _tempMods[modEntry['name']]
except KeyError:
unimodEntry = unimod.by_title(modEntry['name'])
if len(unimodEntry) != 0:
modSymbol = 'u:'+str(unimodEntry['record_id'])
else:
modSymbol = modEntry['name']
modSymbolMaspy = '[' + modSymbol + ']'
_tempMods[modEntry['name']] = modSymbolMaspy
modifications.append((modEntry['location'], modSymbolMaspy))
modifications.sort(key=ITEMGETTER(0))
_lastPos = 0
_peptide = list()
for pos, mod in modifications:
_peptide.extend((sii.sequence[_lastPos:pos], mod))
_lastPos = pos
_peptide.append(sii.sequence[_lastPos:])
sii.peptide = ''.join(_peptide)
else:
sii.peptide = sii.sequence
siiList.append(sii)
return siiList
[docs]def importMsgfMzidResults(siiContainer, filelocation, specfile=None,
qcAttr='eValue', qcLargerBetter=False, qcCutoff=0.01,
rankAttr='score', rankLargerBetter=True):
"""Import peptide spectrum matches (PSMs) from a MS-GF+ mzIdentML file,
generate :class:`Sii <maspy.core.Sii>` elements and store them in the
specified :class:`siiContainer <maspy.core.SiiContainer>`. Imported ``Sii``
are ranked according to a specified attribute and validated if they surpass
a specified quality threshold.
:param siiContainer: imported PSM results are added to this instance of
:class:`siiContainer <maspy.core.SiiContainer>`
:param filelocation: file path of the percolator result file
:param specfile: optional, unambiguous identifier of a ms-run file. Is also
used as a reference to other MasPy file containers. If specified the
attribute ``.specfile`` of all ``Sii`` is set to this value, else
it is read from the mzIdentML file.
:param qcAttr: name of the parameter to define a quality cut off. Typically
this is some sort of a global false positive estimator (eg FDR)
:param qcLargerBetter: bool, True if a large value for the ``.qcAttr`` means
a higher confidence.
:param qcCutOff: float, the quality threshold for the specifed ``.qcAttr``
:param rankAttr: name of the parameter used for ranking ``Sii`` according
to how well they match to a fragment ion spectrum, in the case when
their are multiple ``Sii`` present for the same spectrum.
:param rankLargerBetter: bool, True if a large value for the ``.rankAttr``
means a better match to the fragment ion spectrum
For details on ``Sii`` ranking see :func:`applySiiRanking()`
For details on ``Sii`` quality validation see :func:`applySiiQcValidation()`
"""
path = os.path.dirname(filelocation)
siiList = readMsgfMzidResults(filelocation, specfile)
#If the mzIdentML file contains multiple specfiles, split the sii elements
# up according to their specified "specfile" attribute.
specfiles = ddict(list)
if specfile is None:
for sii in siiList:
specfiles[sii.specfile].append(sii)
else:
specfiles[specfile] = siiList
for specfile in specfiles:
_siiList = specfiles[specfile]
prepareSiiImport(siiContainer, specfile, path, qcAttr, qcLargerBetter,
qcCutoff, rankAttr, rankLargerBetter)
addSiiToContainer(siiContainer, specfile, _siiList)
applySiiRanking(siiContainer, specfile)
applySiiQcValidation(siiContainer, specfile)
#####################################################################
### import for FeatureContainer class #######################
#####################################################################
[docs]def importPeptideFeatures(fiContainer, filelocation, specfile):
""" Import peptide features from a featureXml file, as generated for example
by the OpenMS node featureFinderCentroided, or a features.tsv file by the
Dinosaur command line tool.
:param fiContainer: imported features are added to this instance of
:class:`FeatureContainer <maspy.core.FeatureContainer>`.
:param filelocation: Actual file path
:param specfile: Keyword (filename) to represent file in the
:class:`FeatureContainer`. Each filename can only occure once, therefore
importing the same filename again is prevented.
"""
if not os.path.isfile(filelocation):
warnings.warn('The specified file does not exist %s' %(filelocation, ))
return None
elif (not filelocation.lower().endswith('.featurexml') and
not filelocation.lower().endswith('.features.tsv')
):
#TODO: this is depricated as importPeptideFeatues
#is not longer be used solely for featurexml
print('Wrong file extension, %s' %(filelocation, ))
elif specfile in fiContainer.info:
print('%s is already present in the SiContainer, import interrupted.'
%(specfile, )
)
return None
#Prepare the file container for the import
fiContainer.addSpecfile(specfile, os.path.dirname(filelocation))
#import featurexml file
if filelocation.lower().endswith('.featurexml'):
featureDict = _importFeatureXml(filelocation)
for featureId, featureEntryDict in viewitems(featureDict):
rtArea = set()
for convexHullEntry in featureEntryDict['convexHullDict']['0']:
rtArea.update([convexHullEntry[0]])
fi = maspy.core.Fi(featureId, specfile)
fi.rt = featureEntryDict['rt']
fi.rtArea = max(rtArea) - min(rtArea)
fi.rtLow = min(rtArea)
fi.rtHigh = max(rtArea)
fi.charge = featureEntryDict['charge']
fi.mz = featureEntryDict['mz']
fi.mh = maspy.peptidemethods.calcMhFromMz(featureEntryDict['mz'],
featureEntryDict['charge'])
fi.intensity = featureEntryDict['intensity']
fi.quality = featureEntryDict['overallquality']
fi.isMatched = False
fi.isAnnotated = False
fi.isValid = True
fiContainer.container[specfile][featureId] = fi
#import dinosaur tsv file
elif filelocation.lower().endswith('.features.tsv'):
featureDict = _importDinosaurTsv(filelocation)
for featureId, featureEntryDict in viewitems(featureDict):
fi = maspy.core.Fi(featureId, specfile)
fi.rt = featureEntryDict['rtApex']
fi.rtArea = featureEntryDict['rtEnd'] - featureEntryDict['rtStart']
fi.rtFwhm = featureEntryDict['fwhm']
fi.rtLow = featureEntryDict['rtStart']
fi.rtHigh = featureEntryDict['rtEnd']
fi.charge = featureEntryDict['charge']
fi.numScans = featureEntryDict['nScans']
fi.mz = featureEntryDict['mz']
fi.mh = maspy.peptidemethods.calcMhFromMz(featureEntryDict['mz'],
featureEntryDict['charge'])
fi.intensity = featureEntryDict['intensitySum']
fi.intensityApex = featureEntryDict['intensityApex']
#Note: not used keys:
#mostAbundantMz nIsotopes nScans averagineCorr mass massCalib
fi.isMatched = False
fi.isAnnotated = False
fi.isValid = True
fiContainer.container[specfile][featureId] = fi
def _importFeatureXml(filelocation):
"""Reads a featureXml file.
:param filelocation: #TODO: docstring
:returns: {featureKey1: {attribute1:value1, attribute2:value2, ...}, ...}
See also :func:`importPeptideFeatures`
"""
with io.open(filelocation, 'r', encoding='utf-8') as openFile:
readingFeature = False
readingHull = False
featureDict = dict()
for i, line in enumerate(openFile):
line = line.strip()
if readingFeature == True:
if line.find('<convexhull') != -1:
readingHull = True
hullNr = line.split('<convexhull nr=\"')[1].split('\">')[0]
hullList = list()
elif readingHull == True:
if line.find('<pt') != -1:
x = float(line.split('x=\"')[1].split('\"')[0])
y = float(line.split('y=\"')[1].split('\"')[0])
# x = retentiontime, y = m/z
#retentionTimeList.append(x)
hullList.append([x,y])
elif line.find('</convexhull>') != -1:
featureDict[featureKey]['convexHullDict'][hullNr] = hullList
readingHull = False
elif line.find('<position dim=\"0\">') != -1:
featureDict[featureKey]['dim0'] = float(line.split('<position dim=\"0\">')[1].split('</position>')[0])
elif line.find('<position dim=\"1\">') != -1:
featureDict[featureKey]['dim1'] = float(line.split('<position dim=\"1\">')[1].split('</position>')[0])
elif line.find('<intensity>') != -1:
featureDict[featureKey]['intensity'] = float(line.split('<intensity>')[1].split('</intensity>')[0])
elif line.find('<overallquality>') != -1:
featureDict[featureKey]['overallquality'] = float(line.split('<overallquality>')[1].split('</overallquality>')[0])
elif line.find('<charge>') != -1:
featureDict[featureKey]['charge'] = int( line.split('<charge>')[1].split('</charge>')[0] )
elif line.find('<userParam') != -1:
if line.find('name=\"label\"') != -1:
featureDict[featureKey]['label'] = line.split('value=\"')[1].split('\"/>')[0]
elif line.find('name=\"score_fit\"') != -1:
featureDict[featureKey]['score_fit'] = float(line.split('value=\"')[1].split('\"/>')[0])
elif line.find('name=\"score_correlation\"') != -1:
featureDict[featureKey]['score_correlation'] = float(line.split('value=\"')[1].split('\"/>')[0])
elif line.find('name=\"FWHM\"') != -1:
featureDict[featureKey]['FWHM'] = float(line.split('value=\"')[1].split('\"/>')[0])
elif line.find('name=\"spectrum_index\"') != -1:
featureDict[featureKey]['spectrum_index'] = line.split('value=\"')[1].split('\"/>')[0]
elif line.find('name=\"spectrum_native_id\"') != -1:
featureDict[featureKey]['spectrum_native_id'] = line.split('value=\"')[1].split('\"/>')[0]
elif line.find('</feature>') != -1:
#mzList = list()
#for retentionTime,mz in featureDict[featureKey]['convexHullDict']['0']:
# mzList.append(mz)
featureDict[featureKey]['rt'] = featureDict[featureKey]['dim0']#numpy.median(retentionTimeList)
featureDict[featureKey]['mz'] = featureDict[featureKey]['dim1']#numpy.median(mzList)
readingFeature == False
if line.find('<feature id') != -1:
readingFeature = True
featureKey = line.split('<feature id=\"')[1].split('\">')[0]
featureDict[featureKey] = dict()
featureDict[featureKey]['convexHullDict'] = dict()
#retentionTimeList = list()
return featureDict
def _importDinosaurTsv(filelocation):
"""Reads a Dinosaur tsv file.
:returns: {featureKey1: {attribute1:value1, attribute2:value2, ...}, ...}
See also :func:`importPeptideFeatures`
"""
with io.open(filelocation, 'r', encoding='utf-8') as openFile:
#NOTE: this is pretty similar to importing percolator results, maybe unify in a common function
lines = openFile.readlines()
headerDict = dict([[y,x] for (x,y) in enumerate(lines[0].strip().split('\t'))])
featureDict = dict()
for linePos, line in enumerate(lines[1:]):
featureId = str(linePos)
fields = line.strip().split('\t')
entryDict = dict()
for headerName, headerPos in viewitems(headerDict):
entryDict[headerName] = float(fields[headerPos])
if headerName in ['rtApex', 'rtEnd', 'rtStart', 'fwhm']:
#Covnert to seconds
entryDict[headerName] *= 60
elif headerName in ['charge', 'intensitySum', 'nIsotopes', 'nScans', 'intensityApex']:
entryDict[headerName] = int(entryDict[headerName])
featureDict[featureId] = entryDict
return featureDict