/*
* Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fhcrc.cpl.toolbox.proteomics.filehandler;
import java.io.*;
import java.math.BigInteger;
import java.util.List;
import org.apache.log4j.Logger;
import net.systemsbiology.regisWeb.pepXML.*;
import org.w3c.dom.Node;
import org.w3c.dom.Element;
import org.w3c.dom.Attr;
import org.apache.xmlbeans.XmlOptions;
import org.apache.xmlbeans.XmlAnySimpleType;
import org.apache.xmlbeans.xml.stream.Attribute;
import org.fhcrc.cpl.toolbox.proteomics.MS2Modification;
import org.fhcrc.cpl.toolbox.proteomics.ModifiedAminoAcid;
/**
* An abstract base class for use as a wrapper for writing PepXml files. We take advantage of XmlBeans to build
* the structure of the pepxml file, and search_summaries (for modifications), but we stitch the XmlBeans XML
* output for features together by hand, writing out to a file as we go, so that we don't have to hold the whole
* structure in memory.
*
* This class is abstract because different implementations are likely to want to use different objects to
* populate the individual spectrum_queries that are written out. The only implementing class at the time of
* this comment is viewer.feature.FeaturePepXmlWriter, which uses an array of viewer.Feature.Feature to
* populate the spectrum_queries.
*
* I can foresee at least two issues arising when another implementation needs to be created (likely on the
* CPAS side):
* 1. More file-level data may need to be written out at the top of the file. In that case, please discuss
* with me (Damon) and we'll make sure that anything that's useful for everybody gets stuck in this abstract
* base class
* 2. Something other than MS2Modifications might be desired to populate the modifications. In that case,
* this abstract base class can turn into two levels of abstract base classes. :)
*
* Also I think there may be some weirdness with where the MS2Modifications are declared, in the case that there's
* more than one fraction. We'll burn that bridge when we come to it.
*/
public abstract class BasePepXmlWriter
{
static Logger _log = Logger.getLogger(BasePepXmlWriter.class);
//root node
protected MsmsPipelineAnalysisDocument _xmlBeansPepXmlDoc = null;
//analysis node, one per doc
protected MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis _xmlBeansAnalysis = null;
//run summaries
protected MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary[] _xmlBeansRunSummaryArray = null;
//first run summary
protected MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary _firstRunSummary = null;
//search summary
protected MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SearchSummary _searchSummary = null;
//Strings of xml representing the structure before and after the feature content
protected String _documentPrefix = null;
protected String _documentPostscript = null;
//encapsulates printing options for all fragments
protected XmlOptions _optionsForPrinting = null;
//Modification objects to populate the AminoAcidModifications and terminal mods
protected MS2Modification[] _modifications = null;
//String to begin all spectrum attributes with
protected String _spectrumBaseString = "";
//source file's base name
protected String _baseName = null;
public static final String PRECURSOR_MASS_TYPE_MONOISOTOPIC = "monoisotopic";
//precursor mass type. Must be an allowed value
protected String _precursorMassType = PRECURSOR_MASS_TYPE_MONOISOTOPIC;
public static final String SEARCH_ENGINE_XTANDEM_COMET = "X! Tandem (comet)";
public static final String DEFAULT_SEARCH_ENGINE = SEARCH_ENGINE_XTANDEM_COMET;
//search engine
protected String _searchEngine = DEFAULT_SEARCH_ENGINE;
//dhmay adding two strings that must be pasted into the output file.
//These will need to be updated if schema version changes
public static final String XSI_URL = "http://www.w3.org/2001/XMLSchema-instance";
public static final String XSD_URL = "http://sashimi.sourceforge.net/schema_revision/pepXML/pepXML_v18.xsd";
/**
* Constructor creates the XmlBeans representing the shell of a PepXml document, and
* creates the "prefix" and "postscript" strings representing that shell
*/
public BasePepXmlWriter()
{
//Construct generic document structure
_xmlBeansPepXmlDoc = MsmsPipelineAnalysisDocument.Factory.newInstance();
_xmlBeansAnalysis = _xmlBeansPepXmlDoc.addNewMsmsPipelineAnalysis();
//dhmay adding two attributes required for TPP to parse these files
Node xsiAtt =
_xmlBeansAnalysis.getDomNode().getOwnerDocument().createAttribute("xmlns:xsi");
xsiAtt.setNodeValue(XSI_URL);
_xmlBeansAnalysis.getDomNode().getAttributes().setNamedItem(xsiAtt);
Node schemaLocAtt =
_xmlBeansAnalysis.getDomNode().getOwnerDocument().createAttribute("xsi:schemaLocation");
schemaLocAtt.setNodeValue(XSD_URL);
_xmlBeansAnalysis.getDomNode().getAttributes().setNamedItem(schemaLocAtt);
_firstRunSummary = _xmlBeansAnalysis.addNewMsmsRunSummary();
_xmlBeansRunSummaryArray = _xmlBeansAnalysis.getMsmsRunSummaryArray();
//set printing options for xml fragments
_optionsForPrinting = new XmlOptions();
_optionsForPrinting.setSaveOuter();
_optionsForPrinting.setSavePrettyPrint();
_optionsForPrinting.setSavePrettyPrintOffset(0);
}
public MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SearchSummary getSearchSummary()
{
if (_searchSummary == null)
{
MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SearchSummary[] searchSummaries =
_firstRunSummary.getSearchSummaryArray();
if (searchSummaries != null && searchSummaries.length > 0)
_searchSummary = searchSummaries[0];
else
{
_log.debug("Adding search summary");
_searchSummary = _firstRunSummary.addNewSearchSummary();
}
_searchSummary.setPrecursorMassType(MassType.Enum.forString(_precursorMassType));
//have to do this manually because the schema isn't aware of any useful search engines
Node searchSummAtt =
_searchSummary.getDomNode().getOwnerDocument().createAttribute("search_engine");
searchSummAtt.setNodeValue(_searchEngine);
_searchSummary.getDomNode().getAttributes().setNamedItem(searchSummAtt);
}
return _searchSummary;
}
/**
* Create doc structure, populate features and modifications
* @param modifications
*/
public BasePepXmlWriter(MS2Modification[] modifications)
{
this();
setModifications(modifications);
}
/**
* setter for modifications
* @param modifications
*/
public void setModifications(MS2Modification[] modifications)
{
_modifications = modifications;
}
/**
* setter for basename
* @param baseName
*/
public void setBaseName(String baseName)
{
_baseName = baseName;
}
/**
* Add modifications to the xml output
* We assume a reasonably small number of modifications. No need to write them each out
* individually and then dispose of the java object
*/
protected void addModificationsToXML()
{
if (_modifications == null || _modifications.length == 0)
return;
for (MS2Modification modification : _modifications)
{
String aminoacid = modification.getAminoAcid();
if (aminoacid.equals("n") || aminoacid.equals("c"))
{
MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SearchSummary.TerminalModification
xmlTerminalMod = getSearchSummary().addNewTerminalModification();
xmlTerminalMod.setMass(modification.getMass());
xmlTerminalMod.setMassdiff(Float.toString(modification.getMassDiff()));
xmlTerminalMod.setVariable(modification.getVariable()? "Y" : "N");
xmlTerminalMod.setTerminus(aminoacid);
//TODO: this is a hack. Not sure if it matters
xmlTerminalMod.setProteinTerminus("N");
}
else
{
MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SearchSummary.AminoacidModification
xmlModification = getSearchSummary().addNewAminoacidModification();
xmlModification.setAminoacid(modification.getAminoAcid());
xmlModification.setMassdiff(Float.toString(modification.getMassDiff()));
//dhmay removing logic that set mass only if it were !=0, after finding Mascot
//results with 0-mass modifications declared
xmlModification.setMass(modification.getMass());
xmlModification.setVariable(modification.getVariable()? "Y" : "N");
if (modification.getVariable() && modification.getSymbol() != null
&& modification.getSymbol().length() > 0
&& !("'".equals(modification.getSymbol())))
{
AaSymbolType.Enum xmlSymbol = AaSymbolType.Enum.forString(modification.getSymbol());
//problems with " as a symbol. xml doesn't like that. No time to fix right now
//For now, just not setting it.
//TODO: carry forward " as a symbol correctly
if (xmlSymbol != null)
{
xmlModification.setSymbol(xmlSymbol);
_log.debug("Adding symbol for mod on var " + modification.getAminoAcid() + ". getSymbol: " + modification.getSymbol() + ", xml Symbol: " + xmlSymbol);
}
else
_log.debug("Not adding symbol for null symbol. Var=" + modification.getAminoAcid() + ", input symbol=" + modification.getSymbol());
}
}
}
}
/**
* Write out all features immediately
* @param pw
*/
protected abstract void writeSpectrumQueries(PrintWriter pw);
/**
* Adds an AnalysisResult representing the peptide prophet score
* @param searchHit
* @param pprophet
* @param allNttProb
* @param fval 0 is a sentinel value meaning absent
*/
protected void addPeptideProphet(MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery.SearchResult.SearchHit searchHit,
float pprophet, String allNttProb, float fval)
{
MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery.SearchResult.SearchHit.AnalysisResult analysisResult =
searchHit.addNewAnalysisResult();
analysisResult.setAnalysis("peptideprophet");
Node domNode = analysisResult.getDomNode();
Element childElement = domNode.getOwnerDocument().createElement("peptideprophet_result");
childElement.setAttribute("probability", "" + pprophet);
if (allNttProb == null)
allNttProb = "(" + pprophet + "," + pprophet + "," + pprophet + ")";
childElement.setAttribute("all_ntt_prob", allNttProb);
domNode.appendChild(childElement);
if (fval != 0)
{
Element searchScoreSummaryElement = domNode.getOwnerDocument().createElement("search_score_summary");
childElement.appendChild(searchScoreSummaryElement);
Element paramElem = domNode.getOwnerDocument().createElement("parameter");
searchScoreSummaryElement.appendChild(paramElem);
paramElem.setAttribute("name", "fval");
paramElem.setAttribute("value", "" + fval);
}
}
/**
* Add a search score. Doesn't make any attempt to prevent duplicates
* @param searchHit
* @param searchScoreName
* @param searchScoreValue
*/
protected void addSearchScore(MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery.SearchResult.SearchHit searchHit,
String searchScoreName, String searchScoreValue)
{
NameValueType newSearchScore = searchHit.addNewSearchScore();
newSearchScore.setName(searchScoreName);
XmlAnySimpleType valueSimpleType = XmlAnySimpleType.Factory.newInstance();
valueSimpleType.setStringValue(searchScoreValue);
newSearchScore.setValue(valueSimpleType);
}
/**
* Utility method to create a spectrumQuery, added to the first run summary
* @param scanFirst
* @param scanLast
* @param charge
* @return
*/
protected MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery
addSpectrumQuery(int scanFirst, int scanLast, int charge, int index)
{
MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery spectrumQuery =
_firstRunSummary.addNewSpectrumQuery();
spectrumQuery.setIndex(index);
spectrumQuery.setStartScan(scanFirst);
spectrumQuery.setEndScan(scanLast);
spectrumQuery.setAssumedCharge(new BigInteger(Integer.toString(charge)));
return spectrumQuery;
}
/**
* Utility method for adding modified aminoacids
* @param searchHit
* @param modifiedAminoAcids
*/
protected void addModificationsToSearchHit(MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery.SearchResult.SearchHit searchHit,
List<ModifiedAminoAcid>[] modifiedAminoAcids,
float nTermModMass, float cTermModMass)
{
if (modifiedAminoAcids != null || nTermModMass != 0 || cTermModMass != 0)
{
MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery.SearchResult.SearchHit.ModificationInfo
xmlBeansModInfo =searchHit.addNewModificationInfo();
String peptideSequence = searchHit.getPeptide();
StringBuffer modPeptideStringBuf = new StringBuffer();
//handling cterm down below because of stringbuf
if (nTermModMass != 0)
{
xmlBeansModInfo.setModNtermMass(nTermModMass);
modPeptideStringBuf.append("n[" + (int) nTermModMass + "]");
}
//modifiedAminoAcids.length guaranteed == peptideSequence.length();
for (int i=0; i<modifiedAminoAcids.length; i++)
{
modPeptideStringBuf.append(peptideSequence.charAt(i));
if (modifiedAminoAcids[i] == null)
continue;
//there may be multiple modifications. If so, too bad. Keep the highest modification mass
double greatestModMass = 0f;
for (ModifiedAminoAcid mod : modifiedAminoAcids[i])
{
MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery.SearchResult.SearchHit.ModificationInfo.ModAminoacidMass
xmlBeansModAminoacidMass = xmlBeansModInfo.addNewModAminoacidMass();
//0-based to 1-based array indexing conversion
xmlBeansModAminoacidMass.setPosition(BigInteger.valueOf(i + 1));
xmlBeansModAminoacidMass.setMass(mod.getMass());
if (mod.getMass() > greatestModMass)
greatestModMass = mod.getMass();
}
if (greatestModMass > 0f)
modPeptideStringBuf.append("[" + Math.round(greatestModMass) + "]");
}
xmlBeansModInfo.setModifiedPeptide(modPeptideStringBuf.toString());
if (cTermModMass != 0)
{
xmlBeansModInfo.setModCtermMass(cTermModMass);
modPeptideStringBuf.append("c[" + (int) cTermModMass + "]");
}
}
}
/**
* Better only call once
* @param databasePath
*/
public void setSearchDatabase(String databasePath)
{
MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SearchSummary.SearchDatabase searchDatabase =
getSearchSummary().addNewSearchDatabase();
searchDatabase.setLocalPath(databasePath);
}
/**
* Better only call once.
* TODO: un-hardcode trypsin
* @param maxCleavages
* @param minTermini
*/
public void setSearchConstraints(int maxCleavages, int minTermini)
{
MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SearchSummary.EnzymaticSearchConstraint
enzSearchConstraint = getSearchSummary().addNewEnzymaticSearchConstraint();
enzSearchConstraint.setMaxNumInternalCleavages(BigInteger.valueOf(maxCleavages));
enzSearchConstraint.setMinNumberTermini(BigInteger.valueOf(minTermini));
enzSearchConstraint.setEnzyme("trypsin");
}
protected void preWrite()
{
if (_baseName != null)
{
_firstRunSummary.setBaseName(_baseName);
getSearchSummary().setBaseName(_baseName);
}
}
/**
* Write out the full document, with all modifications and features, to a file
* @param file
* @throws IOException
*/
public void write(File file) throws IOException
{
preWrite();
//add a sentinel node that tells us where to split the document to insert features and modifications,
//which, conveniently, is the same place for both
Node runSummaryNode = _firstRunSummary.getDomNode();
//if there isn't an explicitly defined base string for all spectra, create it from
//the first part of the filename up to the /last/ ".". Exception: if ends with .pep.xml (any case),
//leave out "pep.xml"
if ("".equals(_spectrumBaseString))
{
_spectrumBaseString = file.getName();
//special handling: if .pep.xml, change to .xml before removing extension
if (_spectrumBaseString.toLowerCase().endsWith(".pep.xml"))
_spectrumBaseString = _spectrumBaseString.substring(0, _spectrumBaseString.length()-8) + ".xml";
if (_spectrumBaseString.contains("."))
_spectrumBaseString = _spectrumBaseString.substring(0, _spectrumBaseString.lastIndexOf("."));
_spectrumBaseString = _spectrumBaseString + ".";
}
//add required namespace element
// Element namespaceElement = runSummaryNode.getOwnerDocument().createElement("xmlns");
Attr nsAttr = _xmlBeansAnalysis.getDomNode().getOwnerDocument().createAttribute("xmlns");
nsAttr.setValue("http://regis-web.systemsbiology.net/pepXML");
// namespaceElement.setNodeValue("http://regis-web.systemsbiology.net/pepXML");
_xmlBeansAnalysis.getDomNode().getAttributes().setNamedItem(nsAttr);
Node featureLocationNode = runSummaryNode.getOwnerDocument().createElement("SENTINEL_FEATURE_LOCATION");
runSummaryNode.appendChild(featureLocationNode);
addModificationsToXML();
//create and break up the xml that defines the document structure
String documentShell = _xmlBeansPepXmlDoc.xmlText(_optionsForPrinting);
//By default, namespace stuff will be written to every opening and closing tag. This gives
//some "parsers" heartburn
documentShell = documentShell.replaceAll("<pep:","<");
documentShell = documentShell.replaceAll("</pep:","</");
//Empty namespace attributes are created, and I don't know of a cleaner way to get rid of them
//TODO: find a cleaner way to get rid of xmlns attrs on q3ratio_summary, peptideprophet_result
documentShell = documentShell.replaceAll("xmlns=\"\"", "");
String[] halves = documentShell.split("<SENTINEL_FEATURE_LOCATION[^\\/]*\\/>");
if (halves.length != 2)
{
_log.error("Failed to create document shell for writing");
return;
}
_documentPrefix = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + halves[0];
_documentPostscript = halves[1];
//remove our dummy node
runSummaryNode.removeChild(featureLocationNode);
PrintWriter pw = new PrintWriter(file);
pw.print(_documentPrefix);
writeSpectrumQueries(pw);
pw.print(_documentPostscript);
pw.flush();
}
public String getSpectrumBaseString()
{
return _spectrumBaseString;
}
public void setSpectrumBaseString(String spectrumBaseString)
{
this._spectrumBaseString = spectrumBaseString;
}
public String get_precursorMassType()
{
return _precursorMassType;
}
public void set_precursorMassType(String _precursorMassType)
{
this._precursorMassType = _precursorMassType;
}
public String get_searchEngine()
{
return _searchEngine;
}
public void set_searchEngine(String _searchEngine)
{
this._searchEngine = _searchEngine;
if (_searchSummary != null)
{
Node searchSummAtt =
_searchSummary.getDomNode().getAttributes().getNamedItem("search_engine");
searchSummAtt.setNodeValue(_searchEngine);
}
}
}