package au.com.acpfg.tpp;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.knime.base.node.io.filereader.DataCellFactory;
import org.knime.core.data.DataCell;
import org.knime.core.data.DataColumnSpec;
import org.knime.core.data.DataColumnSpecCreator;
import org.knime.core.data.DataRow;
import org.knime.core.data.DataTableSpec;
import org.knime.core.data.DataType;
import org.knime.core.data.DataValue;
import org.knime.core.data.RowKey;
import org.knime.core.data.collection.CollectionCellFactory;
import org.knime.core.data.collection.CollectionDataValue;
import org.knime.core.data.collection.ListCell;
import org.knime.core.data.collection.CollectionDataValue.CollectionUtilityFactory;
import org.knime.core.data.container.CellFactory;
import org.knime.core.data.def.DefaultRow;
import org.knime.core.data.def.DoubleCell;
import org.knime.core.data.def.IntCell;
import org.knime.core.data.def.StringCell;
import org.knime.core.node.BufferedDataContainer;
import org.knime.core.node.BufferedDataTable;
import org.knime.core.node.CanceledExecutionException;
import org.knime.core.node.defaultnodesettings.SettingsModelIntegerBounded;
import org.knime.core.node.defaultnodesettings.SettingsModelString;
import org.knime.core.node.ExecutionContext;
import org.knime.core.node.ExecutionMonitor;
import org.knime.core.node.InvalidSettingsException;
import org.knime.core.node.NodeLogger;
import org.knime.core.node.NodeModel;
import org.knime.core.node.NodeSettingsRO;
import org.knime.core.node.NodeSettingsWO;
/**
* This is the model implementation of PepXMLReader.
* Reads PepXML (as produced by the trans-proteomics pipeline) to enable processing of peptide/protein identifications and statistics using KNIME
*
* @author Andrew Cassin
*/
public class PepXMLReaderNodeModel extends NodeModel {
// the logger instance
private static final NodeLogger logger = NodeLogger
.getLogger(PepXMLReaderNodeModel.class);
static final String CFGKEY_FILE = "xml-filename";
private static final String DEFAULT_FILE = "/tmp/peptide-identifications.pep.xml";
private SettingsModelString m_file = new SettingsModelString(CFGKEY_FILE, DEFAULT_FILE);
/**
* xml state parsing members
*/
private boolean m_xml_in_score_summary;
/**
* Constructor for the node model.
*/
protected PepXMLReaderNodeModel() {
super(0, 1);
}
/**
* {@inheritDoc}
*/
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData,
final ExecutionContext exec) throws Exception {
// the data table spec of the single output table,
// the table will have three columns:
DataColumnSpec[] allColSpecs = new DataColumnSpec[18];
allColSpecs[0] =
new DataColumnSpecCreator("Filename", StringCell.TYPE).createSpec();
allColSpecs[1] =
new DataColumnSpecCreator("Spectrum ID", StringCell.TYPE).createSpec();
allColSpecs[2] =
new DataColumnSpecCreator("Mass", DoubleCell.TYPE).createSpec();
allColSpecs[3] = new DataColumnSpecCreator("Charge", IntCell.TYPE).createSpec();
allColSpecs[4] = new DataColumnSpecCreator("Peptide Sequence", StringCell.TYPE).createSpec();
allColSpecs[5] = new DataColumnSpecCreator("Previous AA", StringCell.TYPE).createSpec();
allColSpecs[6] = new DataColumnSpecCreator("Next AA", StringCell.TYPE).createSpec();
allColSpecs[7] = new DataColumnSpecCreator("Modified Peptide Sequence", StringCell.TYPE).createSpec();
allColSpecs[8] = new DataColumnSpecCreator("Peptide Identification Probability", DoubleCell.TYPE).createSpec();
allColSpecs[9] = new DataColumnSpecCreator("X!Tandem Scores (expect,hyper,next)", DataType.getType(ListCell.class, DoubleCell.TYPE)).createSpec();
allColSpecs[10] = new DataColumnSpecCreator("Mascot Scores (expect,homology,identity,ion,star)", DataType.getType(ListCell.class,DoubleCell.TYPE)).createSpec();
allColSpecs[11] = new DataColumnSpecCreator("Sequest Scores (xcorr,spscore,sprank,deltacn,deltacnstar)", DataType.getType(ListCell.class, DoubleCell.TYPE)).createSpec();
allColSpecs[12] = new DataColumnSpecCreator("PeptideProphet Scores (fval,ntt,nmc,massd)", DataType.getType(ListCell.class, DoubleCell.TYPE)).createSpec();
allColSpecs[13] = new DataColumnSpecCreator("Protein Accession(s)", StringCell.TYPE).createSpec();
allColSpecs[14] = new DataColumnSpecCreator("Protein Description(s)", StringCell.TYPE).createSpec();
allColSpecs[15] = new DataColumnSpecCreator("Matched Ions", IntCell.TYPE).createSpec();
allColSpecs[16] = new DataColumnSpecCreator("Total Ions", IntCell.TYPE).createSpec();
allColSpecs[17] = new DataColumnSpecCreator("Hit Rank", IntCell.TYPE).createSpec();
DataTableSpec outputSpec = new DataTableSpec(allColSpecs);
// the execution context will provide us with storage capacity, in this
// case a data container to which we will add rows sequentially
// Note, this container can also handle arbitrary big data tables, it
// will buffer to disc if necessary.
BufferedDataContainer container = exec.createDataContainer(outputSpec);
try {
FileInputStream fis = new FileInputStream(m_file.getStringValue());
XMLStreamReader xsr = XMLInputFactory.newInstance().createXMLStreamReader(fis);
if (!this.getXMLFileType(xsr).equals("pepXML")) {
throw new Exception("PepXML document expected, but not in a compatible format. Aborting.");
}
parsePepXML(xsr, exec, container);
} catch (Exception e) {
logger.error("Unable to process Pep/ProtXML file: "+m_file.getStringValue());
throw e;
}
// once we are done, we close the container and return its table
container.close();
BufferedDataTable out = container.getTable();
return new BufferedDataTable[]{out};
}
/**
* {@inheritDoc}
*/
@Override
protected void reset() {
}
/**
* {@inheritDoc}
*/
@Override
protected DataTableSpec[] configure(final DataTableSpec[] inSpecs)
throws InvalidSettingsException {
// TODO: check if user settings are available, fit to the incoming
// table structure, and the incoming types are feasible for the node
// to execute. If the node can execute in its current state return
// the spec of its output data table(s) (if you can, otherwise an array
// with null elements), or throw an exception with a useful user message
return new DataTableSpec[]{null};
}
/**
* {@inheritDoc}
*/
@Override
protected void saveSettingsTo(final NodeSettingsWO settings) {
m_file.saveSettingsTo(settings);
}
/**
* {@inheritDoc}
*/
@Override
protected void loadValidatedSettingsFrom(final NodeSettingsRO settings)
throws InvalidSettingsException {
m_file.loadSettingsFrom(settings);
}
/**
* {@inheritDoc}
*/
@Override
protected void validateSettings(final NodeSettingsRO settings)
throws InvalidSettingsException {
m_file.validateSettings(settings);
}
/**
* {@inheritDoc}
*/
@Override
protected void loadInternals(final File internDir,
final ExecutionMonitor exec) throws IOException,
CanceledExecutionException {
// TODO load internal data.
// Everything handed to output ports is loaded automatically (data
// returned by the execute method, models loaded in loadModelContent,
// and user settings set through loadSettingsFrom - is all taken care
// of). Load here only the other internals that need to be restored
// (e.g. data used by the views).
}
/**
* {@inheritDoc}
*/
@Override
protected void saveInternals(final File internDir,
final ExecutionMonitor exec) throws IOException,
CanceledExecutionException {
// TODO save internal models.
// Everything written to output ports is saved automatically (data
// returned by the execute method, models saved in the saveModelContent,
// and user settings saved through saveSettingsTo - is all taken care
// of). Save here only the other internals that need to be preserved
// (e.g. data used by the views).
}
public void parsePepXML(XMLStreamReader xmlStreamReader, ExecutionContext exec, BufferedDataContainer cont) throws XMLStreamException, CanceledExecutionException {
pepXML curPSM = null; // current peptide-to-specturm match
int row = 1;
while( xmlStreamReader.hasNext() ) {
int event = xmlStreamReader.next();
if(event == XMLStreamConstants.START_ELEMENT) { //beginning of new element
String elementName = xmlStreamReader.getLocalName();
if(elementName.equals("peptideprophet_summary"))
xmlStreamReader.next();
else if(elementName.equals("spectrum_query")) { // new peptide record starts
curPSM = new pepXML(m_file.getStringValue());
curPSM.parse_pepXML_line(xmlStreamReader);
}
if(elementName.equals("search_hit"))
curPSM.parse_pepXML_line(xmlStreamReader);
if(elementName.equals("mod_aminoacid_mass"))
curPSM.record_AA_mod(xmlStreamReader);
if(elementName.equals("search_score"))
curPSM.parse_search_score_line(xmlStreamReader);
if(elementName.equals("peptideprophet_result"))
curPSM.record_iniProb(xmlStreamReader);
if (elementName.equals("search_score_summary")) {
m_xml_in_score_summary = true;
} else if (elementName.equals("parameter") && m_xml_in_score_summary) {
curPSM.record_peptideprophet_scores(xmlStreamReader);
}
} else if(event == XMLStreamConstants.END_ELEMENT) { // end of element
String elementName = xmlStreamReader.getLocalName();
if(elementName.equals("spectrum_query")) { // end of peptide record
curPSM.annotate_modPeptide();
DataCell[] cells = new DataCell[18];
cells[0] = new StringCell(curPSM.getFilename());
cells[1] = new StringCell(curPSM.getSpecId());
cells[2] = new DoubleCell(curPSM.getMass());
cells[3] = new IntCell(curPSM.getCharge());
cells[4] = new StringCell(curPSM.getPeptide());
StringBuffer sb = new StringBuffer();
sb.append(curPSM.getPrevAA());
cells[5] = new StringCell(sb.toString());
sb.setCharAt(0, curPSM.getNextAA());
cells[6] = new StringCell(sb.toString());
cells[7] = new StringCell(curPSM.getModPeptide());
cells[8] = new DoubleCell(curPSM.getIniProb());
cells[9] = DataType.getMissingCell();
cells[10] = getMascotScores(curPSM);
cells[11] = DataType.getMissingCell();
cells[12] = getPeptideProphetScores(curPSM);
cells[13] = new StringCell(curPSM.getProteinIds());
cells[14] = curPSM.hasProteinDescr() ? new StringCell(curPSM.getProteinDescr()) : DataType.getMissingCell();
cells[15] = new IntCell(curPSM.getMatchedIons());
cells[16] = new IntCell(curPSM.getTotalIons());
cells[17] = new IntCell(curPSM.hitRank());
if (row % 200 == 0) {
exec.checkCanceled();
}
cont.addRowToTable(new DefaultRow("Hit"+row, cells));
row++;
curPSM = null;
} else if (elementName.equals("search_score_summary")) {
m_xml_in_score_summary = false;
}
}
}
}
/**
* Given the current state of the pepXML instance (which cannot be null) compute the collection
* cell for insertion into the current KNIME row. This method should only be called if mascot results are available
*
* @param i
* @return
*/
protected ListCell getMascotScores(pepXML i) {
ArrayList<DoubleCell> scores = new ArrayList<DoubleCell>();
scores.add(new DoubleCell(i.getMascot_expect()));
scores.add(new DoubleCell(i.getMascot_homologyscore()));
scores.add(new DoubleCell(i.getMascot_identityscore()));
scores.add(new DoubleCell(i.getMascot_ionscore()));
scores.add(new DoubleCell(i.getMascot_star()));
return CollectionCellFactory.createListCell(scores);
}
protected ListCell getXTandemScores(pepXML i) {
ArrayList<DoubleCell> scores = new ArrayList<DoubleCell>();
scores.add(new DoubleCell(i.getXtandem_expect()));
scores.add(new DoubleCell(i.getHyperscore()));
scores.add(new DoubleCell(i.getNextscore()));
return CollectionCellFactory.createListCell(scores);
}
protected ListCell getSequestScores(pepXML i) {
ArrayList<DoubleCell> scores = new ArrayList<DoubleCell>();
scores.add(new DoubleCell(i.getSequest_xcorr()));
scores.add(new DoubleCell(i.getSequest_spscore()));
scores.add(new DoubleCell(i.getSequest_sprank()));
scores.add(new DoubleCell(i.getSequest_deltacn()));
scores.add(new DoubleCell(i.getSequest_deltacnstar()));
return CollectionCellFactory.createListCell(scores);
}
protected ListCell getPeptideProphetScores(pepXML i) {
ArrayList<DoubleCell> scores = new ArrayList<DoubleCell>();
scores.add(new DoubleCell(i.getPP_fval()));
scores.add(new DoubleCell(i.getPP_ntt()));
scores.add(new DoubleCell(i.getPP_nmc()));
scores.add(new DoubleCell(i.getPP_massd()));
return CollectionCellFactory.createListCell(scores);
}
/*
* Function runs through given file to determine if its a pepXML or protXML file
*/
private String getXMLFileType(XMLStreamReader xmlStreamReader) throws XMLStreamException {
String ret = null;
while( xmlStreamReader.hasNext() ) {
int event = xmlStreamReader.next(); //get type of next event in file
if(event == XMLStreamConstants.START_ELEMENT) { //beginning of new element
String elementName = xmlStreamReader.getLocalName();
if(elementName.equals("peptideprophet_summary")) {
ret = "pepXML";
break;
}
if(elementName.equals("protein_summary")) {
ret = "protXML";
break;
}
}
}
return ret;
}
}