package lssminer; import java.io.File; import java.io.IOException; import org.knime.core.data.DataCell; import org.knime.core.data.DataColumnSpec; import org.knime.core.data.DataColumnSpecCreator; import org.knime.core.data.DataRow; import org.knime.core.data.DataTableSpec; import org.knime.core.data.RowIterator; import org.knime.core.data.RowKey; import org.knime.core.data.def.DefaultRow; import org.knime.core.data.def.IntCell; import org.knime.core.data.def.StringCell; import org.knime.core.node.BufferedDataContainer; import org.knime.core.node.BufferedDataTable; import org.knime.core.node.CanceledExecutionException; import org.knime.core.node.ExecutionContext; import org.knime.core.node.ExecutionMonitor; import org.knime.core.node.InvalidSettingsException; import org.knime.core.node.NodeModel; import org.knime.core.node.NodeSettingsRO; import org.knime.core.node.NodeSettingsWO; import org.knime.core.node.defaultnodesettings.SettingsModelBoolean; import org.knime.core.node.defaultnodesettings.SettingsModelIntegerBounded; import org.knime.core.node.defaultnodesettings.SettingsModelString; /** * This is the model implementation of LSSMiner. * The Longest Shared Sequence Miner looks for the longest subsequence of the test data inside the training sequences. * * @author Manuel Wildner */ public class LSSMinerNodeModel extends NodeModel { /** * The settings models for the dialog components to handle user settings. */ private SettingsModelString m_testSeqColumnSelection = createTestSeqColumnModel(); private SettingsModelString m_trainingSeqColumnSelection = createTrainingSeqColumnModel(); private SettingsModelIntegerBounded m_maxTestGapSelection = createMaxTestGapModel(); private SettingsModelIntegerBounded m_maxTrainGapSelection = createMaxTrainGapModel(); private SettingsModelIntegerBounded m_minSeqLengthSelection = createMinSeqLengthGapModel(); private SettingsModelIntegerBounded m_maxSeqLengthVariationSelection = createMaxSeqLengthVariationGapModel(); private SettingsModelBoolean m_appendSharedSeqLength = createAppendSharedSeqLengthModel(); private SettingsModelBoolean m_appendSharedSeq = createAppendSharedSeqModel(); private int maxTestGap = 1; private int maxTrainGap = 5; // TODO write a settingsmodel for it private int minSeqLength = 5; private int maxSeqLengthVariation = 1; private boolean appendSeqLength = true; private boolean appendSeq = false; /** * Constructor for the node model. */ protected LSSMinerNodeModel() { super(2, 1); } /** * {@inheritDoc} */ @Override protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception { if (inData == null || inData[0] == null || inData[1] == null) { return inData; } // stores meta data about the tables DataTableSpec inDataSpec0 = inData[0].getDataTableSpec(); // test data int rowNum0 = inData[0].getRowCount(); DataTableSpec inDataSpec1 = inData[1].getDataTableSpec(); // training data int rowNum1 = inData[1].getRowCount(); /* * store the positions of needed columns. */ int seqColPos0 = inDataSpec0.findColumnIndex(m_testSeqColumnSelection .getStringValue()); int seqColPos1 = inDataSpec1.findColumnIndex(m_trainingSeqColumnSelection .getStringValue()); /* * update parameters which are specified by the user */ maxTestGap = m_maxTestGapSelection.getIntValue(); maxTrainGap = m_maxTrainGapSelection.getIntValue(); minSeqLength = m_minSeqLengthSelection.getIntValue(); maxSeqLengthVariation = m_maxSeqLengthVariationSelection.getIntValue(); appendSeqLength = m_appendSharedSeqLength.getBooleanValue(); appendSeq = m_appendSharedSeq.getBooleanValue(); int numOutColumns; if (appendSeq ^ appendSeqLength) { numOutColumns = 3; } else if(appendSeq && appendSeqLength) { numOutColumns = 4; } else { numOutColumns = 2; } RowIterator rowIter1 = inData[1].iterator(); // the structure of the output table DataColumnSpec[] allColSpecs = new DataColumnSpec[numOutColumns]; allColSpecs[0] = new DataColumnSpecCreator("RowID(Training)", IntCell.TYPE) .createSpec(); allColSpecs[1] = new DataColumnSpecCreator("RowID(Test)", IntCell.TYPE) .createSpec(); if(appendSeqLength) { allColSpecs[2] = new DataColumnSpecCreator("SharedSeqLength", IntCell.TYPE) .createSpec(); } if(appendSeq) { if(appendSeqLength) { allColSpecs[3] = new DataColumnSpecCreator("SharedSeqence", StringCell.TYPE) .createSpec(); } else { allColSpecs[2] = new DataColumnSpecCreator("SharedSeqence", StringCell.TYPE) .createSpec(); } } DataTableSpec outputSpec = new DataTableSpec(allColSpecs); // stores the current longest shared sequence length int[] lssLength = new int[rowNum0]; // look for sequences in the training data BufferedDataContainer container = exec.createDataContainer(outputSpec); int rowCountOut = 0; while(rowIter1.hasNext()) { DataRow row1 = rowIter1.next(); String[] trainingTokens = ((StringCell) (row1 .getCell(seqColPos1))).getStringValue().split(","); int rowCountTest = 0; RowIterator rowIter0 = inData[0].iterator(); while(rowIter0.hasNext()) { DataRow row0 = rowIter0.next(); String[] testTokens = ((StringCell) (row0 .getCell(seqColPos0))).getStringValue().split(","); int testPointer = testTokens.length - 1; int foundCount = 0; int trainGapCount = 0; StringBuilder stringBuilder = new StringBuilder(); for (int i = trainingTokens.length - 1; i >= 0; i--) { if (trainGapCount <= maxTrainGap) { int currFoundCount = foundCount; int testLoopRange = (foundCount > 0) ? maxTestGap : testTokens.length - 1; for (int j = 0; j <= testLoopRange; j++) { if (testPointer - j >= 0) { if (trainingTokens[i] .equals(testTokens[testPointer - j])) { if (appendSeq) { stringBuilder.insert(0, trainingTokens[i] + ","); } foundCount++; trainGapCount = 0; testPointer -= (j + 1); break; } } else { break; } } if (foundCount > currFoundCount) { if (foundCount > 0) { trainGapCount++; } } } else { break; } } if(foundCount >= minSeqLength && foundCount >= lssLength[rowCountTest] - maxSeqLengthVariation) { if (foundCount > lssLength[rowCountTest]) { lssLength[rowCountTest] = foundCount; } /* * Create the new row */ RowKey key = new RowKey("Row" + rowCountOut); int rowNumberTrain = Integer.parseInt(row1.getKey().getString().substring(3)); // the cells of the current row, the types of the cells must // match the column spec (see above) DataCell[] cells = new DataCell[numOutColumns]; cells[0] = new IntCell(rowNumberTrain); cells[1] = new IntCell(Integer.parseInt(row0.getKey().getString().substring(1))); if(appendSeqLength) { cells[2] = new IntCell(foundCount); } if(appendSeq) { stringBuilder.delete(stringBuilder.length() - 2, stringBuilder.length()); // stringBuilder.reverse(); if(appendSeqLength) { cells[3] = new StringCell(stringBuilder.toString()); } else { cells[2] = new StringCell(stringBuilder.toString()); } } DataRow row = new DefaultRow(key, cells); container.addRowToTable(row); // check if the execution monitor was canceled exec.checkCanceled(); exec.setProgress(rowNumberTrain / (double) rowNum1, "Adding row " + rowCountOut); rowCountOut++; } rowCountTest++; } } container.close(); BufferedDataTable out = container.getTable(); return new BufferedDataTable[] { out }; } /** * {@inheritDoc} */ @Override protected void reset() { // TODO: generated method stub } /** * {@inheritDoc} */ @Override protected DataTableSpec[] configure(final DataTableSpec[] inSpecs) throws InvalidSettingsException { // TODO: generated method stub return new DataTableSpec[]{null}; } /** * {@inheritDoc} */ @Override protected void saveSettingsTo(final NodeSettingsWO settings) { m_testSeqColumnSelection.saveSettingsTo(settings); m_trainingSeqColumnSelection.saveSettingsTo(settings); m_maxTestGapSelection.saveSettingsTo(settings); m_maxTrainGapSelection.saveSettingsTo(settings); m_minSeqLengthSelection.saveSettingsTo(settings); m_maxSeqLengthVariationSelection.saveSettingsTo(settings); m_appendSharedSeqLength.saveSettingsTo(settings); m_appendSharedSeq.saveSettingsTo(settings); } /** * {@inheritDoc} */ @Override protected void loadValidatedSettingsFrom(final NodeSettingsRO settings) throws InvalidSettingsException { m_testSeqColumnSelection.loadSettingsFrom(settings); m_trainingSeqColumnSelection.loadSettingsFrom(settings); m_maxTestGapSelection.loadSettingsFrom(settings); m_maxTrainGapSelection.loadSettingsFrom(settings); m_minSeqLengthSelection.loadSettingsFrom(settings); m_maxSeqLengthVariationSelection.loadSettingsFrom(settings); m_appendSharedSeqLength.loadSettingsFrom(settings); m_appendSharedSeq.loadSettingsFrom(settings); } /** * {@inheritDoc} */ @Override protected void validateSettings(final NodeSettingsRO settings) throws InvalidSettingsException { m_testSeqColumnSelection.validateSettings(settings); m_trainingSeqColumnSelection.validateSettings(settings); m_maxTestGapSelection.validateSettings(settings); m_maxTrainGapSelection.validateSettings(settings); m_minSeqLengthSelection.validateSettings(settings); m_maxSeqLengthVariationSelection.validateSettings(settings); m_appendSharedSeqLength.validateSettings(settings); m_appendSharedSeq.validateSettings(settings); } /** * {@inheritDoc} */ @Override protected void loadInternals(final File internDir, final ExecutionMonitor exec) throws IOException, CanceledExecutionException { // TODO: generated method stub } /** * {@inheritDoc} */ @Override protected void saveInternals(final File internDir, final ExecutionMonitor exec) throws IOException, CanceledExecutionException { // TODO: generated method stub } /** * Creation of the different Settings Models to communicate with the node * dialog */ protected static SettingsModelString createTestSeqColumnModel() { return new SettingsModelString("test_seq_column_selection", "POLYLINE"); } protected static SettingsModelString createTrainingSeqColumnModel() { return new SettingsModelString("training_seq_column_selection", "POLYLINE"); } protected static SettingsModelIntegerBounded createMaxTestGapModel() { return new SettingsModelIntegerBounded("max_test_gap_selection", 1, 0, 100000); } protected static SettingsModelIntegerBounded createMaxTrainGapModel() { return new SettingsModelIntegerBounded("max_train_gap_selection", 5, 0, 100000); } protected static SettingsModelIntegerBounded createMinSeqLengthGapModel() { return new SettingsModelIntegerBounded("min_seq_length_selection", 5, 1, 100000); } protected static SettingsModelIntegerBounded createMaxSeqLengthVariationGapModel() { return new SettingsModelIntegerBounded("max_seq_length_variation_selection", 1, 0, 100000); } protected static SettingsModelBoolean createAppendSharedSeqLengthModel() { return new SettingsModelBoolean("append_shared_seq_length", true); } protected static SettingsModelBoolean createAppendSharedSeqModel() { return new SettingsModelBoolean("append_shared_seq", false); } }