package de.tud.inf.operator.mm; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import Jama.EigenvalueDecomposition; import Jama.Matrix; import com.rapidminer.example.Attribute; import com.rapidminer.example.Attributes; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.example.table.DataRowFactory; import com.rapidminer.example.table.MemoryExampleTable; import com.rapidminer.operator.IOContainer; import com.rapidminer.operator.IOObject; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorCreationException; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.operator.learner.clustering.clusterer.KMeans; import com.rapidminer.operator.preprocessing.IdTagging; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.OperatorService; import com.rapidminer.tools.att.AttributeSet; import de.tud.inf.operator.mm.util.ClusteringInfo; import de.tud.inf.operator.mm.util.MetaConfig; /** * This class implements the Cluster-and-Select strategy. * * {@link http://www.siam.org/proceedings/datamining/2008/dm08_71_fern.pdf} * * TO AVOID INSTABILITY BECAUSE OF THE MANTISSE AND IEEE 754 THE DECIMAL PLACES ARE SET TO A FIXED VALUE * * {@link http://en.wikipedia.org/wiki/IEEE_754-1985} * * @version $Revision$ * @author Andre Jaehnig */ public class CASSelector extends AbstractSelector { /************************************************************************************************ * FIELDS ***********************************************************************************************/ /** decimal places */ private static final int DECIMAL_PLACES = 10; /** Column name for the cluster of this selector. */ private static final String CAS_COLUMN_NAME_CLUSTER = "cas_cluster"; /** Column name for the indicator which clustering is selected. */ private static final String CAS_COLUMN_NAME_SELECTED = "cas_selected"; /** Dummy column name. */ private static final String WORKING_COLUMN_NAME = "working"; /************************************************************************************************ * GETTER & SETTER ***********************************************************************************************/ /* * none */ /************************************************************************************************ * CONSTRUCTOR ***********************************************************************************************/ /** * Constructor. * * @param description */ public CASSelector(OperatorDescription description) { super(description); } /************************************************************************************************ * PUBLIC METHODS ***********************************************************************************************/ /* * (non-Javadoc) * * @see com.rapidminer.operator.Operator#apply() */ @Override public IOObject[] apply() throws OperatorException { double decimalMultiplier = Math.pow(10.0d, DECIMAL_PLACES); this.logNote("Number of used decimal places: " + DECIMAL_PLACES); // get example set with the clusterings ExampleSet exampleSet = this.getInput(ExampleSet.class); int exampleSetSize = exampleSet.size(); this.logNote("Input example-set has " + exampleSetSize + " elements."); // get parameters String metaFileName = this.getParameterAsString(PARAMETER_META_FILENAME); String selectorFileName = this.getParameterAsString(PARAMETER_SELECTOR_FILENAME); MetaConfig mc = MetaConfig.load(metaFileName); String snmiColumnName = mc.getSnmiColumnName(); String clusterColumnPrefix = mc.getClusteringColumnPrefix(); int sampleSize = this.getParameterAsInt(PARAMETER_SAMPLE_SIZE); if (sampleSize < 1 || sampleSize > exampleSetSize) { throw new UserError(this, 116, new Object[] { PARAMETER_SAMPLE_SIZE, sampleSize }); } this.logNote("Requested clustering sample size: " + sampleSize); // create attributes for the selection flag and for the cluster-assignment Attribute casClusterAttr = AttributeFactory.createAttribute(CAS_COLUMN_NAME_CLUSTER, Ontology.NOMINAL); Attribute casSelectedAttr = AttributeFactory.createAttribute(CAS_COLUMN_NAME_SELECTED, Ontology.NOMINAL); exampleSet.getExampleTable().addAttribute(casClusterAttr); exampleSet.getExampleTable().addAttribute(casSelectedAttr); // add attribute to view exampleSet.getAttributes().setSpecialAttribute(casClusterAttr, CAS_COLUMN_NAME_CLUSTER); exampleSet.getAttributes().setSpecialAttribute(casSelectedAttr, CAS_COLUMN_NAME_SELECTED); // add dummy attribute-column (only to the table) Attribute workingSNMIAttr = AttributeFactory.createAttribute(WORKING_COLUMN_NAME + "1", Ontology.REAL); Attribute workingNMIAttr = AttributeFactory.createAttribute(WORKING_COLUMN_NAME + "2", Ontology.REAL); Attribute workingSumAttr = AttributeFactory.createAttribute(WORKING_COLUMN_NAME + "3", Ontology.REAL); exampleSet.getExampleTable().addAttribute(workingSNMIAttr); exampleSet.getExampleTable().addAttribute(workingNMIAttr); exampleSet.getExampleTable().addAttribute(workingSumAttr); // get SNMI attribute Attribute snmiAttr = exampleSet.getAttributes().get(snmiColumnName); if (snmiAttr == null) { throw new UserError(this, 111, snmiColumnName); } // build the affinity and the diagonal matrix based on the NMI this.logNote("Build the affinity matrix based on the NMI-values."); Matrix affinityMatrix = new Matrix(exampleSetSize, exampleSetSize); Matrix diagonalMatrix = new Matrix(exampleSetSize, exampleSetSize); Iterator<Example> it = exampleSet.iterator(); Example example = null; Attributes exampleSetAttributes = exampleSet.getAttributes(); int i = 0; int j; while (it.hasNext()) { example = it.next(); j = 0; for (Attribute attribute : exampleSetAttributes) { if (!attribute.getName().startsWith(clusterColumnPrefix)) { // not relevant continue; } if (i == j) { affinityMatrix.set(i, j, 0.0d); diagonalMatrix.set(i, j, example.getValue(snmiAttr)); } else { affinityMatrix.set(i, j, example.getValue(attribute)); } j++; } i++; } /* * calculating L = D^(-1/2) A D^(-1/2) with A = affinityMatrix and D = diagonalMatrix * * L = (D^(1/2)^-1) A (D^(1/2)^-1) L = (sqrt(D) ^ -1) A (sqrt(D) ^ -1) * * because D is a diagonal matrix the elements x of D can easily calculate by x = 1 / sqrt(x) */ this.logNote("Calculate matrix D and L (see paper for details)."); for (int n = 0; n < exampleSetSize; n++) { diagonalMatrix.set(n, n, 1.0d / Math.sqrt(diagonalMatrix.get(n, n))); } Matrix l = diagonalMatrix.times(affinityMatrix).times(diagonalMatrix); // round the values of L int rows = l.getRowDimension(); int cols = l.getColumnDimension(); for (int r = 0; r < rows; r++) { for (int c = 0; c < cols; c++) { l.set(r, c, Math.round(l.get(r, c) * decimalMultiplier) / decimalMultiplier); } } // get the eigenvalues and -vectors this.log("Execute an eigenvalue-decomposition on the matrix L."); EigenvalueDecomposition evd = l.eig(); Matrix eigenvectorMatrix = evd.getV(); double[] eigenvalues = evd.getRealEigenvalues(); /* * select the k highest eigenvectors (skip those with a duplicate eigenvalue) and form the matrix X by stacking * the eigenvectors in columns */ Matrix x = new Matrix(exampleSetSize, sampleSize); i = 0; double prevEigenvalue = Double.NaN; double eigenvalue; for (j = eigenvalues.length - 1; j >= 0 && i < sampleSize; j--) { eigenvalue = eigenvalues[j]; if (eigenvalue != prevEigenvalue) { // selected this one Matrix curVector = eigenvectorMatrix.getMatrix(0, exampleSetSize - 1, j, j); x.setMatrix(0, exampleSetSize - 1, i, i, curVector); prevEigenvalue = eigenvalue; i++; } } if (i < sampleSize - 1) { throw new Error("Couldn't find enough eigenvectors."); } // form the matrix Y from X by re-normalizing each of X's row Matrix y = new Matrix(exampleSetSize, sampleSize); double length; for (int r = 0; r < exampleSetSize; r++) { // get length of the row vector length = 0.0d; for (int c = 0; c < sampleSize; c++) { length += Math.pow(x.get(r, c), 2); } length = Math.sqrt(length); // normalize the row values of X and put them into Y for (int c = 0; c < sampleSize; c++) { y.set(r, c, x.get(r, c) / length); } } /* * treating each row in Y as an point and cluster them via k-means */ // create 'sampleSize' attributes AttributeSet attributeSet = new AttributeSet(); for (i = 0; i < sampleSize; i++) { attributeSet.addAttribute(AttributeFactory.createAttribute("selected" + i, Ontology.REAL)); } // create table and example set for the kmeans clustering this.logNote("Execute a k-Means clustering on the eigenvectors (k = " + sampleSize + ")."); MemoryExampleTable table = new MemoryExampleTable(attributeSet.getAllAttributes()); DataRowFactory drf = new DataRowFactory(DataRowFactory.TYPE_DOUBLE_ARRAY, '.'); Double[] values = new Double[attributeSet.getAllAttributes().size()]; rows = y.getRowDimension(); cols = y.getColumnDimension(); int counter; for (int r = 0; r < rows; r++) { counter = 0; for (int c = 0; c < cols; c++) { values[counter++] = y.get(r, c); } // add values to the table table.addDataRow(drf.create(values, table.getAttributes())); } ExampleSet kmeansExampleSet = table.createExampleSet(attributeSet); // run kmeans Operator kMeans = null; Operator idTagging = null; try { idTagging = OperatorService.createOperator(IdTagging.class); kMeans = OperatorService.createOperator(KMeans.class); } catch (OperatorCreationException oce) { throw new Error(oce.getMessage()); } idTagging.apply(new IOContainer(kmeansExampleSet)); kMeans.setParameter(KMeans.PARAMETER_K, String.valueOf(sampleSize)); kMeans.setParameter(KMeans.PARAMETER_ADD_CLUSTER_ATTRIBUTE, "true"); kMeans.apply(new IOContainer(kmeansExampleSet)); // copy cluster column to output example set it = kmeansExampleSet.iterator(); Attribute kmeansClusterAttr = kmeansExampleSet.getAttributes().getSpecial(Attributes.CLUSTER_NAME); counter = 0; Map<String, Integer> selectedIndices = new HashMap<String, Integer>(); String clusterValue = null; Example outputExample = null; Integer oldIdx; while (it.hasNext()) { example = it.next(); clusterValue = example.getNominalValue(kmeansClusterAttr); outputExample = exampleSet.getExample(counter); outputExample.setValue(casClusterAttr, clusterValue); // get old selected index for this cluster oldIdx = selectedIndices.get(clusterValue); if (oldIdx == null) { // there is none selectedIndices.put(clusterValue, counter); } else { // compare SNMI of both if (outputExample.getValue(snmiAttr) > exampleSet.getExample(oldIdx).getValue(snmiAttr)) { // set the new index selectedIndices.put(clusterValue, counter); } } counter++; } // mark the selected clusterings at the the output example set it = exampleSet.iterator(); counter = 0; while (it.hasNext()) { example = it.next(); if (selectedIndices.get(example.getNominalValue(casClusterAttr)) == counter) { // selected example.setValue(casSelectedAttr, "true"); } else { // not selected example.setValue(casSelectedAttr, "false"); } counter++; } // write meta config mc.setSelectorFileName(selectorFileName); ClusteringInfo ci = new ClusteringInfo(); ci.setInfoColumnName(CAS_COLUMN_NAME_CLUSTER); ci.setSelectedColumnName(CAS_COLUMN_NAME_SELECTED); ci.setSampleSize(sampleSize); mc.getClusteringInfo().put(this.getClass().getName(), ci); mc.save(metaFileName); return new IOObject[] { exampleSet }; } /************************************************************************************************ * PRIVATE METHODS ***********************************************************************************************/ /* * none */ }