package de.tud.inf.operator.mm;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import Jama.EigenvalueDecomposition;
import Jama.Matrix;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DataRowFactory;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.operator.IOContainer;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorCreationException;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.learner.clustering.clusterer.KMeans;
import com.rapidminer.operator.preprocessing.IdTagging;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.OperatorService;
import com.rapidminer.tools.att.AttributeSet;
import de.tud.inf.operator.mm.util.ClusteringInfo;
import de.tud.inf.operator.mm.util.MetaConfig;
/**
* This class implements the Cluster-and-Select strategy.
*
* {@link http://www.siam.org/proceedings/datamining/2008/dm08_71_fern.pdf}
*
* TO AVOID INSTABILITY BECAUSE OF THE MANTISSE AND IEEE 754 THE DECIMAL PLACES ARE SET TO A FIXED VALUE
*
* {@link http://en.wikipedia.org/wiki/IEEE_754-1985}
*
* @version $Revision$
* @author Andre Jaehnig
*/
public class CASSelector extends AbstractSelector {
/************************************************************************************************
* FIELDS
***********************************************************************************************/
/** decimal places */
private static final int DECIMAL_PLACES = 10;
/** Column name for the cluster of this selector. */
private static final String CAS_COLUMN_NAME_CLUSTER = "cas_cluster";
/** Column name for the indicator which clustering is selected. */
private static final String CAS_COLUMN_NAME_SELECTED = "cas_selected";
/** Dummy column name. */
private static final String WORKING_COLUMN_NAME = "working";
/************************************************************************************************
* GETTER & SETTER
***********************************************************************************************/
/*
* none
*/
/************************************************************************************************
* CONSTRUCTOR
***********************************************************************************************/
/**
* Constructor.
*
* @param description
*/
public CASSelector(OperatorDescription description) {
super(description);
}
/************************************************************************************************
* PUBLIC METHODS
***********************************************************************************************/
/*
* (non-Javadoc)
*
* @see com.rapidminer.operator.Operator#apply()
*/
@Override
public IOObject[] apply() throws OperatorException {
double decimalMultiplier = Math.pow(10.0d, DECIMAL_PLACES);
this.logNote("Number of used decimal places: " + DECIMAL_PLACES);
// get example set with the clusterings
ExampleSet exampleSet = this.getInput(ExampleSet.class);
int exampleSetSize = exampleSet.size();
this.logNote("Input example-set has " + exampleSetSize + " elements.");
// get parameters
String metaFileName = this.getParameterAsString(PARAMETER_META_FILENAME);
String selectorFileName = this.getParameterAsString(PARAMETER_SELECTOR_FILENAME);
MetaConfig mc = MetaConfig.load(metaFileName);
String snmiColumnName = mc.getSnmiColumnName();
String clusterColumnPrefix = mc.getClusteringColumnPrefix();
int sampleSize = this.getParameterAsInt(PARAMETER_SAMPLE_SIZE);
if (sampleSize < 1 || sampleSize > exampleSetSize) {
throw new UserError(this, 116, new Object[] { PARAMETER_SAMPLE_SIZE, sampleSize });
}
this.logNote("Requested clustering sample size: " + sampleSize);
// create attributes for the selection flag and for the cluster-assignment
Attribute casClusterAttr = AttributeFactory.createAttribute(CAS_COLUMN_NAME_CLUSTER, Ontology.NOMINAL);
Attribute casSelectedAttr = AttributeFactory.createAttribute(CAS_COLUMN_NAME_SELECTED, Ontology.NOMINAL);
exampleSet.getExampleTable().addAttribute(casClusterAttr);
exampleSet.getExampleTable().addAttribute(casSelectedAttr);
// add attribute to view
exampleSet.getAttributes().setSpecialAttribute(casClusterAttr, CAS_COLUMN_NAME_CLUSTER);
exampleSet.getAttributes().setSpecialAttribute(casSelectedAttr, CAS_COLUMN_NAME_SELECTED);
// add dummy attribute-column (only to the table)
Attribute workingSNMIAttr = AttributeFactory.createAttribute(WORKING_COLUMN_NAME + "1", Ontology.REAL);
Attribute workingNMIAttr = AttributeFactory.createAttribute(WORKING_COLUMN_NAME + "2", Ontology.REAL);
Attribute workingSumAttr = AttributeFactory.createAttribute(WORKING_COLUMN_NAME + "3", Ontology.REAL);
exampleSet.getExampleTable().addAttribute(workingSNMIAttr);
exampleSet.getExampleTable().addAttribute(workingNMIAttr);
exampleSet.getExampleTable().addAttribute(workingSumAttr);
// get SNMI attribute
Attribute snmiAttr = exampleSet.getAttributes().get(snmiColumnName);
if (snmiAttr == null) {
throw new UserError(this, 111, snmiColumnName);
}
// build the affinity and the diagonal matrix based on the NMI
this.logNote("Build the affinity matrix based on the NMI-values.");
Matrix affinityMatrix = new Matrix(exampleSetSize, exampleSetSize);
Matrix diagonalMatrix = new Matrix(exampleSetSize, exampleSetSize);
Iterator<Example> it = exampleSet.iterator();
Example example = null;
Attributes exampleSetAttributes = exampleSet.getAttributes();
int i = 0;
int j;
while (it.hasNext()) {
example = it.next();
j = 0;
for (Attribute attribute : exampleSetAttributes) {
if (!attribute.getName().startsWith(clusterColumnPrefix)) {
// not relevant
continue;
}
if (i == j) {
affinityMatrix.set(i, j, 0.0d);
diagonalMatrix.set(i, j, example.getValue(snmiAttr));
}
else {
affinityMatrix.set(i, j, example.getValue(attribute));
}
j++;
}
i++;
}
/*
* calculating L = D^(-1/2) A D^(-1/2) with A = affinityMatrix and D = diagonalMatrix
*
* L = (D^(1/2)^-1) A (D^(1/2)^-1) L = (sqrt(D) ^ -1) A (sqrt(D) ^ -1)
*
* because D is a diagonal matrix the elements x of D can easily calculate by x = 1 / sqrt(x)
*/
this.logNote("Calculate matrix D and L (see paper for details).");
for (int n = 0; n < exampleSetSize; n++) {
diagonalMatrix.set(n, n, 1.0d / Math.sqrt(diagonalMatrix.get(n, n)));
}
Matrix l = diagonalMatrix.times(affinityMatrix).times(diagonalMatrix);
// round the values of L
int rows = l.getRowDimension();
int cols = l.getColumnDimension();
for (int r = 0; r < rows; r++) {
for (int c = 0; c < cols; c++) {
l.set(r, c, Math.round(l.get(r, c) * decimalMultiplier) / decimalMultiplier);
}
}
// get the eigenvalues and -vectors
this.log("Execute an eigenvalue-decomposition on the matrix L.");
EigenvalueDecomposition evd = l.eig();
Matrix eigenvectorMatrix = evd.getV();
double[] eigenvalues = evd.getRealEigenvalues();
/*
* select the k highest eigenvectors (skip those with a duplicate eigenvalue) and form the matrix X by stacking
* the eigenvectors in columns
*/
Matrix x = new Matrix(exampleSetSize, sampleSize);
i = 0;
double prevEigenvalue = Double.NaN;
double eigenvalue;
for (j = eigenvalues.length - 1; j >= 0 && i < sampleSize; j--) {
eigenvalue = eigenvalues[j];
if (eigenvalue != prevEigenvalue) {
// selected this one
Matrix curVector = eigenvectorMatrix.getMatrix(0, exampleSetSize - 1, j, j);
x.setMatrix(0, exampleSetSize - 1, i, i, curVector);
prevEigenvalue = eigenvalue;
i++;
}
}
if (i < sampleSize - 1) {
throw new Error("Couldn't find enough eigenvectors.");
}
// form the matrix Y from X by re-normalizing each of X's row
Matrix y = new Matrix(exampleSetSize, sampleSize);
double length;
for (int r = 0; r < exampleSetSize; r++) {
// get length of the row vector
length = 0.0d;
for (int c = 0; c < sampleSize; c++) {
length += Math.pow(x.get(r, c), 2);
}
length = Math.sqrt(length);
// normalize the row values of X and put them into Y
for (int c = 0; c < sampleSize; c++) {
y.set(r, c, x.get(r, c) / length);
}
}
/*
* treating each row in Y as an point and cluster them via k-means
*/
// create 'sampleSize' attributes
AttributeSet attributeSet = new AttributeSet();
for (i = 0; i < sampleSize; i++) {
attributeSet.addAttribute(AttributeFactory.createAttribute("selected" + i, Ontology.REAL));
}
// create table and example set for the kmeans clustering
this.logNote("Execute a k-Means clustering on the eigenvectors (k = " + sampleSize + ").");
MemoryExampleTable table = new MemoryExampleTable(attributeSet.getAllAttributes());
DataRowFactory drf = new DataRowFactory(DataRowFactory.TYPE_DOUBLE_ARRAY, '.');
Double[] values = new Double[attributeSet.getAllAttributes().size()];
rows = y.getRowDimension();
cols = y.getColumnDimension();
int counter;
for (int r = 0; r < rows; r++) {
counter = 0;
for (int c = 0; c < cols; c++) {
values[counter++] = y.get(r, c);
}
// add values to the table
table.addDataRow(drf.create(values, table.getAttributes()));
}
ExampleSet kmeansExampleSet = table.createExampleSet(attributeSet);
// run kmeans
Operator kMeans = null;
Operator idTagging = null;
try {
idTagging = OperatorService.createOperator(IdTagging.class);
kMeans = OperatorService.createOperator(KMeans.class);
}
catch (OperatorCreationException oce) {
throw new Error(oce.getMessage());
}
idTagging.apply(new IOContainer(kmeansExampleSet));
kMeans.setParameter(KMeans.PARAMETER_K, String.valueOf(sampleSize));
kMeans.setParameter(KMeans.PARAMETER_ADD_CLUSTER_ATTRIBUTE, "true");
kMeans.apply(new IOContainer(kmeansExampleSet));
// copy cluster column to output example set
it = kmeansExampleSet.iterator();
Attribute kmeansClusterAttr = kmeansExampleSet.getAttributes().getSpecial(Attributes.CLUSTER_NAME);
counter = 0;
Map<String, Integer> selectedIndices = new HashMap<String, Integer>();
String clusterValue = null;
Example outputExample = null;
Integer oldIdx;
while (it.hasNext()) {
example = it.next();
clusterValue = example.getNominalValue(kmeansClusterAttr);
outputExample = exampleSet.getExample(counter);
outputExample.setValue(casClusterAttr, clusterValue);
// get old selected index for this cluster
oldIdx = selectedIndices.get(clusterValue);
if (oldIdx == null) {
// there is none
selectedIndices.put(clusterValue, counter);
}
else {
// compare SNMI of both
if (outputExample.getValue(snmiAttr) > exampleSet.getExample(oldIdx).getValue(snmiAttr)) {
// set the new index
selectedIndices.put(clusterValue, counter);
}
}
counter++;
}
// mark the selected clusterings at the the output example set
it = exampleSet.iterator();
counter = 0;
while (it.hasNext()) {
example = it.next();
if (selectedIndices.get(example.getNominalValue(casClusterAttr)) == counter) {
// selected
example.setValue(casSelectedAttr, "true");
}
else {
// not selected
example.setValue(casSelectedAttr, "false");
}
counter++;
}
// write meta config
mc.setSelectorFileName(selectorFileName);
ClusteringInfo ci = new ClusteringInfo();
ci.setInfoColumnName(CAS_COLUMN_NAME_CLUSTER);
ci.setSelectedColumnName(CAS_COLUMN_NAME_SELECTED);
ci.setSampleSize(sampleSize);
mc.getClusteringInfo().put(this.getClass().getName(), ci);
mc.save(metaFileName);
return new IOObject[] { exampleSet };
}
/************************************************************************************************
* PRIVATE METHODS
***********************************************************************************************/
/*
* none
*/
}