/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.clustering.clusterer.soft; import java.util.Collection; import java.util.LinkedList; import java.util.List; import java.util.Random; import java.util.Vector; import Jama.Matrix; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.Tools; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.operator.OperatorCreationException; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.operator.clustering.ClusterModel; import com.rapidminer.operator.clustering.FlatFuzzyClusterModel; import com.rapidminer.operator.clustering.clusterer.KMeans; import com.rapidminer.operator.clustering.clusterer.RMAbstractClusterer; import com.rapidminer.operator.ports.metadata.AttributeMetaData; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeCategory; import com.rapidminer.parameter.ParameterTypeDouble; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.parameter.UndefinedParameterError; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.OperatorService; import com.rapidminer.tools.RandomGenerator; import com.rapidminer.tools.math.VectorMath; /** * This operator represents an implementation of the EM-algorithm. * * @author Regina Fritsch */ public class EMClusterer extends RMAbstractClusterer { /** The parameter name for "the maximal number of clusters" */ public static final String PARAMETER_K = "k"; /** * The parameter name for "the maximal number of runs of the k method with random initialization that are * performed" */ public static final String PARAMETER_MAX_RUNS = "max_runs"; /** The parameter name for "the maximal number of iterations performed for one run of the k method" */ public static final String PARAMETER_MAX_OPTIMIZATION_STEPS = "max_optimization_steps"; /** The parameter name for "the quality, which has to be fulfilled for the stopping of the soft clustering" */ public static final String PARAMETER_QUALITY = "quality"; /** The parameter name for "Indicates if the probabilities will be shown in example table" */ public static final String PARAMETER_SHOW_PROBABILITIES = "show_probabilities"; /** The parameter name for "Indicates the initialization distribution" */ public static final String PARAMETER_INITIALIZATION_DISTRIBUTION = "inital_distribution"; /** The parameter name for "List of the different init distributions" */ public static final String[] INIT_DISTRIBUTION = { "randomly assigned examples", "k-means run", "average parameters" }; /** The parameter name for "Init distributions randomly assigned" */ public static final int RANDOMLY_ASSIGNED = 0; /** The parameter name for "Init distributions hard clustering" */ public static final int K_MEANS = 1; /** The parameter name for "Init distributions average parameters" */ public static final int AVERAGE_PARAMETERS = 2; /** The parameter name for "Indicates if the example set has correlated attributes" */ public static final String PARAMETER_CORRELATED = "correlated_attributes"; public EMClusterer(OperatorDescription description) { super(description); } @Override protected Collection<AttributeMetaData> getAdditionalAttributes() { List<AttributeMetaData> propAttributes = new LinkedList<AttributeMetaData>(); try { int k = getParameterAsInt(PARAMETER_K); for (int i = 0; i < k; i++) { AttributeMetaData newAttr = new AttributeMetaData("cluster_" + i + "_probability", Ontology.REAL, "cluster_" + i + "_probability"); propAttributes.add(newAttr); } } catch (UndefinedParameterError e) { } return propAttributes; } /* * Creates the Clustermodel. */ public ClusterModel createClusterModel(ExampleSet exampleSet) throws OperatorException { FlatFuzzyClusterModel bestModel = null; int restoreMaxRuns = getParameterAsInt(PARAMETER_MAX_RUNS); boolean restoreCorrelated = getParameterAsBoolean(PARAMETER_CORRELATED); boolean isCorrelated = getParameterAsBoolean(PARAMETER_CORRELATED); int k = getParameterAsInt(PARAMETER_K); int initSpecialSize = exampleSet.getAttributes().specialSize(); double[][] exampleInClusterProbability = new double[exampleSet.size()][k]; double max = Double.NEGATIVE_INFINITY; int exceptionCounter = 0; // the iterations for (int iter = 0; iter < getParameterAsInt(PARAMETER_MAX_RUNS); iter++) { FlatFuzzyClusterModel result = new FlatFuzzyClusterModel(exampleSet, k, getParameterAsBoolean(RMAbstractClusterer.PARAMETER_ADD_AS_LABEL), getParameterAsBoolean(RMAbstractClusterer.PARAMETER_REMOVE_UNLABELED)); FlatFuzzyClusterModel oldResult = null; // initialize the model try { init(exampleSet, result, k, initSpecialSize, exampleInClusterProbability); } catch (OperatorCreationException e1) { e1.printStackTrace(); } boolean stableState = false; double logLikelyHood_old = Double.POSITIVE_INFINITY; double logLikelyHood = 0; // the optimization-steps int optiStep = 0; int[] clusterAssignments = new int[exampleSet.size()]; try { for (optiStep = 0; (optiStep < getParameterAsInt(PARAMETER_MAX_OPTIMIZATION_STEPS)) && !stableState; optiStep++) { stableState = true; oldResult = result; result = new FlatFuzzyClusterModel(exampleSet, k, getParameterAsBoolean(RMAbstractClusterer.PARAMETER_ADD_AS_LABEL), getParameterAsBoolean(RMAbstractClusterer.PARAMETER_REMOVE_UNLABELED)); // Compute the probabilities for each example with each cluster if (isCorrelated) { expectationCorrelated(exampleSet, k, exampleInClusterProbability, oldResult); } else { expectationNonCorrelated(exampleSet, k, exampleInClusterProbability, oldResult); } // compute the hard-clustering from the soft-clustering (assignments of the examples to the clusters) for (int exampleIndex = 0; exampleIndex < exampleSet.size(); exampleIndex++) { int bestIndex = bestIndex(exampleIndex, k, exampleInClusterProbability); if (bestIndex < 0) bestIndex = RandomGenerator.getGlobalRandomGenerator().nextInt(result.getNumberOfClusters()); clusterAssignments[exampleIndex] = bestIndex; } result.setClusterAssignments(clusterAssignments, exampleSet); // Recalculate the values: cluster probabilities, means and standard deviations maximization(exampleSet, k, exampleInClusterProbability, result); // test if the quality of the soft-clustering performs the user-defined quality logLikelyHood = computeLogLikelyhood(k, exampleInClusterProbability, result); double difference = logLikelyHood_old - logLikelyHood; if (!(Math.abs(difference) < getParameterAsDouble(PARAMETER_QUALITY))) { stableState = false; } logLikelyHood_old = logLikelyHood; } } catch (Exception e) { exceptionCounter++; // If there occurs an exception, don't stop at the first time and if there are some useable models don't discard them. if (exceptionCounter > restoreMaxRuns) { // if there are not enough models, start again without the option correlated if ((iter - (exceptionCounter - 1)) < Math.round(restoreMaxRuns * 0.49)) { getLogger().info("Can't compute the inverse of the covariance matrix. Maybe the Matrix is singular. Changing option \"correlated_attributes\" to false."); setParameter(PARAMETER_CORRELATED, "" + false); setParameter(PARAMETER_MAX_RUNS, "" + restoreMaxRuns); bestModel = (FlatFuzzyClusterModel) createClusterModel(exampleSet); } break; } else { setParameter(PARAMETER_MAX_RUNS, "" + (getParameterAsInt(PARAMETER_MAX_RUNS) + 1)); continue; } } // check if the model of the current iteration is better than the models computed before if (Math.abs(logLikelyHood) > max) { max = Math.abs(logLikelyHood); bestModel = result; if (showProbs() == true) { setProbabilitiesInTable(exampleSet, exampleInClusterProbability); bestModel.setExampleInClusterProbability(exampleInClusterProbability); } } } // restore original values setParameter(PARAMETER_MAX_RUNS, "" + restoreMaxRuns); setParameter(PARAMETER_CORRELATED, "" + restoreCorrelated); return bestModel; } /* * INIT SECTOR */ /* * Main init method. */ private void init(ExampleSet exampleSet, FlatFuzzyClusterModel result, int k, int initSpecialSize, double[][] exampleInClusterProbability) throws OperatorException, OperatorCreationException { // init means, standard deviations (or covariance matrix) and cluster probabilities according to specified distribution int distribution = getParameterAsInt(PARAMETER_INITIALIZATION_DISTRIBUTION); switch (distribution) { case RANDOMLY_ASSIGNED: try { // allocate the examples randomly to the clusters Random random = RandomGenerator.getRandomGenerator(this); int clustersFilled; do { clustersFilled = 0; double[][] clusterMeans = new double[k][exampleSet.getAttributes().size()]; int i = 0; for (Example ex : exampleSet) { int cluster = random.nextInt(k); exampleInClusterProbability[i][cluster] = 1; int j = 0; for (Attribute attribute : exampleSet.getAttributes()) { clusterMeans[cluster][j] += ex.getValue(attribute); j++; } i++; } // check if there is at least one example in each cluster for (i = 0; i < k; i++) { // set means in the model (allready not normalized) result.setClusterMean(i, clusterMeans[i]); for (int j = 0; j < exampleInClusterProbability.length; j++) { if (exampleInClusterProbability[j][i] == 1) { clustersFilled++; break; } } } } while (clustersFilled < k); } catch (UndefinedParameterError e) { } // compute means (normalized), stdDev...) computeValuesWithClusterMemberships(exampleSet, k, exampleInClusterProbability, result); if (isCorrelated()) { initCovarianceMatrix(exampleSet, exampleInClusterProbability, result, k); } break; case K_MEANS: // allocate the examples according to the k-means run to the clusters KMeans clusterAlgorithm = OperatorService.createOperator(KMeans.class); ExampleSet clusterSet = (ExampleSet) exampleSet.clone(); clusterAlgorithm.setParameter(KMeans.PARAMETER_K, "" + k); clusterAlgorithm.setParameter(RMAbstractClusterer.PARAMETER_ADD_CLUSTER_ATTRIBUTE, "true"); clusterAlgorithm.generateClusterModel(clusterSet); // ad a side effect, add cluster attribute to clusterSet double[][] clusterMeans = new double[k][exampleSet.getAttributes().size()]; int exampleIndex = 0; Attribute clusterAttribute = clusterSet.getAttributes().getCluster(); for (Example example: clusterSet) { int clusterIndex = (int) example.getValue(clusterAttribute); exampleInClusterProbability[exampleIndex][clusterIndex] = 1; int j = 0; for (Attribute attribute : clusterSet.getAttributes()) { clusterMeans[clusterIndex][j] += example.getValue(attribute); j++; } exampleIndex++; } for (int i = 0; i < k; i++) { result.setClusterMean(i, clusterMeans[i]); } // compute means (normalized), stdDev...) computeValuesWithClusterMemberships(exampleSet, k, exampleInClusterProbability, result); if (isCorrelated()) { initCovarianceMatrix(exampleSet, exampleInClusterProbability, result, k); } break; case AVERAGE_PARAMETERS: default: Random random = RandomGenerator.getRandomGenerator(this); initAverageParameters(exampleSet, k, exampleInClusterProbability, result, random); break; } // show probabilities in example table? if (showProbs()) { if (exampleSet.getAttributes().specialSize() == initSpecialSize) { for (int i = 0; i < k; i++) { String name = "cluster_" + i + "_probability"; Attribute newAttribute = AttributeFactory.createAttribute(Ontology.REAL); newAttribute.setName(name); exampleSet.getExampleTable().addAttribute(newAttribute); exampleSet.getAttributes().setSpecialAttribute(newAttribute, name); } setProbabilitiesInTable(exampleSet, exampleInClusterProbability); } } } /* * !This method does not work alone! * * Computes the initial mean, standard deviation and cluster probabilities, for given initial cluster classifications * and means already summed up. */ private void computeValuesWithClusterMemberships(ExampleSet exampleSet, int k, double[][] exampleInClusterProbability, FlatFuzzyClusterModel result) { // compute means for (int i = 0; i < k; i++) { int denominator = 0; for (int j = 0; j < exampleInClusterProbability.length; j++) { if (exampleInClusterProbability[j][i] == 1) { denominator++; } } double[] clusterMean = new double[result.getClusterMean(i).length]; for (int j = 0; j < result.getClusterMean(i).length; j++) { clusterMean[j] = result.getClusterMean(i)[j] / denominator; } result.setClusterMean(i, clusterMean); } // compute standard deviations (& cluster probabilities) for (int i = 0; i < k; i++) { int denominator = 0; double clusterStDeviation = 0; for (int j = 0; j < exampleInClusterProbability.length; j++) { if (exampleInClusterProbability[j][i] == 1) { double[] helpVector = VectorMath.vectorSubtraction(exampleToArray(exampleSet.getExample(j)), result.getClusterMean(i)); clusterStDeviation += VectorMath.vectorMultiplication(helpVector, helpVector); denominator++; } } result.setClusterStandardDeviation(i, clusterStDeviation / denominator); result.setClusterProbability(i, (double) denominator / exampleSet.size()); } } /* * compute examplesInClusterProbability [P(C_i|x)] to initialize clusterCovarianceMatrix [Sigma_i] */ private void initCovarianceMatrix(ExampleSet exampleSet, double[][] exampleInClusterProbability, FlatFuzzyClusterModel result, int k) { // compute examplesInClusterProbabilities [P(C_i|x)] (the probabilities for each example with each cluster) expectationNonCorrelated(exampleSet, k, exampleInClusterProbability, result); // init clusterCovarianceMatrix [Sigma_i] computeCovarianceMatrix(exampleSet, exampleInClusterProbability, result, k); result.clearClusterStandardDeviations(); } /* * Initialize means, standard deviations and cluster probabilities, by computing the averages of this values over the * exampleSet. */ private void initAverageParameters(ExampleSet exampleSet, int k, double[][] exampleInClusterProbability, FlatFuzzyClusterModel result, Random random) { // various initializations double[] max = new double[exampleSet.getAttributes().size()]; double[] min = new double[exampleSet.getAttributes().size()]; double[] average = new double[exampleSet.getAttributes().size()]; for (int j = 0; j < min.length; j++) { min[j] = Double.POSITIVE_INFINITY; } // compute average, minimum and maximum values of the attributes int i = 0; for (Example ex : exampleSet) { int j = 0; for (Attribute attribute : exampleSet.getAttributes()) { double value = ex.getValue(attribute); average[j] += value; if (value < min[j]) { min[j] = value; } else if (value > max[j]) { max[j] = value; } j++; } i++; } for (int j = 0; j < average.length; j++) { average[j] = average[j] / exampleSet.size(); } // make it random (to get different initializations for the different iterations) double[] offset = VectorMath.vectorDivision(VectorMath.vectorSubtraction(max, min), (k * 2)); min = VectorMath.vectorAddition(min, getOffset(offset, random)); average = VectorMath.vectorAddition(average, getOffset(offset, random)); max = VectorMath.vectorAddition(max, getOffset(offset, random)); // compute average means, standard deviations double[] help = VectorMath.vectorSubtraction(average, min); help = VectorMath.vectorDivision(help, (k / 2 + 1)); double[] help2 = VectorMath.vectorSubtraction(max, average); help2 = VectorMath.vectorDivision(help2, (k / 2 + 1)); int j = 0; for (i = 0; i < k; i++) { double[] clusterMean = new double[exampleSet.getAttributes().size()]; double clusterStDeviation; double clusterProbability; if (i < k / 2) { clusterMean = VectorMath.vectorAddition(VectorMath.vectorMultiplication(help, (i + 1)), min); clusterStDeviation = VectorMath.vectorMultiplication(help, help); } else if ((i == k / 2) && (k % 2 == 1)) { clusterMean = average; double[] help3 = VectorMath.vectorMultiplication(help, -1); clusterStDeviation = VectorMath.vectorMultiplication(help3, help3); clusterStDeviation += VectorMath.vectorMultiplication(help2, help2); clusterStDeviation = clusterStDeviation / 2; } else { clusterMean = VectorMath.vectorAddition(average, VectorMath.vectorMultiplication(help2, (j + 1))); clusterStDeviation = VectorMath.vectorMultiplication(help2, help2); j++; } // set all cluster probabilities to the same value clusterProbability = (double) 1 / k; result.setClusterMean(i, clusterMean); result.setClusterStandardDeviation(i, clusterStDeviation); result.setClusterProbability(i, clusterProbability); } if (isCorrelated()) { initCovarianceMatrix(exampleSet, exampleInClusterProbability, result, k); } } /* * Computes a random offset within a range. (range: (+/- input offset) */ private double[] getOffset(double[] offset, Random random) { double multi = 2 * random.nextDouble() - 1; // number between -1 and 1 return VectorMath.vectorMultiplication(offset, multi); } /* * END: INIT SECTOR */ /* * Computes to which cluster an example fits best. */ protected int bestIndex(int exampleIndex, int k, double[][] exampleInClusterProbability) throws Exception { int bestIndex = -1; double bestIndexValue = 0; for (int i = 0; i < k; i++) { if (bestIndexValue < exampleInClusterProbability[exampleIndex][i]) { bestIndexValue = exampleInClusterProbability[exampleIndex][i]; bestIndex = i; } } return bestIndex; } /* * Computes the probabilities for each example with each cluster (exampleClusterProbs). (with StdDev) */ protected void expectationNonCorrelated(ExampleSet exampleSet, int k, double[][] exampleInClusterProbability, FlatFuzzyClusterModel oldResult) { int j = 0; for (Example ex : exampleSet) { double sum = 0; for (int i = 0; i < k; i++) { double[] helpVector = VectorMath.vectorSubtraction(exampleToArray(ex), oldResult.getClusterMean(i)); double stDev = oldResult.getClusterStandardDeviation(i); // stDev must be greater than 0: division by zero if(stDev == 0) { stDev = 1E-10; } // formula see: http://jmlr.csail.mit.edu/papers/volume6/banerjee05b/banerjee05b.pdf (page 1725 + 1729) exampleInClusterProbability[j][i] = ((1 / Math.sqrt(Math.pow(2 * Math.PI * stDev, exampleSet.getAttributes().size()))) * Math.exp(-1 * (VectorMath.vectorMultiplication(helpVector, helpVector) / (2 * stDev))) * oldResult .getClusterProbability(i)); sum += exampleInClusterProbability[j][i]; } for (int i = 0; i < k; i++) { exampleInClusterProbability[j][i] = exampleInClusterProbability[j][i] / sum; } j++; } } /* * Computes the probabilities for each example with each cluster (exampleClusterProbs). (with covarianceMatrix) */ protected void expectationCorrelated(ExampleSet exampleSet, int k, double[][] exampleInClusterProbability, FlatFuzzyClusterModel oldResult) throws Exception { int j = 0; for (Example ex : exampleSet) { double sum = 0; Vector<Integer> problems = new Vector<Integer>(); for (int i = 0; i < k; i++) { double[] helpVector = VectorMath.vectorSubtraction(exampleToArray(ex), oldResult.getClusterMean(i)); double[][] helpMatrix = new double[helpVector.length][1]; for (int l = 0; l < helpVector.length; l++) { helpMatrix[l][0] = helpVector[l]; } Matrix matrix = new Matrix(helpMatrix); matrix = (matrix.transpose().times((new Matrix(oldResult.getClusterCovarianceMatrix(i))).inverse())) // invCovMatrix .times(matrix); double secondPart = Math.exp(matrix.getArray()[0][0] * (-0.5)); double determinant = (new Matrix(oldResult.getClusterCovarianceMatrix(i))).det(); if (determinant < 0) { determinant *= -1; } // this is here(!) only the conditional probability: P(Example_j|Cluster_i) * W_i exampleInClusterProbability[j][i] = (1 / Math.sqrt(Math.pow(2 * Math.PI, exampleSet.getAttributes().size()) * determinant)) * secondPart * oldResult.getClusterProbability(i); if (exampleInClusterProbability[j][i] == Double.POSITIVE_INFINITY) { problems.add(i); } // sum = P[x] -> probability of a example sum += exampleInClusterProbability[j][i]; } // sometimes double is not able to represent the exampleInClusterProbability, that is only the case if the // probabilitity is very closely to 1, when this happens the probability is set to 1 and all others to 0. for (int i = 0; i < k; i++) { if (problems.isEmpty()) { exampleInClusterProbability[j][i] = exampleInClusterProbability[j][i] / sum; } else { if (exampleInClusterProbability[j][i] == Double.POSITIVE_INFINITY) { exampleInClusterProbability[j][i] = 1.0; } else { exampleInClusterProbability[j][i] = 0.0; } } } j++; } } /* * Computes the new values of: * - cluster means [my_i] AND * - clusterprobabilities [P(Cluster_i)] AND * - cluster standard deviation [sigma_i] OR * - cluster covariance matrix [Sigma_i] * with the probabilities of each example to each cluster [P(Cluster_i|example)] */ protected void maximization(ExampleSet exampleSet, int k, double[][] exampleInClusterProbability, FlatFuzzyClusterModel result) { for (int i = 0; i < k; i++) { double probabilitySum = 0; int j = 0; double[] clusterMean = new double[exampleSet.getAttributes().size()]; for (Example example : exampleSet) { probabilitySum += exampleInClusterProbability[j][i]; clusterMean = VectorMath.vectorAddition(clusterMean, VectorMath.vectorMultiplication(exampleToArray(example), exampleInClusterProbability[j][i])); j++; } result.setClusterMean(i, VectorMath.vectorDivision(clusterMean, probabilitySum)); result.setClusterProbability(i, probabilitySum / exampleSet.size()); if (!isCorrelated()) { j = 0; double clusterStDeviation = 0; for (Example example : exampleSet) { double[] helpVector = VectorMath.vectorSubtraction(exampleToArray(example), result.getClusterMean(i)); clusterStDeviation += exampleInClusterProbability[j][i] * (VectorMath.vectorMultiplication(helpVector, helpVector)); j++; } result.setClusterStandardDeviation(i, clusterStDeviation / probabilitySum); } } if (isCorrelated()) { computeCovarianceMatrix(exampleSet, exampleInClusterProbability, result, k); } } /* * Computes clusterCovarianceMatrix. [Sigma_i] */ private void computeCovarianceMatrix(ExampleSet exampleSet, double[][] exampleInClusterProbability, FlatFuzzyClusterModel result, int k) { for (int i = 0; i < k; i++) { Matrix matrix_old = null; Matrix matrix = null; double probSum = 0; int id = 0; for (Example example : exampleSet) { double[] helpVector = VectorMath.vectorSubtraction(exampleToArray(example), result.getClusterMean(i)); double[][] helpMatrix = new double[helpVector.length][1]; for (int j = 0; j < helpVector.length; j++) { helpMatrix[j][0] = helpVector[j]; } matrix = new Matrix(helpMatrix); matrix = matrix.times(matrix.transpose()).times(exampleInClusterProbability[id][i]); probSum += exampleInClusterProbability[id][i]; if (matrix_old != null) { matrix = matrix_old.plus(matrix); } matrix_old = matrix; id++; } double[][] covarianceMatrix = matrix.getArray(); covarianceMatrix = VectorMath.matrixDivision(covarianceMatrix, probSum); result.setClusterCovarianceMatrix(i, covarianceMatrix); } } /* * Computes the loglikelyhood. */ protected double computeLogLikelyhood(int k, double[][] exampleInClusterProbability, FlatFuzzyClusterModel resultModel) { double result = 0; double temp = 0; for (int n = 0; n < exampleInClusterProbability.length; n++) { for (int i = 0; i < k; i++) { temp += resultModel.getClusterProbability(i) * exampleInClusterProbability[n][i]; } result += Math.log(temp); } return result; } /* * Show cluster probabilities in table? */ private boolean showProbs() { if (getParameterAsBoolean(PARAMETER_SHOW_PROBABILITIES) == true) { return true; } return false; } /* * Are there correlated attributes in the example set? */ private boolean isCorrelated() { if (!getParameterAsBoolean(PARAMETER_CORRELATED)) { return false; } return true; } /* * Sets the cluster probabilities in the table, according to the actual values in exampleClusterProbs. */ private void setProbabilitiesInTable(ExampleSet exampleSet, double[][] exampleInClusterProbability) throws OperatorException { for (int i = 0; i < getParameterAsInt(PARAMETER_K); i++) { String name = "cluster_" + i + "_probability"; int j = 0; for (Example ex : exampleSet) { ex.setValue(exampleSet.getAttributes().get(name), exampleInClusterProbability[j][i]); j++; } } } /* * Computes an array of an example. Important for some math operations. */ private double[] exampleToArray(Example example) { double[] result = new double[example.getAttributes().size()]; int i = 0; for (Attribute attribute : example.getAttributes()) { result[i] = example.getValue(attribute); i++; } return result; } @Override public ClusterModel generateClusterModel(ExampleSet exampleSet) throws OperatorException { // get parameters int k = getParameterAsInt(PARAMETER_K); // perform checks Tools.isNonEmpty(exampleSet); Tools.checkAndCreateIds(exampleSet); if (exampleSet.size() < k) { logWarning("number of clusters (k) = " + k + " > number of objects =" + exampleSet.size()); throw new UserError(this, 142, k); } ClusterModel model = createClusterModel(exampleSet); Attribute idAttribute = exampleSet.getAttributes().getId(); if (addsClusterAttribute()) { Attribute cluster = AttributeFactory.createAttribute("cluster", Ontology.NOMINAL); exampleSet.getExampleTable().addAttribute(cluster); exampleSet.getAttributes().setCluster(cluster); int i = 0; if (idAttribute.isNumerical()) { for (Example example : exampleSet) { example.setValue(cluster, "cluster_" + model.getClusterIndexOfId(example.getValue(idAttribute))); i++; } } else { for (Example example : exampleSet) { example.setValue(cluster, "cluster_" + model.getClusterIndexOfId(example.getValueAsString(idAttribute))); i++; } } } return model; } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = new LinkedList<ParameterType>(); ParameterType type = new ParameterTypeInt(PARAMETER_K, "The number of clusters which should be found.", 2, Integer.MAX_VALUE, 2); type.setExpert(false); types.add(type); types.addAll(super.getParameterTypes()); types.add(new ParameterTypeInt(PARAMETER_MAX_RUNS, "The maximal number of runs of this operator with random initialization that are performed.", 1,Integer.MAX_VALUE, 5, false)); types.add(new ParameterTypeInt(PARAMETER_MAX_OPTIMIZATION_STEPS, "The maximal number of iterations performed for one run of this operator.", 1, Integer.MAX_VALUE, 100, false)); types.add(new ParameterTypeDouble(PARAMETER_QUALITY,"The quality that must be fullfilled before the algorithm stops. (The rising of the loglikelyhood that must be undercut)", 1.0E-15, 1.0E-1, 1.0E-10)); types.addAll(RandomGenerator.getRandomGeneratorParameters(this)); types.add(new ParameterTypeBoolean(PARAMETER_SHOW_PROBABILITIES, "Insert probabilities for every cluster with every example in the example set.",true)); types.add(new ParameterTypeCategory(PARAMETER_INITIALIZATION_DISTRIBUTION, "Indicates the inital distribution of the centroids.", INIT_DISTRIBUTION, K_MEANS)); types.add(new ParameterTypeBoolean(PARAMETER_CORRELATED, "Has to be activated, if the example set contains correlated attributes.", true)); return types; } }