/*
* RapidMiner
*
* Copyright (C) 2001-2011 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.clustering.clusterer.soft;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import java.util.Vector;
import Jama.Matrix;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.Tools;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.operator.OperatorCreationException;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.clustering.ClusterModel;
import com.rapidminer.operator.clustering.FlatFuzzyClusterModel;
import com.rapidminer.operator.clustering.clusterer.KMeans;
import com.rapidminer.operator.clustering.clusterer.RMAbstractClusterer;
import com.rapidminer.operator.ports.metadata.AttributeMetaData;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeDouble;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.UndefinedParameterError;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.OperatorService;
import com.rapidminer.tools.RandomGenerator;
import com.rapidminer.tools.math.VectorMath;
/**
* This operator represents an implementation of the EM-algorithm.
*
* @author Regina Fritsch
*/
public class EMClusterer extends RMAbstractClusterer {
/** The parameter name for "the maximal number of clusters" */
public static final String PARAMETER_K = "k";
/**
* The parameter name for "the maximal number of runs of the k method with random initialization that are
* performed"
*/
public static final String PARAMETER_MAX_RUNS = "max_runs";
/** The parameter name for "the maximal number of iterations performed for one run of the k method" */
public static final String PARAMETER_MAX_OPTIMIZATION_STEPS = "max_optimization_steps";
/** The parameter name for "the quality, which has to be fulfilled for the stopping of the soft clustering" */
public static final String PARAMETER_QUALITY = "quality";
/** The parameter name for "Indicates if the probabilities will be shown in example table" */
public static final String PARAMETER_SHOW_PROBABILITIES = "show_probabilities";
/** The parameter name for "Indicates the initialization distribution" */
public static final String PARAMETER_INITIALIZATION_DISTRIBUTION = "inital_distribution";
/** The parameter name for "List of the different init distributions" */
public static final String[] INIT_DISTRIBUTION = { "randomly assigned examples", "k-means run", "average parameters" };
/** The parameter name for "Init distributions randomly assigned" */
public static final int RANDOMLY_ASSIGNED = 0;
/** The parameter name for "Init distributions hard clustering" */
public static final int K_MEANS = 1;
/** The parameter name for "Init distributions average parameters" */
public static final int AVERAGE_PARAMETERS = 2;
/** The parameter name for "Indicates if the example set has correlated attributes" */
public static final String PARAMETER_CORRELATED = "correlated_attributes";
public EMClusterer(OperatorDescription description) {
super(description);
}
@Override
protected Collection<AttributeMetaData> getAdditionalAttributes() {
List<AttributeMetaData> propAttributes = new LinkedList<AttributeMetaData>();
try {
int k = getParameterAsInt(PARAMETER_K);
for (int i = 0; i < k; i++) {
AttributeMetaData newAttr = new AttributeMetaData("cluster_" + i + "_probability", Ontology.REAL, "cluster_" + i + "_probability");
propAttributes.add(newAttr);
}
} catch (UndefinedParameterError e) {
}
return propAttributes;
}
/*
* Creates the Clustermodel.
*/
public ClusterModel createClusterModel(ExampleSet exampleSet) throws OperatorException {
FlatFuzzyClusterModel bestModel = null;
int restoreMaxRuns = getParameterAsInt(PARAMETER_MAX_RUNS);
boolean restoreCorrelated = getParameterAsBoolean(PARAMETER_CORRELATED);
boolean isCorrelated = getParameterAsBoolean(PARAMETER_CORRELATED);
int k = getParameterAsInt(PARAMETER_K);
int initSpecialSize = exampleSet.getAttributes().specialSize();
double[][] exampleInClusterProbability = new double[exampleSet.size()][k];
double max = Double.NEGATIVE_INFINITY;
int exceptionCounter = 0;
// the iterations
for (int iter = 0; iter < getParameterAsInt(PARAMETER_MAX_RUNS); iter++) {
FlatFuzzyClusterModel result = new FlatFuzzyClusterModel(exampleSet, k, getParameterAsBoolean(RMAbstractClusterer.PARAMETER_ADD_AS_LABEL), getParameterAsBoolean(RMAbstractClusterer.PARAMETER_REMOVE_UNLABELED));
FlatFuzzyClusterModel oldResult = null;
// initialize the model
try {
init(exampleSet, result, k, initSpecialSize, exampleInClusterProbability);
} catch (OperatorCreationException e1) {
e1.printStackTrace();
}
boolean stableState = false;
double logLikelyHood_old = Double.POSITIVE_INFINITY;
double logLikelyHood = 0;
// the optimization-steps
int optiStep = 0;
int[] clusterAssignments = new int[exampleSet.size()];
try {
for (optiStep = 0; (optiStep < getParameterAsInt(PARAMETER_MAX_OPTIMIZATION_STEPS)) && !stableState; optiStep++) {
stableState = true;
oldResult = result;
result = new FlatFuzzyClusterModel(exampleSet, k, getParameterAsBoolean(RMAbstractClusterer.PARAMETER_ADD_AS_LABEL), getParameterAsBoolean(RMAbstractClusterer.PARAMETER_REMOVE_UNLABELED));
// Compute the probabilities for each example with each cluster
if (isCorrelated) {
expectationCorrelated(exampleSet, k, exampleInClusterProbability, oldResult);
} else {
expectationNonCorrelated(exampleSet, k, exampleInClusterProbability, oldResult);
}
// compute the hard-clustering from the soft-clustering (assignments of the examples to the clusters)
for (int exampleIndex = 0; exampleIndex < exampleSet.size(); exampleIndex++) {
int bestIndex = bestIndex(exampleIndex, k, exampleInClusterProbability);
if (bestIndex < 0)
bestIndex = RandomGenerator.getGlobalRandomGenerator().nextInt(result.getNumberOfClusters());
clusterAssignments[exampleIndex] = bestIndex;
}
result.setClusterAssignments(clusterAssignments, exampleSet);
// Recalculate the values: cluster probabilities, means and standard deviations
maximization(exampleSet, k, exampleInClusterProbability, result);
// test if the quality of the soft-clustering performs the user-defined quality
logLikelyHood = computeLogLikelyhood(k, exampleInClusterProbability, result);
double difference = logLikelyHood_old - logLikelyHood;
if (!(Math.abs(difference) < getParameterAsDouble(PARAMETER_QUALITY))) {
stableState = false;
}
logLikelyHood_old = logLikelyHood;
}
} catch (Exception e) {
exceptionCounter++;
// If there occurs an exception, don't stop at the first time and if there are some useable models don't discard them.
if (exceptionCounter > restoreMaxRuns) {
// if there are not enough models, start again without the option correlated
if ((iter - (exceptionCounter - 1)) < Math.round(restoreMaxRuns * 0.49)) {
getLogger().info("Can't compute the inverse of the covariance matrix. Maybe the Matrix is singular. Changing option \"correlated_attributes\" to false.");
setParameter(PARAMETER_CORRELATED, "" + false);
setParameter(PARAMETER_MAX_RUNS, "" + restoreMaxRuns);
bestModel = (FlatFuzzyClusterModel) createClusterModel(exampleSet);
}
break;
} else {
setParameter(PARAMETER_MAX_RUNS, "" + (getParameterAsInt(PARAMETER_MAX_RUNS) + 1));
continue;
}
}
// check if the model of the current iteration is better than the models computed before
if (Math.abs(logLikelyHood) > max) {
max = Math.abs(logLikelyHood);
bestModel = result;
if (showProbs() == true) {
setProbabilitiesInTable(exampleSet, exampleInClusterProbability);
bestModel.setExampleInClusterProbability(exampleInClusterProbability);
}
}
}
// restore original values
setParameter(PARAMETER_MAX_RUNS, "" + restoreMaxRuns);
setParameter(PARAMETER_CORRELATED, "" + restoreCorrelated);
return bestModel;
}
/*
* INIT SECTOR
*/
/*
* Main init method.
*/
private void init(ExampleSet exampleSet, FlatFuzzyClusterModel result, int k, int initSpecialSize, double[][] exampleInClusterProbability) throws OperatorException, OperatorCreationException {
// init means, standard deviations (or covariance matrix) and cluster probabilities according to specified distribution
int distribution = getParameterAsInt(PARAMETER_INITIALIZATION_DISTRIBUTION);
switch (distribution) {
case RANDOMLY_ASSIGNED:
try {
// allocate the examples randomly to the clusters
Random random = RandomGenerator.getRandomGenerator(this);
int clustersFilled;
do {
clustersFilled = 0;
double[][] clusterMeans = new double[k][exampleSet.getAttributes().size()];
int i = 0;
for (Example ex : exampleSet) {
int cluster = random.nextInt(k);
exampleInClusterProbability[i][cluster] = 1;
int j = 0;
for (Attribute attribute : exampleSet.getAttributes()) {
clusterMeans[cluster][j] += ex.getValue(attribute);
j++;
}
i++;
}
// check if there is at least one example in each cluster
for (i = 0; i < k; i++) {
// set means in the model (allready not normalized)
result.setClusterMean(i, clusterMeans[i]);
for (int j = 0; j < exampleInClusterProbability.length; j++) {
if (exampleInClusterProbability[j][i] == 1) {
clustersFilled++;
break;
}
}
}
} while (clustersFilled < k);
} catch (UndefinedParameterError e) {
}
// compute means (normalized), stdDev...)
computeValuesWithClusterMemberships(exampleSet, k, exampleInClusterProbability, result);
if (isCorrelated()) {
initCovarianceMatrix(exampleSet, exampleInClusterProbability, result, k);
}
break;
case K_MEANS:
// allocate the examples according to the k-means run to the clusters
KMeans clusterAlgorithm = OperatorService.createOperator(KMeans.class);
ExampleSet clusterSet = (ExampleSet) exampleSet.clone();
clusterAlgorithm.setParameter(KMeans.PARAMETER_K, "" + k);
clusterAlgorithm.setParameter(RMAbstractClusterer.PARAMETER_ADD_CLUSTER_ATTRIBUTE, "true");
clusterAlgorithm.generateClusterModel(clusterSet); // ad a side effect, add cluster attribute to clusterSet
double[][] clusterMeans = new double[k][exampleSet.getAttributes().size()];
int exampleIndex = 0;
Attribute clusterAttribute = clusterSet.getAttributes().getCluster();
for (Example example: clusterSet) {
int clusterIndex = (int) example.getValue(clusterAttribute);
exampleInClusterProbability[exampleIndex][clusterIndex] = 1;
int j = 0;
for (Attribute attribute : clusterSet.getAttributes()) {
clusterMeans[clusterIndex][j] += example.getValue(attribute);
j++;
}
exampleIndex++;
}
for (int i = 0; i < k; i++) {
result.setClusterMean(i, clusterMeans[i]);
}
// compute means (normalized), stdDev...)
computeValuesWithClusterMemberships(exampleSet, k, exampleInClusterProbability, result);
if (isCorrelated()) {
initCovarianceMatrix(exampleSet, exampleInClusterProbability, result, k);
}
break;
case AVERAGE_PARAMETERS:
default:
Random random = RandomGenerator.getRandomGenerator(this);
initAverageParameters(exampleSet, k, exampleInClusterProbability, result, random);
break;
}
// show probabilities in example table?
if (showProbs()) {
if (exampleSet.getAttributes().specialSize() == initSpecialSize) {
for (int i = 0; i < k; i++) {
String name = "cluster_" + i + "_probability";
Attribute newAttribute = AttributeFactory.createAttribute(Ontology.REAL);
newAttribute.setName(name);
exampleSet.getExampleTable().addAttribute(newAttribute);
exampleSet.getAttributes().setSpecialAttribute(newAttribute, name);
}
setProbabilitiesInTable(exampleSet, exampleInClusterProbability);
}
}
}
/*
* !This method does not work alone!
*
* Computes the initial mean, standard deviation and cluster probabilities, for given initial cluster classifications
* and means already summed up.
*/
private void computeValuesWithClusterMemberships(ExampleSet exampleSet, int k, double[][] exampleInClusterProbability, FlatFuzzyClusterModel result) {
// compute means
for (int i = 0; i < k; i++) {
int denominator = 0;
for (int j = 0; j < exampleInClusterProbability.length; j++) {
if (exampleInClusterProbability[j][i] == 1) {
denominator++;
}
}
double[] clusterMean = new double[result.getClusterMean(i).length];
for (int j = 0; j < result.getClusterMean(i).length; j++) {
clusterMean[j] = result.getClusterMean(i)[j] / denominator;
}
result.setClusterMean(i, clusterMean);
}
// compute standard deviations (& cluster probabilities)
for (int i = 0; i < k; i++) {
int denominator = 0;
double clusterStDeviation = 0;
for (int j = 0; j < exampleInClusterProbability.length; j++) {
if (exampleInClusterProbability[j][i] == 1) {
double[] helpVector = VectorMath.vectorSubtraction(exampleToArray(exampleSet.getExample(j)), result.getClusterMean(i));
clusterStDeviation += VectorMath.vectorMultiplication(helpVector, helpVector);
denominator++;
}
}
result.setClusterStandardDeviation(i, clusterStDeviation / denominator);
result.setClusterProbability(i, (double) denominator / exampleSet.size());
}
}
/*
* compute examplesInClusterProbability [P(C_i|x)] to initialize clusterCovarianceMatrix [Sigma_i]
*/
private void initCovarianceMatrix(ExampleSet exampleSet, double[][] exampleInClusterProbability, FlatFuzzyClusterModel result, int k) {
// compute examplesInClusterProbabilities [P(C_i|x)] (the probabilities for each example with each cluster)
expectationNonCorrelated(exampleSet, k, exampleInClusterProbability, result);
// init clusterCovarianceMatrix [Sigma_i]
computeCovarianceMatrix(exampleSet, exampleInClusterProbability, result, k);
result.clearClusterStandardDeviations();
}
/*
* Initialize means, standard deviations and cluster probabilities, by computing the averages of this values over the
* exampleSet.
*/
private void initAverageParameters(ExampleSet exampleSet, int k, double[][] exampleInClusterProbability, FlatFuzzyClusterModel result, Random random) {
// various initializations
double[] max = new double[exampleSet.getAttributes().size()];
double[] min = new double[exampleSet.getAttributes().size()];
double[] average = new double[exampleSet.getAttributes().size()];
for (int j = 0; j < min.length; j++) {
min[j] = Double.POSITIVE_INFINITY;
}
// compute average, minimum and maximum values of the attributes
int i = 0;
for (Example ex : exampleSet) {
int j = 0;
for (Attribute attribute : exampleSet.getAttributes()) {
double value = ex.getValue(attribute);
average[j] += value;
if (value < min[j]) {
min[j] = value;
} else if (value > max[j]) {
max[j] = value;
}
j++;
}
i++;
}
for (int j = 0; j < average.length; j++) {
average[j] = average[j] / exampleSet.size();
}
// make it random (to get different initializations for the different iterations)
double[] offset = VectorMath.vectorDivision(VectorMath.vectorSubtraction(max, min), (k * 2));
min = VectorMath.vectorAddition(min, getOffset(offset, random));
average = VectorMath.vectorAddition(average, getOffset(offset, random));
max = VectorMath.vectorAddition(max, getOffset(offset, random));
// compute average means, standard deviations
double[] help = VectorMath.vectorSubtraction(average, min);
help = VectorMath.vectorDivision(help, (k / 2 + 1));
double[] help2 = VectorMath.vectorSubtraction(max, average);
help2 = VectorMath.vectorDivision(help2, (k / 2 + 1));
int j = 0;
for (i = 0; i < k; i++) {
double[] clusterMean = new double[exampleSet.getAttributes().size()];
double clusterStDeviation;
double clusterProbability;
if (i < k / 2) {
clusterMean = VectorMath.vectorAddition(VectorMath.vectorMultiplication(help, (i + 1)), min);
clusterStDeviation = VectorMath.vectorMultiplication(help, help);
} else if ((i == k / 2) && (k % 2 == 1)) {
clusterMean = average;
double[] help3 = VectorMath.vectorMultiplication(help, -1);
clusterStDeviation = VectorMath.vectorMultiplication(help3, help3);
clusterStDeviation += VectorMath.vectorMultiplication(help2, help2);
clusterStDeviation = clusterStDeviation / 2;
} else {
clusterMean = VectorMath.vectorAddition(average, VectorMath.vectorMultiplication(help2, (j + 1)));
clusterStDeviation = VectorMath.vectorMultiplication(help2, help2);
j++;
}
// set all cluster probabilities to the same value
clusterProbability = (double) 1 / k;
result.setClusterMean(i, clusterMean);
result.setClusterStandardDeviation(i, clusterStDeviation);
result.setClusterProbability(i, clusterProbability);
}
if (isCorrelated()) {
initCovarianceMatrix(exampleSet, exampleInClusterProbability, result, k);
}
}
/*
* Computes a random offset within a range. (range: (+/- input offset)
*/
private double[] getOffset(double[] offset, Random random) {
double multi = 2 * random.nextDouble() - 1; // number between -1 and 1
return VectorMath.vectorMultiplication(offset, multi);
}
/*
* END: INIT SECTOR
*/
/*
* Computes to which cluster an example fits best.
*/
protected int bestIndex(int exampleIndex, int k, double[][] exampleInClusterProbability) throws Exception {
int bestIndex = -1;
double bestIndexValue = 0;
for (int i = 0; i < k; i++) {
if (bestIndexValue < exampleInClusterProbability[exampleIndex][i]) {
bestIndexValue = exampleInClusterProbability[exampleIndex][i];
bestIndex = i;
}
}
return bestIndex;
}
/*
* Computes the probabilities for each example with each cluster (exampleClusterProbs). (with StdDev)
*/
protected void expectationNonCorrelated(ExampleSet exampleSet, int k, double[][] exampleInClusterProbability, FlatFuzzyClusterModel oldResult) {
int j = 0;
for (Example ex : exampleSet) {
double sum = 0;
for (int i = 0; i < k; i++) {
double[] helpVector = VectorMath.vectorSubtraction(exampleToArray(ex), oldResult.getClusterMean(i));
double stDev = oldResult.getClusterStandardDeviation(i);
// stDev must be greater than 0: division by zero
if(stDev == 0) {
stDev = 1E-10;
}
// formula see: http://jmlr.csail.mit.edu/papers/volume6/banerjee05b/banerjee05b.pdf (page 1725 + 1729)
exampleInClusterProbability[j][i] = ((1 / Math.sqrt(Math.pow(2 * Math.PI * stDev, exampleSet.getAttributes().size())))
* Math.exp(-1 * (VectorMath.vectorMultiplication(helpVector, helpVector) / (2 * stDev))) * oldResult
.getClusterProbability(i));
sum += exampleInClusterProbability[j][i];
}
for (int i = 0; i < k; i++) {
exampleInClusterProbability[j][i] = exampleInClusterProbability[j][i] / sum;
}
j++;
}
}
/*
* Computes the probabilities for each example with each cluster (exampleClusterProbs). (with covarianceMatrix)
*/
protected void expectationCorrelated(ExampleSet exampleSet, int k, double[][] exampleInClusterProbability, FlatFuzzyClusterModel oldResult) throws Exception {
int j = 0;
for (Example ex : exampleSet) {
double sum = 0;
Vector<Integer> problems = new Vector<Integer>();
for (int i = 0; i < k; i++) {
double[] helpVector = VectorMath.vectorSubtraction(exampleToArray(ex), oldResult.getClusterMean(i));
double[][] helpMatrix = new double[helpVector.length][1];
for (int l = 0; l < helpVector.length; l++) {
helpMatrix[l][0] = helpVector[l];
}
Matrix matrix = new Matrix(helpMatrix);
matrix = (matrix.transpose().times((new Matrix(oldResult.getClusterCovarianceMatrix(i))).inverse())) // invCovMatrix
.times(matrix);
double secondPart = Math.exp(matrix.getArray()[0][0] * (-0.5));
double determinant = (new Matrix(oldResult.getClusterCovarianceMatrix(i))).det();
if (determinant < 0) {
determinant *= -1;
}
// this is here(!) only the conditional probability: P(Example_j|Cluster_i) * W_i
exampleInClusterProbability[j][i] = (1 / Math.sqrt(Math.pow(2 * Math.PI, exampleSet.getAttributes().size()) * determinant)) * secondPart
* oldResult.getClusterProbability(i);
if (exampleInClusterProbability[j][i] == Double.POSITIVE_INFINITY) {
problems.add(i);
}
// sum = P[x] -> probability of a example
sum += exampleInClusterProbability[j][i];
}
// sometimes double is not able to represent the exampleInClusterProbability, that is only the case if the
// probabilitity is very closely to 1, when this happens the probability is set to 1 and all others to 0.
for (int i = 0; i < k; i++) {
if (problems.isEmpty()) {
exampleInClusterProbability[j][i] = exampleInClusterProbability[j][i] / sum;
} else {
if (exampleInClusterProbability[j][i] == Double.POSITIVE_INFINITY) {
exampleInClusterProbability[j][i] = 1.0;
} else {
exampleInClusterProbability[j][i] = 0.0;
}
}
}
j++;
}
}
/*
* Computes the new values of:
* - cluster means [my_i] AND
* - clusterprobabilities [P(Cluster_i)] AND
* - cluster standard deviation [sigma_i] OR
* - cluster covariance matrix [Sigma_i]
* with the probabilities of each example to each cluster [P(Cluster_i|example)]
*/
protected void maximization(ExampleSet exampleSet, int k, double[][] exampleInClusterProbability, FlatFuzzyClusterModel result) {
for (int i = 0; i < k; i++) {
double probabilitySum = 0;
int j = 0;
double[] clusterMean = new double[exampleSet.getAttributes().size()];
for (Example example : exampleSet) {
probabilitySum += exampleInClusterProbability[j][i];
clusterMean = VectorMath.vectorAddition(clusterMean, VectorMath.vectorMultiplication(exampleToArray(example), exampleInClusterProbability[j][i]));
j++;
}
result.setClusterMean(i, VectorMath.vectorDivision(clusterMean, probabilitySum));
result.setClusterProbability(i, probabilitySum / exampleSet.size());
if (!isCorrelated()) {
j = 0;
double clusterStDeviation = 0;
for (Example example : exampleSet) {
double[] helpVector = VectorMath.vectorSubtraction(exampleToArray(example), result.getClusterMean(i));
clusterStDeviation += exampleInClusterProbability[j][i] * (VectorMath.vectorMultiplication(helpVector, helpVector));
j++;
}
result.setClusterStandardDeviation(i, clusterStDeviation / probabilitySum);
}
}
if (isCorrelated()) {
computeCovarianceMatrix(exampleSet, exampleInClusterProbability, result, k);
}
}
/*
* Computes clusterCovarianceMatrix. [Sigma_i]
*/
private void computeCovarianceMatrix(ExampleSet exampleSet, double[][] exampleInClusterProbability, FlatFuzzyClusterModel result, int k) {
for (int i = 0; i < k; i++) {
Matrix matrix_old = null;
Matrix matrix = null;
double probSum = 0;
int id = 0;
for (Example example : exampleSet) {
double[] helpVector = VectorMath.vectorSubtraction(exampleToArray(example), result.getClusterMean(i));
double[][] helpMatrix = new double[helpVector.length][1];
for (int j = 0; j < helpVector.length; j++) {
helpMatrix[j][0] = helpVector[j];
}
matrix = new Matrix(helpMatrix);
matrix = matrix.times(matrix.transpose()).times(exampleInClusterProbability[id][i]);
probSum += exampleInClusterProbability[id][i];
if (matrix_old != null) {
matrix = matrix_old.plus(matrix);
}
matrix_old = matrix;
id++;
}
double[][] covarianceMatrix = matrix.getArray();
covarianceMatrix = VectorMath.matrixDivision(covarianceMatrix, probSum);
result.setClusterCovarianceMatrix(i, covarianceMatrix);
}
}
/*
* Computes the loglikelyhood.
*/
protected double computeLogLikelyhood(int k, double[][] exampleInClusterProbability, FlatFuzzyClusterModel resultModel) {
double result = 0;
double temp = 0;
for (int n = 0; n < exampleInClusterProbability.length; n++) {
for (int i = 0; i < k; i++) {
temp += resultModel.getClusterProbability(i) * exampleInClusterProbability[n][i];
}
result += Math.log(temp);
}
return result;
}
/*
* Show cluster probabilities in table?
*/
private boolean showProbs() {
if (getParameterAsBoolean(PARAMETER_SHOW_PROBABILITIES) == true) {
return true;
}
return false;
}
/*
* Are there correlated attributes in the example set?
*/
private boolean isCorrelated() {
if (!getParameterAsBoolean(PARAMETER_CORRELATED)) {
return false;
}
return true;
}
/*
* Sets the cluster probabilities in the table, according to the actual values in exampleClusterProbs.
*/
private void setProbabilitiesInTable(ExampleSet exampleSet, double[][] exampleInClusterProbability) throws OperatorException {
for (int i = 0; i < getParameterAsInt(PARAMETER_K); i++) {
String name = "cluster_" + i + "_probability";
int j = 0;
for (Example ex : exampleSet) {
ex.setValue(exampleSet.getAttributes().get(name), exampleInClusterProbability[j][i]);
j++;
}
}
}
/*
* Computes an array of an example. Important for some math operations.
*/
private double[] exampleToArray(Example example) {
double[] result = new double[example.getAttributes().size()];
int i = 0;
for (Attribute attribute : example.getAttributes()) {
result[i] = example.getValue(attribute);
i++;
}
return result;
}
@Override
public ClusterModel generateClusterModel(ExampleSet exampleSet) throws OperatorException {
// get parameters
int k = getParameterAsInt(PARAMETER_K);
// perform checks
Tools.isNonEmpty(exampleSet);
Tools.checkAndCreateIds(exampleSet);
if (exampleSet.size() < k) {
logWarning("number of clusters (k) = " + k + " > number of objects =" + exampleSet.size());
throw new UserError(this, 142, k);
}
ClusterModel model = createClusterModel(exampleSet);
Attribute idAttribute = exampleSet.getAttributes().getId();
if (addsClusterAttribute()) {
Attribute cluster = AttributeFactory.createAttribute("cluster", Ontology.NOMINAL);
exampleSet.getExampleTable().addAttribute(cluster);
exampleSet.getAttributes().setCluster(cluster);
int i = 0;
if (idAttribute.isNumerical()) {
for (Example example : exampleSet) {
example.setValue(cluster, "cluster_" + model.getClusterIndexOfId(example.getValue(idAttribute)));
i++;
}
} else {
for (Example example : exampleSet) {
example.setValue(cluster, "cluster_" + model.getClusterIndexOfId(example.getValueAsString(idAttribute)));
i++;
}
}
}
return model;
}
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = new LinkedList<ParameterType>();
ParameterType type = new ParameterTypeInt(PARAMETER_K, "The number of clusters which should be found.", 2, Integer.MAX_VALUE, 2);
type.setExpert(false);
types.add(type);
types.addAll(super.getParameterTypes());
types.add(new ParameterTypeInt(PARAMETER_MAX_RUNS, "The maximal number of runs of this operator with random initialization that are performed.", 1,Integer.MAX_VALUE, 5, false));
types.add(new ParameterTypeInt(PARAMETER_MAX_OPTIMIZATION_STEPS, "The maximal number of iterations performed for one run of this operator.", 1, Integer.MAX_VALUE, 100, false));
types.add(new ParameterTypeDouble(PARAMETER_QUALITY,"The quality that must be fullfilled before the algorithm stops. (The rising of the loglikelyhood that must be undercut)", 1.0E-15, 1.0E-1, 1.0E-10));
types.addAll(RandomGenerator.getRandomGeneratorParameters(this));
types.add(new ParameterTypeBoolean(PARAMETER_SHOW_PROBABILITIES, "Insert probabilities for every cluster with every example in the example set.",true));
types.add(new ParameterTypeCategory(PARAMETER_INITIALIZATION_DISTRIBUTION, "Indicates the inital distribution of the centroids.", INIT_DISTRIBUTION, K_MEANS));
types.add(new ParameterTypeBoolean(PARAMETER_CORRELATED, "Has to be activated, if the example set contains correlated attributes.", true));
return types;
}
}