/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package de.dfki.madm.operator.clustering; import java.util.ArrayList; import com.rapidminer.example.Attribute; import com.rapidminer.example.Attributes; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.Tools; import com.rapidminer.example.set.SplittedExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.OperatorProgress; import com.rapidminer.operator.ProcessStoppedException; import com.rapidminer.operator.UserError; import com.rapidminer.operator.clustering.Centroid; import com.rapidminer.operator.clustering.CentroidClusterModel; import com.rapidminer.operator.clustering.Cluster; import com.rapidminer.operator.clustering.ClusterModel; import com.rapidminer.operator.clustering.clusterer.FastKMeans; import com.rapidminer.operator.clustering.clusterer.KMeans; import com.rapidminer.operator.clustering.clusterer.RMAbstractClusterer; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.math.similarity.DistanceMeasure; import de.dfki.madm.operator.KMeanspp; public class XMeansCore extends RMAbstractClusterer { private static final int INTERMEDIATE_PROGRESS = 20; private ExampleSet exampleSet = null; private int examplesize = -1; private DistanceMeasure measure = null; private int k_min = -1; private int k_max = -1; private boolean kpp = false; private int maxOptimizationSteps = -1; private int maxRuns = -1; private OperatorDescription description = null; private Attributes attributes = null; private int dimension = -1; private int[] centroidAssignments = null; private String ClusteringAlgorithm = ""; private Operator executingOperator = null; /** * Initialization of X-Mean * * @param eSet * ExamleSet to cluster * @param k_min * minimal number of cluster * @param k_max * maximal number of cluster * @param kpp * using K++-Algorithem to determin the first centroids * @param maxOptimizationSteps * maximal optimationsteps of k-Means * @param maxRuns * The maximal number of runs of k-Means with random initialization that are * performed. * @param description * @param measure * MeasureType to use * @param cluster_alg * Clustering Algorithm to use */ public XMeansCore(ExampleSet eSet, int k_min, int k_max, boolean kpp, int maxOptimizationSteps, int maxRuns, OperatorDescription description, DistanceMeasure measure, String cluster_alg) { super(description); this.exampleSet = eSet; this.measure = measure; this.k_max = k_max; this.k_min = k_min; this.kpp = kpp; this.maxOptimizationSteps = maxOptimizationSteps; this.maxRuns = maxRuns; this.description = description; this.centroidAssignments = new int[exampleSet.size()]; this.ClusteringAlgorithm = cluster_alg; } /** * Running X-Means Algorithm * * @return Clustered Model * @throws OperatorException */ public ClusterModel doXMean() throws OperatorException { examplesize = exampleSet.size(); measure.init(exampleSet); // checking and creating ids if necessary Tools.checkAndCreateIds(exampleSet); // additional checks Tools.onlyNonMissingValues(exampleSet, getOperatorClassName(), this, new String[0]); if (exampleSet.size() < k_min) { throw new UserError(this, 142, k_min); } // extracting attribute names attributes = exampleSet.getAttributes(); ArrayList<String> attributeNames = new ArrayList<String>(attributes.size()); for (Attribute attribute : attributes) { attributeNames.add(attribute.getName()); } CentroidClusterModel bestModel = null; RMAbstractClusterer KMean = null; // get the Clustering Algorithm if (this.ClusteringAlgorithm.equals("FastKMeans")) { KMean = new FastKMeans(description); ((FastKMeans) KMean).setPresetMeasure(measure); } else if (this.ClusteringAlgorithm.equals("KMeans")) { KMean = new KMeans(description); ((KMeans) KMean).setPresetMeasure(measure); } else { throw new OperatorException("Unknown kmeans algorithm: " + ClusteringAlgorithm); } // Set Parameters for Clustering Algorithm KMean.setParameter("k", k_min + ""); KMean.setParameter("max_runs", maxRuns + ""); KMean.setParameter("max_optimization_steps", maxOptimizationSteps + ""); KMean.setParameter(KMeanspp.PARAMETER_USE_KPP, kpp + ""); // initialize progress OperatorProgress operatorProgress = null; if (executingOperator != null && executingOperator.getProgress() != null) { operatorProgress = executingOperator.getProgress(); operatorProgress.setTotal(100); } // get the first run bestModel = (CentroidClusterModel) KMean.generateClusterModel(exampleSet); if (operatorProgress != null) { operatorProgress.setCompleted(INTERMEDIATE_PROGRESS); } // save Dimension of data dimension = bestModel.getCentroid(0).getCentroid().length; // calculate first BIC double current_m_BIC = this.calcBIC(bestModel); boolean change = true; boolean addAsLabel = getParameterAsBoolean(RMAbstractClusterer.PARAMETER_ADD_AS_LABEL); boolean removeUnlabeled = getParameterAsBoolean(RMAbstractClusterer.PARAMETER_REMOVE_UNLABELED); while (bestModel.getCentroids().size() < k_max && change) { change = false; int array_size = bestModel.getClusters().size(); CentroidClusterModel[] Children = new CentroidClusterModel[array_size]; CentroidClusterModel[] Parent = new CentroidClusterModel[array_size]; SplittedExampleSet splittedSet = SplittedExampleSet.splitByAttribute(exampleSet, exampleSet.getAttributes().get("cluster")); if (splittedSet.getNumberOfSubsets() < array_size) { break; } int anz = 0; // get all Child-cluster for (@SuppressWarnings("unused") Cluster cl : bestModel.getClusters()) { splittedSet.selectSingleSubset(anz); KMean.setParameter("k", 2 + ""); Children[anz] = (CentroidClusterModel) KMean.generateClusterModel(splittedSet); KMean.setParameter("k", 1 + ""); Parent[anz] = (CentroidClusterModel) KMean.generateClusterModel(splittedSet); anz++; } Double[] SaveDiffBic = new Double[array_size]; boolean[] takeChange = new boolean[array_size]; int change_anz = 0; // check which Children to take for (int i = 0; i < Parent.length; i++) { double BICc = calcBIC(Children[i]); double BICp = calcBIC(Parent[i]); if (BICc > BICp) { // take Children takeChange[i] = true; SaveDiffBic[i] = BICc - BICp; change_anz++; } else { takeChange[i] = false; } } CentroidClusterModel model = null; if (change_anz + array_size < k_max) { // all children are in the limit model = new CentroidClusterModel(exampleSet, change_anz + array_size, attributeNames, measure, addAsLabel, removeUnlabeled); int id = 0; for (int i = 0; i < array_size; i++) { if (takeChange[i]) { for (Centroid z : Children[i].getCentroids()) { model.assignExample(id, z.getCentroid()); id++; } } else { model.assignExample(id, Parent[i].getCentroid(0).getCentroid()); id++; } } } else { // pick the best children model = new CentroidClusterModel(exampleSet, k_max, attributeNames, measure, addAsLabel, removeUnlabeled); double hilf = 0; CentroidClusterModel hilf2 = null; // sort for (int i = 0; i < takeChange.length - 1; i++) { for (int j = i + 1; j < takeChange.length; j++) { if (SaveDiffBic[j] > SaveDiffBic[i]) { hilf = SaveDiffBic[j]; SaveDiffBic[j] = SaveDiffBic[i]; SaveDiffBic[i] = hilf; hilf2 = Children[j]; Children[j] = Children[i]; Children[i] = hilf2; hilf2 = Parent[j]; Parent[j] = Parent[i]; Parent[i] = hilf2; } } } int id = 0; int anz1 = 0; for (int i = 0; i < array_size; i++) { if (takeChange[i]) { for (Centroid z : Children[i].getCentroids()) { model.assignExample(id, z.getCentroid()); id++; anz1++; } } else { model.assignExample(id, Parent[i].getCentroid(0).getCentroid()); id++; anz1++; } if (anz1 >= k_max) { break; } } } model.finishAssign(); model = this.assinePoints(model); double new_m_BIC = calcBIC(model); // check if the new BIC is better than the old if (new_m_BIC > current_m_BIC) { change = true; bestModel = model; current_m_BIC = new_m_BIC; } else { model = null; } if (operatorProgress != null) { if (bestModel.getCentroids().size() > k_max) { operatorProgress.complete(); } else { operatorProgress.setCompleted((int) (INTERMEDIATE_PROGRESS + (100.0 - INTERMEDIATE_PROGRESS) * bestModel.getCentroids().size() / k_max)); } } } if (addsClusterAttribute()) { Attribute cluster = AttributeFactory.createAttribute("cluster", Ontology.NOMINAL); exampleSet.getExampleTable().addAttribute(cluster); exampleSet.getAttributes().setCluster(cluster); int i = 0; for (Example example : exampleSet) { example.setValue(cluster, "cluster_" + centroidAssignments[i]); i++; } } if (operatorProgress != null) { operatorProgress.complete(); } return bestModel; } /** * assign the Points to cluster * * @param model * @return */ private CentroidClusterModel assinePoints(CentroidClusterModel model) { double[] values = new double[attributes.size()]; int i = 0; for (Example example : exampleSet) { double[] exampleValues = getAsDoubleArray(example, attributes, values); double nearestDistance = measure.calculateDistance(model.getCentroidCoordinates(0), exampleValues); int nearestIndex = 0; int id = 0; for (Centroid cr : model.getCentroids()) { double distance = measure.calculateDistance(cr.getCentroid(), exampleValues); if (distance < nearestDistance) { nearestDistance = distance; nearestIndex = id; } id++; } centroidAssignments[i] = nearestIndex; i++; } model.setClusterAssignments(centroidAssignments, exampleSet); return model; } /** * Calculate the BIC like in the paper by Dan Pelleg and Andrew Moore * * @param bestModel * @return BIC of the given modell * @throws ProcessStoppedException */ private double calcBIC(CentroidClusterModel bestModel) throws ProcessStoppedException { double loglike = 0; int numCenters = bestModel.getNumberOfClusters(); int numDimensions = bestModel.getCentroidCoordinates(0).length; int numParameters = numCenters - 1 + // probabilities numCenters * numDimensions + // means numCenters; // variance params for (Cluster c : bestModel.getClusters()) { int current_id = c.getClusterId(); loglike += logLikelihoodEstimate(c, bestModel.getCentroidCoordinates(current_id), numCenters); } loglike -= numParameters / 2.0 * Math.log(examplesize); return loglike; } private double[] getAsDoubleArray(Example example, Attributes attributes, double[] values) { int i = 0; for (Attribute attribute : attributes) { values[i] = example.getValue(attribute); i++; } return values; } private double logLikelihoodEstimate(Cluster c, double[] centroid, int K) { double l = 0; double R = examplesize; double Rn = c.getNumberOfExamples(); double M = dimension; double d = 0; double[] values = new double[attributes.size()]; if (Rn > 1) { double sum = 0; final Attribute idAttribute = exampleSet.getAttributes().getId(); boolean idIsNominal = idAttribute.isNominal(); exampleSet.remapIds(); for (Object ob : c.getExampleIds()) { Example example; if (idIsNominal) { example = exampleSet.getExampleFromId(idAttribute.getMapping().mapString((String) ob)); } else { example = exampleSet.getExampleFromId(((Double) ob).intValue()); } if (example == null) { throw new RuntimeException("Unknown id: " + ob); } sum += Math.pow(measure.calculateDistance(centroid, getAsDoubleArray(example, attributes, values)), 2); } d = 1.0 / (Rn - K) * sum; l = -(Rn / 2.0) * Math.log(2.0 * Math.PI) - Rn * M / 2.0 * Math.log(d) - (Rn - K) / 2.0 + Rn * Math.log(Rn) - Rn * Math.log(R); } return l; } @Override public ClusterModel generateClusterModel(ExampleSet exampleSet) throws OperatorException { return null; } /** * The operator in which XMeans is done. Used to display the progress. * * @param executingOperator * The executing XMeans operator */ public void setExecutingOperator(Operator executingOperator) { this.executingOperator = executingOperator; } }