XMeansCore.java example

Explorer
rapidminer-5-master
/*
 *  RapidMiner
 *
 *  Copyright (C) 2001-2014 by RapidMiner and the contributors
 *
 *  Complete list of developers available at our web site:
 *
 *       http://rapidminer.com
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Affero General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Affero General Public License for more details.
 *
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this program.  If not, see http://www.gnu.org/licenses/.
 */
package de.dfki.madm.operator.clustering;

import java.util.ArrayList;

import com.rapidminer.example.Attribute;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.Tools;
import com.rapidminer.example.set.SplittedExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.clustering.Centroid;
import com.rapidminer.operator.clustering.CentroidClusterModel;
import com.rapidminer.operator.clustering.Cluster;
import com.rapidminer.operator.clustering.ClusterModel;
import com.rapidminer.operator.clustering.clusterer.FastKMeans;
import com.rapidminer.operator.clustering.clusterer.KMeans;
import com.rapidminer.operator.clustering.clusterer.RMAbstractClusterer;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.math.similarity.DistanceMeasure;

import de.dfki.madm.operator.KMeanspp;

public class XMeansCore extends RMAbstractClusterer {

	private ExampleSet exampleSet = null;
	private int examplesize = -1;
	private DistanceMeasure measure = null;
	private int k_min = -1;
	private int k_max = -1;
	private boolean kpp = false;
	private int maxOptimizationSteps = -1;
	private int maxRuns = -1;
	private OperatorDescription description = null;
	private Attributes attributes = null;
	private int dimension = -1;
	private int[] centroidAssignments = null;
	private String ClusteringAlgorithm = "";

	/**
	 * Initialization of X-Mean
	 * 
	 * @param eSet ExamleSet to cluster
	 * @param k_min minimal number of cluster
	 * @param k_max maximal number of cluster
	 * @param kpp using K++-Algorithem to determin the first centroids
	 * @param maxOptimizationSteps maximal optimationsteps of k-Means
	 * @param maxRuns The maximal number of runs of k-Means with random initialization that are performed.
	 * @param description
	 * @param measure MeasureType to use
	 * @param cluster_alg Clustering Algorithm to use
	 */
	public XMeansCore(ExampleSet eSet, int k_min, int k_max, boolean kpp,
			int maxOptimizationSteps, int maxRuns,
			OperatorDescription description, DistanceMeasure measure,
			String cluster_alg) {
		super(description);

		this.exampleSet = eSet;
		this.measure = measure;
		this.k_max = k_max;
		this.k_min = k_min;
		this.kpp = kpp;
		this.maxOptimizationSteps = maxOptimizationSteps;
		this.maxRuns = maxRuns;
		this.description = description;
		this.centroidAssignments = new int[exampleSet.size()];
		this.ClusteringAlgorithm = cluster_alg;
	}

	/**
	 * Running X-Means Algorithm
	 * 
	 * @return Clustered Model
	 * @throws OperatorException
	 */
	public ClusterModel doXMean() throws OperatorException {
		examplesize = exampleSet.size();

		measure.init(exampleSet);

		// checking and creating ids if necessary
		Tools.checkAndCreateIds(exampleSet);

		// additional checks
		Tools.onlyNonMissingValues(exampleSet, "KMeans");
		if (exampleSet.size() < k_min) {
			throw new UserError(this, 142, k_min);
		}

		// extracting attribute names
		attributes = exampleSet.getAttributes();
		ArrayList<String> attributeNames = new ArrayList<String>(
				attributes.size());
		for (Attribute attribute : attributes)
			attributeNames.add(attribute.getName());

		CentroidClusterModel bestModel = null;

		RMAbstractClusterer KMean = null;

		// get the Clustering Algorithm
		if (this.ClusteringAlgorithm.equals("FastKMeans")) {
			KMean = new FastKMeans(description);
			((FastKMeans) KMean).setPresetMeasure(measure);
		} else if (this.ClusteringAlgorithm.equals("KMeans")) {
			KMean = new KMeans(description);
			((KMeans) KMean).setPresetMeasure(measure);
		}
		
		// Set Parameters for Clustering Algorithm
		KMean.setParameter("k", k_min + "");
		KMean.setParameter("max_runs", maxRuns + "");
		KMean.setParameter("max_optimization_steps", maxOptimizationSteps+ "");
		KMean.setParameter(KMeanspp.PARAMETER_USE_KPP, kpp+"");
		
		// get the first run
		bestModel = (CentroidClusterModel) KMean.generateClusterModel(exampleSet);
		
		// save Dimension of data
		dimension = bestModel.getCentroid(0).getCentroid().length;

		// calculate first BIC
		double current_m_BIC = this.calcBIC(bestModel);

		boolean change = true;
		
		while (bestModel.getCentroids().size() < k_max && change) {
			checkForStop();
			change = false;
			int array_size = bestModel.getClusters().size();

			CentroidClusterModel[] Children = new CentroidClusterModel[array_size];
			CentroidClusterModel[] Parent = new CentroidClusterModel[array_size];
			SplittedExampleSet splittedSet = SplittedExampleSet.splitByAttribute(exampleSet, exampleSet.getAttributes().get("cluster"));

			if (splittedSet.getNumberOfSubsets() < array_size) {
				break;
			}
			int anz = 0;
			
			// get all Child-cluster
			for (@SuppressWarnings("unused") Cluster cl : bestModel.getClusters()) {
				splittedSet.selectSingleSubset(anz);

				KMean.setParameter("k", 2 + "");
				Children[anz] = (CentroidClusterModel) KMean
						.generateClusterModel(splittedSet);
				KMean.setParameter("k", 1 + "");
				Parent[anz] = (CentroidClusterModel) KMean
						.generateClusterModel(splittedSet);
				anz++;
			}

			Double[] SaveDiffBic = new Double[array_size];
			boolean[] takeChange = new boolean[array_size];
			int change_anz = 0;
			// check which Children to take
			for (int i = 0; i < Parent.length; i++) {
				double BICc = calcBIC(Children[i]);
				double BICp = calcBIC(Parent[i]);
				if (BICc > BICp) {
					// take Children
					takeChange[i] = true;
					SaveDiffBic[i] = (BICc - BICp);
					change_anz++;
				} else {
					takeChange[i] = false;
				}
			}

			CentroidClusterModel model = null;
			if ((change_anz + array_size) < k_max) {
				// all children are in the limit
				model = new CentroidClusterModel(exampleSet, (change_anz + array_size), attributeNames, measure, getParameterAsBoolean(RMAbstractClusterer.PARAMETER_ADD_AS_LABEL), getParameterAsBoolean(RMAbstractClusterer.PARAMETER_REMOVE_UNLABELED));

				int id = 0;
				for (int i = 0; i < array_size; i++) {
					if (takeChange[i]) {
						for (Centroid z : Children[i].getCentroids()) {
							model.assignExample(id, z.getCentroid());
							id++;
						}
					} else {
						model.assignExample(id, Parent[i].getCentroid(0) .getCentroid());
						id++;
					}
				}
			} else {
				// pick the best children
				model = new CentroidClusterModel(exampleSet, k_max, attributeNames, measure, getParameterAsBoolean(RMAbstractClusterer.PARAMETER_ADD_AS_LABEL), getParameterAsBoolean(RMAbstractClusterer.PARAMETER_REMOVE_UNLABELED));
				double hilf = 0;
				CentroidClusterModel hilf2 = null;
				//sort
				for (int i=0; i< (takeChange.length-1); i++) {
					for (int j=(i+1); j < takeChange.length; j++) {
						if (SaveDiffBic[j] > SaveDiffBic[i]) {
							hilf = SaveDiffBic[j];
							SaveDiffBic[j] = SaveDiffBic[i];
							SaveDiffBic[i] = hilf;
							
							hilf2 = Children[j];
							Children[j] = Children[i];
							Children[i] = hilf2;
							
							hilf2 = Parent[j];
							Parent[j] = Parent[i];
							Parent[i] = hilf2;
						}
					}
				}
				
				int id = 0;
				int anz1 = 0;
				for (int i = 0; i < array_size; i++) {
					if (takeChange[i]) {
						for (Centroid z : Children[i].getCentroids()) {
							model.assignExample(id, z.getCentroid());
							id++;
							anz1++;
						}
					} else {
						model.assignExample(id, Parent[i].getCentroid(0) .getCentroid());
						id++;
						anz1++;
					}
					if (anz1 >= k_max)
						break;
				}
			}

			model.finishAssign();

			model = this.assinePoints(model);

			double new_m_BIC = calcBIC(model);
			
			// check if the new BIC is better than the old
			if (new_m_BIC > current_m_BIC) {
				change = true;
				bestModel = model;
				current_m_BIC = new_m_BIC;
			} else {
				model = null;
			}
		}

		if (addsClusterAttribute()) {
			Attribute cluster = AttributeFactory.createAttribute("cluster",
					Ontology.NOMINAL);
			exampleSet.getExampleTable().addAttribute(cluster);
			exampleSet.getAttributes().setCluster(cluster);
			int i = 0;
			for (Example example : exampleSet) {
				example.setValue(cluster, "cluster_" + centroidAssignments[i]);
				i++;
			}
		}
		return bestModel;
	}

	/**
	 * assign the Points to cluster
	 * 
	 * @param model 
	 * @return
	 */
	private CentroidClusterModel assinePoints(CentroidClusterModel model) {
		double[] values = new double[attributes.size()];
		int i = 0;
		for (Example example : exampleSet) {
			double[] exampleValues = getAsDoubleArray(example, attributes,
					values);
			double nearestDistance = measure.calculateDistance(
					model.getCentroidCoordinates(0), exampleValues);
			int nearestIndex = 0;
			int id = 0;
			for (Centroid cr : model.getCentroids()) {
				double distance = measure.calculateDistance(cr.getCentroid(),
						exampleValues);
				if (distance < nearestDistance) {
					nearestDistance = distance;
					nearestIndex = id;
				}
				id++;
			}
			centroidAssignments[i] = nearestIndex;
			i++;
		}

		model.setClusterAssignments(centroidAssignments, exampleSet);
		return model;
	}

	/**
	 * Calculate the BIC like in the paper by Dan Pelleg and Andrew Moore
	 * 
	 * @param bestModel
	 * @return BIC of the given modell
	 */
	private double calcBIC(CentroidClusterModel bestModel) {
		double loglike = 0;
		int numCenters = bestModel.getNumberOfClusters();
		int numDimensions = bestModel.getCentroidCoordinates(0).length;

		int numParameters = (numCenters - 1) + // probabilities
				numCenters * numDimensions + // means
				numCenters; // variance params

		for (Cluster c : bestModel.getClusters()) {
			int current_id = c.getClusterId();
			loglike += logLikelihoodEstimate(c,
					bestModel.getCentroidCoordinates(current_id),
					bestModel.getClusterAssignments(exampleSet), numCenters);
		}

		loglike -= (numParameters / 2.0) * Math.log(examplesize);
		return loglike;
	}

	private double[] getAsDoubleArray(Example example, Attributes attributes,
			double[] values) {
		int i = 0;
		for (Attribute attribute : attributes) {
			values[i] = example.getValue(attribute);
			i++;
		}
		return values;
	}

	private double logLikelihoodEstimate(Cluster c, double[] centroid,
			int[] as, int K) {
		double l = 0;
		double R = examplesize;
		double Rn = c.getNumberOfExamples();
		double M = dimension;
		double d = 0;
		double[] values = new double[attributes.size()];

		if (Rn > 1) {
			double sum = 0;

			final Attribute idAttribute = exampleSet.getAttributes().getId();
			boolean idIsNominal = idAttribute.isNominal();
			exampleSet.remapIds();
			for (Object ob : c.getExampleIds()) {
				Example example;
				if (idIsNominal) {					
					example = exampleSet.getExampleFromId(idAttribute.getMapping().mapString((String)ob));
				} else {
					example = exampleSet.getExampleFromId(((Double) ob).intValue());
				}
				if (example == null) {
					throw new RuntimeException("Unknown id: "+ob);
				}
				sum += Math.pow((measure.calculateDistance(
						centroid,
						getAsDoubleArray(example, attributes,
								values))), 2);
			}

			d = (1.0 / (Rn - K)) * sum;

			l = -(Rn / 2.0) * Math.log(2.0 * Math.PI) - ((Rn * M) / 2.0)
					* Math.log(d) - (Rn - K) / 2.0 + Rn * Math.log(Rn) - Rn
					* Math.log(R);
		}
		return l;
	}

	@Override
	public ClusterModel generateClusterModel(ExampleSet exampleSet)
			throws OperatorException {
		return null;
	}
}