package de.tud.inf.operator.mm; import java.util.Arrays; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Random; import com.rapidminer.example.Attribute; import com.rapidminer.example.Attributes; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.operator.IOContainer; import com.rapidminer.operator.IOObject; import com.rapidminer.operator.MissingIOObjectException; import com.rapidminer.operator.ModelApplier; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorCreationException; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.features.selection.RandomSelection; import com.rapidminer.operator.features.transformation.PCA; import com.rapidminer.operator.learner.clustering.clusterer.KMeans; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.parameter.ParameterTypeString; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.OperatorService; import de.tud.inf.operator.mm.util.MetaConfig; /** * This operator creates a library of different clusterings of the same data-source. For now there are implemented 3 * strategies that will be executed one after another and each of them produces 1/3 of the library. * * @version $Revision$ * @author Andre Jaehnig */ public class ClusterEnsembleCreator extends Operator { /************************************************************************************************ * FIELDS ***********************************************************************************************/ /** Number of clusterings each strategy should produce. */ public static final String PARAMETER_CLUSTERING_COUNT_PER_STRATEGY = "count_per_strategy"; /** Minimum number of clusters that should appear within a clustering. */ public static final String PARAMETER_K_MIN = "k_min"; /** Maximum number of clusters that should appear within a clustering. */ public static final String PARAMETER_K_MAX = "k_max"; /** Number of classes within the data. */ public static final String PARAMETER_CLASS_COUNT = "class_count"; /** Prefix for the new cluster columns. */ public static final String PARAMETER_CLUSTER_COLUMN_PREFIX = "cluster_prefix"; /** List of attributes that are only classifying and should not be used for clustering. */ public static final String PARAMETER_CLASSIFYING_ATTRIBUTES = "classifying_attributes"; /** Filename of the meta configuration file. */ public static final String PARAMETER_META_FILENAME = "meta_filename"; /** Filename of the ensemble file. */ public static final String PARAMETER_ENSEMBLE_FILENAME = "ensemble_filename"; /** Filename of the raw data. */ public static final String PARAMETER_DATA_FILENAME = "data_filename"; /** Operator for K-means clustering. */ private Operator kMeans = null; /** Operator for a random feature selection. */ private Operator randomFeatureSelector = null; /** Operator for a PCA. */ private Operator pca = null; /** Operator for applying a model. */ private Operator modelApplier = null; /** Random number creator. */ private Random random = null; /** The prefix for the cluster column. */ private String clusterColumnPrefix = null; /************************************************************************************************ * GETTER & SETTER ***********************************************************************************************/ /* * (non-Javadoc) * * @see com.rapidminer.operator.Operator#getInputClasses() */ @Override public Class<?>[] getInputClasses() { return new Class[] { ExampleSet.class }; } /* * (non-Javadoc) * * @see com.rapidminer.operator.Operator#getOutputClasses() */ @Override public Class<?>[] getOutputClasses() { return new Class[] { ExampleSet.class }; } /* * (non-Javadoc) * * @see com.rapidminer.operator.Operator#getParameterTypes() */ public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.add(new ParameterTypeInt(PARAMETER_CLUSTERING_COUNT_PER_STRATEGY, "The number of clusterings each strategy should produce.", 1, Integer.MAX_VALUE, 200)); types.add(new ParameterTypeInt(PARAMETER_K_MIN, "Minimum number of clusters that should appear within a clustering.", 2, Integer.MAX_VALUE, true)); types.add(new ParameterTypeInt(PARAMETER_K_MAX, "Maximum number of clusters that should appear within a clustering.", 2, Integer.MAX_VALUE, true)); types.add(new ParameterTypeInt(PARAMETER_CLASS_COUNT, "Number of classes within the data.", 1, Integer.MAX_VALUE, true)); types.add(new ParameterTypeString(PARAMETER_CLASSIFYING_ATTRIBUTES, "List of attributes that are only classifying and should not be used for clustering.", "")); types.add(new ParameterTypeString(PARAMETER_CLUSTER_COLUMN_PREFIX, "Prefix for the new cluster columns.", "cr")); types.add(new ParameterTypeString(PARAMETER_META_FILENAME, "Filename of the meta configuration file.")); types.add(new ParameterTypeString(PARAMETER_ENSEMBLE_FILENAME, "Filename of the ensemble file.")); types.add(new ParameterTypeString(PARAMETER_DATA_FILENAME, "Filename of the raw data file.")); return types; } /************************************************************************************************ * CONSTRUCTOR ***********************************************************************************************/ /** * Constructor. * * @param description Description of the operator. */ public ClusterEnsembleCreator(OperatorDescription description) { super(description); } /************************************************************************************************ * PUBLIC METHODS ***********************************************************************************************/ /* * (non-Javadoc) * * @see com.rapidminer.operator.Operator#apply() */ @Override public IOObject[] apply() throws OperatorException { // get input example set and some details ExampleSet inputExampleSet = getInput(ExampleSet.class); int elementCount = inputExampleSet.size(); int dimensionCount = inputExampleSet.getAttributes().size(); this.logNote("Input example-set has " + elementCount + " elements and " + dimensionCount + " dimensions."); // create a copy for the ouput ExampleSet outputExampleSet = (ExampleSet) inputExampleSet.clone(); // read out parameters int countPerStrategy = this.getParameterAsInt(PARAMETER_CLUSTERING_COUNT_PER_STRATEGY); String classifyingAttributesStr = this.getParameterAsString(PARAMETER_CLASSIFYING_ATTRIBUTES); List<String> classifyingAttributes = null; if (classifyingAttributesStr != null && classifyingAttributesStr.length() > 0) { classifyingAttributes = Arrays.asList(classifyingAttributesStr.split(",")); } else { classifyingAttributes = new LinkedList<String>(); } this.clusterColumnPrefix = this.getParameterAsString(PARAMETER_CLUSTER_COLUMN_PREFIX); String metaFileName = this.getParameterAsString(PARAMETER_META_FILENAME); String ensembleFileName = this.getParameterAsString(PARAMETER_ENSEMBLE_FILENAME); String rawDataFileName = this.getParameterAsString(PARAMETER_DATA_FILENAME); int kMin, kMax; if (this.isParameterSet(PARAMETER_K_MIN)) { kMin = this.getParameterAsInt(PARAMETER_K_MIN); } else { // kMin is not given kMin = 2; } if (this.isParameterSet(PARAMETER_K_MAX)) { kMax = this.getParameterAsInt(PARAMETER_K_MAX); } else { // kMax is not given kMax = (int) Math.sqrt(elementCount); } if (this.isParameterSet(PARAMETER_CLASS_COUNT)) { if (this.isParameterSet(PARAMETER_K_MAX)) { // ignore class count this.logWarning("Parameter " + PARAMETER_CLASS_COUNT + " will be ignored."); } else { this.logNote("Setting Parameter " + PARAMETER_K_MAX + " to 2 * " + PARAMETER_CLASS_COUNT + "."); int classCount = this.getParameterAsInt(PARAMETER_CLASS_COUNT); kMax = 2 * classCount; } } // check kMin/kMax-values if (kMin < 2) { throw new Error("Parameter " + PARAMETER_K_MIN + " has to be at least '2'."); } if (kMax > elementCount) { throw new Error("Parameter " + PARAMETER_K_MAX + " can not be larger than the number of elements of this data-set."); } else if (kMax < kMin) { throw new Error("Parameter " + PARAMETER_K_MAX + " can not be smaller than parameter " + PARAMETER_K_MIN + "."); } this.logNote("Global settings for the clustering ensemble:"); this.logNote("kMin = " + kMin + "\tkMax = " + kMax); this.logNote("Clustering-count per strategy: " + countPerStrategy); this.logNote("Classifying attributes: " + classifyingAttributesStr); // create the necessary operators try { // K-means operator this.kMeans = OperatorService.createOperator(KMeans.class); // random feature selector operator this.randomFeatureSelector = OperatorService.createOperator(RandomSelection.class); // pca this.pca = OperatorService.createOperator(PCA.class); // model applier this.modelApplier = OperatorService.createOperator(ModelApplier.class); } catch (OperatorCreationException oce) { throw new Error(oce.getMessage()); } // mark classifying attributes as special ones Attribute attr = null; for (String attrName : classifyingAttributes) { // get the attribute attr = inputExampleSet.getAttributes().get(attrName); // set special inputExampleSet.getAttributes().setSpecialAttribute(attr, attrName); } // write meta file to disk MetaConfig mc = new MetaConfig(); mc.setClassifyingAttributeNames(classifyingAttributes); mc.setClusteringColumnPrefix(this.clusterColumnPrefix); mc.setClusteringCount(3 * countPerStrategy); mc.setEnsembleFileName(ensembleFileName); mc.setIdColumnName(inputExampleSet.getAttributes().getId().getName()); mc.setDataFileName(rawDataFileName); mc.save(metaFileName); // initialize some strategy wide variables int clusteringCounter = 0; this.random = new Random(); IOContainer operatorOutput = null; // ############################################################################################ // STRATEGY 1 // kMeans with different random initializations. // ############################################################################################ this.logNote("Running strategy 1:"); this.logNote("The different clustering solutions are obtained by applying K-means to the " + "same data with different random intialization."); this.logNote("==================="); for (int i = 0; i < countPerStrategy; i++) { this.logNote("Pass #" + clusteringCounter); // execute k-means operatorOutput = this.runKMeans(inputExampleSet, kMin, kMax, true); // copy cluster-attribute to output example set outputExampleSet = this.copyClusterColumnToOutput(outputExampleSet, operatorOutput, clusteringCounter); clusteringCounter++; this.logNote("----------"); } // ############################################################################################ // STRATEGY 2 // kMeans with different random feature subsets. // ############################################################################################ this.logNote("Running strategy 2:"); this.logNote("The different clustering solutions are obtained by using different random " + "feature subsets."); this.logNote("==================="); // get min/max values for the feature count int featureCountMin = 2; int featureCountMax = (int) Math.round(dimensionCount / 2.0d); if (featureCountMax < featureCountMin) { featureCountMax = featureCountMin; } this.logNote("Local settings for strategy 2:"); this.logNote("min-feature-count = " + featureCountMin + "\tmax-feature-count = " + featureCountMax); ExampleSet reducedExampleSet = null; for (int i = 0; i < countPerStrategy; i++) { this.logNote("Pass #" + clusteringCounter); // execute feature reduction operatorOutput = this.runFeatureReduction(inputExampleSet, featureCountMin, featureCountMax); reducedExampleSet = operatorOutput.get(ExampleSet.class); // execute k-means operatorOutput = this.runKMeans(reducedExampleSet, kMin, kMax, false); // copy cluster-attribute to output example set outputExampleSet = this.copyClusterColumnToOutput(outputExampleSet, operatorOutput, clusteringCounter); clusteringCounter++; this.logNote("----------"); } // ############################################################################################ // STRATEGY 3 // kMeans with different PCs // ############################################################################################ this.logNote("Running strategy 3:"); this.logNote("Use of different random linear projections of the features to create different " + "clustering solutions."); this.logNote("==================="); // get min/max values for the feature count featureCountMin = 2; featureCountMax = (int) Math.round((dimensionCount - classifyingAttributes.size()) / 2.0d); if (featureCountMax < featureCountMin) { featureCountMax = featureCountMin; } this.logNote("Local settings for strategy 3:"); this.logNote("min-feature-count = " + featureCountMin + "\tmax-feature-count = " + featureCountMax); // execute pca this.logNote("Starting a full PCA on the input example set."); this.pca.setParameter(PCA.PARAMETER_REDUCTION_TYPE, String.valueOf(PCA.REDUCTION_NONE)); operatorOutput = this.pca.apply(new IOContainer(inputExampleSet)); // apply pca model operatorOutput = this.modelApplier.apply(operatorOutput); ExampleSet pcaExampleSet = operatorOutput.get(ExampleSet.class); this.logNote("PCA finished!"); for (int i = 0; i < countPerStrategy; i++) { this.logNote("Pass #" + clusteringCounter); // execute feature reduction operatorOutput = this.runFeatureReduction(pcaExampleSet, featureCountMin, featureCountMax); reducedExampleSet = operatorOutput.get(ExampleSet.class); // execute k-means operatorOutput = this.runKMeans(reducedExampleSet, kMin, kMax, false); // copy cluster-attribute to output example set outputExampleSet = this.copyClusterColumnToOutput(outputExampleSet, operatorOutput, clusteringCounter); clusteringCounter++; this.logNote("----------"); } return new IOObject[] { outputExampleSet }; } /************************************************************************************************ * PRIVATE METHODS ***********************************************************************************************/ /** * This method runs an k-Means clustering on the input example set and returns the output example set and the * clustering model. The parameter k will be randomly drawn between kMin and kMax. Also there can be specified if the * initialization of the centroids should be randomized or not. * * @param exampleSet The input example set that should clustered. * @param kMin Minimum number of clusters. * @param kMax Maximum number of clusters. * @param randomSeed Flag for a random initialization of the centroids. * @return Container with the output example set and the cluster model. */ private IOContainer runKMeans(ExampleSet exampleSet, int kMin, int kMax, boolean randomSeed) { // get new seed int seed; if (randomSeed) { seed = this.random.nextInt(Integer.MAX_VALUE); } else { seed = -1; } // get new k int k = this.random.nextInt(kMax - kMin + 1) + kMin; // setting parameters this.logNote("K-means-settings:\tk = " + k + "\tseed = " + seed); kMeans.setParameter(KMeans.PARAMETER_LOCAL_RANDOM_SEED, String.valueOf(seed)); kMeans.setParameter(KMeans.PARAMETER_K, String.valueOf(k)); // fire! IOContainer output; try { output = this.kMeans.apply(new IOContainer(exampleSet)); } catch (OperatorException e) { e.printStackTrace(); throw new Error("Error while running K-means.",e); } return output; } /** * This methods selects a random subset of features from the input example set and returns the according reduced * example set. The number of features will be randomly drawn between the two given boundaries. * * @param exampleSet The input example set. * @param minFeatures Minimum number of features. * @param maxFeatures Maximum number of features. * @return The reduced example set. */ private IOContainer runFeatureReduction(ExampleSet exampleSet, int minFeatures, int maxFeatures) { // get new seed int seed = this.random.nextInt(Integer.MAX_VALUE); // get new feature count int featureCount = this.random.nextInt(maxFeatures - minFeatures + 1) + minFeatures; // setting parameters this.logNote("Feature-reduction-settings:\tfeature-count = " + featureCount + "\tseed = " + seed); this.randomFeatureSelector.setParameter(RandomSelection.PARAMETER_NUMBER_OF_FEATURES, String .valueOf(featureCount)); this.randomFeatureSelector.setParameter(RandomSelection.PARAMETER_LOCAL_RANDOM_SEED, String.valueOf(seed)); // fire! IOContainer output; try { output = this.randomFeatureSelector.apply(new IOContainer(exampleSet)); } catch (OperatorException e) { throw new Error("Error while running feature reduction."); } return output; } /** * This method copies the cluster-assigments from the clustering output to the given example set as a new column. * * @param exampleSet The input example set to which the new column should be added. * @param clusteringOutput The out of the clustering (clustering result). * @param num The current number of the clustering. * @return An example set with an extra column for the given clustering result. */ private ExampleSet copyClusterColumnToOutput(ExampleSet exampleSet, IOContainer clusteringOutput, int num) { // get example set from the clustering ExampleSet clusteringExampleSet; try { clusteringExampleSet = clusteringOutput.get(ExampleSet.class); } catch (MissingIOObjectException e) { throw new Error("Something went wrong :/"); } Attribute clusterAttribute = clusteringExampleSet.getAttributes().getSpecial(Attributes.CLUSTER_NAME); // add new attribute to the output example set Attribute newClusterAttribute = AttributeFactory .createAttribute(this.clusterColumnPrefix + num, Ontology.NOMINAL); exampleSet.getExampleTable().addAttribute(newClusterAttribute); exampleSet.getAttributes().setSpecialAttribute(newClusterAttribute, this.clusterColumnPrefix + num); // iterate through clustering example set and copy the cluster attribute values Iterator<Example> it1 = clusteringExampleSet.iterator(); Iterator<Example> it2 = exampleSet.iterator(); Example example1 = null; Example example2 = null; while (it1.hasNext() && it2.hasNext()) { example1 = it1.next(); example2 = it2.next(); example2.setValue(newClusterAttribute, example1.getNominalValue(clusterAttribute)); } return exampleSet; } }