ClusterEnsembleCreator.java example

Explorer
ComplexRapidMiner-master
- operator
- src


package de.tud.inf.operator.mm;

import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;

import com.rapidminer.example.Attribute;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.operator.IOContainer;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.MissingIOObjectException;
import com.rapidminer.operator.ModelApplier;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorCreationException;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.features.selection.RandomSelection;
import com.rapidminer.operator.features.transformation.PCA;
import com.rapidminer.operator.learner.clustering.clusterer.KMeans;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.OperatorService;

import de.tud.inf.operator.mm.util.MetaConfig;


/**
 * This operator creates a library of different clusterings of the same data-source. For now there are implemented 3
 * strategies that will be executed one after another and each of them produces 1/3 of the library.
 * 
 * @version $Revision$
 * @author Andre Jaehnig
 */
public class ClusterEnsembleCreator extends Operator {

   /************************************************************************************************
    * FIELDS
    ***********************************************************************************************/

   /** Number of clusterings each strategy should produce. */
   public static final String PARAMETER_CLUSTERING_COUNT_PER_STRATEGY = "count_per_strategy";

   /** Minimum number of clusters that should appear within a clustering. */
   public static final String PARAMETER_K_MIN = "k_min";

   /** Maximum number of clusters that should appear within a clustering. */
   public static final String PARAMETER_K_MAX = "k_max";

   /** Number of classes within the data. */
   public static final String PARAMETER_CLASS_COUNT = "class_count";

   /** Prefix for the new cluster columns. */
   public static final String PARAMETER_CLUSTER_COLUMN_PREFIX = "cluster_prefix";

   /** List of attributes that are only classifying and should not be used for clustering. */
   public static final String PARAMETER_CLASSIFYING_ATTRIBUTES = "classifying_attributes";

   /** Filename of the meta configuration file. */
   public static final String PARAMETER_META_FILENAME = "meta_filename";

   /** Filename of the ensemble file. */
   public static final String PARAMETER_ENSEMBLE_FILENAME = "ensemble_filename";
   
   /** Filename of the raw data. */
   public static final String PARAMETER_DATA_FILENAME = "data_filename";

   /** Operator for K-means clustering. */
   private Operator kMeans = null;

   /** Operator for a random feature selection. */
   private Operator randomFeatureSelector = null;

   /** Operator for a PCA. */
   private Operator pca = null;

   /** Operator for applying a model. */
   private Operator modelApplier = null;

   /** Random number creator. */
   private Random random = null;

   /** The prefix for the cluster column. */
   private String clusterColumnPrefix = null;

   /************************************************************************************************
    * GETTER & SETTER
    ***********************************************************************************************/

   /*
    * (non-Javadoc)
    * 
    * @see com.rapidminer.operator.Operator#getInputClasses()
    */
   @Override
   public Class<?>[] getInputClasses() {
      return new Class[] { ExampleSet.class };
   }

   /*
    * (non-Javadoc)
    * 
    * @see com.rapidminer.operator.Operator#getOutputClasses()
    */
   @Override
   public Class<?>[] getOutputClasses() {
      return new Class[] { ExampleSet.class };
   }

   /*
    * (non-Javadoc)
    * 
    * @see com.rapidminer.operator.Operator#getParameterTypes()
    */
   public List<ParameterType> getParameterTypes() {
      List<ParameterType> types = super.getParameterTypes();

      types.add(new ParameterTypeInt(PARAMETER_CLUSTERING_COUNT_PER_STRATEGY,
            "The number of clusterings each strategy should produce.", 1, Integer.MAX_VALUE, 200));
      types.add(new ParameterTypeInt(PARAMETER_K_MIN,
            "Minimum number of clusters that should appear within a clustering.", 2, Integer.MAX_VALUE, true));
      types.add(new ParameterTypeInt(PARAMETER_K_MAX,
            "Maximum number of clusters that should appear within a clustering.", 2, Integer.MAX_VALUE, true));
      types.add(new ParameterTypeInt(PARAMETER_CLASS_COUNT, "Number of classes within the data.", 1, Integer.MAX_VALUE,
            true));
      types.add(new ParameterTypeString(PARAMETER_CLASSIFYING_ATTRIBUTES,
            "List of attributes that are only classifying and should not be used for clustering.", ""));
      types.add(new ParameterTypeString(PARAMETER_CLUSTER_COLUMN_PREFIX, "Prefix for the new cluster columns.", "cr"));
      types.add(new ParameterTypeString(PARAMETER_META_FILENAME, "Filename of the meta configuration file."));
      types.add(new ParameterTypeString(PARAMETER_ENSEMBLE_FILENAME, "Filename of the ensemble file."));
      types.add(new ParameterTypeString(PARAMETER_DATA_FILENAME, "Filename of the raw data file."));

      return types;
   }

   /************************************************************************************************
    * CONSTRUCTOR
    ***********************************************************************************************/

   /**
    * Constructor.
    * 
    * @param description Description of the operator.
    */
   public ClusterEnsembleCreator(OperatorDescription description) {
      super(description);
   }

   /************************************************************************************************
    * PUBLIC METHODS
    ***********************************************************************************************/

   /*
    * (non-Javadoc)
    * 
    * @see com.rapidminer.operator.Operator#apply()
    */
   @Override
   public IOObject[] apply() throws OperatorException {
      // get input example set and some details
      ExampleSet inputExampleSet = getInput(ExampleSet.class);
      int elementCount = inputExampleSet.size();
      int dimensionCount = inputExampleSet.getAttributes().size();
      this.logNote("Input example-set has " + elementCount + " elements and " + dimensionCount + " dimensions.");

      // create a copy for the ouput
      ExampleSet outputExampleSet = (ExampleSet) inputExampleSet.clone();

      // read out parameters
      int countPerStrategy = this.getParameterAsInt(PARAMETER_CLUSTERING_COUNT_PER_STRATEGY);
      String classifyingAttributesStr = this.getParameterAsString(PARAMETER_CLASSIFYING_ATTRIBUTES);
      List<String> classifyingAttributes = null;
      if (classifyingAttributesStr != null && classifyingAttributesStr.length() > 0) {
         classifyingAttributes = Arrays.asList(classifyingAttributesStr.split(","));
      }
      else {
         classifyingAttributes = new LinkedList<String>();
      }
      this.clusterColumnPrefix = this.getParameterAsString(PARAMETER_CLUSTER_COLUMN_PREFIX);
      String metaFileName = this.getParameterAsString(PARAMETER_META_FILENAME);
      String ensembleFileName = this.getParameterAsString(PARAMETER_ENSEMBLE_FILENAME);
      String rawDataFileName = this.getParameterAsString(PARAMETER_DATA_FILENAME);

      int kMin, kMax;
      if (this.isParameterSet(PARAMETER_K_MIN)) {
         kMin = this.getParameterAsInt(PARAMETER_K_MIN);
      }
      else {
         // kMin is not given
         kMin = 2;
      }
      if (this.isParameterSet(PARAMETER_K_MAX)) {
         kMax = this.getParameterAsInt(PARAMETER_K_MAX);
      }
      else {
         // kMax is not given
         kMax = (int) Math.sqrt(elementCount);
      }
      if (this.isParameterSet(PARAMETER_CLASS_COUNT)) {
         if (this.isParameterSet(PARAMETER_K_MAX)) {
            // ignore class count
            this.logWarning("Parameter " + PARAMETER_CLASS_COUNT + " will be ignored.");
         }
         else {
            this.logNote("Setting Parameter " + PARAMETER_K_MAX + " to 2 * " + PARAMETER_CLASS_COUNT + ".");
            int classCount = this.getParameterAsInt(PARAMETER_CLASS_COUNT);
            kMax = 2 * classCount;
         }
      }

      // check kMin/kMax-values
      if (kMin < 2) {
         throw new Error("Parameter " + PARAMETER_K_MIN + " has to be at least '2'.");
      }
      if (kMax > elementCount) {
         throw new Error("Parameter " + PARAMETER_K_MAX
                         + " can not be larger than the number of elements of this data-set.");
      }
      else if (kMax < kMin) {
         throw new Error("Parameter " + PARAMETER_K_MAX + " can not be smaller than parameter " + PARAMETER_K_MIN + ".");
      }
      this.logNote("Global settings for the clustering ensemble:");
      this.logNote("kMin = " + kMin + "\tkMax = " + kMax);
      this.logNote("Clustering-count per strategy: " + countPerStrategy);
      this.logNote("Classifying attributes: " + classifyingAttributesStr);

      // create the necessary operators
      try {
         // K-means operator
         this.kMeans = OperatorService.createOperator(KMeans.class);

         // random feature selector operator
         this.randomFeatureSelector = OperatorService.createOperator(RandomSelection.class);

         // pca
         this.pca = OperatorService.createOperator(PCA.class);

         // model applier
         this.modelApplier = OperatorService.createOperator(ModelApplier.class);
      }
      catch (OperatorCreationException oce) {
         throw new Error(oce.getMessage());
      }

      // mark classifying attributes as special ones
      Attribute attr = null;
      for (String attrName : classifyingAttributes) {
         // get the attribute
         attr = inputExampleSet.getAttributes().get(attrName);

         // set special
         inputExampleSet.getAttributes().setSpecialAttribute(attr, attrName);
      }

      // write meta file to disk
      MetaConfig mc = new MetaConfig();
      mc.setClassifyingAttributeNames(classifyingAttributes);
      mc.setClusteringColumnPrefix(this.clusterColumnPrefix);
      mc.setClusteringCount(3 * countPerStrategy);
      mc.setEnsembleFileName(ensembleFileName);
      mc.setIdColumnName(inputExampleSet.getAttributes().getId().getName());
      mc.setDataFileName(rawDataFileName);
      mc.save(metaFileName);

      // initialize some strategy wide variables
      int clusteringCounter = 0;
      this.random = new Random();
      IOContainer operatorOutput = null;

      // ############################################################################################
      // STRATEGY 1
      // kMeans with different random initializations.
      // ############################################################################################
      this.logNote("Running strategy 1:");
      this.logNote("The different clustering solutions are obtained by applying K-means to the "
                   + "same data with different random intialization.");
      this.logNote("===================");

      for (int i = 0; i < countPerStrategy; i++) {
         this.logNote("Pass #" + clusteringCounter);

         // execute k-means
         operatorOutput = this.runKMeans(inputExampleSet, kMin, kMax, true);

         // copy cluster-attribute to output example set
         outputExampleSet = this.copyClusterColumnToOutput(outputExampleSet, operatorOutput, clusteringCounter);

         clusteringCounter++;
         this.logNote("----------");
      }

      // ############################################################################################
      // STRATEGY 2
      // kMeans with different random feature subsets.
      // ############################################################################################
      this.logNote("Running strategy 2:");
      this.logNote("The different clustering solutions are obtained by using different random " + "feature subsets.");
      this.logNote("===================");

      // get min/max values for the feature count
      int featureCountMin = 2;
      int featureCountMax = (int) Math.round(dimensionCount / 2.0d);
      if (featureCountMax < featureCountMin) {
         featureCountMax = featureCountMin;
      }
      this.logNote("Local settings for strategy 2:");
      this.logNote("min-feature-count = " + featureCountMin + "\tmax-feature-count = " + featureCountMax);

      ExampleSet reducedExampleSet = null;
      for (int i = 0; i < countPerStrategy; i++) {
         this.logNote("Pass #" + clusteringCounter);

         // execute feature reduction
         operatorOutput = this.runFeatureReduction(inputExampleSet, featureCountMin, featureCountMax);
         reducedExampleSet = operatorOutput.get(ExampleSet.class);

         // execute k-means
         operatorOutput = this.runKMeans(reducedExampleSet, kMin, kMax, false);

         // copy cluster-attribute to output example set
         outputExampleSet = this.copyClusterColumnToOutput(outputExampleSet, operatorOutput, clusteringCounter);

         clusteringCounter++;
         this.logNote("----------");
      }

      // ############################################################################################
      // STRATEGY 3
      // kMeans with different PCs
      // ############################################################################################
      this.logNote("Running strategy 3:");
      this.logNote("Use of different random linear projections of the features to create different "
                   + "clustering solutions.");
      this.logNote("===================");

      // get min/max values for the feature count
      featureCountMin = 2;
      featureCountMax = (int) Math.round((dimensionCount - classifyingAttributes.size()) / 2.0d);
      if (featureCountMax < featureCountMin) {
         featureCountMax = featureCountMin;
      }
      this.logNote("Local settings for strategy 3:");
      this.logNote("min-feature-count = " + featureCountMin + "\tmax-feature-count = " + featureCountMax);

      // execute pca
      this.logNote("Starting a full PCA on the input example set.");
      this.pca.setParameter(PCA.PARAMETER_REDUCTION_TYPE, String.valueOf(PCA.REDUCTION_NONE));
      operatorOutput = this.pca.apply(new IOContainer(inputExampleSet));
      // apply pca model
      operatorOutput = this.modelApplier.apply(operatorOutput);
      ExampleSet pcaExampleSet = operatorOutput.get(ExampleSet.class);
      this.logNote("PCA finished!");

      for (int i = 0; i < countPerStrategy; i++) {
         this.logNote("Pass #" + clusteringCounter);

         // execute feature reduction
         operatorOutput = this.runFeatureReduction(pcaExampleSet, featureCountMin, featureCountMax);
         reducedExampleSet = operatorOutput.get(ExampleSet.class);

         // execute k-means
         operatorOutput = this.runKMeans(reducedExampleSet, kMin, kMax, false);

         // copy cluster-attribute to output example set
         outputExampleSet = this.copyClusterColumnToOutput(outputExampleSet, operatorOutput, clusteringCounter);

         clusteringCounter++;
         this.logNote("----------");
      }

      return new IOObject[] { outputExampleSet };
   }

   /************************************************************************************************
    * PRIVATE METHODS
    ***********************************************************************************************/

   /**
    * This method runs an k-Means clustering on the input example set and returns the output example set and the
    * clustering model. The parameter k will be randomly drawn between kMin and kMax. Also there can be specified if the
    * initialization of the centroids should be randomized or not.
    * 
    * @param exampleSet The input example set that should clustered.
    * @param kMin Minimum number of clusters.
    * @param kMax Maximum number of clusters.
    * @param randomSeed Flag for a random initialization of the centroids.
    * @return Container with the output example set and the cluster model.
    */
   private IOContainer runKMeans(ExampleSet exampleSet, int kMin, int kMax, boolean randomSeed) {
      // get new seed
      int seed;
      if (randomSeed) {
         seed = this.random.nextInt(Integer.MAX_VALUE);
      }
      else {
         seed = -1;
      }

      // get new k
      int k = this.random.nextInt(kMax - kMin + 1) + kMin;

      // setting parameters
      this.logNote("K-means-settings:\tk = " + k + "\tseed = " + seed);
      kMeans.setParameter(KMeans.PARAMETER_LOCAL_RANDOM_SEED, String.valueOf(seed));
      kMeans.setParameter(KMeans.PARAMETER_K, String.valueOf(k));

      // fire!
      IOContainer output;
      try {
         output = this.kMeans.apply(new IOContainer(exampleSet));
      }
      catch (OperatorException e) {
    	  e.printStackTrace();
         throw new Error("Error while running K-means.",e);
      }

      return output;
   }

   /**
    * This methods selects a random subset of features from the input example set and returns the according reduced
    * example set. The number of features will be randomly drawn between the two given boundaries.
    * 
    * @param exampleSet The input example set.
    * @param minFeatures Minimum number of features.
    * @param maxFeatures Maximum number of features.
    * @return The reduced example set.
    */
   private IOContainer runFeatureReduction(ExampleSet exampleSet, int minFeatures, int maxFeatures) {
      // get new seed
      int seed = this.random.nextInt(Integer.MAX_VALUE);

      // get new feature count
      int featureCount = this.random.nextInt(maxFeatures - minFeatures + 1) + minFeatures;

      // setting parameters
      this.logNote("Feature-reduction-settings:\tfeature-count = " + featureCount + "\tseed = " + seed);
      this.randomFeatureSelector.setParameter(RandomSelection.PARAMETER_NUMBER_OF_FEATURES, String
            .valueOf(featureCount));
      this.randomFeatureSelector.setParameter(RandomSelection.PARAMETER_LOCAL_RANDOM_SEED, String.valueOf(seed));

      // fire!
      IOContainer output;
      try {
         output = this.randomFeatureSelector.apply(new IOContainer(exampleSet));
      }
      catch (OperatorException e) {
         throw new Error("Error while running feature reduction.");
      }

      return output;
   }

   /**
    * This method copies the cluster-assigments from the clustering output to the given example set as a new column.
    * 
    * @param exampleSet The input example set to which the new column should be added.
    * @param clusteringOutput The out of the clustering (clustering result).
    * @param num The current number of the clustering.
    * @return An example set with an extra column for the given clustering result.
    */
   private ExampleSet copyClusterColumnToOutput(ExampleSet exampleSet, IOContainer clusteringOutput, int num) {
      // get example set from the clustering
      ExampleSet clusteringExampleSet;
      try {
         clusteringExampleSet = clusteringOutput.get(ExampleSet.class);
      }
      catch (MissingIOObjectException e) {
         throw new Error("Something went wrong :/");
      }
      Attribute clusterAttribute = clusteringExampleSet.getAttributes().getSpecial(Attributes.CLUSTER_NAME);

      // add new attribute to the output example set
      Attribute newClusterAttribute = AttributeFactory
            .createAttribute(this.clusterColumnPrefix + num, Ontology.NOMINAL);
      exampleSet.getExampleTable().addAttribute(newClusterAttribute);
      exampleSet.getAttributes().setSpecialAttribute(newClusterAttribute, this.clusterColumnPrefix + num);

      // iterate through clustering example set and copy the cluster attribute values
      Iterator<Example> it1 = clusteringExampleSet.iterator();
      Iterator<Example> it2 = exampleSet.iterator();
      Example example1 = null;
      Example example2 = null;
      while (it1.hasNext() && it2.hasNext()) {
         example1 = it1.next();
         example2 = it2.next();
         example2.setValue(newClusterAttribute, example1.getNominalValue(clusterAttribute));
      }

      return exampleSet;
   }

}