ClusteringAggregation.java example



package de.tud.inf.operator.mm;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import com.rapidminer.example.Attribute;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorCreationException;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.io.CSVExampleSource;
import com.rapidminer.operator.io.SimpleExampleSource;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.OperatorService;

import de.tud.inf.operator.mm.util.ClusteringInfo;
import de.tud.inf.operator.mm.util.MetaConfig;
import de.tud.inf.operator.mm.util.SortingIndex;


/**
 * 
 * 
 * @version $Revision$
 * @author Andre Jaehnig
 */
public class ClusteringAggregation extends Operator {

   /************************************************************************************************
    * FIELDS
    ***********************************************************************************************/

   /** Filename of the meta configuration file. */
   public static final String PARAMETER_META_FILENAME = "meta_filename";
   
   /** Filename of the aggregation file. */
   public static final String PARAMETER_AGGREGATION_FILENAME = "aggregation_filename";

   /** Name of the class that should be used for the selection. */
   public static final String PARAMETER_SELECTOR = "selector";

   /** Name of the column containing the aggregation information. */
   public static final String PARAMETER_AGGREGATION_COLUMN_NAME = "aggregation_column";

   /** Must be the same like {@link SimpleXMLExampleSource.PARAMETER_FILENAME} */
   public static final String PARAMETER_FILENAME = "filename";

   /************************************************************************************************
    * GETTER & SETTER
    ***********************************************************************************************/

   /*
    * (non-Javadoc)
    * 
    * @see com.rapidminer.operator.Operator#getInputClasses()
    */
   @Override
   public Class<?>[] getInputClasses() {
      return new Class[0];
   }

   /*
    * (non-Javadoc)
    * 
    * @see com.rapidminer.operator.Operator#getOutputClasses()
    */
   @Override
   public Class<?>[] getOutputClasses() {
      return new Class[] { ExampleSet.class };
   }

   /*
    * (non-Javadoc)
    * 
    * @see com.rapidminer.operator.Operator#getParameterTypes()
    */
   public List<ParameterType> getParameterTypes() {
      List<ParameterType> types = super.getParameterTypes();

      types.add(new ParameterTypeString(PARAMETER_META_FILENAME, "Filename of the meta configuration file."));
      types.add(new ParameterTypeString(PARAMETER_AGGREGATION_FILENAME, "Filename of the aggregation file."));
      
      types.add(new ParameterTypeString(PARAMETER_SELECTOR,
            "Name of the class that should be used for the selection."));
      types.add(new ParameterTypeString(PARAMETER_AGGREGATION_COLUMN_NAME,
            "Name of the column containing the aggregation information.", "crAggr"));

      return types;
   }

   /************************************************************************************************
    * CONSTRUCTOR
    ***********************************************************************************************/

   /**
    * Constructor.
    * 
    * @param description
    */
   public ClusteringAggregation(OperatorDescription description) {
      super(description);
   }

   /************************************************************************************************
    * PUBLIC METHODS
    ***********************************************************************************************/

   /*
    * (non-Javadoc)
    * 
    * @see com.rapidminer.operator.Operator#apply()
    */
   @Override
   public IOObject[] apply() throws OperatorException {
      // get parameters
      String metaFileName = this.getParameterAsString(PARAMETER_META_FILENAME);
      String aggregationFileName = this.getParameterAsString(PARAMETER_AGGREGATION_FILENAME);
      String selector = this.getParameterAsString(PARAMETER_SELECTOR);
      String aggregationColumnName = this.getParameterAsString(PARAMETER_AGGREGATION_COLUMN_NAME);
      
      MetaConfig mc = MetaConfig.load(metaFileName);
      String clusterColumnPrefix = mc.getClusteringColumnPrefix();
      String idColumnName = mc.getIdColumnName();
      String csvFileNameClustering = mc.getEnsembleFileName();
      String csvFileNameSelection = mc.getSelectorFileName();
      Map<String, ClusteringInfo> mapi =  mc.getClusteringInfo();
      ClusteringInfo inf = mapi.get(selector);
      String selectionColumnName =inf.getSelectedColumnName();
     // String selectionColumnName = mc.getClusteringInfo().get(selector).getSelectedColumnName();

      // read csv files and get example sets
      Operator csvReader = null;
      try {
         csvReader = OperatorService.createOperator(CSVExampleSource.class);
      }
      catch (OperatorCreationException oce) {
         throw new Error(oce.getMessage());
      }

      // first the clustering csv
      csvReader.setParameter(PARAMETER_FILENAME, csvFileNameClustering);
      csvReader.setParameter(CSVExampleSource.PARAMETER_ID_NAME, idColumnName);
      csvReader.setParameter(CSVExampleSource.PARAMETER_USE_COMMENT_CHARACTERS, "false");
      IOObject[] ioo = csvReader.apply();
      ExampleSet clusteringExampleSet = (ExampleSet) ioo[0];
      int elementCount = clusteringExampleSet.size();

      
      // and now the selection csv
      csvReader.setParameter(PARAMETER_FILENAME, csvFileNameSelection);
      csvReader.setParameter(CSVExampleSource.PARAMETER_ID_NAME, "");
      ioo = csvReader.apply();
      ExampleSet selectionExampleSet = (ExampleSet) ioo[0];
      int clusteringCount = selectionExampleSet.size();
          

      this.logNote("Input is: " + elementCount + " elements and " + clusteringCount + " different clusterings.");

      // read out which clusterings are important
      List<Integer> selectedClusteringIds = new ArrayList<Integer>();
      Attribute selectionAttr = selectionExampleSet.getAttributes().get(selectionColumnName);
      if (selectionAttr == null) {
         throw new Error("At the selection CSV is no such column ('" + selectionColumnName + "')");
      }
      Iterator<Example> it = selectionExampleSet.iterator();
      Example example = null;
      int counter = 0;
      while (it.hasNext()) {
         example = it.next();
         if (example.getNominalValue(selectionAttr).equalsIgnoreCase("true")) {
            selectedClusteringIds.add(counter);
         }
         counter++;
      }
      List<Attribute> selectedClusteringAttributes = new LinkedList<Attribute>();
      counter = 0;
      StringBuffer sb = new StringBuffer("Following clusterings are selected: ");
      for (Attribute attr : clusteringExampleSet.getAttributes()) {
         if (!attr.getName().startsWith(clusterColumnPrefix) && !attr.getName().equals(aggregationColumnName)) {
            // not relevant
            continue;
         }
         if (selectedClusteringIds.contains(counter)) {
            selectedClusteringAttributes.add(attr);
            sb.append(attr.getName() + " ");
         }
         counter++;
      }
      int selectedClusteringCount = selectedClusteringAttributes.size();
      this.logNote(sb.toString());

      // check if we have all selected attributes
      if (selectedClusteringIds.size() != selectedClusteringCount) {
         throw new Error("Error while retrieving all selected clustering attributes.");
      }
      this.logNote(selectedClusteringCount + " clusterings are selected.");

      // add new attribute to the clustering example set for the aggregation
      Attribute aggregationAttr = AttributeFactory.createAttribute(aggregationColumnName, Ontology.INTEGER);
      clusteringExampleSet.getExampleTable().addAttribute(aggregationAttr);
      clusteringExampleSet.getAttributes().setSpecialAttribute(aggregationAttr, aggregationColumnName);

      // set dummy value for this new attribute
      it = clusteringExampleSet.iterator();
      while (it.hasNext()) {
         it.next().setValue(aggregationAttr, Integer.MIN_VALUE);
      }

      // for each combination of two elements
      it = clusteringExampleSet.iterator();
      Iterator<Example> innerIt = null;
      Example innerExample = null;
      double baseWeight = 1.0d / (double) selectedClusteringCount;
      double weight;
      int curClusterCount = 0;
      Iterator<Example> reclusterIt = null;
      Example reclusterExample = null;
      while (it.hasNext()) { // u
         example = it.next();
         innerIt = clusteringExampleSet.iterator();
         while (innerIt.hasNext()) { // v
            innerExample = innerIt.next();
            if (innerExample.getId() >= example.getId()) {
               // do avoid check of v vs. u (because we already have u vs. v)
               break;
            }

            // calculate fraction of clusterings that places u and v in different clusters
            weight = 0.0d;

            // for each selected clustering
            for (Attribute attr : selectedClusteringAttributes) {
               // if in different cluster -> edge = edge + 1
               if ((int) example.getNumericalValue(attr) != (int) innerExample.getNumericalValue(attr)) {
                  weight++;
               }
            }
            weight *= baseWeight;

            // check if the edge between u and v is of interest
            if (weight <= 0.5) {
               Integer clusterId = (int) example.getNumericalValue(aggregationAttr);
               Integer innerClusterId = (int) innerExample.getNumericalValue(aggregationAttr);

               // if both are not in a cluster -> add a new one for this both
               if (clusterId == Integer.MIN_VALUE && innerClusterId == Integer.MIN_VALUE) {
                  example.setValue(aggregationAttr, curClusterCount);
                  innerExample.setValue(aggregationAttr, curClusterCount);
                  this.logNote("Added a new cluster (#" + curClusterCount + ")");
                  curClusterCount++;
               }
               // else if both are already in different clusters -> merge them
               else if (clusterId != Integer.MIN_VALUE && innerClusterId != Integer.MIN_VALUE) {
                  if (clusterId != innerClusterId) {
                     this.logNote("Merge cluster #" + innerClusterId + " into cluster #" + clusterId);
                     reclusterIt = clusteringExampleSet.iterator();
                     while (reclusterIt.hasNext()) {
                        reclusterExample = reclusterIt.next();
                        if ((int) reclusterExample.getNumericalValue(aggregationAttr) == innerClusterId) {
                           reclusterExample.setValue(aggregationAttr, clusterId);
                        }
                     }
                  }
               }
               // else if one of them is in a cluster -> add the other one to the same
               else {
                  if (clusterId == Integer.MIN_VALUE) {
                     example.setValue(aggregationAttr, innerClusterId);
                  }
                  else {
                     innerExample.setValue(aggregationAttr, clusterId);
                  }
               }
            }
         }
      }

      // rename cluster ids
      List<SortingIndex> sortedIndex = new ArrayList<SortingIndex>(elementCount);
      counter = 0;
      it = clusteringExampleSet.iterator();
      while (it.hasNext()) {
         example = it.next();
         sortedIndex.add(new SortingIndex(Double.valueOf(example.getNumericalValue(aggregationAttr)), counter));
         counter++;
      }
      Collections.sort(sortedIndex);

      // fill selection attributes
      counter = 0;
      Iterator<SortingIndex> sortedIt = sortedIndex.iterator();
      int clusterId = 0;
      int curClusterId;
      int prevClusterId = -1;
      while (sortedIt.hasNext()) {
         example = clusteringExampleSet.getExample(sortedIt.next().getIndex());
         curClusterId = (int) example.getNumericalValue(aggregationAttr);
         if (curClusterId == clusterId) {
            prevClusterId = curClusterId;
            continue;
         }
         else if (curClusterId == prevClusterId) {
            example.setValue(aggregationAttr, clusterId);
         }
         else {
            clusterId++;
            prevClusterId = curClusterId;
            example.setValue(aggregationAttr, clusterId);
         }
      }

      this.logNote("Aggregation contains " + (clusterId + 1) + " cluster.");
      
      // write meta config
      mc.setAggregationFileName(aggregationFileName);
      mc.setAggregationColumnName(aggregationColumnName);
      mc.setAggregationClusterCount(clusterId + 1);
      mc.setSelectorUsedForAggregation(selector);      
      mc.save(metaFileName);

      return new IOObject[] { clusteringExampleSet };
   }

   /************************************************************************************************
    * PRIVATE METHODS
    ***********************************************************************************************/

   /*
    * none
    */

}