package de.tud.inf.operator.mm; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.operator.IOObject; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorCreationException; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.io.CSVExampleSource; import com.rapidminer.operator.io.SimpleExampleSource; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeString; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.OperatorService; import de.tud.inf.operator.mm.util.ClusteringInfo; import de.tud.inf.operator.mm.util.MetaConfig; import de.tud.inf.operator.mm.util.SortingIndex; /** * * * @version $Revision$ * @author Andre Jaehnig */ public class ClusteringAggregation extends Operator { /************************************************************************************************ * FIELDS ***********************************************************************************************/ /** Filename of the meta configuration file. */ public static final String PARAMETER_META_FILENAME = "meta_filename"; /** Filename of the aggregation file. */ public static final String PARAMETER_AGGREGATION_FILENAME = "aggregation_filename"; /** Name of the class that should be used for the selection. */ public static final String PARAMETER_SELECTOR = "selector"; /** Name of the column containing the aggregation information. */ public static final String PARAMETER_AGGREGATION_COLUMN_NAME = "aggregation_column"; /** Must be the same like {@link SimpleXMLExampleSource.PARAMETER_FILENAME} */ public static final String PARAMETER_FILENAME = "filename"; /************************************************************************************************ * GETTER & SETTER ***********************************************************************************************/ /* * (non-Javadoc) * * @see com.rapidminer.operator.Operator#getInputClasses() */ @Override public Class<?>[] getInputClasses() { return new Class[0]; } /* * (non-Javadoc) * * @see com.rapidminer.operator.Operator#getOutputClasses() */ @Override public Class<?>[] getOutputClasses() { return new Class[] { ExampleSet.class }; } /* * (non-Javadoc) * * @see com.rapidminer.operator.Operator#getParameterTypes() */ public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.add(new ParameterTypeString(PARAMETER_META_FILENAME, "Filename of the meta configuration file.")); types.add(new ParameterTypeString(PARAMETER_AGGREGATION_FILENAME, "Filename of the aggregation file.")); types.add(new ParameterTypeString(PARAMETER_SELECTOR, "Name of the class that should be used for the selection.")); types.add(new ParameterTypeString(PARAMETER_AGGREGATION_COLUMN_NAME, "Name of the column containing the aggregation information.", "crAggr")); return types; } /************************************************************************************************ * CONSTRUCTOR ***********************************************************************************************/ /** * Constructor. * * @param description */ public ClusteringAggregation(OperatorDescription description) { super(description); } /************************************************************************************************ * PUBLIC METHODS ***********************************************************************************************/ /* * (non-Javadoc) * * @see com.rapidminer.operator.Operator#apply() */ @Override public IOObject[] apply() throws OperatorException { // get parameters String metaFileName = this.getParameterAsString(PARAMETER_META_FILENAME); String aggregationFileName = this.getParameterAsString(PARAMETER_AGGREGATION_FILENAME); String selector = this.getParameterAsString(PARAMETER_SELECTOR); String aggregationColumnName = this.getParameterAsString(PARAMETER_AGGREGATION_COLUMN_NAME); MetaConfig mc = MetaConfig.load(metaFileName); String clusterColumnPrefix = mc.getClusteringColumnPrefix(); String idColumnName = mc.getIdColumnName(); String csvFileNameClustering = mc.getEnsembleFileName(); String csvFileNameSelection = mc.getSelectorFileName(); Map<String, ClusteringInfo> mapi = mc.getClusteringInfo(); ClusteringInfo inf = mapi.get(selector); String selectionColumnName =inf.getSelectedColumnName(); // String selectionColumnName = mc.getClusteringInfo().get(selector).getSelectedColumnName(); // read csv files and get example sets Operator csvReader = null; try { csvReader = OperatorService.createOperator(CSVExampleSource.class); } catch (OperatorCreationException oce) { throw new Error(oce.getMessage()); } // first the clustering csv csvReader.setParameter(PARAMETER_FILENAME, csvFileNameClustering); csvReader.setParameter(CSVExampleSource.PARAMETER_ID_NAME, idColumnName); csvReader.setParameter(CSVExampleSource.PARAMETER_USE_COMMENT_CHARACTERS, "false"); IOObject[] ioo = csvReader.apply(); ExampleSet clusteringExampleSet = (ExampleSet) ioo[0]; int elementCount = clusteringExampleSet.size(); // and now the selection csv csvReader.setParameter(PARAMETER_FILENAME, csvFileNameSelection); csvReader.setParameter(CSVExampleSource.PARAMETER_ID_NAME, ""); ioo = csvReader.apply(); ExampleSet selectionExampleSet = (ExampleSet) ioo[0]; int clusteringCount = selectionExampleSet.size(); this.logNote("Input is: " + elementCount + " elements and " + clusteringCount + " different clusterings."); // read out which clusterings are important List<Integer> selectedClusteringIds = new ArrayList<Integer>(); Attribute selectionAttr = selectionExampleSet.getAttributes().get(selectionColumnName); if (selectionAttr == null) { throw new Error("At the selection CSV is no such column ('" + selectionColumnName + "')"); } Iterator<Example> it = selectionExampleSet.iterator(); Example example = null; int counter = 0; while (it.hasNext()) { example = it.next(); if (example.getNominalValue(selectionAttr).equalsIgnoreCase("true")) { selectedClusteringIds.add(counter); } counter++; } List<Attribute> selectedClusteringAttributes = new LinkedList<Attribute>(); counter = 0; StringBuffer sb = new StringBuffer("Following clusterings are selected: "); for (Attribute attr : clusteringExampleSet.getAttributes()) { if (!attr.getName().startsWith(clusterColumnPrefix) && !attr.getName().equals(aggregationColumnName)) { // not relevant continue; } if (selectedClusteringIds.contains(counter)) { selectedClusteringAttributes.add(attr); sb.append(attr.getName() + " "); } counter++; } int selectedClusteringCount = selectedClusteringAttributes.size(); this.logNote(sb.toString()); // check if we have all selected attributes if (selectedClusteringIds.size() != selectedClusteringCount) { throw new Error("Error while retrieving all selected clustering attributes."); } this.logNote(selectedClusteringCount + " clusterings are selected."); // add new attribute to the clustering example set for the aggregation Attribute aggregationAttr = AttributeFactory.createAttribute(aggregationColumnName, Ontology.INTEGER); clusteringExampleSet.getExampleTable().addAttribute(aggregationAttr); clusteringExampleSet.getAttributes().setSpecialAttribute(aggregationAttr, aggregationColumnName); // set dummy value for this new attribute it = clusteringExampleSet.iterator(); while (it.hasNext()) { it.next().setValue(aggregationAttr, Integer.MIN_VALUE); } // for each combination of two elements it = clusteringExampleSet.iterator(); Iterator<Example> innerIt = null; Example innerExample = null; double baseWeight = 1.0d / (double) selectedClusteringCount; double weight; int curClusterCount = 0; Iterator<Example> reclusterIt = null; Example reclusterExample = null; while (it.hasNext()) { // u example = it.next(); innerIt = clusteringExampleSet.iterator(); while (innerIt.hasNext()) { // v innerExample = innerIt.next(); if (innerExample.getId() >= example.getId()) { // do avoid check of v vs. u (because we already have u vs. v) break; } // calculate fraction of clusterings that places u and v in different clusters weight = 0.0d; // for each selected clustering for (Attribute attr : selectedClusteringAttributes) { // if in different cluster -> edge = edge + 1 if ((int) example.getNumericalValue(attr) != (int) innerExample.getNumericalValue(attr)) { weight++; } } weight *= baseWeight; // check if the edge between u and v is of interest if (weight <= 0.5) { Integer clusterId = (int) example.getNumericalValue(aggregationAttr); Integer innerClusterId = (int) innerExample.getNumericalValue(aggregationAttr); // if both are not in a cluster -> add a new one for this both if (clusterId == Integer.MIN_VALUE && innerClusterId == Integer.MIN_VALUE) { example.setValue(aggregationAttr, curClusterCount); innerExample.setValue(aggregationAttr, curClusterCount); this.logNote("Added a new cluster (#" + curClusterCount + ")"); curClusterCount++; } // else if both are already in different clusters -> merge them else if (clusterId != Integer.MIN_VALUE && innerClusterId != Integer.MIN_VALUE) { if (clusterId != innerClusterId) { this.logNote("Merge cluster #" + innerClusterId + " into cluster #" + clusterId); reclusterIt = clusteringExampleSet.iterator(); while (reclusterIt.hasNext()) { reclusterExample = reclusterIt.next(); if ((int) reclusterExample.getNumericalValue(aggregationAttr) == innerClusterId) { reclusterExample.setValue(aggregationAttr, clusterId); } } } } // else if one of them is in a cluster -> add the other one to the same else { if (clusterId == Integer.MIN_VALUE) { example.setValue(aggregationAttr, innerClusterId); } else { innerExample.setValue(aggregationAttr, clusterId); } } } } } // rename cluster ids List<SortingIndex> sortedIndex = new ArrayList<SortingIndex>(elementCount); counter = 0; it = clusteringExampleSet.iterator(); while (it.hasNext()) { example = it.next(); sortedIndex.add(new SortingIndex(Double.valueOf(example.getNumericalValue(aggregationAttr)), counter)); counter++; } Collections.sort(sortedIndex); // fill selection attributes counter = 0; Iterator<SortingIndex> sortedIt = sortedIndex.iterator(); int clusterId = 0; int curClusterId; int prevClusterId = -1; while (sortedIt.hasNext()) { example = clusteringExampleSet.getExample(sortedIt.next().getIndex()); curClusterId = (int) example.getNumericalValue(aggregationAttr); if (curClusterId == clusterId) { prevClusterId = curClusterId; continue; } else if (curClusterId == prevClusterId) { example.setValue(aggregationAttr, clusterId); } else { clusterId++; prevClusterId = curClusterId; example.setValue(aggregationAttr, clusterId); } } this.logNote("Aggregation contains " + (clusterId + 1) + " cluster."); // write meta config mc.setAggregationFileName(aggregationFileName); mc.setAggregationColumnName(aggregationColumnName); mc.setAggregationClusterCount(clusterId + 1); mc.setSelectorUsedForAggregation(selector); mc.save(metaFileName); return new IOObject[] { clusteringExampleSet }; } /************************************************************************************************ * PRIVATE METHODS ***********************************************************************************************/ /* * none */ }