package de.tud.inf.operator.mm; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.example.table.DataRow; import com.rapidminer.operator.IOObject; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.tools.Ontology; import de.tud.inf.operator.mm.util.ClusteringInfo; import de.tud.inf.operator.mm.util.MetaConfig; import de.tud.inf.operator.mm.util.SortingIndex; /** * This class implements the Joint-Criterion strategy. * * {@link http://www.siam.org/proceedings/datamining/2008/dm08_71_fern.pdf} * * @version $Revision$ * @author Andre Jaehnig */ public class JointCriterionSelector extends AbstractSelector { /************************************************************************************************ * FIELDS ***********************************************************************************************/ /** Column name with the indices of the selected order. */ private static final String JOINT_CRITERION_COLUMN_NAME_ORDER = "jc_order"; /** Column name with the selected flag. */ private static final String JOINT_CRITERION_COLUMN_NAME_SELECTED = "jc_selected"; /** Weighting-factor for the calculation. */ private static final double ALPHA = 0.5d; /************************************************************************************************ * GETTER & SETTER ***********************************************************************************************/ /* * none */ /************************************************************************************************ * CONSTRUCTOR ***********************************************************************************************/ /** * Constructor. * * @param description */ public JointCriterionSelector(OperatorDescription description) { super(description); } /************************************************************************************************ * PUBLIC METHODS ***********************************************************************************************/ /* * (non-Javadoc) * * @see com.rapidminer.operator.Operator#apply() */ @Override public IOObject[] apply() throws OperatorException { // get example set ExampleSet exampleSet = this.getInput(ExampleSet.class); int exampleSetSize = exampleSet.size(); this.logNote("Input example-set has " + exampleSetSize + " elements."); // get parameters String metaFileName = this.getParameterAsString(PARAMETER_META_FILENAME); String selectorFileName = this.getParameterAsString(PARAMETER_SELECTOR_FILENAME); MetaConfig mc = MetaConfig.load(metaFileName); String snmiColumnName = mc.getSnmiColumnName(); String clusterColumnPrefix = mc.getClusteringColumnPrefix(); int sampleSize = this.getParameterAsInt(PARAMETER_SAMPLE_SIZE); if (sampleSize < 1 || sampleSize > exampleSetSize) { throw new UserError(this, 116, new Object[] { PARAMETER_SAMPLE_SIZE, sampleSize }); } this.logNote("Requested clustering sample size: " + sampleSize); // create attributes for the selection flag and for a general order of selection Attribute jcOrderAttr = AttributeFactory.createAttribute(JOINT_CRITERION_COLUMN_NAME_ORDER, Ontology.INTEGER); Attribute jcSelectedAttr = AttributeFactory.createAttribute(JOINT_CRITERION_COLUMN_NAME_SELECTED, Ontology.NOMINAL); exampleSet.getExampleTable().addAttribute(jcOrderAttr); exampleSet.getExampleTable().addAttribute(jcSelectedAttr); // add attribute to view exampleSet.getAttributes().setSpecialAttribute(jcOrderAttr, JOINT_CRITERION_COLUMN_NAME_ORDER); exampleSet.getAttributes().setSpecialAttribute(jcSelectedAttr, JOINT_CRITERION_COLUMN_NAME_SELECTED); // add dummy attribute-column (only to the table) Attribute workingSNMIAttr = AttributeFactory.createAttribute(WORKING_COLUMN_NAME + "1", Ontology.REAL); Attribute workingNMIAttr = AttributeFactory.createAttribute(WORKING_COLUMN_NAME + "2", Ontology.REAL); Attribute workingSumAttr = AttributeFactory.createAttribute(WORKING_COLUMN_NAME + "3", Ontology.REAL); exampleSet.getExampleTable().addAttribute(workingSNMIAttr); exampleSet.getExampleTable().addAttribute(workingNMIAttr); exampleSet.getExampleTable().addAttribute(workingSumAttr); // get a sorted iterator over the snmi-column of the nmi-csv-file Attribute snmiAttr = exampleSet.getAttributes().get(snmiColumnName); if (snmiAttr == null) { throw new UserError(this, 111, snmiColumnName); } List<SortingIndex> sortedIndex = new ArrayList<SortingIndex>(exampleSetSize); int counter = 0; Iterator<Example> it = exampleSet.iterator(); Example example = null; while (it.hasNext()) { example = it.next(); sortedIndex.add(new SortingIndex(Double.valueOf(example.getNumericalValue(snmiAttr)), counter)); counter++; } Collections.sort(sortedIndex); // setting first selected element-index (the one with the highest snmi) int selectedIndex = sortedIndex.get(exampleSetSize - 1).getIndex(); // a list with all (so far) selected indices List<Integer> selectedIndices = new LinkedList<Integer>(); // fill selection attributes int order = 0; double sumSNMIs = 0.0d; double sumNMIs = 0.0d; double curSNMI, curNMI, newSumSNMIs, newSumNMIs; DataRow selectedRow = null; DataRow curRow = null; for (int i = 0; i < exampleSetSize; i++) { for (int j = 0; j < exampleSetSize; j++) { if (selectedIndices.contains(j)) { // not relevant anymore continue; } curRow = exampleSet.getExampleTable().getDataRow(j); curSNMI = curRow.get(snmiAttr); curNMI = 0.0d; for (Attribute attr : exampleSet.getAttributes()) { if (!attr.getName().startsWith(clusterColumnPrefix)) { // not relevant continue; } if (selectedIndices.contains(Integer.valueOf(attr.getName().substring(clusterColumnPrefix.length())))) { curNMI += 1.0d - curRow.get(attr); } } newSumSNMIs = sumSNMIs + curSNMI; newSumNMIs = sumNMIs + curNMI; curRow.set(workingSNMIAttr, newSumSNMIs); curRow.set(workingNMIAttr, newSumNMIs); curRow.set(workingSumAttr, ALPHA * newSumSNMIs + (1.0d - ALPHA) * newSumNMIs); } // find maximum sortedIndex = new ArrayList<SortingIndex>(exampleSetSize); counter = 0; it = exampleSet.iterator(); while (it.hasNext()) { example = it.next(); sortedIndex.add(new SortingIndex(Double.valueOf(example.getNumericalValue(workingSumAttr)), counter)); counter++; } Collections.sort(sortedIndex); // set new selected index selectedIndex = sortedIndex.get(exampleSetSize - 1).getIndex(); selectedIndices.add(selectedIndex); // get new basic values for the calculation selectedRow = exampleSet.getExampleTable().getDataRow(selectedIndex); sumSNMIs = selectedRow.get(workingSNMIAttr); sumNMIs = selectedRow.get(workingNMIAttr); // reset working values of this selected index so that they are no longer relevant selectedRow.set(workingSNMIAttr, Double.NEGATIVE_INFINITY); selectedRow.set(workingNMIAttr, Double.NEGATIVE_INFINITY); selectedRow.set(workingSumAttr, Double.NEGATIVE_INFINITY); // set ordering-index selectedRow.set(jcOrderAttr, order); // set selected flag if (order < sampleSize) { exampleSet.getExample(selectedIndex).setValue(jcSelectedAttr, "true"); } else { exampleSet.getExample(selectedIndex).setValue(jcSelectedAttr, "false"); } order++; } // remove working attributes exampleSet.getExampleTable().removeAttribute(workingSNMIAttr); exampleSet.getExampleTable().removeAttribute(workingNMIAttr); exampleSet.getExampleTable().removeAttribute(workingSumAttr); // write meta config mc.setSelectorFileName(selectorFileName); ClusteringInfo ci = new ClusteringInfo(); ci.setInfoColumnName(JOINT_CRITERION_COLUMN_NAME_ORDER); ci.setSelectedColumnName(JOINT_CRITERION_COLUMN_NAME_SELECTED); ci.setSampleSize(sampleSize); mc.getClusteringInfo().put(this.getClass().getName(), ci); mc.save(metaFileName); return new IOObject[] { exampleSet }; } /************************************************************************************************ * PRIVATE METHODS ***********************************************************************************************/ /* * none */ }