/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.visualization.dependencies; import java.util.List; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.operator.IOContainer; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorCreationException; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.preprocessing.discretization.BinDiscretization; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.tools.OperatorService; import com.rapidminer.tools.math.MathFunctions; /** * <p>This operator calculates the mutual information matrix between all attributes of the * input example set. This operator simply produces a dependency matrix which up to now * cannot be used by other operators but can be displayed to the user in the result tab.</p> * * <p>Please note that this simple implementation * performs a data scan for each attribute combination and might therefore take * some time for non-memory example tables.</p> * * @author Ingo Mierswa * @version $Id: MutualInformationMatrixOperator.java,v 1.1 2008/08/25 08:10:33 ingomierswa Exp $ */ public class MutualInformationMatrixOperator extends AbstractPairwiseMatrixOperator { public MutualInformationMatrixOperator(OperatorDescription description) { super(description); } /** This preprocessing discretizes the input example set by a view. */ protected ExampleSet performPreprocessing(ExampleSet eSet) throws OperatorException { try { Operator discretizationOperator = OperatorService.createOperator(BinDiscretization.class); discretizationOperator.setParameter(BinDiscretization.PARAMETER_NUMBER_OF_BINS, getParameterAsInt(BinDiscretization.PARAMETER_NUMBER_OF_BINS) + ""); discretizationOperator.setParameter(BinDiscretization.PARAMETER_CREATE_VIEW, "true"); return discretizationOperator.apply(new IOContainer((ExampleSet)eSet.clone())).get(ExampleSet.class); } catch (OperatorCreationException e) { // should not happen throw new OperatorException(getName() + ": Cannot create discretization operator (" + e + ")."); } } public String getMatrixName() { return "Mutual Information"; } /** Calculates the mutual information for both attributes. */ public double getMatrixValue(ExampleSet exampleSet, Attribute firstAttribute, Attribute secondAttribute) { // init double[] firstProbabilites = new double[firstAttribute.getMapping().size()]; double[] secondProbabilites = new double[secondAttribute.getMapping().size()]; double[][] jointProbabilities = new double[firstAttribute.getMapping().size()][secondAttribute.getMapping().size()]; double firstCounter = 0.0d; double secondCounter = 0.0d; double firstSecondCounter = 0.0d; // count values for (Example example : exampleSet) { double firstValue = example.getValue(firstAttribute); if (!Double.isNaN(firstValue)) { firstProbabilites[(int)firstValue]++; firstCounter++; } double secondValue = example.getValue(secondAttribute); if (!Double.isNaN(secondValue)) { secondProbabilites[(int)secondValue]++; secondCounter++; } if (!Double.isNaN(firstValue) && !Double.isNaN(secondValue)) { jointProbabilities[(int)firstValue][(int)secondValue]++; firstSecondCounter++; } } // transform to probabilities for (int i = 0; i < firstProbabilites.length; i++) { firstProbabilites[i] /= firstCounter; } for (int i = 0; i < secondProbabilites.length; i++) { secondProbabilites[i] /= secondCounter; } for (int i = 0; i < jointProbabilities.length; i++) { for (int j = 0; j < jointProbabilities[i].length; j++) { jointProbabilities[i][j] /= firstSecondCounter; } } double firstEntropy = 0.0d; for (int i = 0; i < firstProbabilites.length; i++) { if (firstProbabilites[i] > 0.0d) firstEntropy += firstProbabilites[i] * MathFunctions.ld(firstProbabilites[i]); } firstEntropy *= -1; double secondEntropy = 0.0d; for (int i = 0; i < secondProbabilites.length; i++) { if (secondProbabilites[i] > 0.0d) secondEntropy += secondProbabilites[i] * MathFunctions.ld(secondProbabilites[i]); } secondEntropy *= -1; double jointEntropy = 0.0d; for (int i = 0; i < jointProbabilities.length; i++) { for (int j = 0; j < jointProbabilities[i].length; j++) { if (jointProbabilities[i][j] > 0.0d) jointEntropy += jointProbabilities[i][j] * MathFunctions.ld(jointProbabilities[i][j]); } } jointEntropy *= -1; return firstEntropy + secondEntropy - jointEntropy; } public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.add(new ParameterTypeInt(BinDiscretization.PARAMETER_NUMBER_OF_BINS, "Indicates the number of bins used for numerical attributes.", 2, Integer.MAX_VALUE, 10)); return types; } }