/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.validation.significance; import java.awt.Color; import javax.swing.JEditorPane; import javax.swing.JLabel; import com.rapidminer.gui.tools.ExtendedJScrollPane; import com.rapidminer.gui.tools.SwingTools; import com.rapidminer.operator.IOContainer; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.performance.PerformanceCriterion; import com.rapidminer.operator.performance.PerformanceVector; import com.rapidminer.tools.Tools; import com.rapidminer.tools.math.FDistribution; import com.rapidminer.tools.math.SignificanceTestResult; /** * Determines if the null hypothesis (all actual mean values are the same) holds * for the input performance vectors. This operator uses a simple (pairwise) * t-test to determine the probability that the null hypothesis is wrong. Since * a t-test can only be applied on two performance vectors this test will be * applied to all possible pairs. The result is a significance matrix. However, * pairwise t-test may introduce a larger type I error. It is recommended to * apply an additional ANOVA test to determine if the null hypothesis is wrong * at all. * * @author Ingo Mierswa * @version $Id: TTestSignificanceTestOperator.java,v 1.5 2006/03/21 15:35:53 * ingomierswa Exp $ */ public class TTestSignificanceTestOperator extends SignificanceTestOperator { /** The result for a paired t-test. */ public static class TTestSignificanceTestResult extends SignificanceTestResult { private static final long serialVersionUID = -5412090499056975997L; private PerformanceVector[] allVectors; private double[][] probMatrix; private double alpha = 0.05d; public TTestSignificanceTestResult(PerformanceVector[] allVectors, double[][] probMatrix, double alpha) { this.allVectors = allVectors; this.probMatrix = probMatrix; this.alpha = alpha; } public String getName() { return "Pairwise t-Test"; } /** Returns NaN since no single probability will be delivered. */ public double getProbability() { return Double.NaN; } public String toString() { StringBuffer result = new StringBuffer(); result.append("Probabilities for random values with the same result:" + Tools.getLineSeparator()); for (int i = 0; i < allVectors.length; i++) { for (int j = 0; j < allVectors.length; j++) { if (!Double.isNaN(probMatrix[i][j])) result.append(Tools.formatNumber(probMatrix[i][j]) + "\t"); else result.append("-----\t"); } result.append(Tools.getLineSeparator()); } result.append("Values smaller than alpha=" + Tools.formatNumber(alpha) + " indicate a probably significant difference between the mean values!" + Tools.getLineSeparator()); result.append("List of performance values:" + Tools.getLineSeparator()); for (int i = 0; i < allVectors.length; i++) { result.append(i + ": " + Tools.formatNumber(allVectors[i].getMainCriterion().getAverage()) + " +/- " + Tools.formatNumber(Math.sqrt(allVectors[i].getMainCriterion().getVariance())) + Tools.getLineSeparator()); } return result.toString(); } public java.awt.Component getVisualizationComponent(IOContainer container) { StringBuffer buffer = new StringBuffer(); Color bgColor = SwingTools.LIGHTEST_YELLOW; String bgColorString = Integer.toHexString(bgColor.getRed()) + Integer.toHexString(bgColor.getGreen()) + Integer.toHexString(bgColor.getBlue()); buffer.append("<table bgcolor=\""+bgColorString+"\" border=\"1\">"); buffer.append("<tr><td></td>"); for (int i = 0; i < allVectors.length; i++) { buffer.append("<td>" + Tools.formatNumber(allVectors[i].getMainCriterion().getAverage()) + " +/- " + Tools.formatNumber(Math.sqrt(allVectors[i].getMainCriterion().getVariance())) + "</td>"); } buffer.append("</tr>"); for (int i = 0; i < allVectors.length; i++) { buffer.append("<tr><td>" + Tools.formatNumber(allVectors[i].getMainCriterion().getAverage()) + " +/- " + Tools.formatNumber(Math.sqrt(allVectors[i].getMainCriterion().getVariance())) + "</td>"); for (int j = 0; j < allVectors.length; j++) { buffer.append("<td>"); if (!Double.isNaN(probMatrix[i][j])) { double prob = probMatrix[i][j]; if (prob < alpha) { buffer.append("<b>"); } buffer.append(Tools.formatNumber(prob)); if (prob < alpha) { buffer.append("</b>"); } } buffer.append("</td>"); } buffer.append("</tr>"); } buffer.append("</table>"); buffer.append("<br>Probabilities for random values with the same result.<br>Bold values are smaller than alpha=" + Tools.formatNumber(alpha) + " which indicates a probably significant difference between the actual mean values!"); JEditorPane textPane = new JEditorPane("text/html", "<html><h1>" + getName() + "</h1>" + buffer.toString() + "</html>"); textPane.setBackground((new JLabel()).getBackground()); textPane.setBorder(javax.swing.BorderFactory.createEmptyBorder(11, 11, 11, 11)); return new ExtendedJScrollPane(textPane); } } public TTestSignificanceTestOperator(OperatorDescription description) { super(description); } public SignificanceTestResult performSignificanceTest(PerformanceVector[] allVectors, double alpha) { double[][] resultMatrix = new double[allVectors.length][allVectors.length]; for (int i = 0; i < allVectors.length; i++) { for (int j = 0; j < (i + 1); j++) resultMatrix[i][j] = Double.NaN; // fill lower triangle with // NaN --> empty in result // string for (int j = i + 1; j < allVectors.length; j++) { resultMatrix[i][j] = getProbability(allVectors[i].getMainCriterion(), allVectors[j].getMainCriterion()); } } return new TTestSignificanceTestResult(allVectors, resultMatrix, alpha); } private double getProbability(PerformanceCriterion pc1, PerformanceCriterion pc2) { double totalDeviation = ((pc1.getAverageCount() - 1) * pc1.getVariance() + (pc2.getAverageCount() - 1) * pc2.getVariance()) / (pc1.getAverageCount() + pc2.getAverageCount() - 2); double factor = 1.0d / (1.0d / pc1.getAverageCount() + 1.0d / pc2.getAverageCount()); double diff = pc1.getAverage() - pc2.getAverage(); double t = factor * diff * diff / totalDeviation; FDistribution fDist = new FDistribution(1, pc1.getAverageCount() + pc2.getAverageCount() - 2); double prob = fDist.getProbabilityForValue(t); prob = prob < 0 ? 1.0d : 1.0d - prob; return prob; } public int getMinSize() { return 2; } public int getMaxSize() { return Integer.MAX_VALUE; } }