/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * ContingencyTables.java * Copyright (C) 1999-2012 University of Waikato, Hamilton, New Zealand * */ package weka.core; /** * Class implementing some statistical routines for contingency tables. * * @author Eibe Frank (eibe@cs.waikato.ac.nz) * @version $Revision: 8925 $ */ public class ContingencyTables implements RevisionHandler { /** The natural logarithm of 2 */ private static double log2 = Math.log(2); /** * Returns chi-squared probability for a given matrix. * * @param matrix the contigency table * @param yates is Yates' correction to be used? * @return the chi-squared probability */ public static double chiSquared(double [][] matrix, boolean yates) { int df = (matrix.length - 1) * (matrix[0].length - 1); return Statistics.chiSquaredProbability(chiVal(matrix, yates), df); } /** * Computes chi-squared statistic for a contingency table. * * @param matrix the contigency table * @param useYates is Yates' correction to be used? * @return the value of the chi-squared statistic */ public static double chiVal(double [][] matrix, boolean useYates) { int df, nrows, ncols, row, col; double[] rtotal, ctotal; double expect = 0, chival = 0, n = 0; boolean yates = true; nrows = matrix.length; ncols = matrix[0].length; rtotal = new double [nrows]; ctotal = new double [ncols]; for (row = 0; row < nrows; row++) { for (col = 0; col < ncols; col++) { rtotal[row] += matrix[row][col]; ctotal[col] += matrix[row][col]; n += matrix[row][col]; } } df = (nrows - 1)*(ncols - 1); if ((df > 1) || (!useYates)) { yates = false; } else if (df <= 0) { return 0; } chival = 0.0; for (row = 0; row < nrows; row++) { if (Utils.gr(rtotal[row], 0)) { for (col = 0; col < ncols; col++) { if (Utils.gr(ctotal[col], 0)) { expect = (ctotal[col] * rtotal[row]) / n; chival += chiCell (matrix[row][col], expect, yates); } } } } return chival; } /** * Tests if Cochran's criterion is fullfilled for the given * contingency table. Rows and columns with all zeros are not considered * relevant. * * @param matrix the contigency table to be tested * @return true if contingency table is ok, false if not */ public static boolean cochransCriterion(double[][] matrix) { double[] rtotal, ctotal; double n = 0, expect, smallfreq = 5; int smallcount = 0, nonZeroRows = 0, nonZeroColumns = 0, nrows, ncols, row, col; nrows = matrix.length; ncols = matrix[0].length; rtotal = new double [nrows]; ctotal = new double [ncols]; for (row = 0; row < nrows; row++) { for (col = 0; col < ncols; col++) { rtotal[row] += matrix[row][col]; ctotal[col] += matrix[row][col]; n += matrix[row][col]; } } for (row = 0; row < nrows; row++) { if (Utils.gr(rtotal[row], 0)) { nonZeroRows++; } } for (col = 0; col < ncols; col++) { if (Utils.gr(ctotal[col], 0)) { nonZeroColumns++; } } for (row = 0; row < nrows; row++) { if (Utils.gr(rtotal[row], 0)) { for (col = 0; col < ncols; col++) { if (Utils.gr(ctotal[col], 0)) { expect = (ctotal[col] * rtotal[row]) / n; if (Utils.sm(expect, smallfreq)) { if (Utils.sm(expect, 1)) { return false; } else { smallcount++; if (smallcount > (nonZeroRows * nonZeroColumns) / smallfreq) { return false; } } } } } } } return true; } /** * Computes Cramer's V for a contingency table. * * @param matrix the contingency table * @return Cramer's V */ public static double CramersV(double [][] matrix) { int row, col, nrows,ncols, min; double n = 0; nrows = matrix.length; ncols = matrix[0].length; for (row = 0; row < nrows; row++) { for (col = 0; col < ncols; col++) { n += matrix[row][col]; } } min = nrows < ncols ? nrows-1 : ncols-1; if ((min == 0) || Utils.eq(n, 0)) return 0; return Math.sqrt(chiVal(matrix, false) / (n * (double)min)); } /** * Computes the entropy of the given array. * * @param array the array * @return the entropy */ public static double entropy(double[] array) { double returnValue = 0, sum = 0; for (int i = 0; i < array.length; i++) { returnValue -= lnFunc(array[i]); sum += array[i]; } if (Utils.eq(sum, 0)) { return 0; } else { return (returnValue + lnFunc(sum)) / (sum * log2); } } /** * Computes conditional entropy of the rows given * the columns. * * @param matrix the contingency table * @return the conditional entropy of the rows given the columns */ public static double entropyConditionedOnColumns(double[][] matrix) { double returnValue = 0, sumForColumn, total = 0; for (int j = 0; j < matrix[0].length; j++) { sumForColumn = 0; for (int i = 0; i < matrix.length; i++) { returnValue = returnValue + lnFunc(matrix[i][j]); sumForColumn += matrix[i][j]; } returnValue = returnValue - lnFunc(sumForColumn); total += sumForColumn; } if (Utils.eq(total, 0)) { return 0; } return -returnValue / (total * log2); } /** * Computes conditional entropy of the columns given * the rows. * * @param matrix the contingency table * @return the conditional entropy of the columns given the rows */ public static double entropyConditionedOnRows(double[][] matrix) { double returnValue = 0, sumForRow, total = 0; for (int i = 0; i < matrix.length; i++) { sumForRow = 0; for (int j = 0; j < matrix[0].length; j++) { returnValue = returnValue + lnFunc(matrix[i][j]); sumForRow += matrix[i][j]; } returnValue = returnValue - lnFunc(sumForRow); total += sumForRow; } if (Utils.eq(total, 0)) { return 0; } return -returnValue / (total * log2); } /** * Computes conditional entropy of the columns given the rows * of the test matrix with respect to the train matrix. Uses a * Laplace prior. Does NOT normalize the entropy. * * @param train the train matrix * @param test the test matrix * @param numClasses the number of symbols for Laplace * @return the entropy */ public static double entropyConditionedOnRows(double[][] train, double[][] test, double numClasses) { double returnValue = 0, trainSumForRow, testSumForRow, testSum = 0; for (int i = 0; i < test.length; i++) { trainSumForRow = 0; testSumForRow = 0; for (int j = 0; j < test[0].length; j++) { returnValue -= test[i][j] * Math.log(train[i][j] + 1); trainSumForRow += train[i][j]; testSumForRow += test[i][j]; } testSum = testSumForRow; returnValue += testSumForRow * Math.log(trainSumForRow + numClasses); } return returnValue / (testSum * log2); } /** * Computes the rows' entropy for the given contingency table. * * @param matrix the contingency table * @return the rows' entropy */ public static double entropyOverRows(double[][] matrix) { double returnValue = 0, sumForRow, total = 0; for (int i = 0; i < matrix.length; i++) { sumForRow = 0; for (int j = 0; j < matrix[0].length; j++) { sumForRow += matrix[i][j]; } returnValue = returnValue - lnFunc(sumForRow); total += sumForRow; } if (Utils.eq(total, 0)) { return 0; } return (returnValue + lnFunc(total)) / (total * log2); } /** * Computes the columns' entropy for the given contingency table. * * @param matrix the contingency table * @return the columns' entropy */ public static double entropyOverColumns(double[][] matrix){ double returnValue = 0, sumForColumn, total = 0; for (int j = 0; j < matrix[0].length; j++){ sumForColumn = 0; for (int i = 0; i < matrix.length; i++) { sumForColumn += matrix[i][j]; } returnValue = returnValue - lnFunc(sumForColumn); total += sumForColumn; } if (Utils.eq(total, 0)) { return 0; } return (returnValue + lnFunc(total)) / (total * log2); } /** * Computes gain ratio for contingency table (split on rows). * Returns Double.MAX_VALUE if the split entropy is 0. * * @param matrix the contingency table * @return the gain ratio */ public static double gainRatio(double[][] matrix){ double preSplit = 0, postSplit = 0, splitEnt = 0, sumForRow, sumForColumn, total = 0, infoGain; // Compute entropy before split for (int i = 0; i < matrix[0].length; i++) { sumForColumn = 0; for (int j = 0; j < matrix.length; j++) sumForColumn += matrix[j][i]; preSplit += lnFunc(sumForColumn); total += sumForColumn; } preSplit -= lnFunc(total); // Compute entropy after split and split entropy for (int i = 0; i < matrix.length; i++) { sumForRow = 0; for (int j = 0; j < matrix[0].length; j++) { postSplit += lnFunc(matrix[i][j]); sumForRow += matrix[i][j]; } splitEnt += lnFunc(sumForRow); } postSplit -= splitEnt; splitEnt -= lnFunc(total); infoGain = preSplit - postSplit; if (Utils.eq(splitEnt, 0)) return 0; return infoGain / splitEnt; } /** * Returns negative base 2 logarithm of multiple hypergeometric * probability for a contingency table. * * @param matrix the contingency table * @return the log of the hypergeometric probability of the contingency table */ public static double log2MultipleHypergeometric(double[][] matrix) { double sum = 0, sumForRow, sumForColumn, total = 0; for (int i = 0; i < matrix.length; i++) { sumForRow = 0; for (int j = 0; j < matrix[i].length; j++) { sumForRow += matrix[i][j]; } sum += SpecialFunctions.lnFactorial(sumForRow); total += sumForRow; } for (int j = 0; j < matrix[0].length; j++) { sumForColumn = 0; for (int i = 0; i < matrix.length; i++) { sumForColumn += matrix [i][j]; } sum += SpecialFunctions.lnFactorial(sumForColumn); } for (int i = 0; i < matrix.length; i++) { for (int j = 0; j < matrix[i].length; j++) { sum -= SpecialFunctions.lnFactorial(matrix[i][j]); } } sum -= SpecialFunctions.lnFactorial(total); return -sum / log2; } /** * Reduces a matrix by deleting all zero rows and columns. * * @param matrix the matrix to be reduced * @return the matrix with all zero rows and columns deleted */ public static double[][] reduceMatrix(double[][] matrix) { int row, col, currCol, currRow, nrows, ncols, nonZeroRows = 0, nonZeroColumns = 0; double[] rtotal, ctotal; double[][] newMatrix; nrows = matrix.length; ncols = matrix[0].length; rtotal = new double [nrows]; ctotal = new double [ncols]; for (row = 0; row < nrows; row++) { for (col = 0; col < ncols; col++) { rtotal[row] += matrix[row][col]; ctotal[col] += matrix[row][col]; } } for (row = 0; row < nrows; row++) { if (Utils.gr(rtotal[row],0)) { nonZeroRows++; } } for (col = 0; col < ncols; col++) { if (Utils.gr(ctotal[col],0)) { nonZeroColumns++; } } newMatrix = new double[nonZeroRows][nonZeroColumns]; currRow = 0; for (row = 0; row < nrows; row++) { if (Utils.gr(rtotal[row],0)) { currCol = 0; for (col = 0; col < ncols; col++) { if (Utils.gr(ctotal[col],0)) { newMatrix[currRow][currCol] = matrix[row][col]; currCol++; } } currRow++; } } return newMatrix; } /** * Calculates the symmetrical uncertainty for base 2. * * @param matrix the contingency table * @return the calculated symmetrical uncertainty * */ public static double symmetricalUncertainty(double matrix[][]) { double sumForColumn, sumForRow, total = 0, columnEntropy = 0, rowEntropy = 0, entropyConditionedOnRows = 0, infoGain = 0; // Compute entropy for columns for (int i = 0; i < matrix[0].length; i++) { sumForColumn = 0; for (int j = 0; j < matrix.length; j++) { sumForColumn += matrix[j][i]; } columnEntropy += lnFunc(sumForColumn); total += sumForColumn; } columnEntropy -= lnFunc(total); // Compute entropy for rows and conditional entropy for (int i = 0; i < matrix.length; i++) { sumForRow = 0; for (int j = 0; j < matrix[0].length; j++) { sumForRow += matrix[i][j]; entropyConditionedOnRows += lnFunc(matrix[i][j]); } rowEntropy += lnFunc(sumForRow); } entropyConditionedOnRows -= rowEntropy; rowEntropy -= lnFunc(total); infoGain = columnEntropy - entropyConditionedOnRows; if (Utils.eq(columnEntropy, 0) || Utils.eq(rowEntropy, 0)) return 0; return 2.0 * (infoGain / (columnEntropy + rowEntropy)); } /** * Computes Goodman and Kruskal's tau-value for a contingency table. * * @param matrix the contingency table * @return Goodman and Kruskal's tau-value */ public static double tauVal(double[][] matrix) { int nrows, ncols, row, col; double [] ctotal; double maxcol = 0, max, maxtotal = 0, n = 0; nrows = matrix.length; ncols = matrix[0].length; ctotal = new double [ncols]; for (row = 0; row < nrows; row++) { max = 0; for (col = 0; col < ncols; col++) { if (Utils.gr(matrix[row][col], max)) max = matrix[row][col]; ctotal[col] += matrix[row][col]; n += matrix[row][col]; } maxtotal += max; } if (Utils.eq(n, 0)) { return 0; } maxcol = ctotal[Utils.maxIndex(ctotal)]; return (maxtotal - maxcol)/(n - maxcol); } /** * Help method for computing entropy. */ private static double lnFunc(double num){ if (num <= 0) { return 0; } else { return num * Math.log(num); } } /** * Computes chi-value for one cell in a contingency table. * * @param freq the observed frequency in the cell * @param expected the expected frequency in the cell * @return the chi-value for that cell; 0 if the expected value is * too close to zero */ private static double chiCell(double freq, double expected, boolean yates){ // Cell in empty row and column? if (Utils.smOrEq(expected, 0)) { return 0; } // Compute difference between observed and expected value double diff = Math.abs(freq - expected); if (yates) { // Apply Yates' correction if wanted diff -= 0.5; // The difference should never be negative if (diff < 0) { diff = 0; } } // Return chi-value for the cell return (diff * diff / expected); } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 8925 $"); } /** * Main method for testing this class. */ public static void main(String[] ops) { double[] firstRow = {10, 5, 20}; double[] secondRow = {2, 10, 6}; double[] thirdRow = {5, 10, 10}; double[][] matrix = new double[3][0]; matrix[0] = firstRow; matrix[1] = secondRow; matrix[2] = thirdRow; for (int i = 0; i < matrix.length; i++) { for (int j = 0; j < matrix[i].length; j++) { System.out.print(matrix[i][j] + " "); } System.out.println(); } System.out.println("Chi-squared probability: " + ContingencyTables.chiSquared(matrix, false)); System.out.println("Chi-squared value: " + ContingencyTables.chiVal(matrix, false)); System.out.println("Cochran's criterion fullfilled: " + ContingencyTables.cochransCriterion(matrix)); System.out.println("Cramer's V: " + ContingencyTables.CramersV(matrix)); System.out.println("Entropy of first row: " + ContingencyTables.entropy(firstRow)); System.out.println("Entropy conditioned on columns: " + ContingencyTables.entropyConditionedOnColumns(matrix)); System.out.println("Entropy conditioned on rows: " + ContingencyTables.entropyConditionedOnRows(matrix)); System.out.println("Entropy conditioned on rows (with Laplace): " + ContingencyTables.entropyConditionedOnRows(matrix, matrix, 3)); System.out.println("Entropy of rows: " + ContingencyTables.entropyOverRows(matrix)); System.out.println("Entropy of columns: " + ContingencyTables.entropyOverColumns(matrix)); System.out.println("Gain ratio: " + ContingencyTables.gainRatio(matrix)); System.out.println("Negative log2 of multiple hypergeometric probability: " + ContingencyTables.log2MultipleHypergeometric(matrix)); System.out.println("Symmetrical uncertainty: " + ContingencyTables.symmetricalUncertainty(matrix)); System.out.println("Tau value: " + ContingencyTables.tauVal(matrix)); double[][] newMatrix = new double[3][3]; newMatrix[0][0] = 1; newMatrix[0][1] = 0; newMatrix[0][2] = 1; newMatrix[1][0] = 0; newMatrix[1][1] = 0; newMatrix[1][2] = 0; newMatrix[2][0] = 1; newMatrix[2][1] = 0; newMatrix[2][2] = 1; System.out.println("Matrix with empty row and column: "); for (int i = 0; i < newMatrix.length; i++) { for (int j = 0; j < newMatrix[i].length; j++) { System.out.print(newMatrix[i][j] + " "); } System.out.println(); } System.out.println("Reduced matrix: "); newMatrix = ContingencyTables.reduceMatrix(newMatrix); for (int i = 0; i < newMatrix.length; i++) { for (int j = 0; j < newMatrix[i].length; j++) { System.out.print(newMatrix[i][j] + " "); } System.out.println(); } } }