/*
* avenir: Predictive analytic based on Hadoop Map Reduce
* Author: Pranab Ghosh
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.avenir.util;
import org.chombo.util.TabularData;
/**
* Contingency matrix for correlation between categorical attributes
* @author pranab
*
*/
public class ContingencyMatrix extends TabularData {
private int[] rowSum;
private int[] colSum;
private int totalCount;
public ContingencyMatrix() {
super();
}
public ContingencyMatrix(int numRow, int numCol) {
super(numRow, numCol);
}
public void aggregate(ContingencyMatrix other) {
for (int r = 0; r < numRow; ++r) {
for (int c = 0; c < numCol; ++c) {
table[r][c] += other.table[r][c];
}
}
}
public int getSum() {
int sum = 0;
for (int r = 0; r < numRow; ++r) {
for (int c = 0; c < numCol; ++c) {
sum +=table[r][c];
}
}
return sum;
}
private void getAggregates() {
rowSum = new int[numRow];
totalCount = 0;
for (int i =0; i < numRow; ++ i) {
rowSum[i] = 0;
for (int j = 0; j < numCol; ++j) {
rowSum[i] += table[i][j];
totalCount += table[i][j];;
}
rowSum[i] = rowSum[i] == 0 ? 1 : rowSum[i];
}
//column sums
colSum = new int[numCol];
for (int j = 0; j < numCol; ++j) {
colSum[j] = 0;
for (int i =0; i < numRow; ++ i) {
colSum[j] += table[i][j];
}
colSum[j] = colSum[j] == 0 ? 1 : colSum[j];
}
}
public double cramerIndex() {
//row sums
int[] rowSum = new int[numRow];
int totalCount = 0;
for (int i =0; i < numRow; ++ i) {
rowSum[i] = 0;
for (int j = 0; j < numCol; ++j) {
rowSum[i] += table[i][j];
totalCount += table[i][j];;
}
rowSum[i] = rowSum[i] == 0 ? 1 : rowSum[i];
}
//column sums
int[] colSum = new int[numCol];
for (int j = 0; j < numCol; ++j) {
colSum[j] = 0;
for (int i =0; i < numRow; ++ i) {
colSum[j] += table[i][j];
}
colSum[j] = colSum[j] == 0 ? 1 : colSum[j];
}
//pearson
double pearson = 0;
for (int i =0; i < numRow; ++ i) {
for (int j = 0; j < numCol; ++j) {
pearson += ( (double)table[i][j] * table[i][j]) / ((double)rowSum[i] * colSum[j] );
}
}
pearson -= 1.0;
//cramer
int smallerDim = numRow < numCol ? numRow : numCol;
double cramer = (pearson) / (smallerDim -1);
return cramer;
}
private double[] rowSumAsDouble() {
double[] rowSumDouble = new double[numRow];
for (int i =0; i < numRow; ++ i) {
rowSumDouble[i] = (double)rowSum[i] / totalCount;
}
return rowSumDouble;
}
private double[] colSumAsDouble() {
double[] colSumDouble = new double[numCol];
for (int j =0; j< numCol; ++ j) {
colSumDouble[j] = (double)colSum[j] / totalCount;
}
return colSumDouble;
}
public double concentrationCoeff() {
getAggregates() ;
double[] rowSumDouble = rowSumAsDouble() ;
double[] colSumDouble = colSumAsDouble();
double sumOne = 0;
for (int i =0; i < numRow; ++ i) {
double elSqSum = 0;
for (int j = 0; j < numCol; ++j) {
double elem = (double)table[i][j] / totalCount;
elSqSum += elem * elem;
}
sumOne += elSqSum / rowSumDouble[i];
}
double sumTwo = 0;
for (int j = 0; j < numCol; ++j) {
sumTwo += colSumDouble[j] * colSumDouble[j] ;
}
double concCoeff = (sumOne - sumTwo) / (1.0 - sumTwo);
return concCoeff;
}
public double uncertaintyCoeff() {
double uncertainCoeff = 0;
getAggregates() ;
double[] rowSumDouble = rowSumAsDouble() ;
double[] colSumDouble = colSumAsDouble();
double sumOne = 0;
for (int i =0; i < numRow; ++ i) {
for (int j = 0; j < numCol; ++j) {
double elem = (double)table[i][j] / totalCount;
sumOne += elem * Math.log10(elem * colSumDouble[j] / rowSumDouble[i] );
}
}
double sumTwo = 0;
for (int j = 0; j < numCol; ++j) {
sumTwo += colSumDouble[j] * Math.log10(colSumDouble[j]);
}
uncertainCoeff = sumOne / sumTwo;
return uncertainCoeff;
}
}