package edu.cmu.minorthird.classify.transform;
/**
* @author Vitor Carvalho
* March 2005
*
* A 2-by-2 matrix indicating the association between 2 variables.
* Useful in feature selection or feature association experiments.
*
* Contains scores for Chi-Squared, Pointwise Mutual-Information,
* Compensated Pointwise Mutual Info
*
* Given 2 variables X and Y, the matrix is
* [a b] a = X and Y ; b = X and Not Y
* [c d] c = Not X and Y; d = Not X and Not Y
*
* */
public class ContingencyTable {
private long a, b, c, d;
public ContingencyTable(long a, long b, long c, long d){
this.a=a; this.b=b; this.c=c; this.d=d;
}
//in log scale to avoid overflow
public double getChiSquared(){
double n = Math.log(total());
double num = 2*Math.log(Math.abs((a*d) - (b*c)));
double den = Math.log(a+b)+Math.log(a+c)+Math.log(c+d)+Math.log(b+d);
double tmp = n+num-den;
return Math.exp(tmp);
}
//Pointwise Mutual Information
public double getPMutualInfo(){
if(a==0) return 0.0;
double n = Math.log(total());
double denon = Math.log(a+b)+Math.log(a+c);
double tmp2 = Math.log(a)+n-denon;
double tmp = tmp2/Math.log(2.0);
return tmp;
}
//Compensated Pointwise Mutual Information
// Basically, count(feature1,feature2)*PointwiseMutualInfo
//it should compensate for low frequency bias in original PMutualInfo
public double getCompensatedPMutualInfo(int count){
double tmp2 = getPMutualInfo();
return tmp2*count;
}
@Override
public String toString() {
return "CTable: [ "+a+" , "+b+" , "+c+" , "+d+" ]";
}
public long total(){return (a+b+c+d);}
public static void main(String[] args) {
System.out.println("Usage: java ContingencyTable a_value b_value c_value d_value");
ContingencyTable ct = new ContingencyTable(Long.parseLong(args[0]), Long.parseLong(args[1]), Long.parseLong(args[2]), Long.parseLong(args[3]));
System.out.println("Score chi = "+ct.getChiSquared());
System.out.println("Score PMI = "+ct.getPMutualInfo());
System.out.println("Score PMI comp = "+ct.getCompensatedPMutualInfo(3));
System.out.println(ct.toString());
}
}