package tests;
import distributions.FisherDistribution;
import distributions.NormalDistribution;
import fileIO.OutFile;
import java.io.FileReader;
import java.util.Arrays;
import transformations.ArrayPair;
import transformations.LinearModel;
import transformations.MatrixSort;
import weka.classifiers.functions.LinearRegression;
import weka.core.Instance;
import weka.core.Instances;
/**
*
* Class to test residuals. Dont know where classes MatrixSort and LinearModel are!
*/
public class ResidualTests {
//Returns the test stat
public static double goldfeldQuandt(double[][]X, double[] Y, int pos)
{
//Copy data and sort by selected attribute
MatrixSort[] ms = new MatrixSort[Y.length];
for(int i=0;i<Y.length;i++)
{
double[] x = new double[X.length];
for(int j=0;j<x.length;j++)
x[j]=X[j][i];
ms[i]=new MatrixSort(x,Y[i],pos);
}
Arrays.sort(ms);
//Split into three sets size n1, n2=n/4 and n3
int p=X.length-1;
int n=Y.length;
int n2=n/3;
int n1=(n-n2)/2;
int n3=n-n1-n2;
System.out.println("n1 = "+n1+" n2 = "+n2+" n3 = "+n3);
double[][]newX = new double[X.length][n1];
double[] newY= new double[n1];
for(int i=0;i<n1;i++)
{
newY[i]=Y[i];
for(int j=0;j<X.length;j++)
newX[j][i]=ms[i].x[j];
}
// Fit regression to first set, find SSE
LinearModel lm =new LinearModel(newX,newY);
lm.fitModel();
lm.findStats();
double s1=lm.getSSE();
// Fit regression to second set, find SSE
newX = new double[X.length][n3];
newY= new double[n3];
for(int i=n1+n2;i<n;i++)
{
newY[i-n1-n2]=Y[i];
for(int j=0;j<X.length;j++)
newX[j][i-n1-n2]=ms[i].x[j];
}
lm =new LinearModel(newX,newY);
lm.fitModel();
lm.findStats();
double s3=lm.getSSE();
//Find q = (n1-p-1)SSE1/(n3-p-1)SSE3
double q;
q=((n3-p-1)*s1)/((n1-p-1)*s3);
System.out.println(" s1 = "+s1+" s3 = "+s3+" q = "+q+" s3/s1"+s3/s1);
FisherDistribution f;
if(s1>s3)
f = new FisherDistribution(n3-p-1, n1-p-1);
else
f= new FisherDistribution(n3-p-1, n1-p-1);
double prob = f.getCDF(q);
double prob2=f.getDensity(q);
System.out.println(" prob = "+prob+" density = "+prob2);
//This follows an F(n1-p-1,k3-p-1) distribution under the null of homoscedastic
return q;
}
public static double runsTest(double[] pred, double[] residuals)
{
double runsCount=1;
double p=0;
boolean positive=false, currentPositive;
ArrayPair[] ap = new ArrayPair[pred.length];
for(int i=0;i<pred.length;i++)
{
ap[i]=new ArrayPair();
ap[i].predicted=pred[i];
ap[i].residual=residuals[i];
}
Arrays.sort(ap);
if(ap[0].residual>0)
{
positive=true;
p=1;
}
for(int i=1;i<ap.length;i++)
{
if(ap[i].residual>0)
{
currentPositive=true;
p++;
}
else
currentPositive=false;
if(currentPositive!=positive)
runsCount++;
positive=currentPositive;
}
double n=ap.length;
// System.out.println("Runs count = "+runsCount+" number of ones = "+p);
//Calculate probs via normal
double m=(2.0*p*(n-p))/(n-1);
//Something wrong with v!?!
double v=(2*p*(n-p)*(2*p*(n-p)-n))/(n*n*(n-1));
System.out.println("m = "+m+" v = "+v);
//Better to use the weka normal distribution
double res=(runsCount-m)/Math.sqrt(v);
return res;
}
public static double kolmogorovSmirnoff(double[] residuals)
{
return kolmogorovSmirnoff(residuals,1);
}
public static double kolmogorovSmirnoff(double[] residuals, double var)
{
//Normality test for residuals: Kolmogorov Smirnoff
int n=residuals.length;
double[] expected=new double[n];
double[] observed=new double[n+1];
double[] residCopy= new double[residuals.length];
System.arraycopy(residuals, 0, residCopy, 0, residuals.length);
observed[n]=1;
NormalDistribution norm = new NormalDistribution(0,var);
Arrays.sort(residCopy);
//Find out the Expected normal values for the stepped probabilities
//Set probs
for(int i=0;i<n;i++)
expected[i]=(i+1)/(double)n;
//Find inverses
for(int i=0;i<n;i++)
observed[i]=norm.getCDF(residCopy[i]);
//Find max deviation
double max=0;
for(int i=0;i<n;i++)
{
if(Math.abs(expected[i]-observed[i+1])>max)
max=Math.abs(expected[i]-observed[i+1]);
}
return max;
}
public static double anscombeProcedure(double[] actual, double[] predicted)
{
return testHeteroscadisity(actual,predicted);
}
public static double testHeteroscadisity(double[] actual, double[] predicted)
{
//Measure correlation between actual values and absolute residual values
double[] absRes = new double[predicted.length];
double meanPred=0;
double meanAbs=0;
for(int i=0;i<predicted.length;i++)
{
absRes[i]=Math.abs(actual[i]-predicted[i]);
meanAbs+=absRes[i];
meanPred+=predicted[i];
}
meanAbs/=predicted.length;
meanPred/=predicted.length;
//Measure correlation between absRes and predicted, quite slowly!
double corr=0,x=0,y=0;
for(int i=0;i<predicted.length;i++)
{
corr+=(absRes[i]-meanAbs)*(predicted[i]-meanPred);
x+=(absRes[i]-meanAbs)*(absRes[i]-meanAbs);
y+=(predicted[i]-meanPred)*(predicted[i]-meanPred);
}
corr=corr/Math.sqrt(x*y);
System.out.println(" Correlation = "+corr);
//Not adjusted for the number of regressors!
// double t=corr*(Math.sqrt(predicted.length-2))/Math.sqrt(1-corr*corr);
return corr;
}
public static void main(String[] args) {
int s=100;
double[] data= new double[s];
NormalDistribution n = new NormalDistribution(0,3);
for(int i=0;i<s;i++)
data[i]=n.simulate();
double x = kolmogorovSmirnoff(data,1);
System.out.println(" KS stat = "+x);
}
public static void testHetero()
{
Instances data;
FileReader r;
Instance inst;
double[] actual,predictions;
LinearRegression lg = new LinearRegression();
try{
r= new FileReader("C:/Research/Data/Gavin Competition/Weka Files/Temp Train.arff");
data = new Instances(r);
data.setClassIndex(data.numAttributes()-1);
lg.buildClassifier(data);
predictions=new double[data.numInstances()];
actual=data.attributeToDoubleArray(data.numAttributes()-1);
for(int i=0;i<predictions.length;i++)
{
inst=data.instance(i);
predictions[i]=lg.classifyInstance(inst);
}
OutFile of= new OutFile("C:/Research/Data/Gavin Competition/Weka Files/CorrelationTest.csv");
System.out.println(" t stat for homogeneity ="+testHeteroscadisity(actual,predictions));
for(int i=0;i<predictions.length;i++)
of.writeLine(actual[i]+","+predictions[i]);
}catch(Exception e)
{
System.out.println(" Error in REsidual Test "+e);
}
}
}