package tests; /* Kruscal-Wallis non-parametric test for difference of k-means **For description used in this implementation, see Neave and Worthington Distribution-Free Tests** K-W test is equivalent to ANOVA f-test, except based on RANK rather than observed value. Not analgous to the way Spearman's rank correlation coefficient is calculated. K-W is a generalisation of the Mann-Whitney test for difference of two means Assumes sample independence. Assumes continuous distribution (no ties), although this is not essential (see below) H_0: There is no difference is population means the samples were taken from H_1: There are some differences Test stat calculated thus 1. Find the rank of each data in terms of all samples pooled together (For ties use the average position and adjustment described below if there are a lot of ties). 2. For each sample calculate the average ranks in the sample R_i and the overall mean R 3. Find the weighted sum of the squared deviations from the mean (weighted by each sample size) W. 4. Calculate test stat H=W*12/(N*(N+1)) where N is the total number of data, or more easily H = [12/N(N+1)]* sum{R_i/n_i} - 3(N+1) 5. Large value of H are unlikely if H_0 is true. Not sure of the ddistribution of H under H_0, probably ML test. Critical regions from tables If there are a large number of ties an adjustment should be made. Suppose r values have more than one occurence, then H should be divided by C below. Let t_i be the number of ties for a given data value, then C = 1 - sum{t^3 - t}/[N(N^2-1)] H*=H/C Implementation notes 1. Input must be csv format, c++ type comments are ignored Data must be in ROWS, i.e. 1 row = i sample 2. First implementation assumes equal sample sizes at each level 3. First line of data should contain the number of treatment levels 4. Second line should store 4. The file Test1.csv contains the example from page 245 of Neive The file Test2.csv contains the example from page 249 of Neive that has duplications 5. Output */ import fileIO.*; import java.util.Arrays; public class KruskalWallis { static InFile f; static int N; //Total data size static int k; //number of levels static int[] n; //number of data per level static String fileName = "LearningRate100Rules.csv"; //Hack,. read in from args // static String fileName = "Test2.txt"; //Hack,. read in from args static DataPoint[][] dataByLevel; static DataPoint[] rankedData; static boolean debug = false; public static void main(String args[]) { double H; double C; double H_prime; loadData(); System.out.println("FILE ="+fileName+"\n Treatment levels ="+k+"\t Total data = "+N+" per level ="+n[0]); //Sort Data Arrays.sort(rankedData); for(int i=0;i<N;i++) { rankedData[i].rank=(i+1); if(debug) System.out.print(rankedData[i].d+"\t"); } //Check for duplicates, counts number duplicated, and then recalculates ranks as the averages // System.out.print("\n\nPRIOR TO Duplicate\n"); // for(int i=0;i<N;i++) // System.out.print(rankedData[i].rank+"\t"); adjustRanksForDuplicates(); //Find rank sums double[] rankSums= new double[k]; for(int i=0;i<k;i++) { rankSums[i]=0; for(int j=0;j<n[i];j++) rankSums[i]+=dataByLevel[i][j].rank; } // Find H stat H=0; for(int i=0;i<k;i++) H+=rankSums[i]*rankSums[i]/n[i]; H=H*12/(N*(N+1)); H-=3*(N+1); System.out.println("\n\n H stat = "+H); //Find C stat int i=0; int nextPos=0; int t=0; int tSum=0,t3Sum=0; while(i<N) { if(rankedData[i].rank!=(i+1)) { t=(int)(2*rankedData[i].rank)-i; // System.out.println("\n t = "+t+"i = "+i); nextPos = t-1; t= (t-i-1); tSum+=(t); t3Sum+=t*t*t; i=nextPos; } else i++; } System.out.println("\n\n t sum = "+tSum+"\t t^3 sum = "+t3Sum); C=1- ((double)t3Sum-tSum)/(N*(N*N-1)); H_prime = H/C; System.out.println("\n\n C = "+C+"\t H* = "+H_prime); } public static void loadData() { f = new InFile(fileName); k = f.readInt(); n = new int[k]; N=0; dataByLevel = new DataPoint[k][]; for(int i=0;i<k;i++) { n[i]=f.readInt(); N+=n[i]; dataByLevel[i]=new DataPoint[n[i]]; } double d; int c=0; rankedData = new DataPoint[N]; for(int i=0;i<k;i++) { for(int j=0;j<n[i];j++) { d = f.readDouble(); dataByLevel[i][j]=new DataPoint(d,i,j); rankedData[c]=dataByLevel[i][j]; c++; } } if(debug) { for(int i=0;i<k;i++) { for(int j=0;j<n[i];j++) System.out.print(dataByLevel[i][j].d+"\t"); System.out.print("\n"); } } } public static void adjustRanksForDuplicates() { DataPoint first=rankedData[0]; int count=0; int pos=0; double s,e; for(int i=1;i<N;i++) { if(rankedData[i].d!=first.d) { if(count>0) { s=first.rank; e=i; // System.out.print("First Rank = "+s+"\nLast Rank ="+e+"\n count = "+count+"\n"); for(int j=0;j<=count;j++) rankedData[(int)(s-1)+j].rank=(e+s)/2; count=0; } first=rankedData[i]; } else count++; } if(count>0) { s=first.rank; e=N; // System.out.print("First Rank = "+s+"\nLast Rank ="+e+"\n count = "+count+"\n"); for(int j=0;j<=count;j++) rankedData[(int)(s-1)+j].rank=(e+s)/2; count=0; } } }