package transformations; import fileIO.*; import tests.ResidualTests; public class VarianceStabalisingStepwiseRegression { static int m,n; static double[][] X; static double[][] transformedX; static double[][] workingX; static double[] Y; static double[] transformedY; static double[] powers; static boolean[] included; static boolean[] transformIncluded; static int[] positions; static int size=0; static int[] transformedPositions; static int transSize=0; static final double CRITICAL=7; //0.823; public static void main(String[] args) { //Each one should load X and Y scaling if necessary OutFile f = new OutFile("C:/Research/Data/Gavin Competition/Results/StepwiseTransformationPowerResults.csv"); f.writeLine("Synthetic,MSE,AP,KS,RT,RQ"); int choice=2; Synthetic(); if(choice==0) fullModel(f); else if (choice==1) stepwiseLinear(f); else forwardSelectTransform(f); /* System.out.println(" Starting Temp Full ..."); f.writeLine("Temp Full"); Temperature(); if(choice==0) fullModel(f); else if (choice==1) stepwiseLinear(f); else forwardSelectTransform(f); */ System.out.println(" Starting Temp Reduced ..."); f.writeLine("Temp Reduced"); TemperatureReduced(); if(choice==0) fullModel(f); else if (choice==1) stepwiseLinear(f); else forwardSelectTransform(f); System.out.println(" Starting SO2 ..."); f.writeLine("SO2"); SO2(); if(choice==0) fullModel(f); else if (choice==1) stepwiseLinear(f); else forwardSelectTransform(f); System.out.println(" Starting SO2 Reduced..."); f.writeLine("SO2 Reduced"); SO2Reduced(); if(choice==0) fullModel(f); else if (choice==1) stepwiseLinear(f); else forwardSelectTransform(f); /* System.out.println(" Starting Precip..."); f.writeLine("Precip"); Precip(); if(choice==0) fullModel(f); else if (choice==1) stepwiseLinear(f); else forwardSelectTransform(f); System.out.println(" Starting Precip Reduced..."); */ f.writeLine("Precip Reduced"); PrecipReduced(); if(choice==0) fullModel(f); else if (choice==1) stepwiseLinear(f); else forwardSelectTransform(f); } public static void Synthetic() { m=1; n=256; int n2=128; int synthScale=5; String path="C:/Research/Data/Gavin Competition/Synthetic/"; String p1="Synthetic Train.csv"; String p2="Synthetic Validate.csv"; boolean finished=false; int attCount=0, c; double var,minVar=Double.MAX_VALUE,newVar, oldSSR, newSSR; X=new double[m+1][n]; Y=new double[n]; transformedY=new double[n]; InFile f = new InFile(path+p1); for(int i=0;i<n;i++) X[0][i]=1; for(int i=0;i<n;i++) { for(int j=1;j<=m;j++) X[j][i]=f.readDouble()+synthScale; Y[i]=f.readDouble(); } //1: Fit linear model, estimate S^2, } public static void TemperatureReduced() { m=20; n=7117; double tempScale=10; String path="C:/Research/Data/Gavin Competition/Temperature/TempTransformed Train.csv"; // String path="C:/Research/Data/Gavin Competition/Temperature/TempTraining.csv"; //Attributes to remove int[] collinear= {1,3,4,5,6,7,20,34,35,36,47,48,72,82}; boolean finished=false; int attCount=0, c; double var,minVar=Double.MAX_VALUE,newVar, oldSSR, newSSR; X=new double[m+1][n]; Y=new double[n]; transformedY=new double[n]; InFile f = new InFile(path); for(int i=0;i<n;i++) X[0][i]=1; for(int i=0;i<n;i++) { for(int j=1;j<=m;j++) X[j][i]=f.readDouble()+tempScale; Y[i]=f.readDouble(); } int c1=0,c2=0; } public static void Temperature() { m=106; n=7117; double tempScale=10; String path="C:/Research/Data/Gavin Competition/Temperature/TempTraining.csv"; //Attributes to remove int[] collinear= {1,3,4,5,6,7,20,34,35,36,47,48,72,82}; boolean finished=false; int attCount=0, c; double var,minVar=Double.MAX_VALUE,newVar, oldSSR, newSSR; X=new double[m+1][n]; Y=new double[n]; transformedY=new double[n]; InFile f = new InFile(path); for(int i=0;i<n;i++) X[0][i]=1; for(int i=0;i<n;i++) { for(int j=1;j<=m;j++) X[j][i]=f.readDouble()+tempScale; Y[i]=f.readDouble(); } int c1=0,c2=0; double[][] reducedX=new double[m+1-collinear.length][]; for(int i=0;i<=m;i++) { if(c1>=collinear.length || i!=collinear[c1]) { reducedX[c2]=X[i]; c2++; } else c1++; } X=reducedX; m=reducedX.length-1; } public static void SO2() { m=26; n=15304; int so2Scale=10; String path="C:/Research/Data/Gavin Competition/SO2/"; String p1="SO2Train.csv"; boolean finished=false; int attCount=0, c; double var,minVar=Double.MAX_VALUE,newVar, oldSSR, newSSR; X=new double[m+1][n]; Y=new double[n]; transformedY=new double[n]; InFile f = new InFile(path+p1); for(int i=0;i<n;i++) X[0][i]=1; for(int i=0;i<n;i++) { for(int j=1;j<=m;j++) X[j][i]=f.readDouble()+so2Scale; Y[i]=f.readDouble(); } } public static void SO2Reduced() { m=19; n=15304; int so2Scale=10; String path="C:/Research/Data/Gavin Competition/SO2/"; String p1="SO2TrainReduced.csv"; boolean finished=false; int attCount=0, c; double var,minVar=Double.MAX_VALUE,newVar, oldSSR, newSSR; X=new double[m+1][n]; Y=new double[n]; transformedY=new double[n]; InFile f = new InFile(path+p1); for(int i=0;i<n;i++) X[0][i]=1; for(int i=0;i<n;i++) { for(int j=1;j<=m;j++) X[j][i]=f.readDouble()+so2Scale; Y[i]=f.readDouble(); } } public static void Precip() { m=106; n=7031; int precipScale=6; String path="C:/Research/Data/Gavin Competition/Precipitation/"; String p1="PrecipitationTrain.csv"; boolean finished=false; int attCount=0, c; double var,minVar=Double.MAX_VALUE,newVar, oldSSR, newSSR; X=new double[m+1][n]; Y=new double[n]; transformedY=new double[n]; InFile f = new InFile(path+p1); for(int i=0;i<n;i++) X[0][i]=1; for(int i=0;i<n;i++) { for(int j=1;j<=m;j++) X[j][i]=f.readDouble()+precipScale; Y[i]=f.readDouble(); } } public static void PrecipReduced() { m=20; n=7031; int precipScale=6; String path="C:/Research/Data/Gavin Competition/Precipitation/"; String p1="PrecipTrainReduced.csv"; boolean finished=false; int attCount=0, c; double var,minVar=Double.MAX_VALUE,newVar, oldSSR, newSSR; X=new double[m+1][n]; Y=new double[n]; transformedY=new double[n]; InFile f = new InFile(path+p1); for(int i=0;i<n;i++) X[0][i]=1; for(int i=0;i<n;i++) { for(int j=1;j<=m;j++) X[j][i]=f.readDouble()+precipScale; Y[i]=f.readDouble(); } } static public void findStats(OutFile f2, LinearModel lm) { double s = lm.findStats(); double[] resids=lm.getResiduals(); double[] pred=lm.getPredicted(); double ap=ResidualTests.anscombeProcedure(pred,resids); double ks=ResidualTests.kolmogorovSmirnoff(resids); double rt=ResidualTests.runsTest(pred,resids); double gq=ResidualTests.goldfeldQuandt(X,Y,1); System.out.println("YJ, s^2 = "+s+", AP = "+ap+", KS = "+ks+", RT = "+rt+", GQ = "+gq); f2.writeLine("FullReg,"+s+","+ap+","+ks+","+rt+","+gq); } public static void fullModel(OutFile f2) { LinearModel lm = new LinearModel(X,Y); lm.fitModel(); findStats(f2,lm); //2. Fit YJ double best = YeoJohnson.findBestTransform(X,Y); System.out.println(" Best Transform = "+best); double[] newY = YeoJohnson.transform(best,Y); lm = new LinearModel(X,newY); lm.fitModel(); f2.writeLine("YJ Transform"); findStats(f2,lm); } public static void stepwiseLinear(OutFile f2) { boolean finished=false; included=new boolean[m+1]; int attCount=0; double var,oldSSR,newSSR,newVar; //This is going to record whether the base values are included, not the transformed included[0]=true; //Always include constant for(int i=1;i<=m;i++) included[i]=false; positions=new int[m+1]; while(!finished) { attCount=formatRegressors(); //Fit linear model with current candidates LinearModel lm = new LinearModel(workingX,Y); lm.fitModel(); var=lm.findStats(); oldSSR=lm.getSSR(); // System.out.println(" Att Count = "+attCount+" SSE plain Y = "+var); //Try adding in each variable in position attCount, record best improvement //Fitting one more model the necessary, but makes code clearer //At the end, workingX should have the new candidate in position attCount //Returns the POSITION IN X // int bestPos=findBestAdditionTransformed(attCount); int bestPos=findBestAddition(attCount); attCount++; //If improvement significant, add in permanently by setting flag, otherwise dont lm = new LinearModel(workingX,Y); lm.fitModel(); newVar=lm.findStats(); newSSR=lm.getSSR(); System.out.println(" Verification: New Var = "+newVar); // System.out.println("SSR Change = "+(newSSR-oldSSR)); // System.out.println("Test stat = "+(newSSR-oldSSR)/var); if((newSSR-oldSSR)/var>CRITICAL) { System.out.println("ADDING = "+bestPos); included[bestPos]=true; positions[attCount-1]=bestPos; size=attCount; } else { System.out.println("NOT ADDING = "+bestPos); included[bestPos]=false; finished=true; } if(attCount==m+1) finished=true; // Try removing any already in the model, if no significant worsening, remove int worst; if(attCount>3) { worst=tryRemovals(X[bestPos],newVar, newSSR); if(worst!=-1) { System.out.println(" Removing Element > "+worst); included[worst]=false; int x=0; while(x<attCount) { if(positions[x]==worst) { while(x<attCount-1) { positions[x]=positions[x+1]; x++; } } x++; } attCount--; attCount=formatRegressors(); } } } //Get full daignositics on final model attCount=formatRegressors(); //Fit linear model with current candidates LinearModel lm = new LinearModel(workingX,Y); lm.fitModel(); findStats(f2,lm); } static int count=0; public static void forwardSelectTransform(OutFile f2) { boolean finished=false, useYJ=false; double bestLambda=1,temp; included=new boolean[m+1]; int attCount=0; LinearModel lm; double var,oldSSR,newSSR,newVar; //This is going to record whether the base values are included, not the transformed included[0]=true; //Always include constant for(int i=1;i<=m;i++) included[i]=false; positions=new int[m+1]; powers=new double[m+1]; while(!finished) { attCount=formatRegressors(); //Fit linear model with current candidates lm = new LinearModel(workingX,Y); lm.fitModel(); var=lm.findStats(); oldSSR=lm.getSSR(); // System.out.println(" Att Count = "+attCount+" SSE plain Y = "+var); //Try adding in each variable in position attCount, record best improvement //Fitting one more model the necessary, but makes code clearer //At the end, workingX should have the new candidate in position attCount //Returns the POSITION IN X int bestPos=findBestAdditionTransformed(attCount); // int bestPos=findBestAddition(attCount); attCount++; //If improvement significant, add in permanently by setting flag, otherwise dont lm = new LinearModel(workingX,Y); lm.fitModel(); newVar=lm.findStats(); newSSR=lm.getSSR(); System.out.println(" Verification: New Var = "+newVar); // System.out.println("SSR Change = "+(newSSR-oldSSR)); // System.out.println("Test stat = "+(newSSR-oldSSR)/var); if((newSSR-oldSSR)/var>CRITICAL) { System.out.println("ADDING = "+bestPos); included[bestPos]=true; positions[attCount-1]=bestPos; size=attCount; } else { System.out.println("NOT ADDING = "+bestPos); included[bestPos]=false; finished=true; } if(attCount==m+1) finished=true; attCount=formatRegressors(); //Yeo Johnson first System.out.println(" TRY YJ: "); bestLambda=YeoJohnson.findBestTransform(workingX,Y); // Round to the nearest 0.5 temp=((double)Math.round(bestLambda*2))/2; double alpha=1; System.out.println("Best Lambda value ="+bestLambda+" Rounded = "+temp); int p=0; useYJ=false; if(temp!=1) { transformedY=YeoJohnson.transform(temp,Y); lm=new LinearModel(workingX,transformedY); lm.fitModel(); double s=lm.findInverseStats(temp,Y); useYJ=true; System.out.println("s = "+s); } } //Get full daignositics on final model attCount=formatRegressors(); //Fit linear model with current candidates if(useYJ) { temp=((double)Math.round(bestLambda*2))/2; transformedY=YeoJohnson.transform(temp,Y); lm = new LinearModel(workingX,transformedY); } else lm = new LinearModel(workingX,Y); lm.fitModel(); findStats(f2,lm); OutFile f3 = new OutFile("TestTrans"+count+".csv"); count++; for(int i=0;i<powers.length;i++) f3.writeString(powers[i]+","); for(int j=0;j<X[0].length;j++) { for(int i=0;i<X.length;i++) f3.writeString(X[i][j]+","); f3.writeString("\n"); } } public static void transformCode() { /* //Simple parameter search on variable just entered attCount=formatRegressors(); int p=0; double alpha; while(p<size && positions[p]!=bestPos) p++; System.out.println("p = "+p+" Pos = "+positions[p]); if(p<size) { alpha=PowerSearch.transformRegressor(workingX,Y,p); System.out.println("Alpha = "+alpha); alpha =((double) Math.round(alpha*2))/2.0; System.out.println("Rounded Alpha = "+alpha); if(alpha==0) { for(int i=0;i<X[bestPos].length;i++) X[bestPos][i]=Math.log(X[bestPos][i]); } else if (alpha!=1) { for(int i=0;i<X[bestPos].length;i++) X[bestPos][i]=Math.pow(X[bestPos][i],alpha); } } // // First effort, try YJ and B-T on newly entered // Try Transformations, not going to bother Y with TEMP attCount=formatRegressors(); //Yeo Johnson first System.out.println(" Nos atts = "+attCount); double bestLambda=YeoJohnson.findBestTransform(workingX,Y); //Round to the nearest 0.5 double temp=((double)Math.round(bestLambda*2))/2; double alpha=1; System.out.println("Best Lambda value ="+bestLambda+" Rounded = "+temp); int p=0; boolean yo=false; if(temp!=1) { transformedY=YeoJohnson.transform(temp,Y); lm=new LinearModel(workingX,transformedY); lm.fitModel(); double s=lm.findInverseStats(temp,Y); yo=true; System.out.println("s = "+s); } while(p<size && positions[p]!=bestPos) p++; System.out.println("p = "+p+" Pos = "+positions[p]); if(p<size) { if(yo) alpha=BoxTidwell.transformRegressor(workingX,transformedY,p); else alpha=BoxTidwell.transformRegressor(workingX,Y,p); } System.out.println("************* ALPHA = "+alpha); */ } //Calculate SSR of removing each one except a, then only remove if not significantly worse public static int tryRemovals(double[] inData, double var, double fullSSR) { int worst=0,outPos; double worstSSR=Double.MAX_VALUE,ssr,s; //Swap new one into position 0, always include LinearModel lm; double[][] temp = new double[size-1][]; temp[0]=workingX[0]; temp[1]=inData; double[] out=workingX[1]; double[] t; int[] tempPos=new int[size-1]; int tempOutPos,a; tempPos[0]=0; tempOutPos=positions[1]; tempPos[1]=positions[size-1]; // for(int i=0;i<size;i++) // System.out.println(" Position = "+positions[i]); System.out.println(" size = "+size); for(int i=2;i<size-1;i++) { temp[i]=workingX[i]; tempPos[i]=positions[i]; } // System.out.println(" Removing element "+tempOutPos); // for(int i=0;i<size-1;i++) // System.out.println(" Temp position = "+tempPos[i]); int i=2; do{ //Fit reduced model lm=new LinearModel(temp,Y); lm.fitModel(); //Find new SSR, record the largest of reduced s=lm.findStats(); ssr=lm.getSSR(); // System.out.println(" SSR when removing "+(i-1)+ " which has original position "+positions[(i-1)]+" is = "+ssr+" with s^2="+s); if(ssr<worstSSR) { worstSSR=ssr; worst=i-1; } //Swap attribute in and out if done if(i<size-1) { a=tempOutPos; tempOutPos=tempPos[i]; // System.out.println(" Removing element "+tempOutPos+" Adding element "+a+" back in"); tempPos[i]=a; // for(int j=0;j<size-1;j++) // System.out.println(" Temp position = "+tempPos[j]); t=temp[i]; temp[i]=out; out=t; } i++; }while(i<size); //Test worst, if not significant, return position. //NOTE that the position in the ORIGINAL data recorded by positions[worst] outPos=positions[worst]; if(((fullSSR-worstSSR)/var)<CRITICAL) return outPos; return -1; } public static int findBestAdditionTransformed(int a) { int best=-1; double[][] temp = new double[a+1][]; LinearModel lmTemp; double minSSE=Double.MAX_VALUE,s,bestPower=1; System.arraycopy(workingX, 0, temp, 0, a); for(int i=0;i<included.length;i++) { if(!included[i]) { temp[a]=X[i];// new double[X[i].length]; // for(int j=0;j<X[i].length;j++) // temp[a][j]=X[i][j]; double power=PowerSearch.transformRegressor(temp,Y,a); if(power!=1) temp[a]=PowerSearch.transform(X[i],power); lmTemp=new LinearModel(temp,Y); lmTemp.fitModel(); s=lmTemp.findStats(); // System.out.println(" Adding in attribute = "+i+" with power = "+power+" New MSE = "+s); if(s<minSSE) { minSSE=s; best=i; bestPower=power; } } } if(best>0) //Shouldnt be 0 { System.out.println(" BEST to add = "+best+" with power = "+bestPower+" MSE = "+minSSE); powers[best]=bestPower; temp[a]=PowerSearch.transform(X[best],bestPower); X[best]=temp[a]; workingX=temp; } return best; } public static int findBestAddition(int a) { int best=-1; double[][] temp = new double[a+1][]; LinearModel lmTemp; double minSSE=Double.MAX_VALUE,s; System.arraycopy(workingX, 0, temp, 0, a); for(int i=0;i<included.length;i++) { if(!included[i]) { temp[a]=X[i]; lmTemp=new LinearModel(temp,Y); lmTemp.fitModel(); s=lmTemp.findStats(); // System.out.println(" Adding in attribute = "+i+" New MSE = "+s); if(s<minSSE) { minSSE=s; best=i; } } } if(best>0) //Shouldnt be 0 { System.out.println(" BEST to add = "+best+" with MSE = "+minSSE); temp[a]=X[best]; workingX=temp; } return best; } public static int formatRegressors() { int attCount=0; for(int i=0;i<included.length;i++) if(included[i]) attCount++; workingX= new double[attCount][]; int c=0; for(int i=0;i<included.length;i++) { if(included[i]) { workingX[c]=X[i]; positions[c]=i; c++; } } return attCount; } }