/* * Created on Jul 16, 2008 * @author sunita * * Trainer as in "Exponentiated Gradient Algorithms for Conditional Random Fields and Max-Margin Markov Networks" in JMLR 08 */ package iitb.CRF; import java.util.Arrays; import java.util.Random; import java.util.Vector; import cern.colt.matrix.tdouble.DoubleMatrix2D; import cern.colt.matrix.tdouble.impl.DenseDoubleMatrix2D; public class ExponentiatedGradientDual extends Trainer { Vector<DataSequence> dataSeqs = new Vector<DataSequence>(); double thetaState[][][], thetaTransition[][][][]; double etas[]; // the learning rate. public ExponentiatedGradientDual(CrfParams p) { super(p); } @Override protected void init(CRF model, DataIter data, double[] l) { super.init(model, data, l); int numRecs = 0; for(data.startScan(); data.hasNext();dataSeqs.add(data.next()), numRecs++); thetaState = new double[numRecs][0][0]; thetaTransition = new double[numRecs][0][0][0]; for (int i = 0; i < numRecs; i++) { thetaState[i] = new double[dataSeqs.get(i).length()][numY]; thetaTransition[i] = new double[dataSeqs.get(i).length()][numY][numY]; } tempMi_YY = new DenseDoubleMatrix2D(numY,numY); etas=new double[numRecs]; Arrays.fill(etas, 0.01); } @Override protected void doTrain() { setInitialValues(); Random random = new Random(); for (int iter = 0; iter < params.maxIters; iter++) { int k = random.nextInt(dataSeqs.size()); // now update thetas. DataSequence dataSeq = dataSeqs.get(k); calculateExpFValuesUsingTheta(dataSeq,ExpF,thetaState[k],thetaTransition[k],false); //TODO: need to set etas properly. double eta = etas[k]; for (int i = 0; i < thetaState[k].length; i++) { initMDone = computeLogMiTrainMode(featureGenerator, lambda, dataSeq, i, Mi_YY, Ri_Y, false, reuseM, initMDone); for (int y = 0; y < numY; y++) { thetaState[k][i][y] -= eta*(thetaState[k][i][y]-params.invSigmaSquare*Ri_Y.get(y)); if (i > 0) { for (int yprev = 0; yprev < numY; yprev++) { thetaTransition[k][i][yprev][y] -= eta*(thetaTransition[k][i][yprev][y]-params.invSigmaSquare*Mi_YY.get(yprev,y)); } } } } // update lambda.. for (int f = 0; f < lambda.length; f++) { lambda[f] += ExpF[f]; } calculateExpFValuesUsingTheta(dataSeq,ExpF,thetaState[k],thetaTransition[k],false); for (int f = 0; f < lambda.length; f++) { lambda[f] -= ExpF[f]; } } for (int f = 0; f < lambda.length; f++) { lambda[f] *= params.invSigmaSquare; } } DoubleMatrix2D tempMi_YY; private void calculateExpFValuesUsingTheta(DataSequence dataSeq, double[] expF, double[][] thetaStateK, double[][][] thetaEdgeK,boolean addCorrFVec) { Arrays.fill(expF,RobustMath.LOG0); if ((beta_Y == null) || (beta_Y.length < dataSeq.length())) { allocateAlphaBeta(2*dataSeq.length()+1); } beta_Y[dataSeq.length()-1].assign(0); for (int i = dataSeq.length()-1; i > 0; i--) { // compute the Mi matrix tmp_Y.assign(thetaStateK[i]); tmp_Y.assign(beta_Y[i],sumFunc); tempMi_YY.assign(thetaEdgeK[i]); RobustMath.logMult(tempMi_YY, tmp_Y, beta_Y[i-1],1,0,false,edgeGen); } alpha_Y.assign(0); for (int i = 0; i < dataSeq.length(); i++) { // compute the Mi matrix if (i > 0) { tmp_Y.assign(alpha_Y); tempMi_YY.assign(thetaEdgeK[i]); RobustMath.logMult(tempMi_YY, tmp_Y, newAlpha_Y,1,0,true,edgeGen); tmp_Y.assign(thetaStateK[i]); newAlpha_Y.assign(tmp_Y,sumFunc); } else { newAlpha_Y.assign(thetaStateK[i]); } // find features that fire at this position.. featureGenerator.startScanFeaturesAt(dataSeq, i); while (featureGenerator.hasNext()) { Feature feature = featureGenerator.next(); int f = feature.index(); int yp = feature.y(); int yprev = feature.yprev(); float val = feature.value(); if ((addCorrFVec) && (dataSeq.y(i) == yp) && (((i-1 >= 0) && (yprev == dataSeq.y(i-1))) || (yprev < 0))) { expF[f] -= val; if (params.debugLvl > 2) { System.out.println("Feature fired " + f + " " + feature); } } if (Math.abs(val) < Double.MIN_VALUE) continue; if (val < 0) { System.out.println("ERROR: Cannot process negative feature values in log domains: " + "either disable the '-trainer=ll' flag or ensure feature values are not -ve"); continue; } if (yprev < 0) { expF[f] = RobustMath.logSumExp(ExpF[f], newAlpha_Y.get(yp) + RobustMath.log(val) + beta_Y[i].get(yp)); } else { expF[f] = RobustMath.logSumExp(ExpF[f], alpha_Y.get(yprev)+thetaStateK[i][yp]+tempMi_YY.get(yprev,yp)+RobustMath.log(val)+beta_Y[i].get(yp)); } } alpha_Y.assign(newAlpha_Y); if (params.debugLvl > 2) { System.out.println("Alpha-i " + alpha_Y.toString()); System.out.println("Ri " + Ri_Y.toString()); System.out.println("Mi " + Mi_YY.toString()); System.out.println("Beta-i " + beta_Y[i].toString()); } } } private void setInitialValues() { Arrays.fill(lambda, 0); for (int i = 0; i < dataSeqs.size(); i++) { calculateExpFValuesUsingTheta(dataSeqs.get(i), ExpF, thetaState[i], thetaTransition[i], true); for (int f = 0; f < lambda.length; f++) { lambda[f] -= ExpF[f]; } } } }