SoftLogMarginTrainer.java example

Explorer
MinorThird-master
/* SoftLogMarginTrainer.java
 * Created on Oct 16, 2007
 * 
 * @author sunita
 * @version 1.3
 * 
 * Objective is log (sum_y hammingLoss(y)*exp(W.(F(xi,y)-F(xi,yi))))
 * 
 */
package iitb.CRF;

import cern.colt.matrix.tdouble.DoubleMatrix1D;
import cern.colt.matrix.tdouble.impl.DenseDoubleMatrix1D;

public class SoftLogMarginTrainer extends Trainer {
    @Override
    protected void setInitValue(double[] lambda) {
        if (initPhase) super.setInitValue(lambda);
    }


    DoubleMatrix1D alphas[]= new DenseDoubleMatrix1D[0];
    DoubleMatrix1D alphaLoss[]= new DenseDoubleMatrix1D[0];
    DoubleMatrix1D betaLoss[]= new DenseDoubleMatrix1D[0];
    boolean initPhase;
    public SoftLogMarginTrainer(CrfParams p) {
        super(p);
        logProcessing=true;
        
    }
    public void train(CRF model, DataIter data, double[] l, Evaluator eval) {
        init(model,data,l);
        evaluator = eval;
        if (params.debugLvl > 0) {
            Util.printDbg("Number of features :" + lambda.length);      
        }
        initPhase=true;
        doTrain();
        initPhase=false;
        System.out.println("Exponential loss training...");
        doTrain();
    }
    @Override
    protected double sumProductInner(DataSequence dataSeq, FeatureGenerator featureGenerator, double[] lambda, double[] grad, 
            boolean onlyForwardPass, int numRecord, FeatureGenerator fgenForExpVals) {
        if ((beta_Y == null) || (beta_Y.length < dataSeq.length())) {
            beta_Y = new DenseDoubleMatrix1D[2*dataSeq.length()];
            for (int i = 0; i < beta_Y.length; i++)
                beta_Y[i] = new DenseDoubleMatrix1D(numY);
            alphas= new DenseDoubleMatrix1D[beta_Y.length];
            alphaLoss= new DenseDoubleMatrix1D[beta_Y.length];
            betaLoss= new DenseDoubleMatrix1D[beta_Y.length];
            for (int i = 0; i < betaLoss.length; i++) {
                betaLoss[i] = new DenseDoubleMatrix1D(numY);
                alphaLoss[i] = new DenseDoubleMatrix1D(numY);
                alphas[i] = new DenseDoubleMatrix1D(numY);
            }
        }
        if (initPhase) return super.sumProductInner(dataSeq, featureGenerator, 
                lambda, grad, onlyForwardPass, numRecord, fgenForExpVals);
        beta_Y[dataSeq.length()-1].assign(0);
        betaLoss[dataSeq.length()-1].assign(RobustMath.LOG0);
        double trainWDotF=0;
        for (int i = dataSeq.length()-1; i > 0; i--) {
            // compute the Mi matrix
            initMDone = computeLogMiTrainMode(featureGenerator,lambda,dataSeq,i,Mi_YY,Ri_Y,false,reuseM,initMDone);
            tmp_Y.assign(beta_Y[i]);
            tmp_Y.assign(Ri_Y,sumFunc);
            RobustMath.logMult(Mi_YY, tmp_Y, beta_Y[i-1],1,0,false,edgeGen);
            int ycorr = dataSeq.y(i);
            trainWDotF += (Ri_Y.get(ycorr)+Mi_YY.get(dataSeq.y(i-1),ycorr));
            
            betaLoss[i-1].assign(RobustMath.LOG0);
           
            for (int yprev=0; yprev < numY; yprev++) {
                betaLoss[i-1].set(yprev,beta_Y[i].get(ycorr)+Ri_Y.get(ycorr)+Mi_YY.get(yprev,ycorr));
            }

            tmp_Y.assign(betaLoss[i]);
            tmp_Y.assign(Ri_Y,sumFunc);
            RobustMath.logMult(Mi_YY, tmp_Y, betaLoss[i-1],1,1,false,edgeGen);
        }
        double betaLogZ=0;
        double logZ=0;
        double obj = 0;
        double betat1=0, betat2=0;
        for (int i = 0; i < dataSeq.length(); i++) {
            // compute the Mi matrix
            initMDone = computeLogMiTrainMode(featureGenerator,lambda,dataSeq,i,Mi_YY,Ri_Y,false,reuseM,initMDone);
            if (i > 0) {
                tmp_Y.assign(alphas[i-1]);
                RobustMath.logMult(Mi_YY, tmp_Y, alphas[i],1,0,true,edgeGen);
                alphas[i].assign(Ri_Y,sumFunc); 
            } else {
                alphas[i].assign(Ri_Y);

                tmp_Y.assign(beta_Y[0]);
                tmp_Y.assign(Ri_Y, sumFunc);
                
                int ycorr=dataSeq.y(0);
                trainWDotF += Ri_Y.get(ycorr);
                
                tmp_Y.assign(betaLoss[i]);
                tmp_Y.assign(Ri_Y,sumFunc);
                
                double t1 = RobustMath.logSumExp(tmp_Y)+Math.log(dataSeq.length())-trainWDotF;
                double t2= RobustMath.logSumExp(RobustMath.logSumExp(tmp_Y)
                        ,beta_Y[0].get(ycorr)+Ri_Y.get(ycorr))-trainWDotF;
                betat1=t1; betat2=t2;
                try {
                    assert(t1 > t2);
                    betaLogZ = RobustMath.logMinusExp(t1,t2);
                    logZ = Math.exp(t1)-Math.exp(t2);
//                    System.out.println("Diff "+(logZ-betaLogZ) + " "+logZ);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
            if (i > 0) {
                tmp_Y.assign(alphaLoss[i-1]);
                RobustMath.logMult(Mi_YY, tmp_Y, alphaLoss[i],1,0,true,edgeGen);
                alphaLoss[i].assign(Ri_Y,sumFunc);
            } else {
                alphaLoss[i].assign(RobustMath.LOG0);
            }
            int ycorr = dataSeq.y(i);
            alphaLoss[i].set(ycorr, RobustMath.logSumExp(alphaLoss[i].get(ycorr),alphas[i].get(ycorr)));

            // find features that fire at this position..
            fgenForExpVals.startScanFeaturesAt(dataSeq, i);
            while (fgenForExpVals.hasNext()) { 
                Feature feature = fgenForExpVals.next();
                int f = feature.index();
                int yp = feature.y();
                int yprev = feature.yprev();
                float val = feature.value();
                if (Math.abs(val) < Double.MIN_VALUE) continue;
                if ((dataSeq.y(i) == yp) && (((i-1 >= 0) && (yprev == dataSeq.y(i-1))) || (yprev < 0))) {
                    grad[f] += val*logZ;
                    //obj += val*lambda[f];
                }
                double logpr=beta_Y[i].get(yp);
                double logt2=0;
                if (yprev < 0) {
                    logpr += alphas[i].get(yp);
                    logt2 = RobustMath.logSumExp(alphaLoss[i].get(yp)+beta_Y[i].get(yp),
                            alphas[i].get(yp)+betaLoss[i].get(yp));
                } else {
                    logpr += alphas[i-1].get(yprev)+Ri_Y.get(yp)+Mi_YY.get(yprev,yp);
                    
                    logt2 =alphaLoss[i-1].get(yprev)+beta_Y[i].get(yp);
                    logt2 = RobustMath.logSumExp(logt2,alphas[i-1].get(yprev)+betaLoss[i].get(yp));
                    if (yp==dataSeq.y(i)) {
                        logt2 = RobustMath.logSumExp(logt2,alphas[i-1].get(yprev)+beta_Y[i].get(yp));
                    }
                    logt2 += (Ri_Y.get(yp)+Mi_YY.get(yprev,yp));
                }
                grad[f] -= val*dataSeq.length()*Math.exp(logpr-trainWDotF);
                grad[f] += val*Math.exp(logt2-trainWDotF);
                assert(!Double.isInfinite(grad[f]));
                assert(!Double.isNaN(grad[f]));
            }
        }
        assert(!Double.isNaN(norm(grad)));
        
        double testVal=RobustMath.LOG0;
        for (int i = 0; i < dataSeq.length(); i++) {
            double t = alphas[i].get(dataSeq.y(i))+beta_Y[i].get(dataSeq.y(i));
            testVal = RobustMath.logSumExp(testVal, t);
        }
        testVal -= trainWDotF;
        assert(Math.abs(testVal-betat2) < 1e-4);
        /*
        double t1 = RobustMath.logSumExp(alphas[dataSeq.length()-1])+Math.log(dataSeq.length());
        double t2= RobustMath.logSumExp(alphaLoss[dataSeq.length()-1]);
        try {
            logZ = RobustMath.logMinusExp(t1,t2);
            if (Math.abs(logZ-betaLogZ) > 1e-2) {
                System.out.println((t1-t2)+ " "+(betat1-betat2));
                RobustMath.logMinusExp(t1,t2);
                RobustMath.logMinusExp(betat1, betat2);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        */
        return obj-logZ;
    }
    
    
    protected double sumProductInnerOld(DataSequence dataSeq, FeatureGenerator featureGenerator, double[] lambda, double[] grad, 
            boolean onlyForwardPass, int numRecord, FeatureGenerator fgenForExpVals) {
        if ((beta_Y == null) || (beta_Y.length < dataSeq.length())) {
            beta_Y = new DenseDoubleMatrix1D[2*dataSeq.length()];
            for (int i = 0; i < beta_Y.length; i++)
                beta_Y[i] = new DenseDoubleMatrix1D(numY);
            alphas= new DenseDoubleMatrix1D[beta_Y.length];
            alphaLoss= new DenseDoubleMatrix1D[beta_Y.length];
            betaLoss= new DenseDoubleMatrix1D[beta_Y.length];
            for (int i = 0; i < betaLoss.length; i++) {
                betaLoss[i] = new DenseDoubleMatrix1D(numY);
                alphaLoss[i] = new DenseDoubleMatrix1D(numY);
                alphas[i] = new DenseDoubleMatrix1D(numY);
            }
        }
        beta_Y[dataSeq.length()-1].assign(0);
        betaLoss[dataSeq.length()-1].assign(RobustMath.LOG0);
        double trainWDotF=0;
        for (int i = dataSeq.length()-1; i > 0; i--) {
            // compute the Mi matrix
            initMDone = computeLogMiTrainMode(featureGenerator,lambda,dataSeq,i,Mi_YY,Ri_Y,false,reuseM,initMDone);
            tmp_Y.assign(beta_Y[i]);
            tmp_Y.assign(Ri_Y,sumFunc);
            RobustMath.logMult(Mi_YY, tmp_Y, beta_Y[i-1],1,0,false,edgeGen);
            int ycorr = dataSeq.y(i);
            trainWDotF += (Ri_Y.get(ycorr)+Mi_YY.get(dataSeq.y(i-1),ycorr));
            
            betaLoss[i-1].assign(RobustMath.LOG0);
           
            for (int yprev=0; yprev < numY; yprev++) {
                betaLoss[i-1].set(yprev,beta_Y[i].get(ycorr)+Ri_Y.get(ycorr)+Mi_YY.get(yprev,ycorr));
            }

            tmp_Y.assign(betaLoss[i]);
            tmp_Y.assign(Ri_Y,sumFunc);
            RobustMath.logMult(Mi_YY, tmp_Y, betaLoss[i-1],1,1,false,edgeGen);
        }
        double betaLogZ=0;
        double logZ=0;
        double obj = 0;
        double betat1=0, betat2=0;
        for (int i = 0; i < dataSeq.length(); i++) {
            // compute the Mi matrix
            initMDone = computeLogMiTrainMode(featureGenerator,lambda,dataSeq,i,Mi_YY,Ri_Y,false,reuseM,initMDone);
            if (i > 0) {
                tmp_Y.assign(alphas[i-1]);
                RobustMath.logMult(Mi_YY, tmp_Y, alphas[i],1,0,true,edgeGen);
                alphas[i].assign(Ri_Y,sumFunc); 
            } else {
                alphas[i].assign(Ri_Y);

                tmp_Y.assign(beta_Y[0]);
                tmp_Y.assign(Ri_Y, sumFunc);
                double t1 = RobustMath.logSumExp(tmp_Y)+Math.log(dataSeq.length());
                
                int ycorr=dataSeq.y(0);
                trainWDotF += Ri_Y.get(ycorr);
                
                tmp_Y.assign(betaLoss[i]);
                tmp_Y.assign(Ri_Y,sumFunc);
                
                double t2= RobustMath.logSumExp(RobustMath.logSumExp(tmp_Y)
                        ,beta_Y[0].get(ycorr)+Ri_Y.get(ycorr));
                betat1=t1; betat2=t2;
                try {
                    assert(t1 > t2);
                    betaLogZ = RobustMath.logMinusExp(t1,t2);
                    logZ = Math.log(Math.exp(t1)-Math.exp(t2));
                    System.out.println("Diff "+(logZ-betaLogZ) + " "+logZ);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
            if (i > 0) {
                tmp_Y.assign(alphaLoss[i-1]);
                RobustMath.logMult(Mi_YY, tmp_Y, alphaLoss[i],1,0,true,edgeGen);
                alphaLoss[i].assign(Ri_Y,sumFunc);
            } else {
                alphaLoss[i].assign(RobustMath.LOG0);
            }
            int ycorr = dataSeq.y(i);
            alphaLoss[i].set(ycorr, RobustMath.logSumExp(alphaLoss[i].get(ycorr),alphas[i].get(ycorr)));

            // find features that fire at this position..
            fgenForExpVals.startScanFeaturesAt(dataSeq, i);
            while (fgenForExpVals.hasNext()) { 
                Feature feature = fgenForExpVals.next();
                int f = feature.index();
                int yp = feature.y();
                int yprev = feature.yprev();
                float val = feature.value();
                if (Math.abs(val) < Double.MIN_VALUE) continue;
                if ((dataSeq.y(i) == yp) && (((i-1 >= 0) && (yprev == dataSeq.y(i-1))) || (yprev < 0))) {
                    grad[f] += val;
                    obj += val*lambda[f];
                }
                double logpr=beta_Y[i].get(yp);
                double logt2=0;
                if (yprev < 0) {
                    logpr += alphas[i].get(yp);
                    logt2 = RobustMath.logSumExp(alphaLoss[i].get(yp)+beta_Y[i].get(yp),
                            alphas[i].get(yp)+betaLoss[i].get(yp));
                } else {
                    logpr += alphas[i-1].get(yprev)+Ri_Y.get(yp)+Mi_YY.get(yprev,yp);
                    
                    logt2 =alphaLoss[i-1].get(yprev)+beta_Y[i].get(yp);
                    logt2 = RobustMath.logSumExp(logt2,alphas[i-1].get(yprev)+betaLoss[i].get(yp));
                    if (yp==dataSeq.y(i)) {
                        logt2 = RobustMath.logSumExp(logt2,alphas[i-1].get(yprev)+beta_Y[i].get(yp));
                    }
                    logt2 += (Ri_Y.get(yp)+Mi_YY.get(yprev,yp));
                }
                grad[f] -= val*Math.exp(RobustMath.logMinusExp(Math.log(dataSeq.length())+logpr,logt2)-logZ);
                assert(!Double.isInfinite(grad[f]));
                assert(!Double.isNaN(grad[f]));
            }
        }
        assert(!Double.isNaN(norm(grad)));
        
        double testVal=RobustMath.LOG0;
        for (int i = 0; i < dataSeq.length(); i++) {
            double t = alphas[i].get(dataSeq.y(i))+beta_Y[i].get(dataSeq.y(i));
            testVal = RobustMath.logSumExp(testVal, t);
        }
        assert(Math.abs(testVal-betat2) < 1e-4);
        
        double t1 = RobustMath.logSumExp(alphas[dataSeq.length()-1])+Math.log(dataSeq.length());
        double t2= RobustMath.logSumExp(alphaLoss[dataSeq.length()-1]);
        try {
            logZ = RobustMath.logMinusExp(t1,t2);
            if (Math.abs(logZ-betaLogZ) > 1e-2) {
                System.out.println((t1-t2)+ " "+(betat1-betat2));
                RobustMath.logMinusExp(t1,t2);
                RobustMath.logMinusExp(betat1, betat2);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return obj-logZ;
    }
    
}