package hep.aida.ref.pdf; import hep.aida.ext.IFitMethod; import hep.aida.ref.fitter.fitdata.FitData; import hep.aida.ref.fitter.fitdata.FitDataIterator; /** * A Pdf builtg from a given data set. * The Pdf is evaluated using the idea of adaptive kernel estimation presented * at http://www-wisconsin.cern.ch/~cranmer/keys.html * * Mirroring is to be performed when distributions don't naturally taper to * zero on one or both sides. With abrupt interruptions of the distribution there * is a leak of probability due to the way the kernel is built (gaussian). By * mirroring the distribution on the side of the abrupt edge the loss of probability * is minimized. * * @author The FreeHEP team @ SLAC. * */ public class NonParametricPdf extends Function { public static final int NO_MIRROR = 0; public static final int MIRROR_LEFT = 1; public static final int MIRROR_RIGHT = 2; public static final int MIRROR_BOTH = 3; private static final int nPoints = 1000; private static final double sqrt2pi = Math.sqrt(2*Math.PI); private Dependent x; private double xVal; private double rho = 1; private double[] lookupTable; private double[] dataPoints; private double[] weights; private int nEntries; private int nData; private double lowerBound; private double upperBound; private double binWidth; private boolean mirrorLeft; private boolean mirrorRight; private int type; public NonParametricPdf(String name, FitData data, Dependent x) { this(name, data, x, MIRROR_BOTH); } public NonParametricPdf(String name, FitData data, Dependent x, int mirrorCode) { super(name); this.x = x; switch(mirrorCode) { case NO_MIRROR: mirrorLeft = false; mirrorRight = false; break; case MIRROR_LEFT: mirrorLeft = true; mirrorRight = false; break; case MIRROR_RIGHT: mirrorLeft = false; mirrorRight = true; break; case MIRROR_BOTH: mirrorLeft = true; mirrorRight = true; break; default: throw new IllegalArgumentException("Unsupported mirror code "+mirrorCode); } lowerBound = x.range().lowerBounds()[0]; upperBound = x.range().upperBounds()[0]; binWidth = (upperBound - lowerBound)/(nPoints-1); VariableList list = new VariableList(); list.add(x); addVariables(list); if ( data.dimension() != 1 ) throw new IllegalArgumentException("Cannot create a multi-dimensional NonParametricPdf"); type = data.fitType(); if ( data.fitType() == IFitMethod.BINNED_FIT ) { throw new IllegalArgumentException("Cannot create NonParametricPdf for binned data"); } else { initializeUnbinnedDataSet(data); } } public void variableChanged(Variable var) { if ( var == x ) xVal = x.value(); } public double functionValue() { if ( type == IFitMethod.UNBINNED_FIT ) { int i = (int)Math.floor((xVal-lowerBound)/binWidth); if ( i < 0 ) { System.out.println("Got point below lower bound "+xVal+". Peforming linear extrapolation."); i=0; } if ( i > nPoints-1 ) { System.out.println("Got point above upper bound "+xVal+". Peforming linear extrapolation."); i= nPoints - 1; } double dx = (xVal-(lowerBound+i*binWidth))/binWidth; double r = (lookupTable[i]+dx*(lookupTable[i+1]-lookupTable[i])); // System.out.println("Value for "+xVal+" "+i+" "+dx+" "+r); return r; } else { throw new UnsupportedOperationException("Cannot evaluate NonParametricPdf for binned data"); } } private void initializeUnbinnedDataSet(FitData d) { lookupTable = new double[nPoints+1]; FitDataIterator iter = (FitDataIterator)d.dataIterator(); nEntries = iter.entries(); nData = nEntries; if (mirrorLeft) nEntries += iter.entries(); if (mirrorRight) nEntries += iter.entries(); dataPoints = new double[nEntries]; weights = new double[nEntries]; double sumX = 0; double sumXSquared = 0; int count = 0; double tmpx; while ( iter.next() ) { tmpx = iter.vars()[0]; dataPoints[count] = tmpx; sumX += tmpx; sumXSquared += tmpx*tmpx; count++; if (mirrorLeft) { dataPoints[count]= 2*lowerBound - tmpx; count++; } if (mirrorRight) { dataPoints[count]= 2*upperBound - tmpx; count++; } } double mean = sumX/nData; double sigma = Math.sqrt(sumXSquared/nData-mean*mean); double h = Math.pow(4./3.,0.2)*Math.pow((double)nData,-0.2)*rho; double hmin = h*sigma*Math.sqrt(2.)/10.; double norm=h*Math.sqrt(sigma)/(2.0*Math.sqrt(3.0)); weights=new double[nEntries]; for(int j = 0; j < nEntries; j++) { weights[j]=norm/Math.sqrt(weightFactor(dataPoints[j],h*sigma)); if (weights[j]<hmin) weights[j]=hmin; } for (int i = 0; i < nPoints+1; i++) lookupTable[i] = evaluateFull( lowerBound + (double)i*binWidth ); } private double weightFactor(double x, double sigma) { double c = 1./(2*sigma*sigma); double y=0; for (int i = 0; i < nEntries; i++) { double arg = x - dataPoints[i]; y += Math.exp(-c*arg*arg); } return y/(sigma*sqrt2pi*nData); } private double evaluateFull( double x ) { double y=0; for( int i = 0; i < nEntries; i++ ) { double chi = (x - dataPoints[i])/weights[i]; y += Math.exp(-0.5*chi*chi)/weights[i]; } return y/(sqrt2pi*nData); } public boolean hasAnalyticalVariableGradient(Variable var) { return false; } /** * FIXME * Should the normalization be left to 1? or should we evaluate it numerically? * The problem is the leaking of probability at the edge. * */ public boolean hasAnalyticalNormalization(Dependent dep) { if ( dep == x ) return true; return false; } public double evaluateAnalyticalNormalization(Dependent dep) { return 1; } }