ErrorBasedConfidenceIntervalEstimator.java example

Explorer
mdrill-master
- trunk
/*
 * Copyright (c) 2010 Pentaho Corporation.  All rights reserved. 
 * This software was developed by Pentaho Corporation and is provided under the terms 
 * of the GNU Lesser General Public License, Version 2.1. You may not use 
 * this file except in compliance with the license. If you need a copy of the license, 
 * please go to http://www.gnu.org/licenses/lgpl-2.1.txt. The Original Code is Time Series 
 * Forecasting.  The Initial Developer is Pentaho Corporation.
 *
 * Software distributed under the GNU Lesser Public License is distributed on an "AS IS" 
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or  implied. Please refer to 
 * the license for the specific language governing your rights and limitations.
 */

/**
 * ErrorBasedConfidenceIntervalEstimator.java
 * Copyright (C) 2010 Pentaho Corporation
 */
package weka.classifiers.timeseries.core;

import java.io.PrintStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Collections;

import weka.classifiers.evaluation.NumericPrediction;
import weka.classifiers.timeseries.AbstractForecaster;
import weka.classifiers.timeseries.TSForecaster;
import weka.classifiers.timeseries.eval.ErrorModule;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Utils;

/**
 * Class that computes confidence intervals for a time series forecaster
 * using errors computed on the training data.
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision: 50211 $
 *
 */
public class ErrorBasedConfidenceIntervalEstimator implements Serializable {
  
  /** For serialization */
  private static final long serialVersionUID = -2748314799535071043L;

  /**
   * Holds confidence bound offsets for targets at a certain level. 
   * The outer list corresponds to the fields to forecast (in the same
   * order as supplied to the TSForecaster.setFieldsToForecast() method. The inner
   * lists hold two element arrays of doubles (upper, lower bounds). The
   * first index of these lists are bounds for to a one-step-ahead forecast,
   * the second a two-step-ahead forecast, and so on 
   */
  protected List<List<double[]>> m_confidenceLimitsForTargets;
  
  /** The names of the target field(s) */
  protected List<String> m_targetFields;
  
  /** The confidence level for the limits */
  protected double m_confidenceLevel;
  
  /**
   * Get the confidence bound offsets for each target at the supplied
   * confidence level
   * 
   * @param confidenceLevel the confidence level to use
   * @return a List of confidence offsets - one for each target. Each
   * target's confidence limits are stored in a two element array,
   * where the first element stores the upper bound and the second
   * the lower bound (both are expressed as an offset)
   * @throws Exception if the confidence limits can't be computed
   * for some reason
   */
  public List<double[]> getConfidenceOffsets(double confidenceLevel,
      List<List<NumericPrediction>> predictions)
    throws Exception {
    
    if (predictions == null || predictions.get(0).size() == 0) {
      throw new Exception("No predictions have been seen yet!");
    }
    
    List<double[]> result = new ArrayList<double[]>();
    
    for (int i = 0; i < m_targetFields.size(); i++) {
      List<NumericPrediction> preds = predictions.get(i);
      
      // need to separate the positive and negative errors
      // into two separate lists
      List<Double> posErrs = new ArrayList<Double>();
      List<Double> negErrs = new ArrayList<Double>();
      
      for (NumericPrediction p : preds) {
        if (!Utils.isMissingValue(p.error())) {
          if (p.error() < 0) {
            negErrs.add(new Double(Math.abs(p.error())));
          }
          
          if (p.error() > 0) {
            posErrs.add(new Double(p.error()));
          }
        }
      }
      
      // sort into ascending order
      Collections.sort(posErrs);
      Collections.sort(negErrs);
      double[] bounds = new double[2];
      bounds[0] = Utils.missingValue();
      bounds[1] = Utils.missingValue();
      
      if (posErrs.size() > 0 && negErrs.size() > 0) {
        double cL = 1.0 - confidenceLevel;
        int posPosition = (int)Math.round(posErrs.size() * cL);
        if (posPosition < 1) {
          posPosition = 1;
        }
        int negPosition = (int)Math.round(negErrs.size() * cL);
        if (negPosition < 1) {
          negPosition = 1;
        }

        //double upperBound = posErrs.get(posErrs.size() - posPosition);
        double upperBound = negErrs.get(negErrs.size() - negPosition);
        //double lowerBound = negErrs.get(negErrs.size() - negPosition);
        double lowerBound = posErrs.get(posErrs.size() - posPosition);
        lowerBound = -lowerBound;
        bounds[0] = lowerBound;
        bounds[1] = upperBound;
        /*System.err.println("Neg pos " + negPosition);
        System.err.println("lower: " + lowerBound + " Upper " + upperBound);*/
      }
      
      result.add(bounds);
    }

    return result;
  }
  
  /**
   * Creates overlay data for "future" instances taken from the training
   * data. All targets are set to missing value in this data. This method
   * is used when the forecaster is being trained using overlay fields.
   * 
   * @param forecaster the forecaster being used
   * @param source the source data to create a set of overlay instances from
   * @param start the index of the instance in the source data that will
   * be the first instance in the overlay data
   * @param numSteps the number of steps to be forecast
   * @return the overlay data as an Instances object
   */
  protected Instances createOverlayForecastData(TSForecaster forecaster, 
      Instances source, int start, int numSteps) {
    
    int toCopy = Math.min(numSteps, source.numInstances() - start);
    Instances overlay = new Instances(source, start, toCopy);
    
    // set all targets to missing
    List<String> fieldsToForecast = 
      AbstractForecaster.stringToList(forecaster.getFieldsToForecast());
    for (int i = 0; i < overlay.numInstances(); i++) {
      Instance current = overlay.instance(i);
      for (String target : fieldsToForecast) {
        current.setValue(overlay.attribute(target), Utils.missingValue());
      }
    }

    return overlay;
  }
  
  /**
   * Computes confidence intervals using the supplied forecster and
   * training data.
   * 
   * @param forecaster the forecaster to use
   * @param insts the training data to use
   * @param numPrime the number of instances to prime the forecaster with
   * @param numSteps the number of steps to forecast (and hence compute
   * intervals for)
   * @param confidenceLevel the confidence level to use
   * @param progress PrintStream objects to report progress to
   * @throws Exception if a problem occurs
   */
  public void calculateConfidenceOffsets(TSForecaster forecaster,
      Instances insts, int numPrime, int numSteps, double confidenceLevel, 
      PrintStream... progress) throws Exception {
    calculateConfidenceOffsets(forecaster, insts, numPrime, -1, numSteps, 
        confidenceLevel, progress);    
  }

  // artificialTimeStartValue is assumed to be the time value for
  // the first instance in the supplied set of instances
  
  /**
   * Computes confidence intervals using the supplied forecster and
   * training data.
   * 
   * @param forecaster the forecaster to use
   * @param insts the training data
   * @param numPrime the number of instances to prime with
   * @param artificialTimeStartValue start value for the artificial time stamp
   * (if one is being used or -1 otherwise)
   * @param numSteps number of time steps to compute confidence intervals for
   * @param confidenceLevel the confidence level to use
   * @param progress varargs PrintStream object(s) to report progress to
   * @throws Exception if something goes wrong.
   */
  public void calculateConfidenceOffsets(TSForecaster forecaster,
      Instances insts, int numPrime, int artificialTimeStartValue, 
      int numSteps, double confidenceLevel, PrintStream... progress)
      throws Exception {
    
    if (insts.numInstances() < (numPrime + numSteps)) {
      throw new Exception("We need at least " + (numPrime + numSteps) 
          + " instances in order to calculate confidence limits!");
    }
    
    if (confidenceLevel < 0 || confidenceLevel > 1) {
      throw new Exception("Confidence level must lie between 0 and 1");
    }
    
    m_targetFields = 
      AbstractForecaster.stringToList(forecaster.getFieldsToForecast());
    m_confidenceLevel = confidenceLevel;
    
    List<ErrorModule> confidenceCalculators = new ArrayList<ErrorModule>();
    for (int i = 0; i < numSteps; i++) {
      ErrorModule m = new ErrorModule();
      m.setTargetFields(m_targetFields);
      confidenceCalculators.add(m);
    }
    
    Instances primeInsts = new Instances(insts, 0, numPrime);
/*    for (int i = 0; i < numPrime; i++) {
      primeInsts.add(insts.instance(i));
    } */
    primeInsts.compactify();
    if (forecaster instanceof TSLagUser && artificialTimeStartValue >= 0) {
      ((TSLagUser)forecaster).getTSLagMaker().
        setArtificialTimeStartValue(artificialTimeStartValue - 1 + numPrime);
    }    
    
    for (int i = numPrime; i < insts.numInstances(); i++) {
      forecaster.primeForecaster(primeInsts);
      
      if (i % 10 == 0) {
        for (PrintStream p : progress) {
          p.println("Computing confidence intervals: processed " + i + " instances...");
        }
      }
      
      List<List<NumericPrediction>> forecastForSteps = null; 
      if (forecaster instanceof OverlayForecaster && 
          ((OverlayForecaster)forecaster).isUsingOverlayData()) {        
        // can only generate forecasts for remaining training data that
        // we can use as overlay data
        Instances overlay = 
          createOverlayForecastData(forecaster, insts, i, numSteps);

        forecastForSteps = 
          ((OverlayForecaster)forecaster).forecast(numSteps, overlay);        
      } else {
        forecastForSteps = forecaster.forecast(numSteps);
      }
      
      // single target only at present
      //List<NumericPrediction> preds = forecastForTargets.get(0);
      
      // update the error modules
      for (int j = 0; j < numSteps && 
        (i + j < insts.numInstances()); j++) {
        
        Instance toPredict = insts.instance(i + j);
//        double[] forecastsForStepJ = new double[m_targetFields.size()];
        List<NumericPrediction> predsForTargets = forecastForSteps.get(j);
        
/*        for (int k = 0; k < m_targetFields.size(); k++) {
          forecastsForStepJ[k] = predsForTargets.get(k).predicted();
        } */
        
        confidenceCalculators.get(j).evaluateForInstance(predsForTargets, toPredict);
      }
      
      // remove the first instance from the primeInsts and then add instance i
      // to the end
      primeInsts.delete(0);
      primeInsts.add(insts.instance(i));
      primeInsts.compactify();      
    }
    
    m_confidenceLimitsForTargets = new ArrayList<List<double[]>>();
    for (int j = 0; j < m_targetFields.size(); j++) {
      ArrayList<double[]> limitsForSingleTarget = new ArrayList<double[]>();
      for (int i = 0; i < numSteps; i++) {
        List<List<NumericPrediction>> predsForStepI = 
          confidenceCalculators.get(i).getPredictionsForAllTargets();
        List<double[]> confOffsetsForStepI = 
          getConfidenceOffsets(confidenceLevel, predsForStepI);
        
        double[] limitsAtStepI = confOffsetsForStepI.get(j);
        limitsForSingleTarget.add(limitsAtStepI);
      }
      m_confidenceLimitsForTargets.add(limitsForSingleTarget);
    }    
  }

  /**
   * Get the confidence level in use
   * 
   * @return the confidence level
   */
  public double getConfidenceLevel() {

    return m_confidenceLevel;
  }

  /**
   * Get the confidence limits (upper and lower bounds) for the named target
   * at the given step number
   * 
   * @param targetName the name of the target to return the limits for
   * @param targetValue the predicted target value
   * @param stepNum the step number to return the bounds for this target
   * @return an array containing the lower and upper bounds for the supplied
   * target value in elements 0 and 1 respectively.
   * 
   * @throws Exception if a problem occurs while computing the bounds.
   */
  public double[] getConfidenceLimitsForTarget(String targetName, 
      double targetValue, int stepNum)
      throws Exception {

    int index = m_targetFields.indexOf(targetName);
    
    if (index < 0) {
      throw new Exception("[ErrorBasedConfidenceLimitEstimator] " +
      		"unknown target: " + targetName);
    }
    List<double[]> confForTarget = m_confidenceLimitsForTargets.get(index); 
    if (stepNum > confForTarget.size()) {
      throw new Exception("[ErrorBasedConfidenceLimitEstimator] no limits availalbe for" +
      		"requested step number: " + stepNum);
    }
    
    double[] offsets = confForTarget.get(stepNum - 1);
    double[] limits = new double[2];
    limits[0] = targetValue + offsets[0];
    limits[1] = targetValue + offsets[1];
    
    //return confForTarget.get(stepNum - 1);
    return limits;
  }
}