FoldParser.java example

Explorer
MinorThird-master
package LBJ2.parse;

import java.util.*;


/**
  * Useful when performing <i>k</i>-fold cross validation, this parser filters
  * the examples coming from another parser.  Conceptually, the examples from
  * the original parser are first split into <i>k</i> "folds" (or partitions)
  * depending on the selected splitting policy.  A particular fold is then
  * selected as the pivot, and this parser can be configured either to return
  * all and only the examples from that fold, or all and only the examples
  * from other folds.
  *
  * <p> The <i>k</i> folds are referred to by their indexes, which are 0, 1,
  * ..., <i>k</i> - 1.  This index is used to select the pivot fold.
  *
  * @see    FoldParser.SplitPolicy
  * @author Dan Muriello, Nick Rizzolo
 **/
public class FoldParser implements Parser
{
  /** The parser whose examples are being filtered. */
  protected Parser parser;
  /** The total number of folds. */
  protected int K;
  /** The way in which examples are partitioned into folds. */
  protected SplitPolicy splitPolicy;
  /**
    * The examples from this fold are exclusively selected for or excluded
    * from the set of examples returned by this parser.
   **/
  protected int pivot;
  /** Whether examples will come from the pivot fold or not. */
  protected boolean fromPivot;
  /** The total number of examples coming from {@link #parser}. */
  protected int examples;

  /** Keeps track of the index of the next example to be returned. */
  protected int exampleIndex;
  /** Keeps track of the current fold; used only in manual splitting. */
  protected int fold;
  /**
    * A lower bound for an index relating to the pivot fold.  The index
    * variable in question may either be {@link #exampleIndex} or
    * {@link #shuffleIndex}.
   **/
  protected int lowerBound;
  /**
    * An upper bound for an index relating to the pivot fold.  The index
    * variable in question may either be {@link #exampleIndex} or
    * {@link #shuffleIndex}.
   **/
  protected int upperBound;
  /**
    * Used only by the random splitting policy to remember which example
    * indexes are in which folds.
   **/
  protected int[] shuffled;
  /** An index pointing into {@link #shuffled}. */
  protected int shuffleIndex;


  /**
    * Constructor for when you don't know how many examples are in the data.
    * Using a constructor that allows specification of the number of examples
    * in the data only saves computation when the splitting policy is either
    * sequential or random.
    *
    * @param parser The parser whose examples are being filtered.
    * @param K      The total number of folds; this value is ignored if the
    *               splitting policy is manual.
    * @param split  The way in which examples are partitioned into folds.
    * @param pivot  The index of the pivot fold.
    * @param f      Whether to extract examples from the pivot.
   **/
  public FoldParser(Parser parser, int K, SplitPolicy split, int pivot,
                    boolean f) {
    this(parser, K, split, pivot, f, -1);
  }

  /**
    * Constructor for when you know neither how many examples are in the data
    * nor <i>K</i>, i.e., how many folds are in the data.  This constructor
    * can only be used when the splitting policy is manual.  Using a
    * constructor that allows specification of the number of examples in the
    * data only saves computation when the splitting policy is either
    * sequential or random.
    *
    * @param parser The parser whose examples are being filtered.
    * @param split  The way in which examples are partitioned into folds.
    * @param pivot  The index of the pivot fold.
    * @param f      Whether to extract examples from the pivot.
   **/
  public FoldParser(Parser parser, SplitPolicy split, int pivot, boolean f) {
    this(parser, -1, split, pivot, f, -1);
  }

  /**
    * Full constructor.
    *
    * @param parser The parser whose examples are being filtered.
    * @param K      The total number of folds; this value is ignored if the
    *               splitting policy is manual.
    * @param split  The way in which examples are partitioned into folds.
    * @param pivot  The index of the pivot fold.
    * @param f      Whether to extract examples from the pivot.
    * @param e      The total number of examples coming from
    *               <code>parser</code>, or -1 if unknown.
   **/
  public FoldParser(Parser parser, int K, SplitPolicy split, int pivot,
                    boolean f, int e) {
    this.K = K;
    splitPolicy = split;
    fromPivot = f;
    examples = e;

    if (examples == -1
        && (splitPolicy == SplitPolicy.sequential
            || splitPolicy == SplitPolicy.random)) {
      ++examples;
      for (Object example = parser.next(); example != null;
           example = parser.next())
        if (example != FoldSeparator.separator) ++examples;
      parser.reset();
    }

    if (splitPolicy == SplitPolicy.random) {
      shuffled = new int[examples];
      for (int i = 0; i < examples; ++i) shuffled[i] = i;
      Random r = new Random();

      for (int i = 0; i < examples; ++i) {
        int j = i + r.nextInt(examples - i);
        int t = shuffled[i];
        shuffled[i] = shuffled[j];
        shuffled[j] = t;
      }

      for (int i = 0; i < K; ++i) {
        setPivot(i);
        Arrays.sort(shuffled, lowerBound, upperBound);
      }
    }

    if (splitPolicy == SplitPolicy.manual) {
      this.K = 1;
      for (Object example = parser.next(); example != null;
           example = parser.next())
        if (example == FoldSeparator.separator) ++this.K;
      parser.reset();
    }

    setPivot(pivot);
    this.parser = parser;
  }


  /**
    * Retrieves the value of {@link #K}, which may have been computed in the
    * constructor if the splitting policy is manual.
   **/
  public int getK() { return K; }


  /**
    * Sets the value of {@link #fromPivot}, which controls whether examples
    * will be taken from the pivot fold or from all other folds.
    *
    * @param f  The new value for {@link #fromPivot}.
   **/
  public void setFromPivot(boolean f) { fromPivot = f; }


  /**
    * Sets the pivot fold, which also causes {@link #parser} to be reset.
    *
    * @param p  The index of the new pivot fold.
   **/
  public void setPivot(int p) {
    pivot = p;
    if (p < K) reset();
  }


  /** Returns the value of {@link #pivot}. */
  public int getPivot() { return pivot; }
  /** Returns the value of {@link #parser}. */
  public Parser getParser() { return parser; }


  /**
    * Sets this parser back to the beginning of the raw data.  This means
    * arranging for all relevant state variables to be reset appropriately as
    * well, since the value of {@link #pivot} may have changed.
    *
    * @see #setPivot(int)
   **/
  public void reset() {
    if (parser != null) parser.reset();

    if (splitPolicy == SplitPolicy.sequential
        || splitPolicy == SplitPolicy.random) {
      lowerBound = pivot * (examples / K) + Math.min(pivot, examples % K);
      upperBound =
        (pivot + 1) * (examples / K) + Math.min(pivot + 1, examples % K);
    }

    if (splitPolicy == SplitPolicy.random) shuffleIndex = lowerBound;
    if (splitPolicy == SplitPolicy.manual) fold = 0;
    exampleIndex = 0;
  }


  /**
    * Convenient for determining if the next example should be returned or
    * not.
    *
    * @param example  The next example object.
    * @return <code>true</code> iff the next example should be returned.
   **/
  protected boolean filter(Object example) {
    if (example == FoldSeparator.separator) return false;
    if (splitPolicy == SplitPolicy.sequential)
      return fromPivot
             == (exampleIndex >= lowerBound && exampleIndex < upperBound);
    if (splitPolicy == SplitPolicy.random)
      return fromPivot
             == (shuffleIndex < upperBound
                 && shuffled[shuffleIndex] == exampleIndex);
    if (splitPolicy == SplitPolicy.kth)
      return fromPivot == (exampleIndex % K == pivot);
    // splitPolicy == SplitPolicy.manual
    return fromPivot == (fold == pivot);
  }


  /**
    * Changes state to reflect retrieval of the next example from the parser.
    *
    * @param example  The previous example object.
   **/
  protected void increment(Object example) {
    if (example == FoldSeparator.separator) {
      if (splitPolicy == SplitPolicy.manual) ++fold;
    }
    else {
      if (splitPolicy == SplitPolicy.random) {
        if (shuffleIndex < upperBound
            && shuffled[shuffleIndex] == exampleIndex)
          ++shuffleIndex;
      }

      ++exampleIndex;
    }
  }


  /** Retrieves the next example object. */
  public Object next() {
    Object result = parser.next();
    for (; result != null && !filter(result); result = parser.next())
      increment(result);
    if (result != null) increment(result);
    return result;
  }


  /** Frees any resources this parser may be holding. */
  public void close() { parser.close(); }


  /**
    * Immutable type representing the way in which examples are partitioned
    * into folds.  When LBJ's self imposed restriction to use Java 1.4 is
    * lifted, this class will be replaced by an <code>enum</code>.
    *
    * <p> The four implemented splitting strategies are described below.  Note
    * that in all cases except "Manual", the size of the folds are as equal as
    * possible, with any extra examples allocated to earlier folds.
    *
    * <blockquote>
    * <dl>
    *   <dt> <b>Sequential</b> </dt>
    *   <dd> The examples are simply partitioned into sequential folds. </dd>
    *   <dt> <b>k<sup>th</sup></b> </dt>
    *   <dd> Every k<sup>th</sup> example is in the same fold. </dd>
    *   <dt> <b>Random</b> </dt>
    *   <dd> Examples are randomly assigned to folds. </dd>
    *   <dt> <b>Manual</b> </dt>
    *   <dd>
    *      Same as sequential, except fold boundaries are indicated by an
    *      appearance of the {@link FoldSeparator} in place of an example
    *      object.
    *   </dd>
    * </dl>
    * </blockquote>
    *
    * @author Nick Rizzolo
  **/
  public static class SplitPolicy
  {
    /** Represents the random split policy. */
    public static final SplitPolicy random = new SplitPolicy(0);
    /** Represents the sequential split policy. */
    public static final SplitPolicy sequential = new SplitPolicy(1);
    /**
      * Represents the split policy in which every k<sup>th</sup> example is
      * part of the same fold.
    **/
    public static final SplitPolicy kth = new SplitPolicy(2);
    /**
      * Represents the split policy in which the user manually inserts fold
      * separation objects.
    **/
    public static final SplitPolicy manual = new SplitPolicy(3);

    /** The names of the different split strategies as strings. */
    private static final String[] names =
      { "random", "sequential", "kth", "manual" };


    /** Can be used to index the {@link #names} array. */
    private int index;


    /** Initializes the object with an index. */
    private SplitPolicy(int i) { index = i; }


    /** Retrieves the name of the policy represented by this object. */
    public String toString() { return names[index]; }
  }
}