package LBJ2.parse;
import java.util.*;
/**
* Useful when performing <i>k</i>-fold cross validation, this parser filters
* the examples coming from another parser. Conceptually, the examples from
* the original parser are first split into <i>k</i> "folds" (or partitions)
* depending on the selected splitting policy. A particular fold is then
* selected as the pivot, and this parser can be configured either to return
* all and only the examples from that fold, or all and only the examples
* from other folds.
*
* <p> The <i>k</i> folds are referred to by their indexes, which are 0, 1,
* ..., <i>k</i> - 1. This index is used to select the pivot fold.
*
* @see FoldParser.SplitPolicy
* @author Dan Muriello, Nick Rizzolo
**/
public class FoldParser implements Parser
{
/** The parser whose examples are being filtered. */
protected Parser parser;
/** The total number of folds. */
protected int K;
/** The way in which examples are partitioned into folds. */
protected SplitPolicy splitPolicy;
/**
* The examples from this fold are exclusively selected for or excluded
* from the set of examples returned by this parser.
**/
protected int pivot;
/** Whether examples will come from the pivot fold or not. */
protected boolean fromPivot;
/** The total number of examples coming from {@link #parser}. */
protected int examples;
/** Keeps track of the index of the next example to be returned. */
protected int exampleIndex;
/** Keeps track of the current fold; used only in manual splitting. */
protected int fold;
/**
* A lower bound for an index relating to the pivot fold. The index
* variable in question may either be {@link #exampleIndex} or
* {@link #shuffleIndex}.
**/
protected int lowerBound;
/**
* An upper bound for an index relating to the pivot fold. The index
* variable in question may either be {@link #exampleIndex} or
* {@link #shuffleIndex}.
**/
protected int upperBound;
/**
* Used only by the random splitting policy to remember which example
* indexes are in which folds.
**/
protected int[] shuffled;
/** An index pointing into {@link #shuffled}. */
protected int shuffleIndex;
/**
* Constructor for when you don't know how many examples are in the data.
* Using a constructor that allows specification of the number of examples
* in the data only saves computation when the splitting policy is either
* sequential or random.
*
* @param parser The parser whose examples are being filtered.
* @param K The total number of folds; this value is ignored if the
* splitting policy is manual.
* @param split The way in which examples are partitioned into folds.
* @param pivot The index of the pivot fold.
* @param f Whether to extract examples from the pivot.
**/
public FoldParser(Parser parser, int K, SplitPolicy split, int pivot,
boolean f) {
this(parser, K, split, pivot, f, -1);
}
/**
* Constructor for when you know neither how many examples are in the data
* nor <i>K</i>, i.e., how many folds are in the data. This constructor
* can only be used when the splitting policy is manual. Using a
* constructor that allows specification of the number of examples in the
* data only saves computation when the splitting policy is either
* sequential or random.
*
* @param parser The parser whose examples are being filtered.
* @param split The way in which examples are partitioned into folds.
* @param pivot The index of the pivot fold.
* @param f Whether to extract examples from the pivot.
**/
public FoldParser(Parser parser, SplitPolicy split, int pivot, boolean f) {
this(parser, -1, split, pivot, f, -1);
}
/**
* Full constructor.
*
* @param parser The parser whose examples are being filtered.
* @param K The total number of folds; this value is ignored if the
* splitting policy is manual.
* @param split The way in which examples are partitioned into folds.
* @param pivot The index of the pivot fold.
* @param f Whether to extract examples from the pivot.
* @param e The total number of examples coming from
* <code>parser</code>, or -1 if unknown.
**/
public FoldParser(Parser parser, int K, SplitPolicy split, int pivot,
boolean f, int e) {
this.K = K;
splitPolicy = split;
fromPivot = f;
examples = e;
if (examples == -1
&& (splitPolicy == SplitPolicy.sequential
|| splitPolicy == SplitPolicy.random)) {
++examples;
for (Object example = parser.next(); example != null;
example = parser.next())
if (example != FoldSeparator.separator) ++examples;
parser.reset();
}
if (splitPolicy == SplitPolicy.random) {
shuffled = new int[examples];
for (int i = 0; i < examples; ++i) shuffled[i] = i;
Random r = new Random();
for (int i = 0; i < examples; ++i) {
int j = i + r.nextInt(examples - i);
int t = shuffled[i];
shuffled[i] = shuffled[j];
shuffled[j] = t;
}
for (int i = 0; i < K; ++i) {
setPivot(i);
Arrays.sort(shuffled, lowerBound, upperBound);
}
}
if (splitPolicy == SplitPolicy.manual) {
this.K = 1;
for (Object example = parser.next(); example != null;
example = parser.next())
if (example == FoldSeparator.separator) ++this.K;
parser.reset();
}
setPivot(pivot);
this.parser = parser;
}
/**
* Retrieves the value of {@link #K}, which may have been computed in the
* constructor if the splitting policy is manual.
**/
public int getK() { return K; }
/**
* Sets the value of {@link #fromPivot}, which controls whether examples
* will be taken from the pivot fold or from all other folds.
*
* @param f The new value for {@link #fromPivot}.
**/
public void setFromPivot(boolean f) { fromPivot = f; }
/**
* Sets the pivot fold, which also causes {@link #parser} to be reset.
*
* @param p The index of the new pivot fold.
**/
public void setPivot(int p) {
pivot = p;
if (p < K) reset();
}
/** Returns the value of {@link #pivot}. */
public int getPivot() { return pivot; }
/** Returns the value of {@link #parser}. */
public Parser getParser() { return parser; }
/**
* Sets this parser back to the beginning of the raw data. This means
* arranging for all relevant state variables to be reset appropriately as
* well, since the value of {@link #pivot} may have changed.
*
* @see #setPivot(int)
**/
public void reset() {
if (parser != null) parser.reset();
if (splitPolicy == SplitPolicy.sequential
|| splitPolicy == SplitPolicy.random) {
lowerBound = pivot * (examples / K) + Math.min(pivot, examples % K);
upperBound =
(pivot + 1) * (examples / K) + Math.min(pivot + 1, examples % K);
}
if (splitPolicy == SplitPolicy.random) shuffleIndex = lowerBound;
if (splitPolicy == SplitPolicy.manual) fold = 0;
exampleIndex = 0;
}
/**
* Convenient for determining if the next example should be returned or
* not.
*
* @param example The next example object.
* @return <code>true</code> iff the next example should be returned.
**/
protected boolean filter(Object example) {
if (example == FoldSeparator.separator) return false;
if (splitPolicy == SplitPolicy.sequential)
return fromPivot
== (exampleIndex >= lowerBound && exampleIndex < upperBound);
if (splitPolicy == SplitPolicy.random)
return fromPivot
== (shuffleIndex < upperBound
&& shuffled[shuffleIndex] == exampleIndex);
if (splitPolicy == SplitPolicy.kth)
return fromPivot == (exampleIndex % K == pivot);
// splitPolicy == SplitPolicy.manual
return fromPivot == (fold == pivot);
}
/**
* Changes state to reflect retrieval of the next example from the parser.
*
* @param example The previous example object.
**/
protected void increment(Object example) {
if (example == FoldSeparator.separator) {
if (splitPolicy == SplitPolicy.manual) ++fold;
}
else {
if (splitPolicy == SplitPolicy.random) {
if (shuffleIndex < upperBound
&& shuffled[shuffleIndex] == exampleIndex)
++shuffleIndex;
}
++exampleIndex;
}
}
/** Retrieves the next example object. */
public Object next() {
Object result = parser.next();
for (; result != null && !filter(result); result = parser.next())
increment(result);
if (result != null) increment(result);
return result;
}
/** Frees any resources this parser may be holding. */
public void close() { parser.close(); }
/**
* Immutable type representing the way in which examples are partitioned
* into folds. When LBJ's self imposed restriction to use Java 1.4 is
* lifted, this class will be replaced by an <code>enum</code>.
*
* <p> The four implemented splitting strategies are described below. Note
* that in all cases except "Manual", the size of the folds are as equal as
* possible, with any extra examples allocated to earlier folds.
*
* <blockquote>
* <dl>
* <dt> <b>Sequential</b> </dt>
* <dd> The examples are simply partitioned into sequential folds. </dd>
* <dt> <b>k<sup>th</sup></b> </dt>
* <dd> Every k<sup>th</sup> example is in the same fold. </dd>
* <dt> <b>Random</b> </dt>
* <dd> Examples are randomly assigned to folds. </dd>
* <dt> <b>Manual</b> </dt>
* <dd>
* Same as sequential, except fold boundaries are indicated by an
* appearance of the {@link FoldSeparator} in place of an example
* object.
* </dd>
* </dl>
* </blockquote>
*
* @author Nick Rizzolo
**/
public static class SplitPolicy
{
/** Represents the random split policy. */
public static final SplitPolicy random = new SplitPolicy(0);
/** Represents the sequential split policy. */
public static final SplitPolicy sequential = new SplitPolicy(1);
/**
* Represents the split policy in which every k<sup>th</sup> example is
* part of the same fold.
**/
public static final SplitPolicy kth = new SplitPolicy(2);
/**
* Represents the split policy in which the user manually inserts fold
* separation objects.
**/
public static final SplitPolicy manual = new SplitPolicy(3);
/** The names of the different split strategies as strings. */
private static final String[] names =
{ "random", "sequential", "kth", "manual" };
/** Can be used to index the {@link #names} array. */
private int index;
/** Initializes the object with an index. */
private SplitPolicy(int i) { index = i; }
/** Retrieves the name of the policy represented by this object. */
public String toString() { return names[index]; }
}
}