package LBJ2.learn;
import java.io.Serializable;
import java.net.URL;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import LBJ2.classify.Feature;
import LBJ2.util.ByteString;
import LBJ2.util.ClassUtils;
import LBJ2.util.ExceptionlessInputStream;
import LBJ2.util.ExceptionlessOutputStream;
import LBJ2.util.FVector;
import LBJ2.util.IVector;
import LBJ2.util.IVector2D;
import LBJ2.util.Sort;
import LBJ2.util.TableFormat;
/**
* A <code>Lexicon</code> contains a mapping from {@link Feature}s to
* integers. The integer key of a feature is returned by the
* {@link #lookup(Feature)} method. If the feature is not already in the
* lexicon, then it will be added to the lexicon, and thus lookup calls can
* be made without the need to check if an entry already exists. The integer
* keys are incremented in ascending order starting from 0 as features are
* added to the lexicon.
*
* <p> The map is implemented as a <code>HashMap</code> by default and the
* <code>Lexicon</code> class has similar functionality. This class also
* maintains a second <code>Vector</code> of integers to their associated
* features for fast reverse lookup using the {@link #lookupKey(int)} method.
*
* @author Michael Paul
**/
public class Lexicon implements Cloneable, Serializable
{
/**
* The default capacity of {@link #lexiconInv} and {@link #featureCounts}.
**/
private static final int defaultCapacity = 1 << 10;
/** <!-- readLexicon(String) -->
* Reads and returns a feature lexicon from the specified file.
*
* @param filename The name of the file from which to read the feature
* lexicon.
* @return The lexicon.
**/
public static Lexicon readLexicon(String filename) {
try { return readLexicon(new URL("file:" + filename)); }
catch (Exception e) {
System.err.println("Error constructing URL:");
e.printStackTrace();
System.exit(1);
}
return null;
}
/** <!-- readLexicon(URL) -->
* Reads a feature lexicon from the specified location.
*
* @param url The location from which to read the feature lexicon.
* @return The lexicon.
**/
public static Lexicon readLexicon(URL url) {
return readLexicon(url, true);
}
/** <!-- readLexicon(URL,boolean) -->
* Reads a feature lexicon from the specified location, with the option to
* ignore the feature counts by setting the second argument to
* <code>false</code>.
*
* @param url The location from which to read the feature lexicon.
* @param readCounts Whether or not to read the feature counts.
* @return The lexicon.
**/
public static Lexicon readLexicon(URL url, boolean readCounts) {
ExceptionlessInputStream in =
ExceptionlessInputStream.openCompressedStream(url);
Lexicon result = readLexicon(in, readCounts);
in.close();
return result;
}
/** <!-- readLexicon(ExceptionlessInputStream,boolean) -->
* Reads a feature lexicon from the specified stream.
*
* @param in The stream from which to read the feature lexicon.
* @return The lexicon.
**/
public static Lexicon readLexicon(ExceptionlessInputStream in) {
return readLexicon(in, true);
}
/** <!-- readLexicon(ExceptionlessInputStream,boolean) -->
* Reads a feature lexicon from the specified stream, with the option to
* ignore the feature counts by setting the second argument to
* <code>false</code>.
*
* @param in The stream from which to read the feature lexicon.
* @param readCounts Whether or not to read the feature counts.
* @return The lexicon.
**/
public static Lexicon readLexicon(ExceptionlessInputStream in,
boolean readCounts) {
String name = in.readString();
if (name == null) return null;
Class clazz = ClassUtils.getClass(name);
Lexicon lexicon = null;
try { lexicon = (Lexicon) clazz.newInstance(); }
catch (Exception e) {
System.err.println("Can't instantiate '" + name + "': " + e);
System.exit(1);
}
lexicon.read(in, readCounts);
return lexicon;
}
// Member variables.
/** The map of features to integer keys. */
protected Map lexicon;
/** The inverted map of integer keys to their features. */
protected FVector lexiconInv;
/** The encoding to use for new features added to this lexicon. */
private String encoding;
/**
* This flag remembers whether {@link #encoding} has been assigned a value
* yet or not. Using this flag, we enforce the constraint that once an
* encoding has been set, it can never be changed. This way, a user will
* only be capable of using the same lexicon object in two different
* learners if they have the same encoding. See the implementation of
* {@link Learner#setLexicon(Lexicon)}.
**/
private boolean encodingSet;
/** Counts the number of occurrences of each feature. */
protected IVector featureCounts;
/**
* Counts the number of occurrences of each feature on a class-by-class
* basis.
**/
protected IVector2D perClassFeatureCounts;
/**
* Features at this index in {@link #lexiconInv} or higher have been
* pruned. <code>-1</code> indicates that no pruning has been done.
**/
protected int pruneCutoff;
/**
* Stores features that might appear repeatedly as children of other
* features, but which are not themselves given indexes in the lexicon.
**/
protected ChildLexicon lexiconChildren;
/** Creates an empty lexicon. */
public Lexicon() { clear(); }
/**
* Creates an empty lexicon with the given encoding.
*
* @param e The encoding to use when adding features to this lexicon.
**/
public Lexicon(String e) {
encoding = e;
encodingSet = true;
clear();
}
/** Clears the data structures associated with this instance. */
public void clear() {
lexicon = new HashMap();
lexiconInv = new FVector();
lexiconChildren = null;
pruneCutoff = -1;
}
/**
* Sets the encoding used when adding features to this lexicon.
*
* @param e The encoding.
**/
public void setEncoding(String e) {
if (encodingSet && (encoding == null ? e != null : !encoding.equals(e))) {
System.err.println(
"LBJ ERROR: Once established, the encoding of a lexicon cannot be "
+ "changed.");
new Exception().printStackTrace();
System.exit(1);
}
encoding = e;
encodingSet = true;
}
/** Simply returns the map stored in {@link #lexicon}. */
public Map getMap() {
lazyMapCreation();
return Collections.unmodifiableMap(lexicon);
}
/** Returns the number of features currently stored in {@link #lexicon}. */
public int size() { return lexiconInv.size(); }
/**
* Returns the value of {@link #pruneCutoff}, or {@link #size()} if
* {@link #pruneCutoff} is -1.
**/
public int getCutoff() { return pruneCutoff == -1 ? size() : pruneCutoff; }
/** <!-- countFeatures(CountPolicy) -->
* Call this method to initialize the lexicon to count feature occurrences
* on each call to <code>lookup(feature, true)</code> (counting still won't
* happen on a call to <code>lookup(feature, false)</code>).
* Alternatively, this method can also cause the lexicon to discard all its
* feature counts and cease counting features at any time in the future.
* The former happens when <code>policy</code> is something other than
* {@link Lexicon.CountPolicy#none}, and the latter happens when
* <code>policy</code> is {@link Lexicon.CountPolicy#none}.
*
* @see #lookup(Feature,boolean)
* @param policy The new feature counting policy.
**/
public void countFeatures(CountPolicy policy) {
featureCounts = null;
perClassFeatureCounts = null;
if (policy == CountPolicy.global)
featureCounts = new IVector(defaultCapacity);
else if (policy == CountPolicy.perClass)
perClassFeatureCounts = new IVector2D(8, defaultCapacity);
}
/** <!-- getCountPolicy() -->
* Returns the feature counting policy currently employed by this lexicon.
**/
public CountPolicy getCountPolicy() {
if (featureCounts != null) return CountPolicy.global;
if (perClassFeatureCounts != null) return CountPolicy.perClass;
return CountPolicy.none;
}
/** <!-- perClassToGlobalCounts() -->
* Collapses per-class feature counts into global counts.
**/
public void perClassToGlobalCounts() {
if (perClassFeatureCounts == null)
throw new IllegalArgumentException(
"LBJ ERROR: Lexicon.perClassToGlobalCounts: Cannot be called if "
+ "there are not per-class counts.");
int rows = perClassFeatureCounts.size(), columns = 0;
for (int i = 0; i < rows; ++i)
columns = Math.max(columns, perClassFeatureCounts.size(i));
featureCounts = new IVector(defaultCapacity);
for (int j = 0; j < columns; ++j) {
int count = 0;
for (int i = 0; i < rows; ++i) count += perClassFeatureCounts.get(i, j);
featureCounts.set(j, count);
}
perClassFeatureCounts = null;
}
/** <!-- contains(Feature) -->
* Returns <code>true</code> if the given feature is already in the
* lexicon (whether it's past the {@link #pruneCutoff} or not) and
* <code>false</code> otherwise. This does not alter or add anything to
* the lexicon.
*
* @param f The feature to look up.
* @return A boolean indicating if the given feature is currently in the
* lexicon.
**/
public boolean contains(Feature f) {
lazyMapCreation();
return lexicon.containsKey(f);
}
/** <!-- lookup(Feature) -->
* Looks up a feature's index by calling <code>lookup(f, false)</code>.
* See {@link #lookup(Feature,boolean,int)} for more details.
*
* @param f The feature to look up.
* @return The integer key that the feature maps to.
**/
public int lookup(Feature f) { return lookup(f, false, -1); }
/** <!-- lookup(Feature,boolean) -->
* Looks up a feature's index by calling <code>lookup(f, training,
* -1)</code>. See {@link #lookup(Feature,boolean,int)} for more details.
*
* @param f The feature to look up.
* @param training Whether or not the learner is currently training.
* @return The integer key that the feature maps to.
**/
public int lookup(Feature f, boolean training) {
return lookup(f, training, -1);
}
/** <!-- lookup(Feature,boolean,int) -->
* Looks up the given feature in the lexicon, possibly counting it and/or
* expanding the lexicon to accomodate it. Feature counting and automatic
* lexicon expansion happen when <code>training</code> is
* <code>true</code>. Otherwise, <code>f</code> is not counted even if
* already in the lexicon, and a previously unobserved feature will cause
* this method to return the value of {@link #getCutoff()} without
* expanding the lexicon to accomodate the new feature.
*
* @param f The feature to look up.
* @param training Whether or not the learner is currently training.
* @param label The label of the example containing this feature, or -1
* if we aren't doing per class feature counting.
* @return The integer key that the feature maps to.
**/
public int lookup(Feature f, boolean training, int label) {
if (label < 0) {
if (training && perClassFeatureCounts != null)
throw new IllegalArgumentException(
"LBJ ERROR: Lexicon.lookup: Must supply a label when training "
+ "with per class feature counts.");
}
else if (!training || perClassFeatureCounts == null)
throw new IllegalArgumentException(
"LBJ ERROR: Lexicon.lookup: A label has been supplied when not "
+ "training with per class feature counts.");
lazyMapCreation();
Integer I = (Integer) lexicon.get(f);
if (I == null) {
if (!training) return getCutoff();
f = f.encode(encoding);
if (lexiconChildren != null) {
Feature c = lexiconChildren.remove(f);
if (c != null) f = c;
}
int key = lexiconInv.size();
lexicon.put(f, new Integer(key));
lexiconInv.add(f);
incrementCount(key, label);
return key;
}
int index = I.intValue();
if (training) incrementCount(index, label);
return index;
}
/**
* Used to lookup the children of conjunctive and referring features during
* training, this method checks {@link #lexiconChildren} if the feature
* isn't present in {@link #lexicon} and {@link #lexiconInv}, and then
* stores the given feature in {@link #lexiconChildren} if it wasn't
* present anywhere.
*
* @param f The feature to look up.
* @param label The label of the example containing this feature, or -1 if
* we aren't doing per class feature counting.
* @return A feature equivalent to <code>f</code> that is stored in this
* lexicon.
**/
public Feature getChildFeature(Feature f, int label) {
lazyMapCreation();
Integer I = (Integer) lexicon.get(f);
if (I != null) {
int index = I.intValue();
incrementCount(index, label);
return lexiconInv.get(index);
}
if (lexiconChildren == null) lexiconChildren = new ChildLexicon(this);
return lexiconChildren.getChildFeature(f, -1);
}
/**
* Increments the count of the feature with the given index(es).
*
* @param index The index of the feature.
* @param label The label of the example containing this feature, which is
* ignored if we aren't doing per class feature counting.
**/
protected void incrementCount(int index, int label) {
if (featureCounts != null) featureCounts.increment(index);
else if (perClassFeatureCounts != null)
perClassFeatureCounts.increment(label, index);
}
/**
* Used to lookup the children of conjunctive and referring features while
* writing the lexicon, this method checks {@link #lexiconChildren} if the
* feature isn't present in {@link #lexicon} and {@link #lexiconInv}, and
* will throw an exception if it still can't be found.
*
* @param f The feature to look up.
* @return If the feature was found in {@link #lexicon}, its associated
* integer index is returned. Otherwise, <code>-i - 1</code> is
* returned, where <code>i</code> is the index associated with the
* feature in {@link #lexiconChildren}.
* @throws UnsupportedOperationException If the feature isn't found
* anywhere in the lexicon.
**/
public int lookupChild(Feature f) {
lazyMapCreation();
Integer I = (Integer) lexicon.get(f);
if (I != null) return I.intValue();
if (lexiconChildren == null)
throw
new UnsupportedOperationException(
"When calling Lexicon.lookupChild(Feature), the feature must be "
+ "present in the lexicon.");
return -lexiconChildren.lookupChild(f) - 1;
}
/** <!-- lookupKey(int) -->
* Does a reverse lexicon lookup and returns the {@link Feature} associated
* with the given integer key, and <code>null</code> if no such feature
* exists.
*
* @param i The integer key to look up. If <code>i</code> is negative,
* {@link #lexiconChildren} is queried instead of
* {@link #lexiconInv}.
* @return The feature that maps to the given integer.
**/
public Feature lookupKey(int i) {
if (i < 0) return lexiconChildren.lookupKey(-i - 1);
return lexiconInv.get(i);
}
/** <!-- isPruned(int,PruningPolicy) -->
* Determines if the given feature index should be pruned according to the
* given pruning policy, which must have its thresholds set already in the
* case that it represents the "Percentage" policy. This method behaves
* equivalently to <code>isPruned(i, -1, p)</code>.
*
* @see #isPruned(int,int,Lexicon.PruningPolicy)
* @param i The feature index.
* @param policy The pruning policy.
* @return <code>true</code> iff the feature should be pruned.
**/
public boolean isPruned(int i, PruningPolicy policy) {
return isPruned(i, -1, policy);
}
/** <!-- isPruned(int,int,PruningPolicy) -->
* Determines if the given feature index should be pruned according to the
* given pruning policy, which must have its thresholds set already in the
* case that it represents the "Percentage" policy. The second argument to
* this method represents the label of the example in which the specified
* feature appeared. It is ignored unless per class feature counts are
* present. If they are, then when the specified label is -1, all counts
* for the given feature must be greater than or equal to the corresponding
* threshold for this method to return <code>true</code>. When per class
* feature counts are present and the label is non-negative, only the count
* corresponding to that label must be greater than or equal to its
* corresonding threshold.
*
* <p> In other words, passing -1 in the second argument gives the behavior
* expected when pruning the lexicon as in
* {@link #prune(Lexicon.PruningPolicy)}. Passing a non-negative label in
* the second argument gives the behavior expected when pruning the actual
* examples.
*
* @param i The feature index.
* @param label The label of the example containing this feature, or -1 if
* we want the lexicon pruning behavior.
* @param policy The pruning policy.
* @return <code>true</code> iff the feature should be pruned.
**/
public boolean isPruned(int i, int label, PruningPolicy policy) {
if (policy.isNone()) return false;
if (featureCounts == null && perClassFeatureCounts == null)
throw new IllegalArgumentException(
"LBJ ERROR: Lexicon.isPruned: pruning policy wasn't 'None', but "
+ "there are no counts.");
if (featureCounts != null) // if global counting
return featureCounts.get(i) < policy.getThreshold(0);
// otherwise, per class counting
if (label >= 0)
return
perClassFeatureCounts.get(label, i) < policy.getThreshold(label);
for (int j = 0; j < perClassFeatureCounts.size(); ++j)
if (perClassFeatureCounts.get(j, i) >= policy.getThreshold(j))
return false;
return true;
}
/** <!-- prune(PruningPolicy) -->
* Rearranges the order in which features appear in the lexicon based on
* the compiled feature counts in {@link #featureCounts} or
* {@link #perClassFeatureCounts} so that pruned features are at the end of
* the feature space. This way, learning algorithms can allocate exactly
* enough space in their weight vectors for the unpruned features.
*
* <p> This method returns an array of integers which is a permutation of
* the integers from 0 (inclusive) to the number of features in the lexicon
* (exclusive). It represents a map from the features' original indexes to
* their new ones after pruning. The {@link #getCutoff()} method then
* returns the new index of the first pruned feature (or, equivalently, the
* number of unpruned features). All features with a new index greater
* than or equal to this index are considered pruned in the case of global
* pruning. In the case of per-class pruning, the cutoff represents the
* first feature whose count fell below the threshold for <i>every</i>
* class. Thus, in this case, features below the cutoff may still be
* pruned in any given class; just not all of them.
*
* @param policy The type of pruning to perform.
* @return A map from features' original indexes to their new ones, or
* <code>null</code> if <code>policy</code> indicates no pruning.
**/
public int[] prune(PruningPolicy policy) {
if (policy.isNone()) {
pruneCutoff = -1;
return null;
}
if (featureCounts == null && perClassFeatureCounts == null)
throw new UnsupportedOperationException(
"LBJ ERROR: Lexicon.prune: Can't prune if there's no feature "
+ "counts.");
// Set thresholds in the policy.
if (policy.isPercentage()) {
if (featureCounts != null) { // if global counting
long t =
Math.round(Math.ceil(featureCounts.max() * policy.getPercentage()));
policy.setThresholds(new int[]{ (int) t });
}
else { // if per class counting
int[] thresholds = new int[perClassFeatureCounts.size()];
int size = perClassFeatureCounts.size();
double p = policy.getPercentage();
for (int i = 0; i < size; ++i)
thresholds[i] =
(int) Math.round(Math.ceil(perClassFeatureCounts.max(i) * p));
policy.setThresholds(thresholds);
}
}
// there's no clause for policy.isAbsolute() here since the appropriate
// threshold must already be established in that case.
else if (!policy.isAbsolute())
throw new UnsupportedOperationException(
"LBJ ERROR: Lexicon.prune: Pruning policy '" + policy
+ "' is not supported.");
// Swap features around, remembering how it was done in swapMap.
pruneCutoff = size();
int[] swapMap = new int[pruneCutoff];
// If features at the end of the space are pruned, there's no need to swap
// anything; just decrement pruneCutoff.
while (pruneCutoff > 0 && isPruned(pruneCutoff - 1, policy)) {
--pruneCutoff;
swapMap[pruneCutoff] = pruneCutoff;
}
// Now we know the feature just below the prune cutoff does not need to be
// pruned (otherwise it would have been handled by the loop above), so we
// start the loop at pruneCutoff - 2 and do swaps for any feature that
// needs to be pruned.
if (pruneCutoff > 0) swapMap[pruneCutoff - 1] = pruneCutoff - 1;
for (int i = pruneCutoff - 2; i >= 0; --i) {
if (isPruned(i, policy)) {
pruneCutoff--;
Feature pruned = lexiconInv.get(i);
Feature f = lexiconInv.get(pruneCutoff);
if (lexicon != null)
lexicon.put(pruned, lexicon.put(f, new Integer(i)));
lexiconInv.set(i, f);
lexiconInv.set(pruneCutoff, pruned);
if (featureCounts != null)
featureCounts.set(i,
featureCounts.set(pruneCutoff,
featureCounts.get(i)));
else {
for (int j = 0; j < perClassFeatureCounts.size(); ++j)
perClassFeatureCounts.set(
j, i,
perClassFeatureCounts.set(j, pruneCutoff,
perClassFeatureCounts.get(j, i)));
}
swapMap[i] = swapMap[pruneCutoff];
swapMap[pruneCutoff] = i;
}
else swapMap[i] = i;
}
// Invert swapMap.
// swapMap[i] currently stores the original index of the feature whose new
// index is i. but we want the inverse: swapMap[i] should store the new
// index of the feature whose original index was i. we also don't want to
// allocate another array as long as swapMap, even if it's only around
// temporarily. so we do this:
for (int i = 0; i < swapMap.length; ) {
int newIndex = 0, j = i;
do {
int original = swapMap[j];
swapMap[j] = -newIndex;
newIndex = j;
j = original;
} while (j != i);
swapMap[i] = newIndex;
for (i++; i < swapMap.length && swapMap[i] <= 0; ++i)
swapMap[i] = -swapMap[i];
}
return swapMap;
}
/**
* Permanently discards any features that have been pruned via
* {@link #prune(Lexicon.PruningPolicy)} as well as all feature counts.
**/
public void discardPrunedFeatures() {
if (pruneCutoff == -1) return;
featureCounts = null;
perClassFeatureCounts = null;
for (int i = lexiconInv.size() - 1; i >= pruneCutoff; --i) {
Feature f = lexiconInv.remove(i);
if (lexicon != null) lexicon.remove(f);
}
lexiconInv = new FVector(lexiconInv);
pruneCutoff = -1;
}
/** <!-- clone() -->
* Returns a deep clone of this lexicon implemented as a
* <code>HashMap</code>.
**/
public Object clone() {
Lexicon clone = null;
try { clone = (Lexicon) super.clone(); }
catch (Exception e) {
System.err.println("Error cloning Lexicon: " + e);
e.printStackTrace();
System.exit(1);
}
if (lexicon != null) {
clone.lexicon = new HashMap();
clone.lexicon.putAll(lexicon);
}
clone.lexiconInv = (FVector) lexiconInv.clone();
if (featureCounts != null)
clone.featureCounts = (IVector) featureCounts.clone();
if (perClassFeatureCounts != null)
clone.perClassFeatureCounts = (IVector2D) perClassFeatureCounts.clone();
if (lexiconChildren != null)
clone.lexiconChildren = (ChildLexicon) lexiconChildren.clone();
return clone;
}
/** Returns whether the given Lexicon object is equal to this one. */
public boolean equals(Object o) {
if (!o.getClass().equals(getClass())) return false;
Lexicon l = (Lexicon) o;
return
pruneCutoff == l.pruneCutoff
&& (lexicon == null ? l.lexicon == null : lexicon.equals(l.lexicon))
&& (featureCounts == null
? l.featureCounts == null : featureCounts.equals(l.featureCounts))
&& (perClassFeatureCounts == null
? l.perClassFeatureCounts == null
: perClassFeatureCounts.equals(l.perClassFeatureCounts))
&& (lexiconChildren == null
? l.lexiconChildren == null
: lexiconChildren.equals(l.lexiconChildren));
}
/** Returns a hash code for this lexicon. */
public int hashCode() { return lexiconInv.hashCode(); }
/** <!-- write(ExceptionlessOutputStream) -->
* Writes a binary representation of the lexicon.
*
* @param out The output stream.
**/
public void write(ExceptionlessOutputStream out) {
out.writeString(getClass().getName());
if (lexiconChildren == null) out.writeString(null);
else lexiconChildren.write(out);
final FVector inverse = lexiconInv;
int[] indexes = new int[inverse.size()];
for (int i = 0; i < indexes.length; ++i) indexes[i] = i;
Sort.sort(indexes,
new Sort.IntComparator() {
public int compare(int i1, int i2) {
return inverse.get(i1).compareTo(inverse.get(i2));
}
});
String previousClassName = null;
String previousPackage = null;
String previousClassifier = null;
String previousSIdentifier = null;
ByteString previousBSIdentifier = null;
out.writeInt(indexes.length);
out.writeInt(pruneCutoff);
for (int i = 0; i < indexes.length; ++i) {
Feature f = inverse.get(indexes[i]);
previousClassName =
f.lexWrite(out, this, previousClassName, previousPackage,
previousClassifier, previousSIdentifier,
previousBSIdentifier);
previousPackage = f.getPackage();
previousClassifier = f.getGeneratingClassifier();
if (f.hasStringIdentifier())
previousSIdentifier = f.getStringIdentifier();
else if (f.hasByteStringIdentifier())
previousBSIdentifier = f.getByteStringIdentifier();
out.writeInt(indexes[i]);
}
if (featureCounts == null) out.writeInt(0);
else featureCounts.write(out);
if (perClassFeatureCounts == null) out.writeInt(0);
else perClassFeatureCounts.write(out);
}
/** <!-- read(ExceptionlessInputStream) -->
* Reads the binary representation of a lexicon from the specified stream,
* overwriting the data in this object.
*
* @param in The input stream.
**/
public void read(ExceptionlessInputStream in) { read(in, true); }
/** <!-- read(ExceptionlessInputStream,boolean) -->
* Reads the binary representation of a lexicon from the specified stream,
* overwriting the data in this object. This method also gives the option
* to ignore any feature counts stored after the feature mappings by
* setting the second argument to <code>false</code>.
*
* @param in The input stream.
* @param readCounts Whether or not to read the feature counts.
**/
public void read(ExceptionlessInputStream in, boolean readCounts) {
lexiconChildren = (ChildLexicon) Lexicon.readLexicon(in, readCounts);
Class previousClass = null;
String previousPackage = null;
String previousClassifier = null;
String previousSIdentifier = null;
ByteString previousBSIdentifier = null;
int N = in.readInt();
pruneCutoff = in.readInt();
lexicon = null;
lexiconInv = new FVector(N);
for (int i = 0; i < N; ++i) {
Feature f =
Feature.lexReadFeature(in, this, previousClass, previousPackage,
previousClassifier, previousSIdentifier,
previousBSIdentifier);
int index = in.readInt();
lexiconInv.set(index, f);
previousClass = f.getClass();
previousPackage = f.getPackage();
previousClassifier = f.getGeneratingClassifier();
if (f.hasStringIdentifier())
previousSIdentifier = f.getStringIdentifier();
else if (f.hasByteStringIdentifier())
previousBSIdentifier = f.getByteStringIdentifier();
}
if (readCounts) {
featureCounts = new IVector();
featureCounts.read(in);
if (featureCounts.size() == 0) featureCounts = null;
perClassFeatureCounts = new IVector2D();
perClassFeatureCounts.read(in);
if (perClassFeatureCounts.size() == 0) perClassFeatureCounts = null;
}
else {
featureCounts = null;
perClassFeatureCounts = null;
}
if (lexiconChildren != null) lexiconChildren.setParent(this);
}
/**
* Various other methods in this class call this method to ensure that
* {@link #lexicon} is populated before performing operations on it. The
* only reason it wouldn't be is if it had just been read off disk.
**/
protected void lazyMapCreation() {
if (lexicon == null) {
lexicon = new HashMap();
int N = lexiconInv.size();
for (int i = 0; i < N; ++i)
lexicon.put(lexiconInv.get(i), new Integer(i));
}
}
/** <!-- readPrunedSize(ExceptionlessInputStream) -->
* Reads the value of {@link #pruneCutoff} from the specified stream,
* discarding everything else.
*
* @param in The input stream.
**/
public static int readPrunedSize(ExceptionlessInputStream in) {
in.readInt();
return in.readInt();
}
/** Returns a text representation of this lexicon (for debugging). */
public String toString() {
StringBuffer result = new StringBuffer();
for (int i = 0; i < lexiconInv.size(); ++i) {
result.append(", ");
result.append(i);
result.append(": ");
result.append(lexiconInv.get(i).toString());
}
if (lexiconInv.size() > 0) return result.substring(2);
return result.toString();
}
/** <!-- printCountTable(boolean) -->
* Produces on <code>STDOUT</code> a table of feature counts including a
* line indicating the position of {@link #pruneCutoff}. It's probably not
* a good idea to call this method unless you know your lexicon is small.
*
* @param p Whether or not to include package names in the output.
**/
public void printCountTable(boolean p) {
int rows = lexiconInv.size();
String[] rowLabels = new String[rows];
String[] columnLabels = null;
double[][] data = null;
int[] sigDigits = null;
int[] dashRows = { 0, pruneCutoff };
if (featureCounts != null) {
data = new double[rows][2];
for (int i = 0; i < rows; ++i) {
data[i][0] = i;
data[i][1] = featureCounts.get(i);
rowLabels[i] =
p ? lexiconInv.get(i).toString()
: lexiconInv.get(i).toStringNoPackage();
}
columnLabels = new String[]{ "Index", "Count" };
sigDigits = new int[2];
}
else if (perClassFeatureCounts != null) {
int columns = perClassFeatureCounts.size() + 1;
data = new double[rows][columns];
for (int i = 0; i < rows; ++i) {
data[i][0] = i;
for (int j = 0; j < columns - 1; ++j)
data[i][j + 1] = perClassFeatureCounts.get(j, i);
rowLabels[i] =
p ? lexiconInv.get(i).toString()
: lexiconInv.get(i).toStringNoPackage();
}
columnLabels = new String[columns];
columnLabels[0] = "Index";
for (int i = 1; i < columns; ++i) columnLabels[i] = "Label " + (i - 1);
sigDigits = new int[columns];
}
else {
data = new double[rows][1];
for (int i = 0; i < rows; ++i) {
data[i][0] = i;
rowLabels[i] =
p ? lexiconInv.get(i).toString()
: lexiconInv.get(i).toStringNoPackage();
}
columnLabels = new String[]{ "Index" };
sigDigits = new int[1];
}
TableFormat.printTableFormat(System.out, columnLabels, rowLabels, data,
sigDigits, dashRows);
}
// main(String[])
public static void main(String[] args) {
String filename = null;
boolean p = true;
try {
filename = args[0];
if (args.length == 2) p = Boolean.parseBoolean(args[1]);
if (args.length > 2) throw new Exception();
}
catch (Exception e) {
System.out.println(
"usage: java LBJ2.learn.Lexicon <lex file> [<package names = true>]");
System.exit(1);
}
Lexicon lexicon = readLexicon(filename);
lexicon.printCountTable(p);
if (lexicon.lexiconChildren != null) {
System.out.println("\nChildren:");
lexicon.lexiconChildren.printCountTable(p);
}
}
/** <!-- class CountPolicy -->
* Immutable type representing the feature counting policy of a lexicon.
* When LBJ's self imposed restriction to use Java 1.4 is lifted, this
* class will be replaced by an <code>enum</code>.
*
* <p> The three feature counting policies are described below.
*
* <blockquote>
* <dl>
* <dt> <b>None</b> </dt>
* <dd> Features occurrences are not counted. </dd>
* <dt> <b>Global</b> </dt>
* <dd>
* The lexicon stores one integer count per feature, and every
* occurrence of the feature adds to this count regardless of the
* example it appears in.
* </dd>
* <dt> <b>Per Class</b> </dt>
* <dd>
* The lexicon stores one integer count for each (feature, prediction
* class) pair. When a given feature appears in example, this
* occurrence adds to the count associated with the example's label,
* assuming that examples have a single discrete label.
* </dd>
* </dl>
* </blockquote>
*
* @author Nick Rizzolo
**/
public static class CountPolicy
{
/** Represents no counting. */
public static final CountPolicy none = new CountPolicy(0);
/** Represents global counting. */
public static final CountPolicy global = new CountPolicy(1);
/** Represents per class counting. */
public static final CountPolicy perClass = new CountPolicy(2);
/** The names of the different counting policies as strings. */
private static final String[] names = { "none", "global", "per class" };
/** Can be used to index the {@link #names} array. */
private int index;
/** Initializes the object with an index. */
private CountPolicy(int i) { index = i; }
/** Retrieves the name of the policy represented by this object. */
public String toString() { return names[index]; }
}
/** <!-- class PruningPolicy -->
* Represents the feature counting policy of a lexicon. Objects of this
* type are used to identify and describe a desired pruning policy. In
* particular, the description of a pruning policy includes feature count
* thresholds which sometimes need to be computed in terms of data. Space
* is allocated within objects of this type for storing these thresholds
* whenever they are computed.
*
* <p> The three pruning policies are described below.
*
* <blockquote>
* <dl>
* <dt> <b>None</b> </dt>
* <dd> No pruning is performed. </dd>
* <dt> <b>Absolute</b> </dt>
* <dd>
* Features whose counts within a given dataset fall below an absolute
* threshold are pruned from that dataset.
* </dd>
* <dt> <b>Percentage</b> </dt>
* <dd>
* Features whose counts within a given dataset are lower than a given
* percentage of the most common feature's count are pruned from that
* dataset.
* </dd>
* </dl>
* </blockquote>
*
* @author Nick Rizzolo
**/
public static class PruningPolicy
{
/** Represents no pruning. */
public static final int NONE = 0;
/** Represents pruning with an absolute threshold. */
public static final int ABSOLUTE = 1;
/** Represents pruning with a percentage threshold. */
public static final int PERCENTAGE = 2;
/** The names of the different counting policies as strings. */
private static final String[] names =
{ "none", "absolute", "percentage" };
/** Can be used to index the {@link #names} array. */
private int index;
/**
* The percentage associated with the "Percentage" policy described
* above.
**/
private double percentage;
/**
* Feature count thresholds which may either be specified by the policy
* explicitly or computed in terms of data.
**/
private int[] thresholds;
/** Creates a new pruning policy in which no features will be pruned. */
public PruningPolicy() { index = NONE; }
/**
* Creates a new "Percentage" policy with the given percentage.
*
* @param p The percentage.
**/
public PruningPolicy(double p) {
index = PERCENTAGE;
percentage = p;
}
/**
* Creates a new "Absolute" policy with the given threshold.
*
* @param t The threshold.
**/
public PruningPolicy(int t) {
index = ABSOLUTE;
thresholds = new int[]{ t };
}
/** <code>true</code> iff the policy is no pruning. */
public boolean isNone() { return index == NONE; }
/** <code>true</code> iff the policy is absolute thresholding. */
public boolean isAbsolute() { return index == ABSOLUTE; }
/** <code>true</code> iff the policy is percentage thresholding. */
public boolean isPercentage() { return index == PERCENTAGE; }
/**
* Use this method to establish feature count thresholds in the
* "Percentage" policy.
*
* @param t The new feature count thresholds.
**/
public void setThresholds(int[] t) {
if (index != PERCENTAGE)
throw new UnsupportedOperationException(
"LBJ ERROR: Lexicon.PruningPolicy.setThresholds should not be "
+ "called unless the policy is 'Percentage'.");
thresholds = (int[]) t.clone();
}
/**
* Returns the value of the <code>i</code><sup>th</sup> threshold in
* {@link #thresholds} when in "Percentage" mode, but ignores the
* parameter <code>i</code> and returns the first element of
* {@link #thresholds} when in "Absolute" mode.
*
* @param i An index.
* @return A feature count threshold.
**/
public int getThreshold(int i) {
if (index == NONE)
throw new UnsupportedOperationException(
"LBJ ERROR: Lexicon.PruningPolicy.getThreshold should never be "
+ "called if the pruning policy is 'None'.");
if (index == ABSOLUTE) return thresholds[0];
return thresholds[i];
}
/** Returns the value of {@link #percentage}. */
public double getPercentage() {
if (index != PERCENTAGE)
throw new UnsupportedOperationException(
"LBJ ERROR: PruningPolicy: Can't get percentage when pruning "
+ "policy isn't 'Percentage'.");
return percentage;
}
/** Retrieves the name of the policy represented by this object. */
public String toString() {
String result = names[index];
if (index == PERCENTAGE) result += "(" + percentage + ")";
if (index != NONE && thresholds != null) {
result += ": [";
for (int i = 0; i < thresholds.length; ++i)
result += thresholds[i] + (i + 1 < thresholds.length ? ", " : "");
result += "]";
}
return result;
}
}
}