package LBJ2.learn; import java.io.Serializable; import java.net.URL; import java.util.Collections; import java.util.HashMap; import java.util.Map; import LBJ2.classify.Feature; import LBJ2.util.ByteString; import LBJ2.util.ClassUtils; import LBJ2.util.ExceptionlessInputStream; import LBJ2.util.ExceptionlessOutputStream; import LBJ2.util.FVector; import LBJ2.util.IVector; import LBJ2.util.IVector2D; import LBJ2.util.Sort; import LBJ2.util.TableFormat; /** * A <code>Lexicon</code> contains a mapping from {@link Feature}s to * integers. The integer key of a feature is returned by the * {@link #lookup(Feature)} method. If the feature is not already in the * lexicon, then it will be added to the lexicon, and thus lookup calls can * be made without the need to check if an entry already exists. The integer * keys are incremented in ascending order starting from 0 as features are * added to the lexicon. * * <p> The map is implemented as a <code>HashMap</code> by default and the * <code>Lexicon</code> class has similar functionality. This class also * maintains a second <code>Vector</code> of integers to their associated * features for fast reverse lookup using the {@link #lookupKey(int)} method. * * @author Michael Paul **/ public class Lexicon implements Cloneable, Serializable { /** * The default capacity of {@link #lexiconInv} and {@link #featureCounts}. **/ private static final int defaultCapacity = 1 << 10; /** <!-- readLexicon(String) --> * Reads and returns a feature lexicon from the specified file. * * @param filename The name of the file from which to read the feature * lexicon. * @return The lexicon. **/ public static Lexicon readLexicon(String filename) { try { return readLexicon(new URL("file:" + filename)); } catch (Exception e) { System.err.println("Error constructing URL:"); e.printStackTrace(); System.exit(1); } return null; } /** <!-- readLexicon(URL) --> * Reads a feature lexicon from the specified location. * * @param url The location from which to read the feature lexicon. * @return The lexicon. **/ public static Lexicon readLexicon(URL url) { return readLexicon(url, true); } /** <!-- readLexicon(URL,boolean) --> * Reads a feature lexicon from the specified location, with the option to * ignore the feature counts by setting the second argument to * <code>false</code>. * * @param url The location from which to read the feature lexicon. * @param readCounts Whether or not to read the feature counts. * @return The lexicon. **/ public static Lexicon readLexicon(URL url, boolean readCounts) { ExceptionlessInputStream in = ExceptionlessInputStream.openCompressedStream(url); Lexicon result = readLexicon(in, readCounts); in.close(); return result; } /** <!-- readLexicon(ExceptionlessInputStream,boolean) --> * Reads a feature lexicon from the specified stream. * * @param in The stream from which to read the feature lexicon. * @return The lexicon. **/ public static Lexicon readLexicon(ExceptionlessInputStream in) { return readLexicon(in, true); } /** <!-- readLexicon(ExceptionlessInputStream,boolean) --> * Reads a feature lexicon from the specified stream, with the option to * ignore the feature counts by setting the second argument to * <code>false</code>. * * @param in The stream from which to read the feature lexicon. * @param readCounts Whether or not to read the feature counts. * @return The lexicon. **/ public static Lexicon readLexicon(ExceptionlessInputStream in, boolean readCounts) { String name = in.readString(); if (name == null) return null; Class clazz = ClassUtils.getClass(name); Lexicon lexicon = null; try { lexicon = (Lexicon) clazz.newInstance(); } catch (Exception e) { System.err.println("Can't instantiate '" + name + "': " + e); System.exit(1); } lexicon.read(in, readCounts); return lexicon; } // Member variables. /** The map of features to integer keys. */ protected Map lexicon; /** The inverted map of integer keys to their features. */ protected FVector lexiconInv; /** The encoding to use for new features added to this lexicon. */ private String encoding; /** * This flag remembers whether {@link #encoding} has been assigned a value * yet or not. Using this flag, we enforce the constraint that once an * encoding has been set, it can never be changed. This way, a user will * only be capable of using the same lexicon object in two different * learners if they have the same encoding. See the implementation of * {@link Learner#setLexicon(Lexicon)}. **/ private boolean encodingSet; /** Counts the number of occurrences of each feature. */ protected IVector featureCounts; /** * Counts the number of occurrences of each feature on a class-by-class * basis. **/ protected IVector2D perClassFeatureCounts; /** * Features at this index in {@link #lexiconInv} or higher have been * pruned. <code>-1</code> indicates that no pruning has been done. **/ protected int pruneCutoff; /** * Stores features that might appear repeatedly as children of other * features, but which are not themselves given indexes in the lexicon. **/ protected ChildLexicon lexiconChildren; /** Creates an empty lexicon. */ public Lexicon() { clear(); } /** * Creates an empty lexicon with the given encoding. * * @param e The encoding to use when adding features to this lexicon. **/ public Lexicon(String e) { encoding = e; encodingSet = true; clear(); } /** Clears the data structures associated with this instance. */ public void clear() { lexicon = new HashMap(); lexiconInv = new FVector(); lexiconChildren = null; pruneCutoff = -1; } /** * Sets the encoding used when adding features to this lexicon. * * @param e The encoding. **/ public void setEncoding(String e) { if (encodingSet && (encoding == null ? e != null : !encoding.equals(e))) { System.err.println( "LBJ ERROR: Once established, the encoding of a lexicon cannot be " + "changed."); new Exception().printStackTrace(); System.exit(1); } encoding = e; encodingSet = true; } /** Simply returns the map stored in {@link #lexicon}. */ public Map getMap() { lazyMapCreation(); return Collections.unmodifiableMap(lexicon); } /** Returns the number of features currently stored in {@link #lexicon}. */ public int size() { return lexiconInv.size(); } /** * Returns the value of {@link #pruneCutoff}, or {@link #size()} if * {@link #pruneCutoff} is -1. **/ public int getCutoff() { return pruneCutoff == -1 ? size() : pruneCutoff; } /** <!-- countFeatures(CountPolicy) --> * Call this method to initialize the lexicon to count feature occurrences * on each call to <code>lookup(feature, true)</code> (counting still won't * happen on a call to <code>lookup(feature, false)</code>). * Alternatively, this method can also cause the lexicon to discard all its * feature counts and cease counting features at any time in the future. * The former happens when <code>policy</code> is something other than * {@link Lexicon.CountPolicy#none}, and the latter happens when * <code>policy</code> is {@link Lexicon.CountPolicy#none}. * * @see #lookup(Feature,boolean) * @param policy The new feature counting policy. **/ public void countFeatures(CountPolicy policy) { featureCounts = null; perClassFeatureCounts = null; if (policy == CountPolicy.global) featureCounts = new IVector(defaultCapacity); else if (policy == CountPolicy.perClass) perClassFeatureCounts = new IVector2D(8, defaultCapacity); } /** <!-- getCountPolicy() --> * Returns the feature counting policy currently employed by this lexicon. **/ public CountPolicy getCountPolicy() { if (featureCounts != null) return CountPolicy.global; if (perClassFeatureCounts != null) return CountPolicy.perClass; return CountPolicy.none; } /** <!-- perClassToGlobalCounts() --> * Collapses per-class feature counts into global counts. **/ public void perClassToGlobalCounts() { if (perClassFeatureCounts == null) throw new IllegalArgumentException( "LBJ ERROR: Lexicon.perClassToGlobalCounts: Cannot be called if " + "there are not per-class counts."); int rows = perClassFeatureCounts.size(), columns = 0; for (int i = 0; i < rows; ++i) columns = Math.max(columns, perClassFeatureCounts.size(i)); featureCounts = new IVector(defaultCapacity); for (int j = 0; j < columns; ++j) { int count = 0; for (int i = 0; i < rows; ++i) count += perClassFeatureCounts.get(i, j); featureCounts.set(j, count); } perClassFeatureCounts = null; } /** <!-- contains(Feature) --> * Returns <code>true</code> if the given feature is already in the * lexicon (whether it's past the {@link #pruneCutoff} or not) and * <code>false</code> otherwise. This does not alter or add anything to * the lexicon. * * @param f The feature to look up. * @return A boolean indicating if the given feature is currently in the * lexicon. **/ public boolean contains(Feature f) { lazyMapCreation(); return lexicon.containsKey(f); } /** <!-- lookup(Feature) --> * Looks up a feature's index by calling <code>lookup(f, false)</code>. * See {@link #lookup(Feature,boolean,int)} for more details. * * @param f The feature to look up. * @return The integer key that the feature maps to. **/ public int lookup(Feature f) { return lookup(f, false, -1); } /** <!-- lookup(Feature,boolean) --> * Looks up a feature's index by calling <code>lookup(f, training, * -1)</code>. See {@link #lookup(Feature,boolean,int)} for more details. * * @param f The feature to look up. * @param training Whether or not the learner is currently training. * @return The integer key that the feature maps to. **/ public int lookup(Feature f, boolean training) { return lookup(f, training, -1); } /** <!-- lookup(Feature,boolean,int) --> * Looks up the given feature in the lexicon, possibly counting it and/or * expanding the lexicon to accomodate it. Feature counting and automatic * lexicon expansion happen when <code>training</code> is * <code>true</code>. Otherwise, <code>f</code> is not counted even if * already in the lexicon, and a previously unobserved feature will cause * this method to return the value of {@link #getCutoff()} without * expanding the lexicon to accomodate the new feature. * * @param f The feature to look up. * @param training Whether or not the learner is currently training. * @param label The label of the example containing this feature, or -1 * if we aren't doing per class feature counting. * @return The integer key that the feature maps to. **/ public int lookup(Feature f, boolean training, int label) { if (label < 0) { if (training && perClassFeatureCounts != null) throw new IllegalArgumentException( "LBJ ERROR: Lexicon.lookup: Must supply a label when training " + "with per class feature counts."); } else if (!training || perClassFeatureCounts == null) throw new IllegalArgumentException( "LBJ ERROR: Lexicon.lookup: A label has been supplied when not " + "training with per class feature counts."); lazyMapCreation(); Integer I = (Integer) lexicon.get(f); if (I == null) { if (!training) return getCutoff(); f = f.encode(encoding); if (lexiconChildren != null) { Feature c = lexiconChildren.remove(f); if (c != null) f = c; } int key = lexiconInv.size(); lexicon.put(f, new Integer(key)); lexiconInv.add(f); incrementCount(key, label); return key; } int index = I.intValue(); if (training) incrementCount(index, label); return index; } /** * Used to lookup the children of conjunctive and referring features during * training, this method checks {@link #lexiconChildren} if the feature * isn't present in {@link #lexicon} and {@link #lexiconInv}, and then * stores the given feature in {@link #lexiconChildren} if it wasn't * present anywhere. * * @param f The feature to look up. * @param label The label of the example containing this feature, or -1 if * we aren't doing per class feature counting. * @return A feature equivalent to <code>f</code> that is stored in this * lexicon. **/ public Feature getChildFeature(Feature f, int label) { lazyMapCreation(); Integer I = (Integer) lexicon.get(f); if (I != null) { int index = I.intValue(); incrementCount(index, label); return lexiconInv.get(index); } if (lexiconChildren == null) lexiconChildren = new ChildLexicon(this); return lexiconChildren.getChildFeature(f, -1); } /** * Increments the count of the feature with the given index(es). * * @param index The index of the feature. * @param label The label of the example containing this feature, which is * ignored if we aren't doing per class feature counting. **/ protected void incrementCount(int index, int label) { if (featureCounts != null) featureCounts.increment(index); else if (perClassFeatureCounts != null) perClassFeatureCounts.increment(label, index); } /** * Used to lookup the children of conjunctive and referring features while * writing the lexicon, this method checks {@link #lexiconChildren} if the * feature isn't present in {@link #lexicon} and {@link #lexiconInv}, and * will throw an exception if it still can't be found. * * @param f The feature to look up. * @return If the feature was found in {@link #lexicon}, its associated * integer index is returned. Otherwise, <code>-i - 1</code> is * returned, where <code>i</code> is the index associated with the * feature in {@link #lexiconChildren}. * @throws UnsupportedOperationException If the feature isn't found * anywhere in the lexicon. **/ public int lookupChild(Feature f) { lazyMapCreation(); Integer I = (Integer) lexicon.get(f); if (I != null) return I.intValue(); if (lexiconChildren == null) throw new UnsupportedOperationException( "When calling Lexicon.lookupChild(Feature), the feature must be " + "present in the lexicon."); return -lexiconChildren.lookupChild(f) - 1; } /** <!-- lookupKey(int) --> * Does a reverse lexicon lookup and returns the {@link Feature} associated * with the given integer key, and <code>null</code> if no such feature * exists. * * @param i The integer key to look up. If <code>i</code> is negative, * {@link #lexiconChildren} is queried instead of * {@link #lexiconInv}. * @return The feature that maps to the given integer. **/ public Feature lookupKey(int i) { if (i < 0) return lexiconChildren.lookupKey(-i - 1); return lexiconInv.get(i); } /** <!-- isPruned(int,PruningPolicy) --> * Determines if the given feature index should be pruned according to the * given pruning policy, which must have its thresholds set already in the * case that it represents the "Percentage" policy. This method behaves * equivalently to <code>isPruned(i, -1, p)</code>. * * @see #isPruned(int,int,Lexicon.PruningPolicy) * @param i The feature index. * @param policy The pruning policy. * @return <code>true</code> iff the feature should be pruned. **/ public boolean isPruned(int i, PruningPolicy policy) { return isPruned(i, -1, policy); } /** <!-- isPruned(int,int,PruningPolicy) --> * Determines if the given feature index should be pruned according to the * given pruning policy, which must have its thresholds set already in the * case that it represents the "Percentage" policy. The second argument to * this method represents the label of the example in which the specified * feature appeared. It is ignored unless per class feature counts are * present. If they are, then when the specified label is -1, all counts * for the given feature must be greater than or equal to the corresponding * threshold for this method to return <code>true</code>. When per class * feature counts are present and the label is non-negative, only the count * corresponding to that label must be greater than or equal to its * corresonding threshold. * * <p> In other words, passing -1 in the second argument gives the behavior * expected when pruning the lexicon as in * {@link #prune(Lexicon.PruningPolicy)}. Passing a non-negative label in * the second argument gives the behavior expected when pruning the actual * examples. * * @param i The feature index. * @param label The label of the example containing this feature, or -1 if * we want the lexicon pruning behavior. * @param policy The pruning policy. * @return <code>true</code> iff the feature should be pruned. **/ public boolean isPruned(int i, int label, PruningPolicy policy) { if (policy.isNone()) return false; if (featureCounts == null && perClassFeatureCounts == null) throw new IllegalArgumentException( "LBJ ERROR: Lexicon.isPruned: pruning policy wasn't 'None', but " + "there are no counts."); if (featureCounts != null) // if global counting return featureCounts.get(i) < policy.getThreshold(0); // otherwise, per class counting if (label >= 0) return perClassFeatureCounts.get(label, i) < policy.getThreshold(label); for (int j = 0; j < perClassFeatureCounts.size(); ++j) if (perClassFeatureCounts.get(j, i) >= policy.getThreshold(j)) return false; return true; } /** <!-- prune(PruningPolicy) --> * Rearranges the order in which features appear in the lexicon based on * the compiled feature counts in {@link #featureCounts} or * {@link #perClassFeatureCounts} so that pruned features are at the end of * the feature space. This way, learning algorithms can allocate exactly * enough space in their weight vectors for the unpruned features. * * <p> This method returns an array of integers which is a permutation of * the integers from 0 (inclusive) to the number of features in the lexicon * (exclusive). It represents a map from the features' original indexes to * their new ones after pruning. The {@link #getCutoff()} method then * returns the new index of the first pruned feature (or, equivalently, the * number of unpruned features). All features with a new index greater * than or equal to this index are considered pruned in the case of global * pruning. In the case of per-class pruning, the cutoff represents the * first feature whose count fell below the threshold for <i>every</i> * class. Thus, in this case, features below the cutoff may still be * pruned in any given class; just not all of them. * * @param policy The type of pruning to perform. * @return A map from features' original indexes to their new ones, or * <code>null</code> if <code>policy</code> indicates no pruning. **/ public int[] prune(PruningPolicy policy) { if (policy.isNone()) { pruneCutoff = -1; return null; } if (featureCounts == null && perClassFeatureCounts == null) throw new UnsupportedOperationException( "LBJ ERROR: Lexicon.prune: Can't prune if there's no feature " + "counts."); // Set thresholds in the policy. if (policy.isPercentage()) { if (featureCounts != null) { // if global counting long t = Math.round(Math.ceil(featureCounts.max() * policy.getPercentage())); policy.setThresholds(new int[]{ (int) t }); } else { // if per class counting int[] thresholds = new int[perClassFeatureCounts.size()]; int size = perClassFeatureCounts.size(); double p = policy.getPercentage(); for (int i = 0; i < size; ++i) thresholds[i] = (int) Math.round(Math.ceil(perClassFeatureCounts.max(i) * p)); policy.setThresholds(thresholds); } } // there's no clause for policy.isAbsolute() here since the appropriate // threshold must already be established in that case. else if (!policy.isAbsolute()) throw new UnsupportedOperationException( "LBJ ERROR: Lexicon.prune: Pruning policy '" + policy + "' is not supported."); // Swap features around, remembering how it was done in swapMap. pruneCutoff = size(); int[] swapMap = new int[pruneCutoff]; // If features at the end of the space are pruned, there's no need to swap // anything; just decrement pruneCutoff. while (pruneCutoff > 0 && isPruned(pruneCutoff - 1, policy)) { --pruneCutoff; swapMap[pruneCutoff] = pruneCutoff; } // Now we know the feature just below the prune cutoff does not need to be // pruned (otherwise it would have been handled by the loop above), so we // start the loop at pruneCutoff - 2 and do swaps for any feature that // needs to be pruned. if (pruneCutoff > 0) swapMap[pruneCutoff - 1] = pruneCutoff - 1; for (int i = pruneCutoff - 2; i >= 0; --i) { if (isPruned(i, policy)) { pruneCutoff--; Feature pruned = lexiconInv.get(i); Feature f = lexiconInv.get(pruneCutoff); if (lexicon != null) lexicon.put(pruned, lexicon.put(f, new Integer(i))); lexiconInv.set(i, f); lexiconInv.set(pruneCutoff, pruned); if (featureCounts != null) featureCounts.set(i, featureCounts.set(pruneCutoff, featureCounts.get(i))); else { for (int j = 0; j < perClassFeatureCounts.size(); ++j) perClassFeatureCounts.set( j, i, perClassFeatureCounts.set(j, pruneCutoff, perClassFeatureCounts.get(j, i))); } swapMap[i] = swapMap[pruneCutoff]; swapMap[pruneCutoff] = i; } else swapMap[i] = i; } // Invert swapMap. // swapMap[i] currently stores the original index of the feature whose new // index is i. but we want the inverse: swapMap[i] should store the new // index of the feature whose original index was i. we also don't want to // allocate another array as long as swapMap, even if it's only around // temporarily. so we do this: for (int i = 0; i < swapMap.length; ) { int newIndex = 0, j = i; do { int original = swapMap[j]; swapMap[j] = -newIndex; newIndex = j; j = original; } while (j != i); swapMap[i] = newIndex; for (i++; i < swapMap.length && swapMap[i] <= 0; ++i) swapMap[i] = -swapMap[i]; } return swapMap; } /** * Permanently discards any features that have been pruned via * {@link #prune(Lexicon.PruningPolicy)} as well as all feature counts. **/ public void discardPrunedFeatures() { if (pruneCutoff == -1) return; featureCounts = null; perClassFeatureCounts = null; for (int i = lexiconInv.size() - 1; i >= pruneCutoff; --i) { Feature f = lexiconInv.remove(i); if (lexicon != null) lexicon.remove(f); } lexiconInv = new FVector(lexiconInv); pruneCutoff = -1; } /** <!-- clone() --> * Returns a deep clone of this lexicon implemented as a * <code>HashMap</code>. **/ public Object clone() { Lexicon clone = null; try { clone = (Lexicon) super.clone(); } catch (Exception e) { System.err.println("Error cloning Lexicon: " + e); e.printStackTrace(); System.exit(1); } if (lexicon != null) { clone.lexicon = new HashMap(); clone.lexicon.putAll(lexicon); } clone.lexiconInv = (FVector) lexiconInv.clone(); if (featureCounts != null) clone.featureCounts = (IVector) featureCounts.clone(); if (perClassFeatureCounts != null) clone.perClassFeatureCounts = (IVector2D) perClassFeatureCounts.clone(); if (lexiconChildren != null) clone.lexiconChildren = (ChildLexicon) lexiconChildren.clone(); return clone; } /** Returns whether the given Lexicon object is equal to this one. */ public boolean equals(Object o) { if (!o.getClass().equals(getClass())) return false; Lexicon l = (Lexicon) o; return pruneCutoff == l.pruneCutoff && (lexicon == null ? l.lexicon == null : lexicon.equals(l.lexicon)) && (featureCounts == null ? l.featureCounts == null : featureCounts.equals(l.featureCounts)) && (perClassFeatureCounts == null ? l.perClassFeatureCounts == null : perClassFeatureCounts.equals(l.perClassFeatureCounts)) && (lexiconChildren == null ? l.lexiconChildren == null : lexiconChildren.equals(l.lexiconChildren)); } /** Returns a hash code for this lexicon. */ public int hashCode() { return lexiconInv.hashCode(); } /** <!-- write(ExceptionlessOutputStream) --> * Writes a binary representation of the lexicon. * * @param out The output stream. **/ public void write(ExceptionlessOutputStream out) { out.writeString(getClass().getName()); if (lexiconChildren == null) out.writeString(null); else lexiconChildren.write(out); final FVector inverse = lexiconInv; int[] indexes = new int[inverse.size()]; for (int i = 0; i < indexes.length; ++i) indexes[i] = i; Sort.sort(indexes, new Sort.IntComparator() { public int compare(int i1, int i2) { return inverse.get(i1).compareTo(inverse.get(i2)); } }); String previousClassName = null; String previousPackage = null; String previousClassifier = null; String previousSIdentifier = null; ByteString previousBSIdentifier = null; out.writeInt(indexes.length); out.writeInt(pruneCutoff); for (int i = 0; i < indexes.length; ++i) { Feature f = inverse.get(indexes[i]); previousClassName = f.lexWrite(out, this, previousClassName, previousPackage, previousClassifier, previousSIdentifier, previousBSIdentifier); previousPackage = f.getPackage(); previousClassifier = f.getGeneratingClassifier(); if (f.hasStringIdentifier()) previousSIdentifier = f.getStringIdentifier(); else if (f.hasByteStringIdentifier()) previousBSIdentifier = f.getByteStringIdentifier(); out.writeInt(indexes[i]); } if (featureCounts == null) out.writeInt(0); else featureCounts.write(out); if (perClassFeatureCounts == null) out.writeInt(0); else perClassFeatureCounts.write(out); } /** <!-- read(ExceptionlessInputStream) --> * Reads the binary representation of a lexicon from the specified stream, * overwriting the data in this object. * * @param in The input stream. **/ public void read(ExceptionlessInputStream in) { read(in, true); } /** <!-- read(ExceptionlessInputStream,boolean) --> * Reads the binary representation of a lexicon from the specified stream, * overwriting the data in this object. This method also gives the option * to ignore any feature counts stored after the feature mappings by * setting the second argument to <code>false</code>. * * @param in The input stream. * @param readCounts Whether or not to read the feature counts. **/ public void read(ExceptionlessInputStream in, boolean readCounts) { lexiconChildren = (ChildLexicon) Lexicon.readLexicon(in, readCounts); Class previousClass = null; String previousPackage = null; String previousClassifier = null; String previousSIdentifier = null; ByteString previousBSIdentifier = null; int N = in.readInt(); pruneCutoff = in.readInt(); lexicon = null; lexiconInv = new FVector(N); for (int i = 0; i < N; ++i) { Feature f = Feature.lexReadFeature(in, this, previousClass, previousPackage, previousClassifier, previousSIdentifier, previousBSIdentifier); int index = in.readInt(); lexiconInv.set(index, f); previousClass = f.getClass(); previousPackage = f.getPackage(); previousClassifier = f.getGeneratingClassifier(); if (f.hasStringIdentifier()) previousSIdentifier = f.getStringIdentifier(); else if (f.hasByteStringIdentifier()) previousBSIdentifier = f.getByteStringIdentifier(); } if (readCounts) { featureCounts = new IVector(); featureCounts.read(in); if (featureCounts.size() == 0) featureCounts = null; perClassFeatureCounts = new IVector2D(); perClassFeatureCounts.read(in); if (perClassFeatureCounts.size() == 0) perClassFeatureCounts = null; } else { featureCounts = null; perClassFeatureCounts = null; } if (lexiconChildren != null) lexiconChildren.setParent(this); } /** * Various other methods in this class call this method to ensure that * {@link #lexicon} is populated before performing operations on it. The * only reason it wouldn't be is if it had just been read off disk. **/ protected void lazyMapCreation() { if (lexicon == null) { lexicon = new HashMap(); int N = lexiconInv.size(); for (int i = 0; i < N; ++i) lexicon.put(lexiconInv.get(i), new Integer(i)); } } /** <!-- readPrunedSize(ExceptionlessInputStream) --> * Reads the value of {@link #pruneCutoff} from the specified stream, * discarding everything else. * * @param in The input stream. **/ public static int readPrunedSize(ExceptionlessInputStream in) { in.readInt(); return in.readInt(); } /** Returns a text representation of this lexicon (for debugging). */ public String toString() { StringBuffer result = new StringBuffer(); for (int i = 0; i < lexiconInv.size(); ++i) { result.append(", "); result.append(i); result.append(": "); result.append(lexiconInv.get(i).toString()); } if (lexiconInv.size() > 0) return result.substring(2); return result.toString(); } /** <!-- printCountTable(boolean) --> * Produces on <code>STDOUT</code> a table of feature counts including a * line indicating the position of {@link #pruneCutoff}. It's probably not * a good idea to call this method unless you know your lexicon is small. * * @param p Whether or not to include package names in the output. **/ public void printCountTable(boolean p) { int rows = lexiconInv.size(); String[] rowLabels = new String[rows]; String[] columnLabels = null; double[][] data = null; int[] sigDigits = null; int[] dashRows = { 0, pruneCutoff }; if (featureCounts != null) { data = new double[rows][2]; for (int i = 0; i < rows; ++i) { data[i][0] = i; data[i][1] = featureCounts.get(i); rowLabels[i] = p ? lexiconInv.get(i).toString() : lexiconInv.get(i).toStringNoPackage(); } columnLabels = new String[]{ "Index", "Count" }; sigDigits = new int[2]; } else if (perClassFeatureCounts != null) { int columns = perClassFeatureCounts.size() + 1; data = new double[rows][columns]; for (int i = 0; i < rows; ++i) { data[i][0] = i; for (int j = 0; j < columns - 1; ++j) data[i][j + 1] = perClassFeatureCounts.get(j, i); rowLabels[i] = p ? lexiconInv.get(i).toString() : lexiconInv.get(i).toStringNoPackage(); } columnLabels = new String[columns]; columnLabels[0] = "Index"; for (int i = 1; i < columns; ++i) columnLabels[i] = "Label " + (i - 1); sigDigits = new int[columns]; } else { data = new double[rows][1]; for (int i = 0; i < rows; ++i) { data[i][0] = i; rowLabels[i] = p ? lexiconInv.get(i).toString() : lexiconInv.get(i).toStringNoPackage(); } columnLabels = new String[]{ "Index" }; sigDigits = new int[1]; } TableFormat.printTableFormat(System.out, columnLabels, rowLabels, data, sigDigits, dashRows); } // main(String[]) public static void main(String[] args) { String filename = null; boolean p = true; try { filename = args[0]; if (args.length == 2) p = Boolean.parseBoolean(args[1]); if (args.length > 2) throw new Exception(); } catch (Exception e) { System.out.println( "usage: java LBJ2.learn.Lexicon <lex file> [<package names = true>]"); System.exit(1); } Lexicon lexicon = readLexicon(filename); lexicon.printCountTable(p); if (lexicon.lexiconChildren != null) { System.out.println("\nChildren:"); lexicon.lexiconChildren.printCountTable(p); } } /** <!-- class CountPolicy --> * Immutable type representing the feature counting policy of a lexicon. * When LBJ's self imposed restriction to use Java 1.4 is lifted, this * class will be replaced by an <code>enum</code>. * * <p> The three feature counting policies are described below. * * <blockquote> * <dl> * <dt> <b>None</b> </dt> * <dd> Features occurrences are not counted. </dd> * <dt> <b>Global</b> </dt> * <dd> * The lexicon stores one integer count per feature, and every * occurrence of the feature adds to this count regardless of the * example it appears in. * </dd> * <dt> <b>Per Class</b> </dt> * <dd> * The lexicon stores one integer count for each (feature, prediction * class) pair. When a given feature appears in example, this * occurrence adds to the count associated with the example's label, * assuming that examples have a single discrete label. * </dd> * </dl> * </blockquote> * * @author Nick Rizzolo **/ public static class CountPolicy { /** Represents no counting. */ public static final CountPolicy none = new CountPolicy(0); /** Represents global counting. */ public static final CountPolicy global = new CountPolicy(1); /** Represents per class counting. */ public static final CountPolicy perClass = new CountPolicy(2); /** The names of the different counting policies as strings. */ private static final String[] names = { "none", "global", "per class" }; /** Can be used to index the {@link #names} array. */ private int index; /** Initializes the object with an index. */ private CountPolicy(int i) { index = i; } /** Retrieves the name of the policy represented by this object. */ public String toString() { return names[index]; } } /** <!-- class PruningPolicy --> * Represents the feature counting policy of a lexicon. Objects of this * type are used to identify and describe a desired pruning policy. In * particular, the description of a pruning policy includes feature count * thresholds which sometimes need to be computed in terms of data. Space * is allocated within objects of this type for storing these thresholds * whenever they are computed. * * <p> The three pruning policies are described below. * * <blockquote> * <dl> * <dt> <b>None</b> </dt> * <dd> No pruning is performed. </dd> * <dt> <b>Absolute</b> </dt> * <dd> * Features whose counts within a given dataset fall below an absolute * threshold are pruned from that dataset. * </dd> * <dt> <b>Percentage</b> </dt> * <dd> * Features whose counts within a given dataset are lower than a given * percentage of the most common feature's count are pruned from that * dataset. * </dd> * </dl> * </blockquote> * * @author Nick Rizzolo **/ public static class PruningPolicy { /** Represents no pruning. */ public static final int NONE = 0; /** Represents pruning with an absolute threshold. */ public static final int ABSOLUTE = 1; /** Represents pruning with a percentage threshold. */ public static final int PERCENTAGE = 2; /** The names of the different counting policies as strings. */ private static final String[] names = { "none", "absolute", "percentage" }; /** Can be used to index the {@link #names} array. */ private int index; /** * The percentage associated with the "Percentage" policy described * above. **/ private double percentage; /** * Feature count thresholds which may either be specified by the policy * explicitly or computed in terms of data. **/ private int[] thresholds; /** Creates a new pruning policy in which no features will be pruned. */ public PruningPolicy() { index = NONE; } /** * Creates a new "Percentage" policy with the given percentage. * * @param p The percentage. **/ public PruningPolicy(double p) { index = PERCENTAGE; percentage = p; } /** * Creates a new "Absolute" policy with the given threshold. * * @param t The threshold. **/ public PruningPolicy(int t) { index = ABSOLUTE; thresholds = new int[]{ t }; } /** <code>true</code> iff the policy is no pruning. */ public boolean isNone() { return index == NONE; } /** <code>true</code> iff the policy is absolute thresholding. */ public boolean isAbsolute() { return index == ABSOLUTE; } /** <code>true</code> iff the policy is percentage thresholding. */ public boolean isPercentage() { return index == PERCENTAGE; } /** * Use this method to establish feature count thresholds in the * "Percentage" policy. * * @param t The new feature count thresholds. **/ public void setThresholds(int[] t) { if (index != PERCENTAGE) throw new UnsupportedOperationException( "LBJ ERROR: Lexicon.PruningPolicy.setThresholds should not be " + "called unless the policy is 'Percentage'."); thresholds = (int[]) t.clone(); } /** * Returns the value of the <code>i</code><sup>th</sup> threshold in * {@link #thresholds} when in "Percentage" mode, but ignores the * parameter <code>i</code> and returns the first element of * {@link #thresholds} when in "Absolute" mode. * * @param i An index. * @return A feature count threshold. **/ public int getThreshold(int i) { if (index == NONE) throw new UnsupportedOperationException( "LBJ ERROR: Lexicon.PruningPolicy.getThreshold should never be " + "called if the pruning policy is 'None'."); if (index == ABSOLUTE) return thresholds[0]; return thresholds[i]; } /** Returns the value of {@link #percentage}. */ public double getPercentage() { if (index != PERCENTAGE) throw new UnsupportedOperationException( "LBJ ERROR: PruningPolicy: Can't get percentage when pruning " + "policy isn't 'Percentage'."); return percentage; } /** Retrieves the name of the policy represented by this object. */ public String toString() { String result = names[index]; if (index == PERCENTAGE) result += "(" + percentage + ")"; if (index != NONE && thresholds != null) { result += ": ["; for (int i = 0; i < thresholds.length; ++i) result += thresholds[i] + (i + 1 < thresholds.length ? ", " : ""); result += "]"; } return result; } } }