/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ /** @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a> */ package cc.mallet.types; import java.util.ArrayList; import java.io.*; import java.util.Iterator; import java.util.HashMap; import java.rmi.dgc.VMID; /** * A mapping between integers and objects where the mapping in each * direction is efficient. Integers are assigned consecutively, starting * at zero, as objects are added to the Alphabet. Objects can not be * deleted from the Alphabet and thus the integers are never reused. * <p> * The most common use of an alphabet is as a dictionary of feature names * associated with a {@link cc.mallet.types.FeatureVector} in an * {@link cc.mallet.types.Instance}. In a simple document * classification usage, * each unique word in a document would be a unique entry in the Alphabet * with a unique integer associated with it. FeatureVectors rely on * the integer part of the mapping to efficiently represent the subset of * the Alphabet present in the FeatureVector. * @see FeatureVector * @see Instance * @see cc.mallet.pipe.Pipe */ public class Alphabet implements Serializable { gnu.trove.TObjectIntHashMap map; ArrayList entries; boolean growthStopped = false; Class entryClass = null; VMID instanceId = new VMID(); //used in readResolve to identify persitent instances public Alphabet (int capacity, Class entryClass) { this.map = new gnu.trove.TObjectIntHashMap (capacity); this.entries = new ArrayList (capacity); this.entryClass = entryClass; // someone could try to deserialize us into this image (e.g., by RMI). Handle this. deserializedEntries.put (instanceId, this); } public Alphabet (Class entryClass) { this (8, entryClass); } public Alphabet (int capacity) { this (capacity, null); } public Alphabet () { this (8, null); } public Alphabet (Object[] entries) { this (entries.length); for (Object entry : entries) this.lookupIndex(entry); } public Object clone () { //try { // Wastes effort, because we over-write ivars we create Alphabet ret = new Alphabet (); ret.map = (gnu.trove.TObjectIntHashMap) map.clone(); ret.entries = (ArrayList) entries.clone(); ret.growthStopped = growthStopped; ret.entryClass = entryClass; return ret; //} catch (CloneNotSupportedException e) { //e.printStackTrace(); //throw new IllegalStateException ("Couldn't clone InstanceList Vocabuary"); //} } /** Return -1 if entry isn't present. */ public int lookupIndex (Object entry, boolean addIfNotPresent) { if (entry == null) throw new IllegalArgumentException ("Can't lookup \"null\" in an Alphabet."); if (entryClass == null) entryClass = entry.getClass(); else // Insist that all entries in the Alphabet are of the same // class. This may not be strictly necessary, but will catch a // bunch of easily-made errors. if (entry.getClass() != entryClass) throw new IllegalArgumentException ("Non-matching entry class, "+entry.getClass()+", was "+entryClass); int retIndex = -1; if (map.containsKey( entry )) { retIndex = map.get( entry ); } else if (!growthStopped && addIfNotPresent) { retIndex = entries.size(); map.put (entry, retIndex); entries.add (entry); } return retIndex; } public int lookupIndex (Object entry) { return lookupIndex (entry, true); } public Object lookupObject (int index) { return entries.get(index); } public Object[] toArray () { return entries.toArray(); } /** * Returns an array containing all the entries in the Alphabet. * The runtime type of the returned array is the runtime type of in. * If in is large enough to hold everything in the alphabet, then it * it used. The returned array is such that for all entries <tt>obj</tt>, * <tt>ret[lookupIndex(obj)] = obj</tt> . */ public Object[] toArray (Object[] in) { return entries.toArray (in); } // xxx This should disable the iterator's remove method... public Iterator iterator () { return entries.iterator(); } public Object[] lookupObjects (int[] indices) { Object[] ret = new Object[indices.length]; for (int i = 0; i < indices.length; i++) ret[i] = entries.get(indices[i]); return ret; } /** * Returns an array of the objects corresponding to * @param indices An array of indices to look up * @param buf An array to store the returned objects in. * @return An array of values from this Alphabet. The runtime type of the array is the same as buf */ public Object[] lookupObjects (int[] indices, Object[] buf) { for (int i = 0; i < indices.length; i++) buf[i] = entries.get(indices[i]); return buf; } public int[] lookupIndices (Object[] objects, boolean addIfNotPresent) { int[] ret = new int[objects.length]; for (int i = 0; i < objects.length; i++) ret[i] = lookupIndex (objects[i], addIfNotPresent); return ret; } public boolean contains (Object entry) { return map.contains (entry); } public int size () { return entries.size(); } public void stopGrowth () { growthStopped = true; } public void startGrowth () { growthStopped = false; } public boolean growthStopped () { return growthStopped; } public Class entryClass () { return entryClass; } /** Return String representation of all Alphabet entries, each separated by a newline. */ public String toString() { StringBuffer sb = new StringBuffer(); for (int i = 0; i < entries.size(); i++) { sb.append (entries.get(i).toString()); sb.append ('\n'); } return sb.toString(); } public void dump () { dump (System.out); } public void dump (PrintStream out) { dump (new PrintWriter (new OutputStreamWriter (out), true)); } public void dump (PrintWriter out) { for (int i = 0; i < entries.size(); i++) { out.println (i+" => "+entries.get (i)); } } /** Convenience method that can often implement alphabetsMatch in classes that implement the AlphabetsCarrying interface. */ public static boolean alphabetsMatch (AlphabetCarrying object1, AlphabetCarrying object2) { Alphabet[] a1 = object1.getAlphabets(); Alphabet[] a2 = object2.getAlphabets(); if (a1.length != a2.length) return false; for (int i = 0; i < a1.length; i++) { if (a1[i] == a2[i]) continue; if (a1[i] == null || a2[i] == null) return false; // One is null, but the other isn't if (! a1[i].equals(a2[i])) return false; } return true; } public VMID getInstanceId() { return instanceId;} // for debugging public void setInstanceId(VMID id) { this.instanceId = id; } // Serialization private static final long serialVersionUID = 1; private static final int CURRENT_SERIAL_VERSION = 1; private void writeObject (ObjectOutputStream out) throws IOException { out.writeInt (CURRENT_SERIAL_VERSION); out.writeInt (entries.size()); for (int i = 0; i < entries.size(); i++) out.writeObject (entries.get(i)); out.writeBoolean (growthStopped); out.writeObject (entryClass); out.writeObject(instanceId); } private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException { int version = in.readInt (); int size = in.readInt(); entries = new ArrayList (size); map = new gnu.trove.TObjectIntHashMap (size); for (int i = 0; i < size; i++) { Object o = in.readObject(); map.put (o, i); entries. add (o); } growthStopped = in.readBoolean(); entryClass = (Class) in.readObject(); if (version >0 ){ // instanced id added in version 1S instanceId = (VMID) in.readObject(); } } private transient static HashMap deserializedEntries = new HashMap(); /** * This gets called after readObject; it lets the object decide whether * to return itself or return a previously read in version. * We use a hashMap of instanceIds to determine if we have already read * in this object. * @return * @throws ObjectStreamException */ public Object readResolve() throws ObjectStreamException { Object previous = deserializedEntries.get(instanceId); if (previous != null){ //System.out.println(" ***Alphabet ReadResolve:Resolving to previous instance. instance id= " + instanceId); return previous; } if (instanceId != null){ deserializedEntries.put(instanceId, this); } //System.out.println(" *** Alphabet ReadResolve: new instance. instance id= " + instanceId); return this; } }