/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/
package cc.mallet.types;
import java.util.ArrayList;
import java.io.*;
import java.util.Iterator;
import java.util.HashMap;
import java.rmi.dgc.VMID;
/**
* A mapping between integers and objects where the mapping in each
* direction is efficient. Integers are assigned consecutively, starting
* at zero, as objects are added to the Alphabet. Objects can not be
* deleted from the Alphabet and thus the integers are never reused.
* <p>
* The most common use of an alphabet is as a dictionary of feature names
* associated with a {@link cc.mallet.types.FeatureVector} in an
* {@link cc.mallet.types.Instance}. In a simple document
* classification usage,
* each unique word in a document would be a unique entry in the Alphabet
* with a unique integer associated with it. FeatureVectors rely on
* the integer part of the mapping to efficiently represent the subset of
* the Alphabet present in the FeatureVector.
* @see FeatureVector
* @see Instance
* @see cc.mallet.pipe.Pipe
*/
public class Alphabet implements Serializable
{
gnu.trove.TObjectIntHashMap map;
ArrayList entries;
boolean growthStopped = false;
Class entryClass = null;
VMID instanceId = new VMID(); //used in readResolve to identify persitent instances
public Alphabet (int capacity, Class entryClass)
{
this.map = new gnu.trove.TObjectIntHashMap (capacity);
this.entries = new ArrayList (capacity);
this.entryClass = entryClass;
// someone could try to deserialize us into this image (e.g., by RMI). Handle this.
deserializedEntries.put (instanceId, this);
}
public Alphabet (Class entryClass)
{
this (8, entryClass);
}
public Alphabet (int capacity)
{
this (capacity, null);
}
public Alphabet ()
{
this (8, null);
}
public Alphabet (Object[] entries) {
this (entries.length);
for (Object entry : entries)
this.lookupIndex(entry);
}
public Object clone ()
{
//try {
// Wastes effort, because we over-write ivars we create
Alphabet ret = new Alphabet ();
ret.map = (gnu.trove.TObjectIntHashMap) map.clone();
ret.entries = (ArrayList) entries.clone();
ret.growthStopped = growthStopped;
ret.entryClass = entryClass;
return ret;
//} catch (CloneNotSupportedException e) {
//e.printStackTrace();
//throw new IllegalStateException ("Couldn't clone InstanceList Vocabuary");
//}
}
/** Return -1 if entry isn't present. */
public int lookupIndex (Object entry, boolean addIfNotPresent)
{
if (entry == null)
throw new IllegalArgumentException ("Can't lookup \"null\" in an Alphabet.");
if (entryClass == null)
entryClass = entry.getClass();
else
// Insist that all entries in the Alphabet are of the same
// class. This may not be strictly necessary, but will catch a
// bunch of easily-made errors.
if (entry.getClass() != entryClass)
throw new IllegalArgumentException ("Non-matching entry class, "+entry.getClass()+", was "+entryClass);
int retIndex = -1;
if (map.containsKey( entry )) {
retIndex = map.get( entry );
}
else if (!growthStopped && addIfNotPresent) {
retIndex = entries.size();
map.put (entry, retIndex);
entries.add (entry);
}
return retIndex;
}
public int lookupIndex (Object entry)
{
return lookupIndex (entry, true);
}
public Object lookupObject (int index)
{
return entries.get(index);
}
public Object[] toArray () {
return entries.toArray();
}
/**
* Returns an array containing all the entries in the Alphabet.
* The runtime type of the returned array is the runtime type of in.
* If in is large enough to hold everything in the alphabet, then it
* it used. The returned array is such that for all entries <tt>obj</tt>,
* <tt>ret[lookupIndex(obj)] = obj</tt> .
*/
public Object[] toArray (Object[] in) {
return entries.toArray (in);
}
// xxx This should disable the iterator's remove method...
public Iterator iterator () {
return entries.iterator();
}
public Object[] lookupObjects (int[] indices)
{
Object[] ret = new Object[indices.length];
for (int i = 0; i < indices.length; i++)
ret[i] = entries.get(indices[i]);
return ret;
}
/**
* Returns an array of the objects corresponding to
* @param indices An array of indices to look up
* @param buf An array to store the returned objects in.
* @return An array of values from this Alphabet. The runtime type of the array is the same as buf
*/
public Object[] lookupObjects (int[] indices, Object[] buf)
{
for (int i = 0; i < indices.length; i++)
buf[i] = entries.get(indices[i]);
return buf;
}
public int[] lookupIndices (Object[] objects, boolean addIfNotPresent)
{
int[] ret = new int[objects.length];
for (int i = 0; i < objects.length; i++)
ret[i] = lookupIndex (objects[i], addIfNotPresent);
return ret;
}
public boolean contains (Object entry)
{
return map.contains (entry);
}
public int size ()
{
return entries.size();
}
public void stopGrowth ()
{
growthStopped = true;
}
public void startGrowth ()
{
growthStopped = false;
}
public boolean growthStopped ()
{
return growthStopped;
}
public Class entryClass ()
{
return entryClass;
}
/** Return String representation of all Alphabet entries, each
separated by a newline. */
public String toString()
{
StringBuffer sb = new StringBuffer();
for (int i = 0; i < entries.size(); i++) {
sb.append (entries.get(i).toString());
sb.append ('\n');
}
return sb.toString();
}
public void dump () { dump (System.out); }
public void dump (PrintStream out)
{
dump (new PrintWriter (new OutputStreamWriter (out), true));
}
public void dump (PrintWriter out)
{
for (int i = 0; i < entries.size(); i++) {
out.println (i+" => "+entries.get (i));
}
}
/** Convenience method that can often implement alphabetsMatch in classes that implement the AlphabetsCarrying interface. */
public static boolean alphabetsMatch (AlphabetCarrying object1, AlphabetCarrying object2) {
Alphabet[] a1 = object1.getAlphabets();
Alphabet[] a2 = object2.getAlphabets();
if (a1.length != a2.length) return false;
for (int i = 0; i < a1.length; i++) {
if (a1[i] == a2[i]) continue;
if (a1[i] == null || a2[i] == null) return false; // One is null, but the other isn't
if (! a1[i].equals(a2[i])) return false;
}
return true;
}
public VMID getInstanceId() { return instanceId;} // for debugging
public void setInstanceId(VMID id) { this.instanceId = id; }
// Serialization
private static final long serialVersionUID = 1;
private static final int CURRENT_SERIAL_VERSION = 1;
private void writeObject (ObjectOutputStream out) throws IOException {
out.writeInt (CURRENT_SERIAL_VERSION);
out.writeInt (entries.size());
for (int i = 0; i < entries.size(); i++)
out.writeObject (entries.get(i));
out.writeBoolean (growthStopped);
out.writeObject (entryClass);
out.writeObject(instanceId);
}
private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
int version = in.readInt ();
int size = in.readInt();
entries = new ArrayList (size);
map = new gnu.trove.TObjectIntHashMap (size);
for (int i = 0; i < size; i++) {
Object o = in.readObject();
map.put (o, i);
entries. add (o);
}
growthStopped = in.readBoolean();
entryClass = (Class) in.readObject();
if (version >0 ){ // instanced id added in version 1S
instanceId = (VMID) in.readObject();
}
}
private transient static HashMap deserializedEntries = new HashMap();
/**
* This gets called after readObject; it lets the object decide whether
* to return itself or return a previously read in version.
* We use a hashMap of instanceIds to determine if we have already read
* in this object.
* @return
* @throws ObjectStreamException
*/
public Object readResolve() throws ObjectStreamException {
Object previous = deserializedEntries.get(instanceId);
if (previous != null){
//System.out.println(" ***Alphabet ReadResolve:Resolving to previous instance. instance id= " + instanceId);
return previous;
}
if (instanceId != null){
deserializedEntries.put(instanceId, this);
}
//System.out.println(" *** Alphabet ReadResolve: new instance. instance id= " + instanceId);
return this;
}
}