package edu.stanford.nlp.util; import java.io.*; import java.util.*; /** * Open addressing backed index for arbitrary object types. * Includes support for both traditional matching of keys values based on * the equals() method as well as 'identity' matching where by keys are * only compared with '=='. * * WARNING: This is currently experimental code. It exists since in * theory open addressing hashing should be more efficient * for hashing int values than bucket-chain hashing, which is used * to back java.util.HashMap. * * To do list: * - in fear of user stupidity, i think HashMap re-hashes the hash * values returned by objects, should we? * - finalize interface * - rigorous benchmarks * - unit tests * * @author <a href="mailto:daniel.cer@cs.colorado.edu">Daniel Cer</a> */ public class OAIndex<K> implements IndexInterface<K> { private static final long serialVersionUID = 127L; static final int INIT_SZ = 1<<10; static final double MAX_LOAD = 0.60; // Index<K> sanityIndex = new Index<K>(); private final boolean identityHash; private Object[] keys; private int[] values; private int mask; private int[] hashCodes; private int[] reverseIndex; int maxIndex; int load; public OAIndex() { identityHash = false; init(); } public OAIndex(boolean identityHash) { this.identityHash = identityHash; init(); } @SuppressWarnings("unchecked") public Set<K> keySet() { Set<K> keySet = new HashSet<K>(); for (int i = 0; i < keys.length; i++) { if (keys[i] == null) continue; keySet.add((K) keys[i]); } return keySet; } public int maxIndex() { return maxIndex; } public int boundOnMaxIndex() { return keys.length; } private void init() { keys = new Object[INIT_SZ]; // since we can't create an array of type K[]; values = new int[INIT_SZ]; hashCodes = new int[INIT_SZ]; reverseIndex = new int[INIT_SZ]; Arrays.fill(reverseIndex, -1); mask = INIT_SZ - 1; } private int supplementalHash(int h) { // use the same supplemental hash function used by HashMap return ((h << 7) - h + (h >>> 9) + (h >>> 17)); } private int findPos(Object e, boolean add) { int hashCode = supplementalHash(e.hashCode()); int idealIdx = hashCode & mask; if (identityHash) { for (int i = 0, idx = idealIdx; i < keys.length; i++, idx++) { if (idx >= keys.length) idx = 0; if (keys[idx] == null) return -idx-1; if (keys[idx] == e) return idx; } } else { for (int i = 0, idx = idealIdx; i < keys.length; i++, idx++) { if (idx >= keys.length) idx = 0; if (keys[idx] == null) return -idx-1; if (hashCodes[idx] != hashCode) continue; if (keys[idx].equals(e)) return idx; } } return -keys.length-1; } @SuppressWarnings("unchecked") public K get(int idx) { int pos = reverseIndex[idx]; if (pos == -1) return null; return (K)keys[pos]; } private void sizeUp() { int newSize = keys.length<<1; mask = newSize-1; //System.err.printf("size up to: %d\n", newSize); Object[] oldKeys = keys; int[] oldValues = values; int[] oldHashCodes = hashCodes; keys = new Object[newSize]; values = new int[newSize]; reverseIndex = new int[newSize]; Arrays.fill(reverseIndex, -1); hashCodes = new int[newSize]; for (int i = 0; i < oldKeys.length; i++) { if (oldKeys[i]==null) continue; int pos = -findPos(oldKeys[i], true)-1; keys[pos] = oldKeys[i]; values[pos] = oldValues[i]; reverseIndex[values[pos]] = pos; hashCodes[pos] = oldHashCodes[i]; } } @SuppressWarnings("unused") private int getSearchOffset(int pos, Object key) { int idealIdx = supplementalHash(key.hashCode()) & mask; int distance; if (idealIdx < pos) { distance = pos + keys.length - idealIdx; } else { distance = pos - idealIdx; } return distance; } private int add(K key, int pos) { if ((load++)/(double)keys.length > MAX_LOAD) { sizeUp(); pos = -findPos(key, true)-1; } keys[pos] = key; values[pos] = maxIndex++; reverseIndex[values[pos]] = pos; hashCodes[pos] = supplementalHash(key.hashCode()); return maxIndex-1; } @Override @SuppressWarnings("unchecked") public String toString() { Set keySet = new TreeSet(); for (int i = 0; i < keys.length; i++) { if (keys[i] == null) continue; keySet.add(keys[i]); } StringBuffer sb = new StringBuffer(); sb.append("["); for (Object k : keySet) { sb.append(k).append(":").append(indexOf((K)k)).append(" "); } sb.append("]"); return sb.toString(); } public int indexOf(K key) { int pos = findPos(key, false); if (pos < 0) return -1; return values[pos]; } public boolean contains(Object key) { int pos = findPos(key, false); if (pos < 0) return false; return true; } public int indexOf(K key, boolean add) { int pos = findPos(key, add); if (pos >= 0) return values[pos]; if (!add) return -1; //System.out.printf("adding: %s %d\n", key, -pos-1); return add(key, -pos-1); /* if (pos != sanityIndex.indexOf(key, true)) { System.err.printf("%d != %d", pos, sanityIndex.indexOf(key)); System.exit(-1); } */ } static public void main(String[] args) throws IOException { if (args.length != 1) { System.err.printf("Usage:\n\tjava ...OAIndex (text file to index)\n"); System.exit(-1); } BufferedReader breader = new BufferedReader(new FileReader(args[0])); OAIndex<String> oaindex = new OAIndex<String>(); System.out.printf("Inserting tokens:\n"); for (String line; (line = breader.readLine()) != null; ) { String[] tokens = line.split("\\s"); for (String token : tokens) { oaindex.indexOf(token, true); System.out.printf("%s: %d (get: %s)\n", token, oaindex.indexOf(token), oaindex.get(oaindex.indexOf(token))); } } System.out.println(); System.out.printf("Final Index:\n%s\n", oaindex); } public int size() { return load; } }