package org.cdlib.xtf.util; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.TreeMap; import java.util.Map; /** * Copyright (c) 2004, Regents of the University of California * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the University of California nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /** * Maintains an in-memory, one-to-one mapping from words in one set to words in * another. The list is read from a disk file, which may be sorted or unsorted. * The format of file entries should be one pair per line, separated by a bar * ("|") character. The first word is considered the "key", the second is the * "value". * * For speed, an in-memory cache of recently mapped words is maintained. */ public class WordMap { /** How many recent mappings to maintain */ private static final int CACHE_SIZE = 5000; /** Keep a cache of lookups performed to-date */ private FastStringCache cache = new FastStringCache(CACHE_SIZE); /** Map of blocks, keyed by the first word in each block */ private HashMap blockMap = new HashMap(100); /** Sorted list of the block keys, for fast binary searching */ private ArrayList blockHeads = new ArrayList(100); /** Construct a word map by reading in a file. */ public WordMap(File f, CharMap charMap) throws IOException { readFile(new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")), charMap); } /** * Construct a word map by reading from an InputStream. If a non-null * character map is specified, all entries are filtered through it. */ public WordMap(InputStream s, CharMap charMap) throws IOException { readFile(new BufferedReader(new InputStreamReader(s, "UTF-8")), charMap); } /** Look up a word, and return the corresponding value, or null if none. */ public synchronized String lookup(String word) { // Have we already looked up this word? If so, save time. String val = null; if (cache.contains(word)) { val = (String)cache.get(word); return val; } // Find the appropriate block. int blockNum = Collections.binarySearch(blockHeads, word); if (blockNum < 0) blockNum = -blockNum - 2; if (blockNum < 0) { cache.put(word, null); return null; } // Search that block. String prev = (String)blockHeads.get(blockNum); String block = (String)blockMap.get(prev); int pos = 0; while (pos < block.length()) { int keyShare = block.charAt(pos) - '0'; int barPos = block.indexOf('|', pos + 1); String key = prev.substring(0, keyShare) + block.substring(pos + 1, barPos); int end = block.indexOf('\n', barPos + 1); int comp = key.compareTo(word); if (comp > 0) break; if (comp != 0) { pos = end + 1; if (pos < 0) break; prev = key; continue; } int valShare = block.charAt(barPos + 1) - '0'; val = key.substring(0, valShare) + block.substring(barPos + 2, end); cache.put(word, val); return val; } // Not found. cache.put(word, null); return null; } // lookup() /** * Read in the contents of a word file, forming blocks of 128 entries per * block. The file need not be in sorted order. * * @param reader Reader to get the data from * @param charMap Accent map to filter entries with, or null for none. * @throws IOException */ private void readFile(BufferedReader reader, CharMap charMap) throws IOException { TreeMap entries = new TreeMap(); HashMap randomCheck = new HashMap(); while (true) { String line = reader.readLine(); if (line == null) break; // Strip off any trailing comment. if (line.indexOf("//") >= 0) line = line.substring(0, line.indexOf("//")); if (line.indexOf("#") >= 0) line = line.substring(0, line.indexOf("#")); if (line.indexOf(";") >= 0) line = line.substring(0, line.indexOf(";")); // Break out the two fields int barPos = line.indexOf('|'); if (barPos < 0) continue; String key = line.substring(0, barPos).trim(); String val = line.substring(barPos + 1).trim(); if (key.length() == 0 || val.length() == 0) continue; // Map characters if a mapping was specified. if (charMap != null) { String newKey = charMap.mapWord(key); if (newKey != null) key = newKey; String newVal = charMap.mapWord(val); if (newVal != null) val = newVal; } // Record the entry. entries.put(key, val); } // while // Divide the entries into sets of 128, and prefix-encode each block. StringBuffer buf = new StringBuffer(); int nEntries = 0; String prev = ""; String firstKey = ""; for (Iterator iter = entries.entrySet().iterator(); iter.hasNext();) { Map.Entry entry = (Map.Entry)iter.next(); String key = (String)entry.getKey(); String val = (String)entry.getValue(); if (firstKey.length() == 0) { firstKey = key; prev = firstKey; } // Figure out how many characters the key has in common with // the previous one. // int keyShare; for (keyShare = 0; keyShare < key.length(); keyShare++) { if (keyShare == prev.length()) break; if (key.charAt(keyShare) != prev.charAt(keyShare)) break; } // Figure out how many characters the value has in common with // the key. // int valShare; for (valShare = 0; valShare < key.length(); valShare++) { if (valShare == val.length()) break; if (key.charAt(valShare) != val.charAt(valShare)) break; } // Now create the entry. buf.append( ((char)(keyShare + '0')) + key.substring(keyShare) + '|' + ((char)(valShare + '0')) + val.substring(valShare) + "\n"); // Record this key for the next time round. prev = key; if (Math.random() < .01) { randomCheck.put(key, val); } // If we've reached 128 entries in this block, or we've reached // the end of the entries, store the block. // nEntries++; if (nEntries == 128 || !iter.hasNext()) { String block = buf.toString(); blockMap.put(firstKey, block); blockHeads.add(firstKey); // Reset for next block nEntries = 0; firstKey = prev = ""; buf.setLength(0); } } // while // Do some random checks to make sure we set things up correctly. for (Iterator iter = randomCheck.entrySet().iterator(); iter.hasNext();) { Map.Entry ent = (Map.Entry)iter.next(); String result = lookup((String)ent.getKey()); assert result.equals(ent.getValue()); } assert lookup("a") == null; assert lookup("zzzzzz") == null; // Clear out the random checks from the cache, to start with a nice // small memory footprint. // cache.clear(); } // readFile() } // class WordMap