package org.cdlib.xtf.util; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.StringTokenizer; /** * Copyright (c) 2004, Regents of the University of California * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the University of California nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /** * Maintains an in-memory, one-to-one mapping from characters in one set to * characters in another. The list is read from a disk file, which may be * sorted or unsorted. * * The format of file entries should be one pair per line, separated by a bar * ("|") character. The first word is considered the "key", the second is the * "value". Each should be a four-digit hex number representing a Unicode * code point. * * For speed, an in-memory cache of recently mapped words is maintained. */ public class CharMap { /** The mapping of chars. */ private char[] map = new char[65536]; /** Special character to denote null list */ private static final char NULL_CHAR = '\uEE00'; /** Size of supplimental mapping of chars... typically there are few */ private static final int SUPP_HASH_SIZE = 100; /** Supplemental mapping of characters after the first */ private IntHash supplementalCharsMap = new IntHash(SUPP_HASH_SIZE); /** How many recent mappings to maintain */ private static final int CACHE_SIZE = 5000; /** Keep a cache of lookups performed to-date */ private FastStringCache cache = new FastStringCache(CACHE_SIZE); /** Construct a char map by reading in a file. */ public CharMap(File f) throws IOException { readFile(new BufferedReader(new FileReader(f))); } /** Construct a char map by reading from an InputStream. */ public CharMap(InputStream s) throws IOException { readFile(new BufferedReader(new InputStreamReader(s))); } /** Map the characters in a word and return the mapped resulting word, * or null if no mappings found. */ public synchronized String mapWord(String word) { // Have we already looked up this word? If so, save time. String val = null; if (cache.contains(word)) { val = (String)cache.get(word); return val; } // Do a quick scan to see if there are any mappable chars. Usually // there are none, so this saves time. // int i; for (i = 0; i < word.length(); i++) { if (map[word.charAt(i)] != 0) break; } if (i == word.length()) { cache.put(word, null); return null; } // Okay, we need to map at least one character. This might result in // the string changing size, so we need to use a buffer. // StringBuffer buf = new StringBuffer(word.length() + 2); buf.append(word); i = 0; int nIterations = 0; while (i < buf.length()) { char c = buf.charAt(i); // Check for infinite loop (can happen if char X maps to Y // and Y maps back to X, or if X maps to XY) // if (++nIterations > 100000) throw new RuntimeException("Probable infinite loop detected in word map"); // If no mapping, go on to the next character. if (map[c] == 0) { ++i; continue; } // If mapping to null, delete the character. if (map[c] == NULL_CHAR) { buf.deleteCharAt(i); continue; } // Replace the existing char with the new mapped char. buf.setCharAt(i, map[c]); // If there is a supplemental string to add, put it in. String suppChars = (String)supplementalCharsMap.get(c); if (suppChars != null) buf.insert(i + 1, suppChars); // Don't increment, since one of the new chars might need // additional mapping. // ; } // Reconstitute the new word and cache it. Then we're done. String newWord = buf.toString(); cache.put(word, newWord); return newWord; } // mapWord() /** * Read in the contents of a char file. The file need not be in sorted * order. * * @param reader Reader to get the data from * @throws IOException */ private void readFile(BufferedReader reader) throws IOException { while (true) { String line = reader.readLine(); if (line == null) break; // Strip off any trailing comment. if (line.indexOf("//") >= 0) line = line.substring(0, line.indexOf("//")); if (line.indexOf("#") >= 0) line = line.substring(0, line.indexOf("#")); if (line.indexOf(";") >= 0) line = line.substring(0, line.indexOf(";")); // Break out the two fields. If no bar, skip this line. int barPos = line.indexOf('|'); if (barPos < 0) continue; String key = line.substring(0, barPos).trim(); String val = line.substring(barPos + 1).trim(); // The key should be exactly four hex digits. int keyCode = -1; try { keyCode = Integer.parseInt(key, 16); } catch (NumberFormatException e) { } if (keyCode < 0 || keyCode > 65535 || key.length() != 4) { Trace.warning( "Warning: Invalid key in char mapping: " + "key '" + key + "' must be exactly four hex digits"); continue; } // The value should be zero or more sets of four hex digits. StringTokenizer st = new StringTokenizer(val); StringBuffer valBuf = new StringBuffer(3); while (st.hasMoreTokens()) { String tok = st.nextToken(); int valCode = -1; try { valCode = Integer.parseInt(tok, 16); } catch (NumberFormatException e) { } if (valCode < 0 || valCode > 65535) { Trace.warning( "Warning: Invalid key/val char mapping: " + "'" + key + "' -> '" + val + "' (value must be series of 4-digit hex numbers)"); continue; } valBuf.append((char)valCode); } // Record the entry. if (valBuf.length() == 0) { // Record null entry using a special marker character map[keyCode] = NULL_CHAR; } else { // Record the first character of the mapping (most mappings // only have one anyway.) // map[keyCode] = valBuf.charAt(0); // In the unusual case of a mapping that has more than one // character, record the remaining chars in a quick-access // hash. // if (valBuf.length() > 1) supplementalCharsMap.put(keyCode, valBuf.substring(1)); } } // while } // readFile() } // class WordMap