package org.apache.lucene.spelt; /** * Copyright 2006-2007 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Acknowledgements: * * A significant amount of new and/or modified code in this module * was made possible by a grant from the Andrew W. Mellon Foundation, * as part of the Melvyl Recommender Project. */ import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.util.ArrayList; import java.util.Set; import java.util.regex.Pattern; import org.apache.lucene.util.Hash64; import org.apache.lucene.util.IntList; import org.apache.lucene.util.LongSet; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.StringUtil; /** * <p> * Reads a spelling dictionary created by {@link SpellWriter}, and provides * fast single- and multi-word spelling suggestions. Typical usage: * </p> * <ol> * <li>First, {@linkplain #open(File) open} a new reader.</li> * <li>For each potentially mispelled query, gather the keywords * and {@linkplain #suggestKeywords(String[]) get suggestions} * for them. * </li> * <li>When done with all queries, {@linkplain #close()} the reader. * </ol> * <p> * Inspired by and very distantly based on Nicolas Maisonneuve / David Spencer * code. * </p> * * @author Martin Haye */ public class SpellReader { /** Keys in the edit map file */ private IntList edMapKeys; /** Positions in the edit map file */ private IntList edMapPosns; /** File for reading edit map entries */ private RandomAccessFile edMapFile; /** Charset decoder for reading edit map entries */ private CharsetDecoder edMapDecoder; /** Pair frequency data */ private FreqData pairFreqs; /** Word frequency data */ private FreqData wordFreqs; /** Frequencies from the term data, sampled at 5 levels */ private int[] freqSamples; /** Where to send debugging info (or null for none) */ private PrintWriter debugWriter = null; /** Pattern used for splitting up lines delimited by bars */ private final Pattern splitPat = Pattern.compile("\\||\n"); /** Set of stop-words to use during spell correction, or null for none */ private Set stopSet; /** Word equivalency checker */ private WordEquiv wordEquiv; /** Private constructor -- use {@link #open(File)} instead. */ private SpellReader() { } /** Check if there's a valid dictionary in the given directory */ public static boolean isValidDictionary(File spellDir) { if (!spellDir.isDirectory() || !spellDir.canRead()) return false; File file = new File(spellDir, "pairs.dat"); return file.canRead(); } /** * Open a reader for the given spelling index directory. Does no stop word * processing, and uses default word equivalency (just case insensitive.) * To specify a stopword set (which you must if you did when building the * dictionary), call {@link #setStopwords(Set)}. To specify a non-default * word equivalency, call {@link #setWordEquiv(WordEquiv)}. * * @param spellDir directory containing the spelling dictionary */ public static SpellReader open(File spellDir) throws IOException { SpellReader reader = new SpellReader(); reader.stopSet = null; reader.wordEquiv = WordEquiv.DEFAULT; reader.openEdmap(spellDir); reader.loadFreqSamples(spellDir); reader.loadWordFreqs(spellDir); reader.openPairFreqs(spellDir); return reader; } /** * Establishes a list of stopwords (e.g. "the", "and", "an", etc.). This * list should be identical to that which was used to create the * dictionary. * * @param set Set of stop-words; all should be lower-case. */ public void setStopwords(Set set) { this.stopSet = set; } /** * Establishes a word equivalency checker. This is used to prevent the * correction algorithm from making suggestions that won't change the * query result. For instance, if words in the main index are all * converted from plural to singular, it would be silly for the checker * to suggest "cats" to replace "cat". * * @param eq the equivalency checker to use */ public void setWordEquiv(WordEquiv eq) { this.wordEquiv = eq; } /** Read the index for the edit map file */ private void openEdmap(File spellDir) throws IOException { long startTime = System.currentTimeMillis(); File file = new File(spellDir, "edmap.dat"); try { // First, open the map file. At the end, we'll find the position of the index. FileInputStream in = new FileInputStream(file); in.skip(file.length() - 20); BufferedReader reader = new BufferedReader(new InputStreamReader(in)); String line = reader.readLine(); int indexPos = Integer.parseInt(line.trim()); // Now re-open and read the index. reader.close(); in = new FileInputStream(file); in.skip(indexPos); reader = new BufferedReader(new InputStreamReader(in)); // Check that we're really looking at a valid index line = reader.readLine(); if (!line.equals("edMap index")) throw new IOException("edmap file corrupt"); // Find out how many keys there are and allocate our lists. line = reader.readLine(); int nKeys = Integer.parseInt(line); edMapKeys = new IntList(nKeys); edMapPosns = new IntList(nKeys + 1); // And read each key/size line int prevKey = 0; int pos = 0; for (int i = 0; i < nKeys; i++) { line = reader.readLine(); String[] tokens = splitPat.split(line); if (tokens.length != 2) throw new IOException("edmap file corrupt"); if (tokens[0].length() != 4) throw new IOException("edmap file corrupt"); int key = comboKey(tokens[0], 0, 1, 2, 3); assert key >= prevKey : "edmap file out of order or corrupt"; prevKey = key; edMapKeys.add(key); int size = Integer.parseInt(tokens[1]); edMapPosns.add(pos); pos += size; } reader.close(); if (edMapKeys.size() != nKeys) throw new IOException("edmap file index truncated"); // Make one extra position entry, and record the index start (as it's // the end of the last key entry) // edMapPosns.add(indexPos); } catch (NumberFormatException e) { throw new IOException("edmap file corrupt"); } // Make a charset decoder that will be used to decode the UTF-8 data edMapDecoder = Charset.forName("UTF-8").newDecoder(); // Finally, open a random-access version of the file for the actual // spellcheck process. // edMapFile = new RandomAccessFile(file, "r"); // Print stats if (debugWriter != null) { debugWriter.println( "EdMap index load time: " + (System.currentTimeMillis() - startTime)); debugWriter.println(" nKeys: " + edMapKeys.size()); } } /** Closes any open files and/or resources associated with the SpellReader */ public void close() throws IOException { if (edMapFile != null) { edMapFile.close(); edMapFile = null; } } /** Establishes a destination for detailed debugging output */ public void setDebugWriter(PrintWriter w) { debugWriter = w; } /** * Read the list of edit-map words for the given 4-character key. * * @param orig the original word being considered * @param key the 4-char key to look up * @param minFreq minimum frequency of words to be queued * @param checked set of words that have already been considered * @param queue receives the resulting words * @return true iff the key was found */ private boolean readEdKey(Word orig, int key, int minFreq, LongSet checked, WordQueue queue) throws IOException { // Look up this key in our index. int idxNum = edMapKeys.binarySearch(key); if (idxNum < 0) return false; // Read in the corresponding chunk of data int startPos = edMapPosns.get(idxNum); int endPos = edMapPosns.get(idxNum + 1); byte[] bytes = new byte[endPos - startPos]; edMapFile.seek(startPos); if (edMapFile.read(bytes) != bytes.length) throw new IOException("error reading from edMap file"); // Decode the string data from UTF-8 String line = edMapDecoder.decode(ByteBuffer.wrap(bytes)).toString().trim(); // Break up all the tokens, and validate the amount. String[] tokens = splitPat.split(line); if (tokens.length < 2) throw new IOException("edmap file corrupt"); // Make sure we got the right key! if (key != comboKey(tokens[0], 0, 1, 2, 3)) throw new IOException("edmap index incorrect"); // Record each word in the list (and their frequencies) String prev = null; for (int j = 1; j < tokens.length; j++) { String word = tokens[j]; // Handle prefix compression if (prev != null) { int overlap = word.charAt(0) - '0'; word = prev.substring(0, overlap) + word.substring(1); } prev = word; // Don't consider any word twice. long hash = Hash64.hash(word); if (checked.contains(hash)) continue; checked.add(hash); // If the frequency is too low, skip it. int freq = wordFreqs.get(hash); if (freq < minFreq) continue; // Eliminate suggestions that are too distant from the original. In // testing, this has the effect of increasing accuracy for the #1 // spot, and in general getting rid of many "ridiculous" suggestions, // but it does eliminate certain distant suggestions way down the // list. // if (orig.wordDist(word) > 4) continue; // Add the new word to the queue. Word w = new Word(orig, word, freq); queue.insert(w); } // All done. return true; } /** * Find words "close" to the given one, and add them to a queue. * In this case, "close" means that the first six characters have an * edit distance of 2 or less. Well, it means approximately that * anyway. * * More precisely, we iterate all possible 4-letter keys that can be * constructed by deleting two of the first six characters in the * word. For each key, we add all words that share it. */ private void findCloseWords(Word orig, int minFreq, WordQueue queue) throws IOException { LongSet checked = new LongSet(100); readEdKey(orig, comboKey(orig.word, 0, 1, 2, 3), minFreq, checked, queue); readEdKey(orig, comboKey(orig.word, 0, 1, 2, 4), minFreq, checked, queue); readEdKey(orig, comboKey(orig.word, 0, 1, 2, 5), minFreq, checked, queue); readEdKey(orig, comboKey(orig.word, 0, 1, 3, 4), minFreq, checked, queue); readEdKey(orig, comboKey(orig.word, 0, 1, 3, 5), minFreq, checked, queue); readEdKey(orig, comboKey(orig.word, 0, 1, 4, 5), minFreq, checked, queue); readEdKey(orig, comboKey(orig.word, 0, 2, 3, 4), minFreq, checked, queue); readEdKey(orig, comboKey(orig.word, 0, 2, 3, 5), minFreq, checked, queue); readEdKey(orig, comboKey(orig.word, 0, 2, 4, 5), minFreq, checked, queue); readEdKey(orig, comboKey(orig.word, 0, 3, 4, 5), minFreq, checked, queue); readEdKey(orig, comboKey(orig.word, 1, 2, 3, 4), minFreq, checked, queue); readEdKey(orig, comboKey(orig.word, 1, 2, 3, 5), minFreq, checked, queue); readEdKey(orig, comboKey(orig.word, 1, 2, 4, 5), minFreq, checked, queue); readEdKey(orig, comboKey(orig.word, 1, 3, 4, 5), minFreq, checked, queue); readEdKey(orig, comboKey(orig.word, 2, 3, 4, 5), minFreq, checked, queue); } /** * Calculate a four letter key for the given word, by sticking together * characters from the given positions. */ private int comboKey(String word, int p0, int p1, int p2, int p3) { int[] ch = new int[4]; ch[0] = word.length() > p0 ? comboChar(word.charAt(p0)) : ' '; ch[1] = word.length() > p1 ? comboChar(word.charAt(p1)) : ' '; ch[2] = word.length() > p2 ? comboChar(word.charAt(p2)) : ' '; ch[3] = word.length() > p3 ? comboChar(word.charAt(p3)) : ' '; return (ch[0] << 24) | (ch[1] << 16) | (ch[2] << 8) | (ch[3] << 0); } private int comboChar(int c) { if (c >= 0x20 && (c & ~0x7f) == 0) return c; c = (char)((c & 0x7f) | 0x20); return (c == '|') ? '*' : c; } /** Check if the given word is in the spelling dictionary */ public boolean inDictionary(String word) throws IOException { return wordFreqs.get(word.toLowerCase()) > 0; } /** * Suggest similar words to a given original word, but not including the * word itself. */ public synchronized String[] suggestSimilar(String str, int numSugg) throws IOException { // Get suggestions, including the original word Word[] suggs = suggestSimilar(new Word(str), numSugg + 1, 1); // Make an array, not including the original word ArrayList<String> out = new ArrayList<String>(); for (int i = 0; i < suggs.length; i++) { if (suggs[i].word.equals(str)) continue; out.add(suggs[i].word); } return out.toArray(new String[out.size()]); } /** * Suggest similar words to a given original word. A minimum frequency limit * is enforced. */ private Word[] suggestSimilar(Word word, int numSugg, int minFreq) throws IOException { int queueSize = numSugg + 10; final WordQueue queue = new WordQueue(queueSize); // Find all words that are close to the original and queue them. findCloseWords(word, minFreq, queue); // Pop everything out of the queue and convert to an array Word[] array = new Word[Math.min(numSugg, queue.size())]; if (debugWriter != null) debugWriter.println(" Consider: "); for (int i = queue.size() - 1; i >= 0; i--) { Word sugg = (Word)queue.pop(); if (debugWriter != null) { debugWriter.print(" "); sugg.debug(debugWriter); } if (i < array.length) array[i] = sugg; } if (debugWriter != null) { debugWriter.println(" Final suggestion(s):"); for (int i = 0; i < array.length; i++) { debugWriter.print(" "); array[i].debug(debugWriter); } } return array; } /** * Keyword-oriented spelling suggestion mechanism. For an ordered list of * terms, come up with suggestions that have a good chance of improving * the precision and/or recall. * * @param terms Ordered list of query terms * @return One suggestion per term. If unchanged, there * was no better suggestion. If null, it is * suggested that the term be deleted. * If the array returned is null, there were * no suggestions at all. */ public synchronized String[] suggestKeywords(String[] terms) throws IOException { // No terms? Then we can't suggest anything. if (terms.length == 0) return null; // Must have already opened frequency data file. assert pairFreqs != null; // Start with a null change, but reduce its score so we hopefully end // up suggesting something. // Phrase in = new Phrase(); in.words = new Word[terms.length]; in.baseScore = -0.2f; for (int i = 0; i < terms.length; i++) in.words[i] = new Word(null, terms[i].toLowerCase(), wordFreqs.get(terms[i].toLowerCase())); in.calcScore(); if (debugWriter != null) { debugWriter.append("Original: "); in.calcScore(debugWriter); } // If there's just one word, our work is simple: just find the best // replacement for that word. // Phrase bestPhrase = in; if (terms.length == 1) { bestPhrase = max(bestPhrase, subWord(in, 0)); bestPhrase = max(bestPhrase, subSplit(in, 0)); } else bestPhrase = subPairs(in); if (debugWriter != null) { debugWriter.append(" Final : "); bestPhrase.calcScore(debugWriter); } // Convert to a string array, and recover the original case mapping. Also, // if any requivalent replacements were made, just use the original word. // String[] out = bestPhrase.toStringArray(); boolean anyChange = false; for (int i = 0; i < out.length; i++) { if (out[i] == null) { anyChange = true; continue; } if (wordEquiv.isEquivalent(terms[i], out[i])) out[i] = terms[i]; else { anyChange = true; out[i] = StringUtil.copyCase(terms[i], out[i]); } } if (debugWriter != null) debugWriter.flush(); // If no changes were made, signal that to the caller. if (!anyChange) return null; return out; } // suggestKeywords() /** * Substitute a single word at the given position, trying to improve the score. * * @param in the best we've done so far * @param pos position to substitute at * @return the best we can do at that position */ private Phrase subWord(Phrase in, int pos) throws IOException { // Don't suggest anything for stop words (which aren't in the dictionary) if (stopSet != null && stopSet.contains(in.words[pos].word)) return in; // Get a suggestion for replacing the word. int origFreq = wordFreqs.get(in.words[pos].word); Word[] suggs = suggestSimilar(in.words[pos], 1, origFreq + 1); if (suggs.length == 0) return in; Word sugg = suggs[0]; assert !sugg.word.equals(in.words[pos].word); // If no improvement, return the original. if (sugg == in.words[pos]) return in; // If the word is "equivalent" (e.g. just a change of plurality) then // just return the original. // if (wordEquiv.isEquivalent(sugg.word, in.words[pos].word)) return in; // Make a new phrase. Phrase out = (Phrase)in.clone(); out.words[0] = sugg; out.calcScore(); return max(in, out); } /** * Return the better of two phrases (an original phase vs. a test phrase). * If a debug stream has been specified, output debug info too. */ private Phrase max(Phrase orig, Phrase test) throws IOException { // Output debugging info if (debugWriter != null && test.score != orig.score) { debugWriter.append( (test.score > orig.score) ? " Better: " : " Worse : "); test.calcScore(debugWriter); } // Now pick the best one and return it. if (test.score > orig.score) return test; else return orig; } /** * Consider pair-wise changes at each position. */ private Phrase subPairs(Phrase in) throws IOException { Phrase bestPhrase = in; // Consider two-word changes at each position, but skip stop-words. for (int pass = 1; pass <= 2; pass++) { if (debugWriter != null) { debugWriter.println(" ---- Pass " + pass + " ----"); debugWriter.print(" Starting with: "); in.calcScore(debugWriter); } int prev = -1; for (int i = 0; i < in.words.length; i++) { Word w = in.words[i]; // Skip words removed by joining if (w == null) continue; // Skip stop words if (stopSet != null && stopSet.contains(w.word)) continue; // Skip words that are the product of splitting. if (w.word.indexOf(' ') >= 0) continue; // Consider operations on a single word (as long as we haven't changed // this word already) // if (in.words[i].orig == in.words[i]) bestPhrase = max(bestPhrase, subSplit(in, i)); // Consider operations on multiple words (as long as we haven't changed // both of them already.) // if (prev >= 0) { if (in.words[i].orig == in.words[i] || in.words[prev].orig == in.words[prev]) { bestPhrase = max(bestPhrase, subPair(in, prev, i)); bestPhrase = max(bestPhrase, subJoin(in, prev, i)); } } prev = i; } if (in == bestPhrase) break; in = bestPhrase; } return bestPhrase; } /** * Consider a set of changes to the pair of words at the given position. * * @param in the current best we've found * @param pos1 first position to consider * @param pos2 second position to consider * @return new best */ private Phrase subPair(Phrase in, int pos1, int pos2) throws IOException { Word word1 = in.words[pos1]; Word word2 = in.words[pos2]; if (debugWriter != null) { debugWriter.println( " subPair(" + pos1 + ", " + pos2 + "): " + in.words[pos1].word + " " + in.words[pos2].word); } // Get a list of independent suggestions for both words. If we've already // made a choice, don't override it. // final int NUM_SUG = 100; Word[] list1 = (word1.orig == word1) ? suggestSimilar(word1, NUM_SUG, 0) : null; Word[] list2 = (word2.orig == word2) ? suggestSimilar(word2, NUM_SUG, 0) : null; // If either list is empty, substitute the original. if (list1 == null || list1.length == 0) list1 = new Word[] { in.words[pos1] }; if (list2 == null || list2.length == 0) list2 = new Word[] { in.words[pos2] }; // Now score all possible combinations, looking for the best one. float bestScore = 0.0f; Word bestSugg1 = null; Word bestSugg2 = null; for (int p1 = 0; p1 < list1.length; p1++) { Word sugg1 = list1[p1]; boolean change1 = !wordEquiv.isEquivalent(in.words[pos1].word, sugg1.word); if (!change1) sugg1 = word1; for (int p2 = 0; p2 < list2.length; p2++) { Word sugg2 = list2[p2]; boolean change2 = !wordEquiv.isEquivalent(in.words[pos2].word, sugg2.word); if (!change2) sugg2 = word2; // Change at least one word if (!change1 && !change2) continue; float pairScore = scorePair(sugg1, sugg2); float totalScore = pairScore + sugg1.score + sugg2.score; if (debugWriter != null) { debugWriter.format( " Pair-replace \"%s %s\" with \"%s %s\": %.2f (%.2f + %.2f + %.2f)\n", word1, word2, sugg1, sugg2, totalScore, pairScore, sugg1.score, sugg2.score); } if (totalScore > bestScore) { bestScore = totalScore; bestSugg1 = sugg1; bestSugg2 = sugg2; } } } // If we couldn't find any pair that results in improvement, do nothing. if (bestSugg1 == null) return in; // If we found something better than doing nothing, record it. Phrase bestPhrase = (Phrase)in.clone(); bestPhrase.words[pos1] = bestSugg1; bestPhrase.words[pos2] = bestSugg2; bestPhrase.calcScore(); return bestPhrase; } /** * Consider splitting a word */ private Phrase subSplit(Phrase in, int pos) throws IOException { Phrase bestPhrase = in; // Only consider splits where both pieces are >= 2 chars in length. String origStr = in.words[pos].word; for (int i = 2; i < origStr.length() - 1; i++) { // Extract the pieces String leftStr = origStr.substring(0, i); String rightStr = origStr.substring(i); // Make sure both parts are real words int leftFreq = wordFreqs.get(leftStr); int rightFreq = wordFreqs.get(rightStr); if (leftFreq <= 0 || rightFreq <= 0) continue; // Get the frequency. It must be greater than the original. int pairFreq = pairFreqs.get(leftStr, rightStr); if (debugWriter != null) { debugWriter.format(" split-replace: '%s' with '%s' '%s': freq %d\n", origStr, leftStr, rightStr, pairFreq); } // Okay, this is a candidate. Score it for real Phrase testPhrase = (Phrase)in.clone(); testPhrase.words[pos] = new Word(in.words[pos], leftStr + " " + rightStr, pairFreq + 1); testPhrase.calcScore(); bestPhrase = max(bestPhrase, testPhrase); } return bestPhrase; } /** * Consider joining the first two words together */ private Phrase subJoin(Phrase in, int pos1, int pos2) throws IOException { Word origWord = new Word(in.words[pos1].word + " " + in.words[pos2].word); int origFreq = pairFreqs.get(in.words[pos1].word, in.words[pos2].word); String joinedStr = in.words[pos1].word + in.words[pos2].word; int joinedFreq = wordFreqs.get(joinedStr); if (joinedFreq == 0) return in; if (debugWriter != null) { debugWriter.format(" join-replace: \"%s %s\" with \"%s\": freq %d\n", in.words[pos1].word, in.words[pos2].word, joinedStr, joinedFreq); } if (joinedFreq <= origFreq) return in; Phrase testPhrase = (Phrase)in.clone(); testPhrase.words[pos1] = new Word(origWord, joinedStr, joinedFreq); testPhrase.words[pos1].score = in.words[pos1].score + in.words[pos2].score + scorePair(in.words[pos1], in.words[pos2]); testPhrase.words[pos2] = null; testPhrase.calcScore(); return testPhrase; } /** * Calculate a score for a suggested replacement for a given word. */ private float scorePair(Word sugg1, Word sugg2) throws IOException { int origPairFreq = pairFreqs.get(sugg1.orig.word, sugg2.orig.word); int suggPairFreq = pairFreqs.get(sugg1.word, sugg2.word); if (suggPairFreq <= origPairFreq) return 0.0f; double freqFactor = (suggPairFreq + 1.0) / (origPairFreq + 1.0); float freqBoost = (float)(Math.log(freqFactor) / Math.log(100.0)) / 2.0f; return freqBoost; } /** Get the term frequency sample array for our dictionary. */ private void loadFreqSamples(File spellDir) throws IOException { // Default if no frequencies found will be to turn off frequency boosting int[] res = new int[5]; res[0] = res[1] = res[2] = res[3] = res[4] = Integer.MAX_VALUE; // Find the frequency samples file and open it File freqSamplesFile = new File(spellDir, "freqSamples.dat"); if (!freqSamplesFile.canRead()) throw new IOException( "Cannot open frequency samples file '" + freqSamplesFile + "'"); BufferedReader reader = new BufferedReader(new FileReader(freqSamplesFile)); int nSamples = 0; int[] samples = null; try { // If there were less than 500 terms to sample, turn off frequency // boosting. // int nTerms = Integer.parseInt(reader.readLine()); if (nTerms >= 500) { // Read in the samples. nSamples = Integer.parseInt(reader.readLine()); samples = new int[nSamples]; for (int i = 0; i < nSamples; i++) samples[i] = Integer.parseInt(reader.readLine()); } } catch (NumberFormatException e) { throw new IOException("term frequencies file corrupt"); } finally { reader.close(); } // Pick out the levels of most interest to us if (samples != null) { res[0] = samples[(int)(nSamples * 0.99)]; // top 1% res[1] = samples[(int)(nSamples * 0.90)]; // top 10% res[2] = samples[(int)(nSamples * 0.50)]; // top 50% res[3] = samples[(int)(nSamples * 0.25)]; // top 75% res[4] = samples[0]; // all above-avg words } // All done. freqSamples = res; } /** Get the term frequency sample array for our dictionary. */ private void loadWordFreqs(File spellDir) throws IOException { // Find the word frequency file and open it File freqFile = new File(spellDir, "words.dat"); if (!freqFile.canRead()) throw new IOException("Cannot open word frequency file '" + freqFile + "'"); // Read in each word and its frequency. wordFreqs = new FreqData(); BufferedReader reader = new BufferedReader(new FileReader(freqFile)); try { while (true) { String line = reader.readLine(); if (line == null) break; String[] toks = splitPat.split(line); String word = toks[0]; int freq = Integer.parseInt(toks[1]); wordFreqs.add(word, freq); } } catch (NumberFormatException e) { throw new IOException("term frequencies file corrupt"); } finally { reader.close(); } } private void openPairFreqs(File spellDir) throws IOException { if (pairFreqs == null) { pairFreqs = new FreqData(); pairFreqs.add(new File(spellDir, "pairs.dat")); } } protected void finalize() throws Throwable { close(); } private String calcMetaphone(String word) { String mph = SpellWriter.calcMetaphone(word); if (mph == null) return ""; return mph; } /** * Keeps track of a single word, either an original or suggested word. */ private final class Word { public String word; public Word orig; public int freq; public String metaphone; private TRStringDistance2 wordDist; private TRStringDistance2 mphDist; public float score; public float freqBoost; /** Contructor for original words */ public Word(String word) throws IOException { this(null, word, 0); } /** Constructor for suggested replacement words */ public Word(Word inOrig, String word, int freq) throws IOException { this.word = word; this.orig = (inOrig == null) ? this : inOrig; this.freq = freq; metaphone = calcMetaphone(word); wordDist = mphDist = null; // lazily created if necessary // If equivalent to the original word, inherit the score. if (orig != this && wordEquiv.isEquivalent(word, orig.word)) { freqBoost = orig.freqBoost; score = orig.score; return; } // Calculate the edit distance and turn it into the base score float dist = orig.wordDist(word) / 2.0f; score = 1.0f - (dist / orig.length()); // If the metaphone matches, nudge the score if (metaphone.equals(orig.metaphone)) score += 0.1f; // If the first and last letters match, nudge the score. if (word.length() > 0 && orig.word.length() > 0 && word.charAt(0) == orig.word.charAt(0) && word.charAt(word.length() - 1) == orig.word.charAt( orig.word.length() - 1)) score += 0.1f; // If this word is more frequent than normal, give it a nudge up. freqBoost = calcFreqBoost(freqSamples, freq); score += freqBoost; } public int length() { return word.length(); } public boolean equals(Word other) { return word.equals(other.word); } public int wordDist(String other) { if (wordDist == null) wordDist = new TRStringDistance2(word); return wordDist.getDistance(other); } public int mphDist(String other) { if (mphDist == null) mphDist = new TRStringDistance2(metaphone); return mphDist.getDistance(other); } public String toString() { return word; } /** Dump debugging output about this word */ public void debug(PrintWriter w) { align(w, "word=" + word + "[" + orig.wordDist(word) + "]", 22); align(w, "mph=" + metaphone + "[" + orig.mphDist(metaphone) + "]", 13); align(w, "freq=" + freq, 12); // If equivalent to the original word, inherit the score. if (orig != this && wordEquiv.isEquivalent(word, orig.word)) { align(w, "copyScore=" + orig.score, 20); w.println(); return; } // Calculate the edit distance and turn it into the base score float dist = orig.wordDist(word) / 2.0f; align(w, "base=" + (1.0f - (dist / orig.length())), 14); // If the metaphone matches, nudge the score String mphStr = "0"; if (metaphone.equals(orig.metaphone)) mphStr = "0.1"; align(w, "mphBoost=" + mphStr, 13); // If the first and last letters match, nudge the score. String matchStr = "0"; if (word.charAt(0) == orig.word.charAt(0) && word.charAt(word.length() - 1) == orig.word.charAt( orig.word.length() - 1)) matchStr = "" + 0.1f; align(w, "matchBoost=" + matchStr, 15); // If any frequency boost appplied, print it. align(w, "freqBoost=" + freqBoost, 20); // Total score align(w, "totalScore=" + score, 22); w.println(); } private void align(PrintWriter w, String s, int width) { w.print(s); for (int i = 0; i < (width - s.length()); i++) w.print(" "); w.print(" "); } /** * Calculate a boost factor based on the frequency of a term. */ private float calcFreqBoost(int[] termFreqs, int freq) { if (freq == 0) return -0.2f; // If this word is more frequent than normal, give it a nudge up. int i = 0; while (i < 5 && freq < termFreqs[i]) i++; if (i == 0) return 0.25f; int loFreq = (i < 5) ? termFreqs[i] : 0; int hiFreq = termFreqs[i - 1]; float loBoost = (5 - i) * 0.05f; float boost = (((freq - loFreq) * 50 / (hiFreq - loFreq)) / 1000.0f) + loBoost; return boost; } } /** * Queue of words, ordered by score and then frequency */ private static final class WordQueue extends PriorityQueue { WordQueue(int size) { initialize(size); } protected final boolean lessThan(Object a, Object b) { Word wa = (Word)a; Word wb = (Word)b; //first criteria: the edit distance if (wa.score > wb.score) return false; if (wa.score < wb.score) return true; //second criteria (if first criteria is equal): the popularity if (wa.freq > wb.freq) return false; if (wa.freq < wb.freq) return true; return false; } } /** * Track an ordered group of words. */ private class Phrase implements Cloneable { Word[] words; float baseScore = 0.0f; float score; public Object clone() { try { Phrase out = (Phrase)super.clone(); out.words = new Word[words.length]; out.baseScore = 0.0f; System.arraycopy(words, 0, out.words, 0, words.length); return out; } catch (CloneNotSupportedException e) { return null; } } public void calcScore() throws IOException { calcScore(null); } public void calcScore(PrintWriter debugWriter) throws IOException { float wordScore = 0.0f; float pairScore = 0.0f; int prev = -1; for (int i = 0; i < words.length; i++) { // Skip words that have been removed by joining if (words[i] == null) continue; // Skip stop words if (stopSet != null && stopSet.contains(words[i].word.toLowerCase())) { if (debugWriter != null) debugWriter.append(words[i].word + " "); continue; } // Okay, score it. wordScore += words[i].score; // Do pair scoring, except for words created by splitting if (prev >= 0 && words[i].word.indexOf(' ') < 0) { pairScore += scorePair(words[prev], words[i]); if (debugWriter != null) debugWriter.format("+%.2f ", scorePair(words[prev], words[i])); } prev = i; // Print the word after the pair score (if any) if (debugWriter != null) debugWriter.format("%s[%.2f] ", words[i].word, words[i].score); } score = baseScore + wordScore + pairScore; if (debugWriter != null) { if (baseScore != 0.0f) debugWriter.format("... base: %.2f ", baseScore); debugWriter.format("... Total: %.2f\n", score); } } public String[] toStringArray() { String[] out = new String[words.length]; for (int i = 0; i < words.length; i++) out[i] = (words[i] == null) ? null : words[i].word; return out; } } } // class SpellReader