SpellReader.java example

Explorer

xtf-dsc-master
- WEB-INF
  - contrib
    - xtf-lucene
      - src
        java
        org
        apache
        lucene
        bigram
        BigramQueryRewriter.java
        BigramSpanRangeQuery.java
        BigramSpanWildcardQuery.java
        BigramStopFilter.java
        chunk
        Chunk.java
        ChunkMarkPos.java
        ChunkSource.java
        ChunkedWordIter.java
        DocNumMap.java
        SpanChunkedNotQuery.java
        SpanDechunkingQuery.java
        SparseStringComparator.java
        limit
        ExcessiveWorkException.java
        LimIndexReader.java
        LimTermDocs.java
        LimTermPositions.java
        TermLimitException.java
        mark
        BasicMarkPos.java
        BasicWordIter.java
        ContextMarker.java
        MarkCollector.java
        MarkPos.java
        WordIter.java
        search
        FieldSpanSource.java
        FlippableStringComparator.java
        QueryRewriter.java
        QueryTraverser.java
        RecordingSearcher.java
        SpanHitCollector.java
        spans
        EmptySpans.java
        FieldSpans.java
        NearSpans.java
        OrNearSpans.java
        Span.java
        SpanFirstQuery.java
        SpanNearQuery.java
        SpanNotNearQuery.java
        SpanNotQuery.java
        SpanOrNearQuery.java
        SpanOrQuery.java
        SpanPosComparator.java
        SpanQuery.java
        SpanRangeQuery.java
        SpanRecordingScorer.java
        SpanScorer.java
        SpanTermQuery.java
        SpanWeight.java
        SpanWildcardQuery.java
        Spans.java
        spell
        SpellKeywordTest.java
        spelt
        DoubleMetaphone.java
        FreqData.java
        LuceneIndexToDict.java
        MinimalAnalyzer.java
        QuerySpeller.java
        SimpleQueryRewriter.java
        SpellReader.java
        SpellTestCmdLine.java
        SpellWriter.java
        SpellWritingAnalyzer.java
        SpellWritingFilter.java
        TRStringDistance2.java
        WordEquiv.java
        util
        CountedInputStream.java
        CountedOutputStream.java
        FileSorter.java
        Hash64.java
        IntList.java
        LongList.java
        LongSet.java
        Prime.java
        PriorityQueue.java
        ProgressTracker.java
        RandomAccessInputStream.java
        StringUtil.java
      - test
        java
        org
        apache
        lucene
        spelt
        DoubleMetaphoneTest.java
        FreqDataTest.java
        LuceneIndexToDictTest.java
        QuerySpellerTest.java
        SimpleQueryRewriterTest.java
        SpellReadWriteTest.java
        SpellWritingAnalyzerTest.java
        TRStringDistance2Test.java
        util
        CountedInputStreamTest.java
        CountedOutputStreamTest.java
        FileSorterTest.java
        Hash64Test.java
        IntListTest.java
        LongListTest.java
        LongSetTest.java
        PrimeTest.java
        ProgressTrackerTest.java
        RandomAccessInputStreamTest.java
        StringUtilTest.java
  - src
    - net
      - sf
        saxon
        tinytree
        HackedTinyBuilder.java
        trans
        KeyManager.java
    - org
      - cdlib
        xtf
        cache
        Cache.java
        CacheDependency.java
        Dependency.java
        FileDependency.java
        GeneratingCache.java
        SimpleCache.java
        StringCache.java
        crossQuery
        CrossQuery.java
        CrossQueryConfig.java
        QueryRoute.java
        QueryRouteException.java
        TimeProfilingListener.java
        raw
        RawQuery.java
        test
        TestableCrossQuery.java
        dynaXML
        Authenticator.java
        DefaultDocLocator.java
        DocLocator.java
        DocRequest.java
        DynaXML.java
        DynaXMLConfig.java
        DynaXMLException.java
        InvalidDocumentException.java
        IpList.java
        NoPermissionException.java
        UnsupportedQueryException.java
        test
        TestableDynaXML.java
        lazyTree
        AncestorEnumeration.java
        AttributeEnumeration.java
        AttributeImpl.java
        ChildEnumeration.java
        DescendantEnumeration.java
        ElementImpl.java
        FastNodeTestPattern.java
        Flag.java
        FollowingEnumeration.java
        FollowingSiblingEnumeration.java
        LazyDocument.java
        LazyHashMap.java
        LazyKeyManager.java
        LazyProfilingListener.java
        LazyTreeBuilder.java
        NodeImpl.java
        ParentNodeImpl.java
        PersistentTree.java
        PrecedingEnumeration.java
        PrecedingOrAncestorEnumeration.java
        PrecedingSiblingEnumeration.java
        ProxyAttributeEnumeration.java
        ProxyAttributeImpl.java
        ProxyElement.java
        SearchElement.java
        SearchElementImpl.java
        SearchNode.java
        SearchTextImpl.java
        SearchTree.java
        TextImpl.java
        TreeEnumeration.java
        saxonExt
        ElementWithContent.java
        Exec.java
        Image.java
        InstructionWithContent.java
        Mail.java
        Pipe.java
        Redirect.java
        SQL.java
        exec
        ArgElement.java
        InputElement.java
        PipeImageElement.java
        PipeImageInstruction.java
        RunElement.java
        RunInstruction.java
        image
        ImageCache.java
        OutputElement.java
        mail
        SendElement.java
        pipe
        PipeBufferPool.java
        PipeFileElement.java
        PipeFopElement.java
        PipeRequestElement.java
        redirect
        HttpErrorElement.java
        RedirectElement.java
        sql
        SQLClose.java
        SQLColumn.java
        SQLConnect.java
        SQLDelete.java
        SQLInsert.java
        SQLProperty.java
        SQLQuery.java
        SQLUpdate.java
        servletBase
        CQLParseException.java
        DTDSuppressingXMLReader.java
        LatencyCutoffStream.java
        RedirectException.java
        SessionURLRewriter.java
        StylesheetCache.java
        TextConfig.java
        TextServlet.java
        test
        FakeOutputStream.java
        FakeServletConfig.java
        FakeServletContext.java
        FakeServletRequest.java
        FakeServletResponse.java
        NullOutputStream.java
        RegressTest.java
        textEngine
        AccentFoldingRewriter.java
        BoostSet.java
        BoostSetParams.java
        BoundedMarkPos.java
        BoundedWordIter.java
        ConfigCache.java
        Constants.java
        DefaultQueryProcessor.java
        DocHit.java
        DocHitImpl.java
        FlippingDirectory.java
        HitLoadException.java
        HitQueue.java
        IndexUtil.java
        IndexValidator.java
        IndexWarmer.java
        MoreLikeThisQuery.java
        NativeFSDirectory.java
        NumericFieldData.java
        NumericRangeQuery.java
        PluralFoldingRewriter.java
        QueryContext.java
        QueryGenException.java
        QueryProcessor.java
        QueryRequest.java
        QueryRequestParser.java
        QueryResult.java
        RefieldingQueryRewriter.java
        SlopFixupRewriter.java
        Snippet.java
        SnippetMaker.java
        SpanExactQuery.java
        SpanSectionTypeQuery.java
        SpellSuggRewriter.java
        SpellcheckParams.java
        SpellingSuggestion.java
        StdTermFilter.java
        StdTermRewriter.java
        TotalHitsComparator.java
        UnspanningQueryRewriter.java
        XtfBigramQueryRewriter.java
        XtfChunk.java
        XtfChunkMarkPos.java
        XtfChunkSource.java
        XtfChunkedWordIter.java
        XtfDocNumMap.java
        XtfLimIndexReader.java
        XtfQueryRewriter.java
        XtfQueryTraverser.java
        XtfSearcher.java
        XtfSpanRangeQuery.java
        XtfSpanWildcardQuery.java
        XtfWordEquiv.java
        facet
        ChildSelector.java
        DescendantSelector.java
        DocsSelector.java
        DynamicGroupData.java
        EmptySelector.java
        FRBRData.java
        FRBRGroupData.java
        FacetSpec.java
        GroupCounts.java
        GroupData.java
        GroupSelector.java
        MarkSelector.java
        NameSelector.java
        PageSelector.java
        ParseException.java
        RangeSelector.java
        ResultFacet.java
        ResultGroup.java
        RootSelector.java
        SelectedSelector.java
        SelectorParser.java
        SelectorParserConstants.java
        SelectorParserTokenManager.java
        SiblingSelector.java
        SimpleCharStream.java
        SingletonSelector.java
        StaticGroupData.java
        Token.java
        TokenMgrError.java
        TopChoiceSelector.java
        UnionSelector.java
        freeform
        CharStream.java
        FreeformQueryParser.java
        FreeformQueryParserConstants.java
        FreeformQueryParserTokenManager.java
        ParseException.java
        SimpleCharStream.java
        Token.java
        TokenMgrError.java
        textIndexer
        AccentFoldingFilter.java
        CrimsonBugWorkaround.java
        DocSelCache.java
        FacetTokenizer.java
        HTMLIndexSource.java
        HTMLToString.java
        IdxTreeCleaner.java
        IdxTreeCuller.java
        IdxTreeDictMaker.java
        IdxTreeOptimizer.java
        IndexDump.java
        IndexInfo.java
        IndexMerge.java
        IndexRecord.java
        IndexSource.java
        IndexStats.java
        IndexSync.java
        IndexerConfig.java
        MARCIndexSource.java
        MSWordIndexSource.java
        PDFIndexSource.java
        PDFToString.java
        PluralFoldingFilter.java
        SectionInfo.java
        SectionInfoStack.java
        SpellWritingFilter.java
        SrcTreeProcessor.java
        StartEndFilter.java
        StructuredFileProxy.java
        TagFilter.java
        TextIndexSource.java
        TextIndexer.java
        TextIndexerException.java
        XMLConfigParser.java
        XMLIndexSource.java
        XMLTextProcessor.java
        XTFTextAnalyzer.java
        XtfSpecialTokensFilter.java
        tokenizer
        CharStream.java
        FastCharStream.java
        ParseException.java
        Token.java
        TokenMgrError.java
        Tokenizer.java
        XTFTokenizer.java
        XTFTokenizerConstants.java
        XTFTokenizerTokenManager.java
        util
        ArrayUtil.java
        Attrib.java
        AttribList.java
        Base64.java
        CharMap.java
        CheckingTokenStream.java
        CircularQueue.java
        ConsecutiveMap.java
        DirSync.java
        DiskHashReader.java
        DiskHashWriter.java
        DocTypeDeclRemover.java
        EasyNode.java
        EmbeddedList.java
        FastIntCache.java
        FastStringCache.java
        FastStringReader.java
        FastTokenizer.java
        FileWalker.java
        FloatList.java
        GeneralException.java
        IntHash.java
        IntMultiMap.java
        LimitedOutputStream.java
        LineReader.java
        Linkable.java
        LinkableImpl.java
        Normalizer.java
        PackedByteBuf.java
        Path.java
        ProcessRunner.java
        StringHash.java
        StructuredFile.java
        StructuredStore.java
        SubDirFilter.java
        SubFileReader.java
        SubFileWriter.java
        SubStoreReader.java
        SubStoreWriter.java
        TagArray.java
        TagChars.java
        Tester.java
        ThreadWatcher.java
        Trace.java
        TraceWriter.java
        WordMap.java
        XMLFormatter.java
        XMLWriter.java
        XTFSaxonErrorListener.java
        xslt
        CharUtils.java
        FileUtils.java
        FreeformQuery.java
        Session.java
        XMLStubReader.java
        zing
        SRU.java
        SRUConfig.java

package org.apache.lucene.spelt;

/**
 * Copyright 2006-2007 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * Acknowledgements:
 *
 * A significant amount of new and/or modified code in this module
 * was made possible by a grant from the Andrew W. Mellon Foundation,
 * as part of the Melvyl Recommender Project.
 */
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.util.ArrayList;
import java.util.Set;
import java.util.regex.Pattern;

import org.apache.lucene.util.Hash64;
import org.apache.lucene.util.IntList;
import org.apache.lucene.util.LongSet;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.StringUtil;

/**
 * <p>
 * Reads a spelling dictionary created by {@link SpellWriter}, and provides
 * fast single- and multi-word spelling suggestions. Typical usage:
 * </p>
 * <ol>
 *   <li>First, {@linkplain #open(File) open} a new reader.</li>
 *   <li>For each potentially mispelled query, gather the keywords
 *       and {@linkplain #suggestKeywords(String[]) get suggestions}
 *       for them.
 *   </li>
 *   <li>When done with all queries, {@linkplain #close()} the reader.
 * </ol>
 * <p>
 * Inspired by and very distantly based on Nicolas Maisonneuve / David Spencer 
 * code.
 * </p>
 *
 * @author Martin Haye
 */
public class SpellReader 
{
  /** Keys in the edit map file */
  private IntList edMapKeys;

  /** Positions in the edit map file */
  private IntList edMapPosns;

  /** File for reading edit map entries */
  private RandomAccessFile edMapFile;

  /** Charset decoder for reading edit map entries */
  private CharsetDecoder edMapDecoder;

  /** Pair frequency data */
  private FreqData pairFreqs;

  /** Word frequency data */
  private FreqData wordFreqs;

  /** Frequencies from the term data, sampled at 5 levels */
  private int[] freqSamples;

  /** Where to send debugging info (or null for none) */
  private PrintWriter debugWriter = null;

  /** Pattern used for splitting up lines delimited by bars */
  private final Pattern splitPat = Pattern.compile("\\||\n");

  /** Set of stop-words to use during spell correction, or null for none */
  private Set stopSet;

  /** Word equivalency checker */
  private WordEquiv wordEquiv;
  
  /** Private constructor -- use {@link #open(File)} instead. */
  private SpellReader() {
  }

  /** Check if there's a valid dictionary in the given directory */
  public static boolean isValidDictionary(File spellDir) {
    if (!spellDir.isDirectory() || !spellDir.canRead())
      return false;
    File file = new File(spellDir, "pairs.dat");
    return file.canRead();
  }

  /**
   * Open a reader for the given spelling index directory. Does no stop word
   * processing, and uses default word equivalency (just case insensitive.)
   * To specify a stopword set (which you must if you did when building the
   * dictionary), call {@link #setStopwords(Set)}. To specify a non-default
   * word equivalency, call {@link #setWordEquiv(WordEquiv)}.
   *
   * @param spellDir   directory containing the spelling dictionary
   */
  public static SpellReader open(File spellDir)
    throws IOException 
  {
    SpellReader reader = new SpellReader();
    reader.stopSet = null;
    reader.wordEquiv = WordEquiv.DEFAULT;
    reader.openEdmap(spellDir);
    reader.loadFreqSamples(spellDir);
    reader.loadWordFreqs(spellDir);
    reader.openPairFreqs(spellDir);
    return reader;
  }
  
  /**
   * Establishes a list of stopwords (e.g. "the", "and", "an", etc.). This
   * list should be identical to that which was used to create the
   * dictionary.
   * 
   * @param set Set of stop-words; all should be lower-case.
   */
  public void setStopwords(Set set) {
    this.stopSet = set;
  }

  /**
   * Establishes a word equivalency checker. This is used to prevent the 
   * correction algorithm from making suggestions that won't change the
   * query result. For instance, if words in the main index are all
   * converted from plural to singular, it would be silly for the checker
   * to suggest "cats" to replace "cat".
   * 
   * @param eq  the equivalency checker to use
   */
  public void setWordEquiv(WordEquiv eq) {
    this.wordEquiv = eq;
  }

  /** Read the index for the edit map file */ 
  private void openEdmap(File spellDir)
    throws IOException 
  {
    long startTime = System.currentTimeMillis();
    File file = new File(spellDir, "edmap.dat");

    try 
    {
      // First, open the map file. At the end, we'll find the position of the index.
      FileInputStream in = new FileInputStream(file);
      in.skip(file.length() - 20);
      BufferedReader reader = new BufferedReader(new InputStreamReader(in));
      String line = reader.readLine();
      int indexPos = Integer.parseInt(line.trim());

      // Now re-open and read the index.
      reader.close();
      in = new FileInputStream(file);
      in.skip(indexPos);
      reader = new BufferedReader(new InputStreamReader(in));

      // Check that we're really looking at a valid index
      line = reader.readLine();
      if (!line.equals("edMap index"))
        throw new IOException("edmap file corrupt");

      // Find out how many keys there are and allocate our lists.
      line = reader.readLine();
      int nKeys = Integer.parseInt(line);
      edMapKeys = new IntList(nKeys);
      edMapPosns = new IntList(nKeys + 1);

      // And read each key/size line
      int prevKey = 0;
      int pos = 0;
      for (int i = 0; i < nKeys; i++) 
      {
        line = reader.readLine();
        String[] tokens = splitPat.split(line);
        if (tokens.length != 2)
          throw new IOException("edmap file corrupt");
        if (tokens[0].length() != 4)
          throw new IOException("edmap file corrupt");

        int key = comboKey(tokens[0], 0, 1, 2, 3);
        assert key >= prevKey : "edmap file out of order or corrupt";
        prevKey = key;

        edMapKeys.add(key);

        int size = Integer.parseInt(tokens[1]);
        edMapPosns.add(pos);
        pos += size;
      }

      reader.close();

      if (edMapKeys.size() != nKeys)
        throw new IOException("edmap file index truncated");

      // Make one extra position entry, and record the index start (as it's 
      // the end of the last key entry)
      //
      edMapPosns.add(indexPos);
    }
    catch (NumberFormatException e) {
      throw new IOException("edmap file corrupt");
    }

    // Make a charset decoder that will be used to decode the UTF-8 data
    edMapDecoder = Charset.forName("UTF-8").newDecoder();

    // Finally, open a random-access version of the file for the actual
    // spellcheck process.
    //
    edMapFile = new RandomAccessFile(file, "r");

    // Print stats
    if (debugWriter != null) {
      debugWriter.println(
        "EdMap index load time: " + (System.currentTimeMillis() - startTime));
      debugWriter.println("  nKeys: " + edMapKeys.size());
    }
  }

  /** Closes any open files and/or resources associated with the SpellReader */
  public void close()
    throws IOException 
  {
    if (edMapFile != null) {
      edMapFile.close();
      edMapFile = null;
    }
  }

  /** Establishes a destination for detailed debugging output */
  public void setDebugWriter(PrintWriter w) {
    debugWriter = w;
  }

  /**
   * Read the list of edit-map words for the given 4-character key.
   *
   * @param orig the original word being considered
   * @param key the 4-char key to look up
   * @param minFreq minimum frequency of words to be queued
   * @param checked set of words that have already been considered
   * @param queue receives the resulting words
   * @return true iff the key was found
   */
  private boolean readEdKey(Word orig, int key, int minFreq, LongSet checked,
                            WordQueue queue)
    throws IOException 
  {
    // Look up this key in our index.
    int idxNum = edMapKeys.binarySearch(key);
    if (idxNum < 0)
      return false;

    // Read in the corresponding chunk of data
    int startPos = edMapPosns.get(idxNum);
    int endPos = edMapPosns.get(idxNum + 1);
    byte[] bytes = new byte[endPos - startPos];
    edMapFile.seek(startPos);
    if (edMapFile.read(bytes) != bytes.length)
      throw new IOException("error reading from edMap file");

    // Decode the string data from UTF-8
    String line = edMapDecoder.decode(ByteBuffer.wrap(bytes)).toString().trim();

    // Break up all the tokens, and validate the amount.
    String[] tokens = splitPat.split(line);
    if (tokens.length < 2)
      throw new IOException("edmap file corrupt");

    // Make sure we got the right key!
    if (key != comboKey(tokens[0], 0, 1, 2, 3))
      throw new IOException("edmap index incorrect");
    
    // Record each word in the list (and their frequencies)
    String prev = null;
    for (int j = 1; j < tokens.length; j++) 
    {
      String word = tokens[j];

      // Handle prefix compression
      if (prev != null) {
        int overlap = word.charAt(0) - '0';
        word = prev.substring(0, overlap) + word.substring(1);
      }
      prev = word;

      // Don't consider any word twice.
      long hash = Hash64.hash(word);
      if (checked.contains(hash))
        continue;
      checked.add(hash);

      // If the frequency is too low, skip it.
      int freq = wordFreqs.get(hash);
      if (freq < minFreq)
        continue;

      // Eliminate suggestions that are too distant from the original. In
      // testing, this has the effect of increasing accuracy for the #1
      // spot, and in general getting rid of many "ridiculous" suggestions,
      // but it does eliminate certain distant suggestions way down the
      // list.
      //
      if (orig.wordDist(word) > 4)
        continue;

      // Add the new word to the queue.
      Word w = new Word(orig, word, freq);
      queue.insert(w);
    }

    // All done.
    return true;
  }

  /**
   * Find words "close" to the given one, and add them to a queue.
   * In this case, "close" means that the first six characters have an
   * edit distance of 2 or less. Well, it means approximately that
   * anyway.
   *
   * More precisely, we iterate all possible 4-letter keys that can be
   * constructed by deleting two of the first six characters in the
   * word. For each key, we add all words that share it.
   */
  private void findCloseWords(Word orig, int minFreq, WordQueue queue)
    throws IOException 
  {
    LongSet checked = new LongSet(100);
    readEdKey(orig, comboKey(orig.word, 0, 1, 2, 3), minFreq, checked, queue);
    readEdKey(orig, comboKey(orig.word, 0, 1, 2, 4), minFreq, checked, queue);
    readEdKey(orig, comboKey(orig.word, 0, 1, 2, 5), minFreq, checked, queue);
    readEdKey(orig, comboKey(orig.word, 0, 1, 3, 4), minFreq, checked, queue);
    readEdKey(orig, comboKey(orig.word, 0, 1, 3, 5), minFreq, checked, queue);
    readEdKey(orig, comboKey(orig.word, 0, 1, 4, 5), minFreq, checked, queue);
    readEdKey(orig, comboKey(orig.word, 0, 2, 3, 4), minFreq, checked, queue);
    readEdKey(orig, comboKey(orig.word, 0, 2, 3, 5), minFreq, checked, queue);
    readEdKey(orig, comboKey(orig.word, 0, 2, 4, 5), minFreq, checked, queue);
    readEdKey(orig, comboKey(orig.word, 0, 3, 4, 5), minFreq, checked, queue);
    readEdKey(orig, comboKey(orig.word, 1, 2, 3, 4), minFreq, checked, queue);
    readEdKey(orig, comboKey(orig.word, 1, 2, 3, 5), minFreq, checked, queue);
    readEdKey(orig, comboKey(orig.word, 1, 2, 4, 5), minFreq, checked, queue);
    readEdKey(orig, comboKey(orig.word, 1, 3, 4, 5), minFreq, checked, queue);
    readEdKey(orig, comboKey(orig.word, 2, 3, 4, 5), minFreq, checked, queue);
  }

  /**
   * Calculate a four letter key for the given word, by sticking together
   * characters from the given positions.
   */
  private int comboKey(String word, int p0, int p1, int p2, int p3) 
  {
    int[] ch = new int[4];
    ch[0] = word.length() > p0 ? comboChar(word.charAt(p0)) : ' ';
    ch[1] = word.length() > p1 ? comboChar(word.charAt(p1)) : ' ';
    ch[2] = word.length() > p2 ? comboChar(word.charAt(p2)) : ' ';
    ch[3] = word.length() > p3 ? comboChar(word.charAt(p3)) : ' ';

    return (ch[0] << 24) | (ch[1] << 16) | (ch[2] << 8) | (ch[3] << 0);
  }

  private int comboChar(int c) {
    if (c >= 0x20 && (c & ~0x7f) == 0)
      return c;
    c = (char)((c & 0x7f) | 0x20);
    return (c == '|') ? '*' : c;
  }

  /** Check if the given word is in the spelling dictionary */
  public boolean inDictionary(String word)
    throws IOException 
  {
    return wordFreqs.get(word.toLowerCase()) > 0;
  }

  /**
   * Suggest similar words to a given original word, but not including the
   * word itself.
   */
  public synchronized String[] suggestSimilar(String str, int numSugg)
    throws IOException 
  {
    // Get suggestions, including the original word
    Word[] suggs = suggestSimilar(new Word(str), numSugg + 1, 1);

    // Make an array, not including the original word
    ArrayList<String> out = new ArrayList<String>();
    for (int i = 0; i < suggs.length; i++) {
      if (suggs[i].word.equals(str))
        continue;
      out.add(suggs[i].word);
    }
    return out.toArray(new String[out.size()]);
  }

  /**
   * Suggest similar words to a given original word. A minimum frequency limit
   * is enforced.
   */
  private Word[] suggestSimilar(Word word, int numSugg, int minFreq)
    throws IOException 
  {
    int queueSize = numSugg + 10;
    final WordQueue queue = new WordQueue(queueSize);

    // Find all words that are close to the original and queue them.
    findCloseWords(word, minFreq, queue);

    // Pop everything out of the queue and convert to an array
    Word[] array = new Word[Math.min(numSugg, queue.size())];
    if (debugWriter != null)
      debugWriter.println("    Consider: ");
    for (int i = queue.size() - 1; i >= 0; i--) 
    {
      Word sugg = (Word)queue.pop();
      if (debugWriter != null) {
        debugWriter.print("      ");
        sugg.debug(debugWriter);
      }
      if (i < array.length)
        array[i] = sugg;
    }

    if (debugWriter != null) 
    {
      debugWriter.println("    Final suggestion(s):");
      for (int i = 0; i < array.length; i++) {
        debugWriter.print("      ");
        array[i].debug(debugWriter);
      }
    }

    return array;
  }

  /**
   * Keyword-oriented spelling suggestion mechanism. For an ordered list of
   * terms, come up with suggestions that have a good chance of improving
   * the precision and/or recall.
   *
   * @param terms           Ordered list of query terms
   * @return                One suggestion per term. If unchanged, there
   *                        was no better suggestion. If null, it is
   *                        suggested that the term be deleted.
   *                        If the array returned is null, there were
   *                        no suggestions at all.
   */
  public synchronized String[] suggestKeywords(String[] terms)
    throws IOException 
  {
    // No terms? Then we can't suggest anything.
    if (terms.length == 0)
      return null;

    // Must have already opened frequency data file.
    assert pairFreqs != null;

    // Start with a null change, but reduce its score so we hopefully end
    // up suggesting something.
    //
    Phrase in = new Phrase();
    in.words = new Word[terms.length];
    in.baseScore = -0.2f;
    for (int i = 0; i < terms.length; i++)
      in.words[i] = new Word(null,
                             terms[i].toLowerCase(),
                             wordFreqs.get(terms[i].toLowerCase()));
    in.calcScore();

    if (debugWriter != null) {
      debugWriter.append("Original: ");
      in.calcScore(debugWriter);
    }

    // If there's just one word, our work is simple: just find the best 
    // replacement for that word.
    //
    Phrase bestPhrase = in;
    if (terms.length == 1) {
      bestPhrase = max(bestPhrase, subWord(in, 0));
      bestPhrase = max(bestPhrase, subSplit(in, 0));
    }
    else
      bestPhrase = subPairs(in);

    if (debugWriter != null) {
      debugWriter.append("  Final : ");
      bestPhrase.calcScore(debugWriter);
    }

    // Convert to a string array, and recover the original case mapping. Also,
    // if any requivalent replacements were made, just use the original word.
    //
    String[] out = bestPhrase.toStringArray();
    boolean anyChange = false;
    for (int i = 0; i < out.length; i++) 
    {
      if (out[i] == null) {
        anyChange = true;
        continue;
      }
      if (wordEquiv.isEquivalent(terms[i], out[i]))
        out[i] = terms[i];
      else {
        anyChange = true;
        out[i] = StringUtil.copyCase(terms[i], out[i]);
      }
    }

    if (debugWriter != null)
      debugWriter.flush();

    // If no changes were made, signal that to the caller.
    if (!anyChange)
      return null;
    return out;
  } // suggestKeywords()

  /**
   * Substitute a single word at the given position, trying to improve the score.
   *
   * @param in      the best we've done so far
   * @param pos     position to substitute at
   * @return        the best we can do at that position
   */
  private Phrase subWord(Phrase in, int pos)
    throws IOException 
  {
    // Don't suggest anything for stop words (which aren't in the dictionary)
    if (stopSet != null && stopSet.contains(in.words[pos].word))
      return in;
    
    // Get a suggestion for replacing the word.
    int origFreq = wordFreqs.get(in.words[pos].word);
    Word[] suggs = suggestSimilar(in.words[pos], 1, origFreq + 1);
    if (suggs.length == 0)
      return in;
    Word sugg = suggs[0];
    assert !sugg.word.equals(in.words[pos].word);

    // If no improvement, return the original.
    if (sugg == in.words[pos])
      return in;

    // If the word is "equivalent" (e.g. just a change of plurality) then
    // just return the original.
    //
    if (wordEquiv.isEquivalent(sugg.word, in.words[pos].word))
      return in;

    // Make a new phrase.
    Phrase out = (Phrase)in.clone();
    out.words[0] = sugg;
    out.calcScore();
    return max(in, out);
  }

  /**
   * Return the better of two phrases (an original phase vs. a test phrase).
   * If a debug stream has been specified, output debug info too.
   */
  private Phrase max(Phrase orig, Phrase test)
    throws IOException 
  {
    // Output debugging info
    if (debugWriter != null && test.score != orig.score) {
      debugWriter.append(
        (test.score > orig.score) ? "    Better: " : "    Worse : ");
      test.calcScore(debugWriter);
    }

    // Now pick the best one and return it.
    if (test.score > orig.score)
      return test;
    else
      return orig;
  }

  /**
   * Consider pair-wise changes at each position.
   */
  private Phrase subPairs(Phrase in)
    throws IOException 
  {
    Phrase bestPhrase = in;

    // Consider two-word changes at each position, but skip stop-words.
    for (int pass = 1; pass <= 2; pass++) 
    {
      if (debugWriter != null) {
        debugWriter.println("  ---- Pass " + pass + " ----");
        debugWriter.print("  Starting with: ");
        in.calcScore(debugWriter);
      }

      int prev = -1;
      for (int i = 0; i < in.words.length; i++) 
      {
        Word w = in.words[i];

        // Skip words removed by joining
        if (w == null)
          continue;

        // Skip stop words
        if (stopSet != null && stopSet.contains(w.word))
          continue;

        // Skip words that are the product of splitting.
        if (w.word.indexOf(' ') >= 0)
          continue;

        // Consider operations on a single word (as long as we haven't changed
        // this word already)
        //
        if (in.words[i].orig == in.words[i])
          bestPhrase = max(bestPhrase, subSplit(in, i));

        // Consider operations on multiple words (as long as we haven't changed
        // both of them already.)
        //
        if (prev >= 0) 
        {
          if (in.words[i].orig == in.words[i] ||
              in.words[prev].orig == in.words[prev]) 
          {
            bestPhrase = max(bestPhrase, subPair(in, prev, i));
            bestPhrase = max(bestPhrase, subJoin(in, prev, i));
          }
        }
        prev = i;
      }

      if (in == bestPhrase)
        break;

      in = bestPhrase;
    }

    return bestPhrase;
  }

  /**
   * Consider a set of changes to the pair of words at the given position.
   *
   * @param in  the current best we've found
   * @param pos1          first position to consider
   * @param pos2          second position to consider
   * @return              new best
   */
  private Phrase subPair(Phrase in, int pos1, int pos2)
    throws IOException 
  {
    Word word1 = in.words[pos1];
    Word word2 = in.words[pos2];

    if (debugWriter != null) {
      debugWriter.println(
        "  subPair(" + pos1 + ", " + pos2 + "): " + in.words[pos1].word + " " +
        in.words[pos2].word);
    }

    // Get a list of independent suggestions for both words. If we've already
    // made a choice, don't override it.
    //
    final int NUM_SUG = 100;
    Word[] list1 = (word1.orig == word1) ? suggestSimilar(word1, NUM_SUG, 0) : null;
    Word[] list2 = (word2.orig == word2) ? suggestSimilar(word2, NUM_SUG, 0) : null;

    // If either list is empty, substitute the original.
    if (list1 == null || list1.length == 0)
      list1 = new Word[] { in.words[pos1] };
    if (list2 == null || list2.length == 0)
      list2 = new Word[] { in.words[pos2] };

    // Now score all possible combinations, looking for the best one.
    float bestScore = 0.0f;
    Word bestSugg1 = null;
    Word bestSugg2 = null;
    for (int p1 = 0; p1 < list1.length; p1++) 
    {
      Word sugg1 = list1[p1];
      boolean change1 = !wordEquiv.isEquivalent(in.words[pos1].word, sugg1.word);
      if (!change1)
        sugg1 = word1;

      for (int p2 = 0; p2 < list2.length; p2++) 
      {
        Word sugg2 = list2[p2];
        boolean change2 = !wordEquiv.isEquivalent(in.words[pos2].word,
                                                  sugg2.word);
        if (!change2)
          sugg2 = word2;

        // Change at least one word
        if (!change1 && !change2)
          continue;

        float pairScore = scorePair(sugg1, sugg2);
        float totalScore = pairScore + sugg1.score + sugg2.score;

        if (debugWriter != null) {
          debugWriter.format(
            "    Pair-replace \"%s %s\" with \"%s %s\": %.2f (%.2f + %.2f + %.2f)\n",
            word1,
            word2,
            sugg1,
            sugg2,
            totalScore,
            pairScore,
            sugg1.score,
            sugg2.score);
        }

        if (totalScore > bestScore) {
          bestScore = totalScore;
          bestSugg1 = sugg1;
          bestSugg2 = sugg2;
        }
      }
    }

    // If we couldn't find any pair that results in improvement, do nothing.
    if (bestSugg1 == null)
      return in;

    // If we found something better than doing nothing, record it.
    Phrase bestPhrase = (Phrase)in.clone();
    bestPhrase.words[pos1] = bestSugg1;
    bestPhrase.words[pos2] = bestSugg2;
    bestPhrase.calcScore();
    return bestPhrase;
  }

  /**
   * Consider splitting a word
   */
  private Phrase subSplit(Phrase in, int pos)
    throws IOException 
  {
    Phrase bestPhrase = in;

    // Only consider splits where both pieces are >= 2 chars in length.
    String origStr = in.words[pos].word;
    for (int i = 2; i < origStr.length() - 1; i++) 
    {
      // Extract the pieces
      String leftStr = origStr.substring(0, i);
      String rightStr = origStr.substring(i);

      // Make sure both parts are real words
      int leftFreq = wordFreqs.get(leftStr);
      int rightFreq = wordFreqs.get(rightStr);
      if (leftFreq <= 0 || rightFreq <= 0)
        continue;

      // Get the frequency. It must be greater than the original.
      int pairFreq = pairFreqs.get(leftStr, rightStr);
      if (debugWriter != null) {
        debugWriter.format("  split-replace: '%s' with '%s' '%s': freq %d\n",
                           origStr,
                           leftStr,
                           rightStr,
                           pairFreq);
      }

      // Okay, this is a candidate. Score it for real
      Phrase testPhrase = (Phrase)in.clone();
      testPhrase.words[pos] = new Word(in.words[pos],
                                       leftStr + " " + rightStr,
                                       pairFreq + 1);
      testPhrase.calcScore();
      bestPhrase = max(bestPhrase, testPhrase);
    }

    return bestPhrase;
  }

  /**
   * Consider joining the first two words together
   */
  private Phrase subJoin(Phrase in, int pos1, int pos2)
    throws IOException 
  {
    Word origWord = new Word(in.words[pos1].word + " " + in.words[pos2].word);
    int origFreq = pairFreqs.get(in.words[pos1].word, in.words[pos2].word);
    String joinedStr = in.words[pos1].word + in.words[pos2].word;
    int joinedFreq = wordFreqs.get(joinedStr);

    if (joinedFreq == 0)
      return in;

    if (debugWriter != null) {
      debugWriter.format("  join-replace: \"%s %s\" with \"%s\": freq %d\n",
                         in.words[pos1].word,
                         in.words[pos2].word,
                         joinedStr,
                         joinedFreq);
    }

    if (joinedFreq <= origFreq)
      return in;

    Phrase testPhrase = (Phrase)in.clone();
    testPhrase.words[pos1] = new Word(origWord, joinedStr, joinedFreq);
    testPhrase.words[pos1].score = in.words[pos1].score + in.words[pos2].score +
                                   scorePair(in.words[pos1], in.words[pos2]);
    testPhrase.words[pos2] = null;
    testPhrase.calcScore();
    return testPhrase;
  }

  /**
   * Calculate a score for a suggested replacement for a given word.
   */
  private float scorePair(Word sugg1, Word sugg2)
    throws IOException 
  {
    int origPairFreq = pairFreqs.get(sugg1.orig.word, sugg2.orig.word);
    int suggPairFreq = pairFreqs.get(sugg1.word, sugg2.word);
    if (suggPairFreq <= origPairFreq)
      return 0.0f;

    double freqFactor = (suggPairFreq + 1.0) / (origPairFreq + 1.0);
    float freqBoost = (float)(Math.log(freqFactor) / Math.log(100.0)) / 2.0f;
    return freqBoost;
  }

  /** Get the term frequency sample array for our dictionary. */
  private void loadFreqSamples(File spellDir)
    throws IOException 
  {
    // Default if no frequencies found will be to turn off frequency boosting
    int[] res = new int[5];
    res[0] = res[1] = res[2] = res[3] = res[4] = Integer.MAX_VALUE;

    // Find the frequency samples file and open it
    File freqSamplesFile = new File(spellDir, "freqSamples.dat");
    if (!freqSamplesFile.canRead())
      throw new IOException(
        "Cannot open frequency samples file '" + freqSamplesFile + "'");

    BufferedReader reader = new BufferedReader(new FileReader(freqSamplesFile));
    int nSamples = 0;
    int[] samples = null;
    try 
    {
      // If there were less than 500 terms to sample, turn off frequency
      // boosting.
      //
      int nTerms = Integer.parseInt(reader.readLine());
      if (nTerms >= 500) 
      {
        // Read in the samples.
        nSamples = Integer.parseInt(reader.readLine());
        samples = new int[nSamples];
        for (int i = 0; i < nSamples; i++)
          samples[i] = Integer.parseInt(reader.readLine());
      }
    }
    catch (NumberFormatException e) {
      throw new IOException("term frequencies file corrupt");
    }
    finally {
      reader.close();
    }

    // Pick out the levels of most interest to us
    if (samples != null) {
      res[0] = samples[(int)(nSamples * 0.99)]; // top 1%
      res[1] = samples[(int)(nSamples * 0.90)]; // top 10%
      res[2] = samples[(int)(nSamples * 0.50)]; // top 50%
      res[3] = samples[(int)(nSamples * 0.25)]; // top 75%
      res[4] = samples[0]; // all above-avg words
    }

    // All done.
    freqSamples = res;
  }

  /** Get the term frequency sample array for our dictionary. */
  private void loadWordFreqs(File spellDir)
    throws IOException 
  {
    // Find the word frequency file and open it
    File freqFile = new File(spellDir, "words.dat");
    if (!freqFile.canRead())
      throw new IOException("Cannot open word frequency file '" + freqFile +
                            "'");

    // Read in each word and its frequency.
    wordFreqs = new FreqData();
    BufferedReader reader = new BufferedReader(new FileReader(freqFile));
    try 
    {
      while (true) {
        String line = reader.readLine();
        if (line == null)
          break;
        String[] toks = splitPat.split(line);
        String word = toks[0];
        int freq = Integer.parseInt(toks[1]);
        wordFreqs.add(word, freq);
      }
    }
    catch (NumberFormatException e) {
      throw new IOException("term frequencies file corrupt");
    }
    finally {
      reader.close();
    }
  }

  private void openPairFreqs(File spellDir)
    throws IOException 
  {
    if (pairFreqs == null) {
      pairFreqs = new FreqData();
      pairFreqs.add(new File(spellDir, "pairs.dat"));
    }
  }

  protected void finalize()
    throws Throwable 
  {
    close();
  }

  private String calcMetaphone(String word) {
    String mph = SpellWriter.calcMetaphone(word);
    if (mph == null)
      return "";
    return mph;
  }

  /**
   * Keeps track of a single word, either an original or suggested word.
   */
  private final class Word 
  {
    public String word;
    public Word orig;
    public int freq;
    public String metaphone;
    private TRStringDistance2 wordDist;
    private TRStringDistance2 mphDist;
    public float score;
    public float freqBoost;

    /** Contructor for original words */
    public Word(String word)
      throws IOException 
    {
      this(null, word, 0);
    }

    /** Constructor for suggested replacement words */
    public Word(Word inOrig, String word, int freq)
      throws IOException 
    {
      this.word = word;
      this.orig = (inOrig == null) ? this : inOrig;
      this.freq = freq;

      metaphone = calcMetaphone(word);
      wordDist = mphDist = null; // lazily created if necessary

      // If equivalent to the original word, inherit the score.
      if (orig != this && wordEquiv.isEquivalent(word, orig.word)) {
        freqBoost = orig.freqBoost;
        score = orig.score;
        return;
      }

      // Calculate the edit distance and turn it into the base score
      float dist = orig.wordDist(word) / 2.0f;
      score = 1.0f - (dist / orig.length());

      // If the metaphone matches, nudge the score
      if (metaphone.equals(orig.metaphone))
        score += 0.1f;

      // If the first and last letters match, nudge the score.
      if (word.length() > 0 &&
          orig.word.length() > 0 &&
          word.charAt(0) == orig.word.charAt(0) &&
          word.charAt(word.length() - 1) == orig.word.charAt(
            orig.word.length() - 1))
        score += 0.1f;

      // If this word is more frequent than normal, give it a nudge up.
      freqBoost = calcFreqBoost(freqSamples, freq);
      score += freqBoost;
    }

    public int length() {
      return word.length();
    }

    public boolean equals(Word other) {
      return word.equals(other.word);
    }

    public int wordDist(String other) {
      if (wordDist == null)
        wordDist = new TRStringDistance2(word);
      return wordDist.getDistance(other);
    }

    public int mphDist(String other) {
      if (mphDist == null)
        mphDist = new TRStringDistance2(metaphone);
      return mphDist.getDistance(other);
    }

    public String toString() {
      return word;
    }

    /** Dump debugging output about this word */
    public void debug(PrintWriter w) 
    {
      align(w, "word=" + word + "[" + orig.wordDist(word) + "]", 22);
      align(w, "mph=" + metaphone + "[" + orig.mphDist(metaphone) + "]", 13);
      align(w, "freq=" + freq, 12);

      // If equivalent to the original word, inherit the score.
      if (orig != this && wordEquiv.isEquivalent(word, orig.word)) {
        align(w, "copyScore=" + orig.score, 20);
        w.println();
        return;
      }

      // Calculate the edit distance and turn it into the base score
      float dist = orig.wordDist(word) / 2.0f;
      align(w, "base=" + (1.0f - (dist / orig.length())), 14);

      // If the metaphone matches, nudge the score
      String mphStr = "0";
      if (metaphone.equals(orig.metaphone))
        mphStr = "0.1";
      align(w, "mphBoost=" + mphStr, 13);

      // If the first and last letters match, nudge the score.
      String matchStr = "0";
      if (word.charAt(0) == orig.word.charAt(0) &&
          word.charAt(word.length() - 1) == orig.word.charAt(
            orig.word.length() - 1))
        matchStr = "" + 0.1f;
      align(w, "matchBoost=" + matchStr, 15);

      // If any frequency boost appplied, print it.
      align(w, "freqBoost=" + freqBoost, 20);

      // Total score
      align(w, "totalScore=" + score, 22);
      w.println();
    }

    private void align(PrintWriter w, String s, int width) {
      w.print(s);
      for (int i = 0; i < (width - s.length()); i++)
        w.print(" ");
      w.print(" ");
    }

    /**
     * Calculate a boost factor based on the frequency of a term.
     */
    private float calcFreqBoost(int[] termFreqs, int freq) 
    {
      if (freq == 0)
        return -0.2f;

      // If this word is more frequent than normal, give it a nudge up.
      int i = 0;
      while (i < 5 && freq < termFreqs[i])
        i++;
      if (i == 0)
        return 0.25f;

      int loFreq = (i < 5) ? termFreqs[i] : 0;
      int hiFreq = termFreqs[i - 1];

      float loBoost = (5 - i) * 0.05f;

      float boost = (((freq - loFreq) * 50 / (hiFreq - loFreq)) / 1000.0f) +
                    loBoost;
      return boost;
    }
  }

  /**
   * Queue of words, ordered by score and then frequency
   */
  private static final class WordQueue extends PriorityQueue 
  {
    WordQueue(int size) {
      initialize(size);
    }

    protected final boolean lessThan(Object a, Object b) 
    {
      Word wa = (Word)a;
      Word wb = (Word)b;

      //first criteria: the edit distance
      if (wa.score > wb.score)
        return false;
      if (wa.score < wb.score)
        return true;

      //second criteria (if first criteria is equal): the popularity
      if (wa.freq > wb.freq)
        return false;
      if (wa.freq < wb.freq)
        return true;

      return false;
    }
  }

  /**
   * Track an ordered group of words.
   */
  private class Phrase implements Cloneable 
  {
    Word[] words;
    float baseScore = 0.0f;
    float score;

    public Object clone() 
    {
      try {
        Phrase out = (Phrase)super.clone();
        out.words = new Word[words.length];
        out.baseScore = 0.0f;
        System.arraycopy(words, 0, out.words, 0, words.length);
        return out;
      }
      catch (CloneNotSupportedException e) {
        return null;
      }
    }

    public void calcScore()
      throws IOException 
    {
      calcScore(null);
    }

    public void calcScore(PrintWriter debugWriter)
      throws IOException 
    {
      float wordScore = 0.0f;
      float pairScore = 0.0f;
      int prev = -1;
      for (int i = 0; i < words.length; i++) 
      {
        // Skip words that have been removed by joining
        if (words[i] == null)
          continue;

        // Skip stop words
        if (stopSet != null && stopSet.contains(words[i].word.toLowerCase())) {
          if (debugWriter != null)
            debugWriter.append(words[i].word + " ");
          continue;
        }

        // Okay, score it.
        wordScore += words[i].score;

        // Do pair scoring, except for words created by splitting
        if (prev >= 0 && words[i].word.indexOf(' ') < 0) {
          pairScore += scorePair(words[prev], words[i]);
          if (debugWriter != null)
            debugWriter.format("+%.2f ", scorePair(words[prev], words[i]));
        }
        prev = i;

        // Print the word after the pair score (if any)
        if (debugWriter != null)
          debugWriter.format("%s[%.2f] ", words[i].word, words[i].score);
      }
      score = baseScore + wordScore + pairScore;
      if (debugWriter != null) {
        if (baseScore != 0.0f)
          debugWriter.format("... base: %.2f ", baseScore);
        debugWriter.format("... Total: %.2f\n", score);
      }
    }

    public String[] toStringArray() {
      String[] out = new String[words.length];
      for (int i = 0; i < words.length; i++)
        out[i] = (words[i] == null) ? null : words[i].word;
      return out;
    }
  }
} // class SpellReader