CompassSpellChecker.java example

Explorer

compass-fork-master
- samples
  - library
    - src
      - java
        org
        compass
        sample
        library
        Article.java
        Author.java
        Book.java
        Identifiable.java
        Name.java
      - test
        org
        compass
        sample
        library
        LibraryTests.java
  - petclinic
    - src
      - java
        org
        compass
        sample
        petclinic
        Clinic.java
        Entity.java
        NamedEntity.java
        Owner.java
        Person.java
        Pet.java
        PetType.java
        Petclinic.java
        Specialty.java
        Vet.java
        Visit.java
        hibernate
        HibernateClinic.java
        jdbc
        AbstractJdbcClinic.java
        CachingClinic.java
        HsqlJdbcClinic.java
        JdbcPet.java
        MySQLJdbcClinic.java
        jmx
        CallMonitor.java
        CallMonitoringInterceptor.java
        ojb
        PersistenceBrokerClinic.java
        util
        EntityUtils.java
        validation
        OwnerValidator.java
        PetValidator.java
        VisitValidator.java
        web
        AbstractClinicForm.java
        AddOwnerForm.java
        AddPetForm.java
        AddVisitForm.java
        ClinicController.java
        EditOwnerForm.java
        EditPetForm.java
        FindOwnersForm.java
      - test
        org
        compass
        sample
        petclinic
        AbstractClinicTests.java
        OwnerTests.java
        SetUpDatabase.java
        hibernate
        HibernateClinicTests.java
        jdbc
        JdbcClinicTests.java
        ojb
        PersistenceBrokerClinicTests.java
- src
  - main

package org.apache.lucene.search.spell;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.Iterator;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;

/**
 * <p>
 *   Spell Checker class  (Main class) <br/>
 *  (initially inspired by the David Spencer code).
 * </p>
 *
 * <p>Example Usage:
 *
 * <pre>
 *  SpellChecker spellcheck = new SpellChecker(spellIndexDirectory);
 *  // To index a field of a user index:
 *  spellcheck.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
 *  // To index a file containing words:
 *  spellcheck.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
 *  String[] suggestions = spellcheck.suggestSimilar("misspelt", 5);
 * </pre>
 *
 *
 * @version 1.0
 */


// Specialized SpellChecker for Compass.

// List of changes: (Mainly to separte between two use cases: one is for indexing and one for searching).

// 1. Added a constructor that accepts a searcher and reader ("searching" spell checker)
// 2. Changed searcher type from IndexSearcher to Searcher
// 3. Added close method
// 4. In indexDictioanry, if the searcher is null, don't reopen it
// 5. Added a constructor that won't open an index searcher ("indexing" spell checker)
// 6. Added indexDictionary that accepts a dictionary and IndexWriter so we can configure it

// LUCENE MONITOR
public class CompassSpellChecker {

  /**
   * Field name for each word in the ngram index.
   */
  public static final String F_WORD = "word";

  /**
   * the spell index
   */
  Directory spellIndex;

  /**
   * Boost value for start and end grams
   */
  private float bStart = 2.0f;
  private float bEnd = 1.0f;

  private IndexReader reader;
  private Searcher searcher;

  // minimum score for hits generated by the spell checker query
  private float minScore = 0.5f;

  private StringDistance sd;


  public CompassSpellChecker(Searcher searcher, IndexReader reader) {
      this.searcher = searcher;
      this.reader = reader;
      setStringDistance(new LevensteinDistance());
  }

  /**
   * Use the given directory as a spell checker index. The directory
   * is created if it doesn't exist yet.
   *
   * @param spellIndex
   * @throws IOException
   */
  public CompassSpellChecker(Directory spellIndex) throws IOException {
    this.setSpellIndex(spellIndex);
    setStringDistance(new LevensteinDistance());
  }

    /**
     */
    public CompassSpellChecker(Directory spellIndex, boolean indexing) throws IOException {
      if (indexing) {
          this.spellIndex = spellIndex;
      } else {
          setSpellIndex(spellIndex);
      }
      setStringDistance(new LevensteinDistance());
    }

    public void close() {
        try {
            searcher.close();
        } catch (IOException e) {
            // do nothing
        }

        try {
            reader.close();
        } catch (IOException e) {
            // do nothing
        }
    }


    public void setStringDistance(StringDistance sd) {
      this.sd = sd;
    }

    public StringDistance getStringDistance() {
      return sd;
    }
    
  /**
   * Use a different index as the spell checker index or re-open
   * the existing index if <code>spellIndex</code> is the same value
   * as given in the constructor.
   *
   * @param spellIndex
   * @throws IOException
   */
  public void setSpellIndex(Directory spellIndex) throws IOException {
    this.spellIndex = spellIndex;
    if (!IndexReader.indexExists(spellIndex)) {
        IndexWriter writer = new IndexWriter(spellIndex, null, true);
        writer.close();
    }
    // close the old searcher, if there was one
    if (searcher != null) {
      searcher.close();
    }
    searcher = new IndexSearcher(this.spellIndex);
  }

  /**
   * Sets the accuracy 0 < minScore < 1; default 0.5
   */
  public void setAccuracy(float minScore) {
    this.minScore = minScore;
  }

    /**
     * Suggest similar words.
     *
     * <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
     * is not the same as the edit distance strategy used to calculate the best
     * matching spell-checked word from the hits that Lucene found, one usually has
     * to retrieve a couple of numSug's in order to get the true best match.
     *
     * <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
     * Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
     *
     * @param word the word you want a spell check done on
     * @param numSug the number of suggested words
     * @throws IOException
     * @return String[]
     */
    public String[] suggestSimilar(String word, int numSug) throws IOException {
      return this.suggestSimilar(word, numSug, null, null, false);
    }

    /**
     * Suggest similar words (optionally restricted to a field of an index).
     *
     * <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
     * is not the same as the edit distance strategy used to calculate the best
     * matching spell-checked word from the hits that Lucene found, one usually has
     * to retrieve a couple of numSug's in order to get the true best match.
     *
     * <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
     * Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
     *
     * @param word the word you want a spell check done on
     * @param numSug the number of suggested words
     * @param ir the indexReader of the user index (can be null see field param)
     * @param field the field of the user index: if field is not null, the suggested
     * words are restricted to the words present in this field.
     * @param morePopular return only the suggest words that are as frequent or more frequent than the searched word
     * (only if restricted mode = (indexReader!=null and field!=null)
     * @throws IOException
     * @return String[] the sorted list of the suggest words with these 2 criteria:
     * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
     * of the suggest words in the field of the user index
     */
    public String[] suggestSimilar(String word, int numSug, IndexReader ir,
        String field, boolean morePopular) throws IOException {

      float min = this.minScore;
      final int lengthWord = word.length();

      final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0;
      final int goalFreq = (morePopular && ir != null && field != null) ? freq : 0;
      // if the word exists in the real index and we don't care for word frequency, return the word itself
      if (!morePopular && freq > 0) {
        return new String[] { word };
      }

      BooleanQuery query = new BooleanQuery();
      String[] grams;
      String key;

      for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {

        key = "gram" + ng; // form key

        grams = formGrams(word, ng); // form word into ngrams (allow dups too)

        if (grams.length == 0) {
          continue; // hmm
        }

        if (bStart > 0) { // should we boost prefixes?
          add(query, "start" + ng, grams[0], bStart); // matches start of word

        }
        if (bEnd > 0) { // should we boost suffixes
          add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word

        }
        for (int i = 0; i < grams.length; i++) {
          add(query, key, grams[i]);
        }
      }

//    System.out.println("Q: " + query);
      Hits hits = searcher.search(query);
//    System.out.println("HITS: " + hits.length());
      SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);

      // go thru more than 'maxr' matches in case the distance filter triggers
      int stop = Math.min(hits.length(), 10 * numSug);
      SuggestWord sugWord = new SuggestWord();
      for (int i = 0; i < stop; i++) {

        sugWord.string = hits.doc(i).get(F_WORD); // get orig word

        // don't suggest a word for itself, that would be silly
        if (sugWord.string.equals(word)) {
          continue;
        }

        // edit distance
        sugWord.score = sd.getDistance(word,sugWord.string);
        if (sugWord.score < min) {
          continue;
        }

        if (ir != null && field != null) { // use the user index
          sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index
          // don't suggest a word that is not present in the field
          if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) {
            continue;
          }
        }
        sugQueue.insert(sugWord);
        if (sugQueue.size() == numSug) {
          // if queue full, maintain the minScore score
          min = ((SuggestWord) sugQueue.top()).score;
        }
        sugWord = new SuggestWord();
      }

      // convert to array string
      String[] list = new String[sugQueue.size()];
      for (int i = sugQueue.size() - 1; i >= 0; i--) {
        list[i] = ((SuggestWord) sugQueue.pop()).string;
      }

      return list;
    }

    /**
     * Add a clause to a boolean query.
     */
    private static void add(BooleanQuery q, String name, String value, float boost) {
      Query tq = new TermQuery(new Term(name, value));
      tq.setBoost(boost);
      q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
    }

    /**
     * Add a clause to a boolean query.
     */
    private static void add(BooleanQuery q, String name, String value) {
      q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD));
    }

    /**
     * Form all ngrams for a given word.
     * @param text the word to parse
     * @param ng the ngram length e.g. 3
     * @return an array of all ngrams in the word and note that duplicates are not removed
     */
    private static String[] formGrams(String text, int ng) {
      int len = text.length();
      String[] res = new String[len - ng + 1];
      for (int i = 0; i < len - ng + 1; i++) {
        res[i] = text.substring(i, i + ng);
      }
      return res;
    }

    /**
     * Removes all terms from the spell check index.
     * @throws IOException
     */
    public void clearIndex() throws IOException {
      IndexWriter writer = new IndexWriter(spellIndex, null, true);
      writer.close();

        // COMASS: Remove closing the searcher
      //close the old searcher
//      searcher.close();
//      searcher = new IndexSearcher(this.spellIndex);
    }

    /**
     * Check whether the word exists in the index.
     * @param word
     * @throws IOException
     * @return true iff the word exists in the index
     */
    public boolean exist(String word) throws IOException {
        // COMPASS: Adding check for index reader
        if (reader == null) {
          reader = IndexReader.open(spellIndex, true);
        }
      return reader.docFreq(new Term(F_WORD, word)) > 0;
    }

    /**
     * Indexes the data from the given {@link Dictionary}.
     * @param dict Dictionary to index
     * @param mergeFactor mergeFactor to use when indexing
     * @param ramMB the max amount or memory in MB to use
     * @throws IOException
     */
    public void indexDictionary(IndexWriter writer, Dictionary dict) throws IOException {
      Iterator iter = dict.getWordsIterator();
      while (iter.hasNext()) {
        String word = (String) iter.next();

        int len = word.length();
        if (len < 3) {
          continue; // too short we bail but "too long" is fine...
        }

        if (this.exist(word)) { // if the word already exist in the gramindex
          continue;
        }

        // ok index the word
        Document doc = createDocument(word, getMin(len), getMax(len));
        writer.addDocument(doc);
      }
        // close writer (REMOVED IN COMPASS), will do it on close
//    writer.optimize();
//    writer.close();
        // close reader so it will be re-opened (and see the new content) when exist()
        // is called the next time:
        if (reader != null) {
          reader.close();
          reader = null;
        }
        // also re-open the spell index to see our own changes when the next suggestion
        // is fetched:
        if (searcher != null) {
            searcher.close();
            searcher = new IndexSearcher(this.spellIndex);
        }
    }

    private int getMin(int l) {
      if (l > 5) {
        return 3;
      }
      if (l == 5) {
        return 2;
      }
      return 1;
    }

    private int getMax(int l) {
      if (l > 5) {
        return 4;
      }
      if (l == 5) {
        return 3;
      }
      return 2;
    }

    private static Document createDocument(String text, int ng1, int ng2) {
      Document doc = new Document();
      doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED)); // orig term
      addGram(text, doc, ng1, ng2);
      return doc;
    }

    private static void addGram(String text, Document doc, int ng1, int ng2) {
      int len = text.length();
      for (int ng = ng1; ng <= ng2; ng++) {
        String key = "gram" + ng;
        String end = null;
        for (int i = 0; i < len - ng + 1; i++) {
          String gram = text.substring(i, i + ng);
          doc.add(new Field(key, gram, Field.Store.NO, Field.Index.NOT_ANALYZED));
          if (i == 0) {
            doc.add(new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED));
          }
          end = gram;
        }
        if (end != null) { // may not be present if len==ng1
          doc.add(new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED));
        }
      }
    }
}