LuceneSpellChecker.java example

Explorer
jcr-master
- jcr-develop
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.exoplatform.services.jcr.impl.core.query.lucene.spell;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.spell.LuceneDictionary;
import org.apache.lucene.search.spell.SpellChecker;
import org.apache.lucene.search.spell.SuggestMode;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;
import org.exoplatform.commons.utils.SecurityHelper;
import org.exoplatform.services.jcr.impl.core.query.QueryHandler;
import org.exoplatform.services.jcr.impl.core.query.QueryRootNode;
import org.exoplatform.services.jcr.impl.core.query.RelationQueryNode;
import org.exoplatform.services.jcr.impl.core.query.TraversingQueryNodeVisitor;
import org.exoplatform.services.jcr.impl.core.query.lucene.FieldNames;
import org.exoplatform.services.jcr.impl.core.query.lucene.SearchIndex;
import org.exoplatform.services.jcr.impl.core.query.lucene.Util;
import org.exoplatform.services.log.ExoLogger;
import org.exoplatform.services.log.Log;

import java.io.IOException;
import java.io.StringReader;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.List;

import javax.jcr.RepositoryException;

/**
 * <code>LuceneSpellChecker</code> implements a spell checker based on the terms
 * present in a lucene index.
 */
public class LuceneSpellChecker implements org.exoplatform.services.jcr.impl.core.query.lucene.SpellChecker
{

   /**
    * Logger instance for this class.
    */
   private static final Log LOG = ExoLogger.getLogger("exo.jcr.component.core.LuceneSpellChecker");

   public static final class FiveSecondsRefreshInterval extends LuceneSpellChecker
   {
      public FiveSecondsRefreshInterval()
      {
         super(5 * 1000);
      }
   }

   public static final class OneMinuteRefreshInterval extends LuceneSpellChecker
   {
      public OneMinuteRefreshInterval()
      {
         super(60 * 1000);
      }
   }

   public static final class FiveMinutesRefreshInterval extends LuceneSpellChecker
   {
      public FiveMinutesRefreshInterval()
      {
         super(5 * 60 * 1000);
      }
   }

   public static final class ThirtyMinutesRefreshInterval extends LuceneSpellChecker
   {
      public ThirtyMinutesRefreshInterval()
      {
         super(30 * 60 * 1000);
      }
   }

   public static final class OneHourRefreshInterval extends LuceneSpellChecker
   {
      public OneHourRefreshInterval()
      {
         super(60 * 60 * 1000);
      }
   }

   public static final class SixHoursRefreshInterval extends LuceneSpellChecker
   {
      public SixHoursRefreshInterval()
      {
         super(6 * 60 * 60 * 1000);
      }
   }

   public static final class TwelveHoursRefreshInterval extends LuceneSpellChecker
   {
      public TwelveHoursRefreshInterval()
      {
         super(12 * 60 * 60 * 1000);
      }
   }

   public static final class OneDayRefreshInterval extends LuceneSpellChecker
   {
      public OneDayRefreshInterval()
      {
         super(24 * 60 * 60 * 1000);
      }
   }

   /**
    * The internal spell checker.
    */
   private InternalSpellChecker spellChecker;

   /**
    * The refresh interval.
    */
   private final long refreshInterval;

   /**
    * Spell checker with a default refresh interval of one hour.
    */
   public LuceneSpellChecker()
   {
      this(60 * 60 * 1000); // default refresh interval: one hour
   }

   protected LuceneSpellChecker(long refreshInterval)
   {
      this.refreshInterval = refreshInterval;
   }

   /**
    * {@inheritDoc}
    */
   public void init(QueryHandler handler, float minDistance, boolean morePopular) throws IOException
   {
      if (handler instanceof SearchIndex)
      {
         this.spellChecker = new InternalSpellChecker((SearchIndex)handler, minDistance, morePopular);
      }
      else
      {
         throw new IOException("LuceneSpellChecker only works with " + SearchIndex.class.getName());
      }
   }

   /**
    * {@inheritDoc}
    * 
    * @throws RepositoryException
    */
   public String check(QueryRootNode aqt) throws IOException, RepositoryException
   {
      String stmt = getFulltextStatement(aqt);
      if (stmt == null)
      {
         // no spellcheck operation in query
         return null;
      }
      return spellChecker.suggest(stmt);
   }

   public void close()
   {
      spellChecker.close();
   }

   // ------------------------------< internal >--------------------------------

   /**
    * Returns the fulltext statement of a spellcheck relation query node or
    * <code>null</code> if none exists in the abstract query tree.
    * 
    * @param aqt
    *            the abstract query tree.
    * @return the fulltext statement or <code>null</code>.
    * @throws RepositoryException
    */
   private String getFulltextStatement(QueryRootNode aqt) throws RepositoryException
   {
      final String[] stmt = new String[1];
      aqt.accept(new TraversingQueryNodeVisitor()
      {
         @Override
         public Object visit(RelationQueryNode node, Object o) throws RepositoryException
         {
            if (stmt[0] == null && node.getOperation() == RelationQueryNode.OPERATION_SPELLCHECK)
            {
               stmt[0] = node.getStringValue();
            }
            return super.visit(node, o);
         }
      }, null);
      return stmt[0];
   }

   private final class InternalSpellChecker
   {

      /**
       * Timestamp when the last refresh was done.
       */
      private long lastRefresh;

      /**
       * Set to true while a refresh is done in a separate thread.
       */
      private boolean refreshing = false;

      /**
       * The query handler associated with this spell checker.
       */
      private final SearchIndex handler;

      /**
       * The directory where the spell index is stored.
       */
      private Directory spellIndexDirectory;

      /**
       * The underlying spell checker.
       */
      private SpellChecker spellChecker;

      private final boolean morePopular;

      /**
       * Creates a new internal spell checker.
       * 
       * @param handler
       *            the associated query handler.
       * @param minDistance
       *            minimal distance between  word and proposed close word. Float value 0..1.
       * @param morePopular
       *            return only the suggest words that are as frequent or more frequent than the searched word 
       */
      InternalSpellChecker(final SearchIndex handler, float minDistance, boolean morePopular) throws IOException
      {
         this.handler = handler;
         spellIndexDirectory = null;
         SecurityHelper.doPrivilegedIOExceptionAction(new PrivilegedExceptionAction<Object>()
         {
            public Object run() throws Exception
            {
               spellIndexDirectory = handler.getDirectoryManager().getDirectory("spellchecker");

               if (IndexReader.indexExists(spellIndexDirectory))
               {
                  lastRefresh = System.currentTimeMillis();
               }
               return null;
            }
         });
         this.spellChecker = new SpellChecker(spellIndexDirectory);
         this.spellChecker.setAccuracy(minDistance);
         this.morePopular = morePopular;
         refreshSpellChecker();
      }

      /**
       * Checks a fulltext query statement and suggests a spell checked
       * version of the statement. If the spell checker thinks the spelling is
       * correct <code>null</code> is returned.
       * 
       * @param statement
       *            the fulltext query statement.
       * @return a suggestion or <code>null</code>.
       */
      String suggest(String statement) throws IOException
      {
         // tokenize the statement (field name doesn't matter actually...)
         List<String> words = new ArrayList<String>();
         List<TokenData> tokens = new ArrayList<TokenData>();
         tokenize(statement, words, tokens);

         String[] suggestions = check(words.toArray(new String[words.size()]));
         if (suggestions != null)
         {
            // replace words in statement in reverse order because length
            // of statement will change
            StringBuilder sb = new StringBuilder(statement);
            for (int i = suggestions.length - 1; i >= 0; i--)
            {
               TokenData t = tokens.get(i);
               // only replace if word acutally changed
               if (!t.word.equalsIgnoreCase(suggestions[i]))
               {
                  sb.replace(t.startOffset, t.endOffset, suggestions[i]);
               }
            }
            // if suggestion is same as a statement return null
            String result = sb.toString();
            if (statement.equalsIgnoreCase(result))
            {
               return null;
            }
            else
            {
               return result;
            }
         }
         else
         {
            return null;
         }
      }

      void close()
      {
         try
         {
            SecurityHelper.doPrivilegedIOExceptionAction(new PrivilegedExceptionAction<Object>()
            {
               public Object run() throws Exception
               {
                  spellIndexDirectory.close();
                  return null;
               }
            });
         }
         catch (IOException e)
         {
            if (LOG.isTraceEnabled())
            {
               LOG.trace("An exception occurred: " + e.getMessage());
            }
         }
         // urgh, the lucene spell checker cannot be closed explicitly.
         // finalize will close the reader...
         spellChecker = null;
      }

      /**
       * Tokenizes the statement into words and tokens.
       * 
       * @param statement
       *            the fulltext query statement.
       * @param words
       *            this list will be filled with the original words extracted
       *            from the statement.
       * @param tokens
       *            this list will be filled with the tokens parsed from the
       *            statement.
       * @throws IOException
       *             if an error occurs while parsing the statement.
       */
      private void tokenize(String statement, List<String> words, List<TokenData> tokens) throws IOException
      {
         TokenStream ts = handler.getTextAnalyzer().tokenStream(FieldNames.FULLTEXT, new StringReader(statement));
         CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
         PositionIncrementAttribute positionIncrement = ts.getAttribute(PositionIncrementAttribute.class);
         OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
         try
         {
            String word;
            while (ts.incrementToken())
            {

               word = new String(term.buffer(), 0, term.length());
               //            while ((t = ts.next()) != null)
               //            {
               String origWord = statement.substring(offset.startOffset(), offset.endOffset());
               if (positionIncrement.getPositionIncrement() > 0)
               {
                  words.add(word);
                  tokens.add(new TokenData(offset.startOffset(), offset.endOffset(), word));
               }
               else
               {
                  // very simple implementation: use termText with length
                  // closer to original word
                  TokenData current = tokens.get(tokens.size() - 1);
                  if (Math.abs(origWord.length() - current.termLength()) > Math.abs(origWord.length() - word.length()))
                  {
                     // replace current token and word
                     words.set(words.size() - 1, word);
                     tokens.set(tokens.size() - 1, new TokenData(offset.startOffset(), offset.endOffset(), word));
                  }
               }
            }
         }
         finally
         {
            ts.end();
            ts.close();
         }
      }

      class TokenData
      {
         int startOffset;

         int endOffset;

         String word;

         public TokenData(int startOffset, int endOffset, String word)
         {
            this.startOffset = startOffset;
            this.endOffset = endOffset;
            this.word = word;
         }

         /**
          * @return
          */
         public int termLength()
         {
            return word.length();
         }

      }

      /**
       * Checks the spelling of the passed <code>words</code> and returns a
       * suggestion.
       * 
       * @param words
       *            the words to check.
       * @return a suggestion of correctly spelled <code>words</code> or
       *         <code>null</code> if this spell checker thinks
       *         <code>words</code> are spelled correctly.
       * @throws IOException
       *             if an error occurs while spell checking.
       */
      private String[] check(final String words[]) throws IOException
      {
         refreshSpellChecker();
         boolean hasSuggestion = false;
         final IndexReader reader = handler.getIndexReader();
         try
         {
            for (int retries = 0; retries < 100; retries++)
            {
               try
               {
                  String[] suggestion = new String[words.length];
                  for (int i = 0; i < words.length; i++)
                  {
                     final int currentIndex = i;
                     String[] similar =
                        SecurityHelper.doPrivilegedIOExceptionAction(new PrivilegedExceptionAction<String[]>()
                        {
                           public String[] run() throws Exception
                           {
                              return spellChecker
                                 .suggestSimilar(words[currentIndex], 5, reader, FieldNames.FULLTEXT, morePopular
                                    ? SuggestMode.SUGGEST_MORE_POPULAR : SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
                           }
                        });

                     if (similar.length > 0)
                     {
                        suggestion[i] = similar[0];
                        hasSuggestion = true;
                     }
                     else
                     {
                        suggestion[i] = words[i];
                     }
                  }
                  if (hasSuggestion)
                  {
                     LOG.debug("Successful after " + new Integer(retries) + " retries");
                     return suggestion;
                  }
                  else
                  {
                     return null;
                  }
               }
               catch (AlreadyClosedException e)
               {
                  // it may happen that the index reader inside the
                  // spell checker is closed while searching for
                  // suggestions. this is actually a design flaw in the
                  // lucene spell checker, but for now we simply retry
                  if (LOG.isTraceEnabled())
                  {
                     LOG.trace("An exception occurred: " + e.getMessage());
                  }
               }
            }
            // unsuccessful after retries
            return null;
         }
         finally
         {
            SecurityHelper.doPrivilegedIOExceptionAction(new PrivilegedExceptionAction<Object>()
            {
               public Object run() throws Exception
               {
                  Util.closeOrRelease(reader);
                  return null;
               }
            });
         }
      }

      /**
       * Refreshes the underlying spell checker in a background thread.
       * Synchronization is done on this <code>LuceneSpellChecker</code>
       * instance. While the refresh takes place {@link #refreshing} is set to
       * <code>true</code>.
       */
      private void refreshSpellChecker()
      {
         if (lastRefresh + refreshInterval < System.currentTimeMillis())
         {
            synchronized (this)
            {
               if (refreshing)
               {
                  return;
               }
               else
               {
                  refreshing = true;
                  Runnable refresh = new Runnable()
                  {
                     public void run()
                     {

                        try
                        {
                           SecurityHelper.doPrivilegedIOExceptionAction(new PrivilegedExceptionAction<Object>()
                           {
                              public Object run() throws Exception
                              {
                                 IndexReader reader = handler.getIndexReader();
                                 try
                                 {
                                    long time = System.currentTimeMillis();
                                    Dictionary dict = new LuceneDictionary(reader, FieldNames.FULLTEXT);
                                    LOG.debug("Starting spell checker index refresh");
                                    spellChecker.indexDictionary(dict, new IndexWriterConfig(Version.LUCENE_36,
                                       new StandardAnalyzer(Version.LUCENE_36)), true);
                                    time = System.currentTimeMillis() - time;
                                    time = time / 1000;
                                    LOG.info("Spell checker index refreshed in: " + new Long(time) + " s.");
                                 }
                                 finally
                                 {
                                    Util.closeOrRelease(reader);
                                    synchronized (InternalSpellChecker.this)
                                    {
                                       refreshing = false;
                                    }
                                 }
                                 return null;
                              }
                           });
                        }
                        catch (IOException e)
                        {
                           if (LOG.isTraceEnabled())
                           {
                              LOG.trace("An exception occurred: " + e.getMessage());
                           }
                        }
                     }
                  };
                  new Thread(refresh, "SpellChecker Refresh").start();

                  lastRefresh = System.currentTimeMillis();
               }
            }
         }
      }
   }
}