SpellingQueryConverter.java example

Explorer
solrcene-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.spelling;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;


/**
 * Converts the query string to a Collection of Lucene tokens using a regular expression.
 * Boolean operators AND and OR are skipped.
 *
 * @since solr 1.3
 **/
public class SpellingQueryConverter extends QueryConverter  {

  /*
  * The following builds up a regular expression that matches productions
  * of the syntax for NMTOKEN as per the W3C XML Recommendation - with one
  * important exception (see below).
  *
  * http://www.w3.org/TR/2008/REC-xml-20081126/ - version used as reference
  *
  * http://www.w3.org/TR/REC-xml/#NT-Nmtoken
  *
  * An NMTOKEN is a series of one or more NAMECHAR characters, which is an
  * extension of the NAMESTARTCHAR character class.
  *
  * The EXCEPTION referred to above concerns the colon, which is legal in an
  * NMTOKEN, but cannot currently be used as a valid field name within Solr,
  * as it is used to delimit the field name from the query string.
  */

  final static String[] NAMESTARTCHAR_PARTS = {
          "A-Z_a-z", "\\xc0-\\xd6", "\\xd8-\\xf6", "\\xf8-\\u02ff",
          "\\u0370-\\u037d", "\\u037f-\\u1fff",
          "\\u200c-\\u200d", "\\u2070-\\u218f",
          "\\u2c00-\\u2fef", "\\u2001-\\ud7ff",
          "\\uf900-\\ufdcf", "\\ufdf0-\\ufffd"
  };
  final static String[] ADDITIONAL_NAMECHAR_PARTS = {
          "\\-.0-9\\xb7", "\\u0300-\\u036f", "\\u203f-\\u2040"
  };
  final static String SURROGATE_PAIR = "\\p{Cs}{2}";
  final static String NMTOKEN;

  static {
    StringBuilder sb = new StringBuilder();
    for (String part : NAMESTARTCHAR_PARTS)
      sb.append(part);
    for (String part : ADDITIONAL_NAMECHAR_PARTS)
      sb.append(part);
    NMTOKEN = "([" + sb.toString() + "]|" + SURROGATE_PAIR + ")+";
  }

  final static String PATTERN = "(?:(?!(" + NMTOKEN + ":|\\d+)))[\\p{L}_\\-0-9]+";
  // previous version: Pattern.compile("(?:(?!(\\w+:|\\d+)))\\w+");
  protected Pattern QUERY_REGEX = Pattern.compile(PATTERN);


  /**
   * Converts the original query string to a collection of Lucene Tokens.
   * @param original the original query string
   * @return a Collection of Lucene Tokens
   */
  public Collection<Token> convert(String original) {
    if (original == null) { // this can happen with q.alt = and no query
      return Collections.emptyList();
    }
    Collection<Token> result = new ArrayList<Token>();
    //TODO: Extract the words using a simple regex, but not query stuff, and then analyze them to produce the token stream
    Matcher matcher = QUERY_REGEX.matcher(original);
    TokenStream stream;
    while (matcher.find()) {
      String word = matcher.group(0);
      if (word.equals("AND") == false && word.equals("OR") == false) {
        try {
          stream = analyzer.reusableTokenStream("", new StringReader(word));
          // TODO: support custom attributes
          CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
          FlagsAttribute flagsAtt = stream.addAttribute(FlagsAttribute.class);
          TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
          PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
          PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
          stream.reset();
          while (stream.incrementToken()) {
            Token token = new Token();
            token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
            token.setStartOffset(matcher.start());
            token.setEndOffset(matcher.end());
            token.setFlags(flagsAtt.getFlags());
            token.setType(typeAtt.type());
            token.setPayload(payloadAtt.getPayload());
            token.setPositionIncrement(posIncAtt.getPositionIncrement());
            result.add(token);
          }
        } catch (IOException e) {
        }
      }
    }
    return result;
  }

}