/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.spelling; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; /** * Converts the query string to a Collection of Lucene tokens using a regular expression. * Boolean operators AND and OR are skipped. * * @since solr 1.3 **/ public class SpellingQueryConverter extends QueryConverter { /* * The following builds up a regular expression that matches productions * of the syntax for NMTOKEN as per the W3C XML Recommendation - with one * important exception (see below). * * http://www.w3.org/TR/2008/REC-xml-20081126/ - version used as reference * * http://www.w3.org/TR/REC-xml/#NT-Nmtoken * * An NMTOKEN is a series of one or more NAMECHAR characters, which is an * extension of the NAMESTARTCHAR character class. * * The EXCEPTION referred to above concerns the colon, which is legal in an * NMTOKEN, but cannot currently be used as a valid field name within Solr, * as it is used to delimit the field name from the query string. */ final static String[] NAMESTARTCHAR_PARTS = { "A-Z_a-z", "\\xc0-\\xd6", "\\xd8-\\xf6", "\\xf8-\\u02ff", "\\u0370-\\u037d", "\\u037f-\\u1fff", "\\u200c-\\u200d", "\\u2070-\\u218f", "\\u2c00-\\u2fef", "\\u2001-\\ud7ff", "\\uf900-\\ufdcf", "\\ufdf0-\\ufffd" }; final static String[] ADDITIONAL_NAMECHAR_PARTS = { "\\-.0-9\\xb7", "\\u0300-\\u036f", "\\u203f-\\u2040" }; final static String SURROGATE_PAIR = "\\p{Cs}{2}"; final static String NMTOKEN; static { StringBuilder sb = new StringBuilder(); for (String part : NAMESTARTCHAR_PARTS) sb.append(part); for (String part : ADDITIONAL_NAMECHAR_PARTS) sb.append(part); NMTOKEN = "([" + sb.toString() + "]|" + SURROGATE_PAIR + ")+"; } final static String PATTERN = "(?:(?!(" + NMTOKEN + ":|\\d+)))[\\p{L}_\\-0-9]+"; // previous version: Pattern.compile("(?:(?!(\\w+:|\\d+)))\\w+"); protected Pattern QUERY_REGEX = Pattern.compile(PATTERN); /** * Converts the original query string to a collection of Lucene Tokens. * @param original the original query string * @return a Collection of Lucene Tokens */ public Collection<Token> convert(String original) { if (original == null) { // this can happen with q.alt = and no query return Collections.emptyList(); } Collection<Token> result = new ArrayList<Token>(); //TODO: Extract the words using a simple regex, but not query stuff, and then analyze them to produce the token stream Matcher matcher = QUERY_REGEX.matcher(original); TokenStream stream; while (matcher.find()) { String word = matcher.group(0); if (word.equals("AND") == false && word.equals("OR") == false) { try { stream = analyzer.reusableTokenStream("", new StringReader(word)); // TODO: support custom attributes CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); FlagsAttribute flagsAtt = stream.addAttribute(FlagsAttribute.class); TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); stream.reset(); while (stream.incrementToken()) { Token token = new Token(); token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setStartOffset(matcher.start()); token.setEndOffset(matcher.end()); token.setFlags(flagsAtt.getFlags()); token.setType(typeAtt.type()); token.setPayload(payloadAtt.getPayload()); token.setPositionIncrement(posIncAtt.getPositionIncrement()); result.add(token); } } catch (IOException e) { } } } return result; } }