/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.spelling; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; /** * Converts the query string to a Collection of Lucene tokens using a regular expression. * Boolean operators AND, OR, NOT are skipped. * * Each term is checked to determine if it is optional, required or prohibited. Required * terms output a {@link Token} with the {@link QueryConverter#REQUIRED_TERM_FLAG} set. * Prohibited terms output a {@link Token} with the {@link QueryConverter#PROHIBITED_TERM_FLAG} * set. If the query uses the plus (+) and minus (-) to denote required and prohibited, this * determination will be accurate. In the case boolean AND/OR/NOTs are used, this * converter makes an uninformed guess as to whether the term would likely behave as if it * is Required or Prohibited and sets the flags accordingly. These flags are used downstream * to generate collations for {@link WordBreakSolrSpellChecker}, in cases where an original * term is split up into multiple Tokens. * * @since solr 1.3 **/ public class SpellingQueryConverter extends QueryConverter { /* * The following builds up a regular expression that matches productions * of the syntax for NMTOKEN as per the W3C XML Recommendation - with one * important exception (see below). * * http://www.w3.org/TR/2008/REC-xml-20081126/ - version used as reference * * http://www.w3.org/TR/REC-xml/#NT-Nmtoken * * An NMTOKEN is a series of one or more NAMECHAR characters, which is an * extension of the NAMESTARTCHAR character class. * * The EXCEPTION referred to above concerns the colon, which is legal in an * NMTOKEN, but cannot currently be used as a valid field name within Solr, * as it is used to delimit the field name from the query string. */ final static String[] NAMESTARTCHAR_PARTS = { "A-Z_a-z", "\\xc0-\\xd6", "\\xd8-\\xf6", "\\xf8-\\u02ff", "\\u0370-\\u037d", "\\u037f-\\u1fff", "\\u200c-\\u200d", "\\u2070-\\u218f", "\\u2c00-\\u2fef", "\\u2001-\\ud7ff", "\\uf900-\\ufdcf", "\\ufdf0-\\ufffd" }; final static String[] ADDITIONAL_NAMECHAR_PARTS = { "\\-.0-9\\xb7", "\\u0300-\\u036f", "\\u203f-\\u2040" }; final static String SURROGATE_PAIR = "\\p{Cs}{2}"; final static String NMTOKEN; static { StringBuilder sb = new StringBuilder(); for (String part : NAMESTARTCHAR_PARTS) sb.append(part); for (String part : ADDITIONAL_NAMECHAR_PARTS) sb.append(part); NMTOKEN = "([" + sb.toString() + "]|" + SURROGATE_PAIR + ")+"; } final static String PATTERN = "(?:(?!(" + NMTOKEN + ":|[\\^.]\\d+)))[^^.:(\\s][\\p{L}_\\-0-9]+"; // previous version: Pattern.compile("(?:(?!(\\w+:|\\d+)))\\w+"); protected Pattern QUERY_REGEX = Pattern.compile(PATTERN); /** * Converts the original query string to a collection of Lucene Tokens. * @param original the original query string * @return a Collection of Lucene Tokens */ @Override public Collection<Token> convert(String original) { if (original == null) { // this can happen with q.alt = and no query return Collections.emptyList(); } boolean mightContainRangeQuery = (original.indexOf('[') != -1 || original.indexOf('{') != -1) && (original.indexOf(']') != -1 || original.indexOf('}') != -1); Collection<Token> result = new ArrayList<>(); Matcher matcher = QUERY_REGEX.matcher(original); String nextWord = null; int nextStartIndex = 0; String lastBooleanOp = null; while (nextWord!=null || matcher.find()) { String word = null; int startIndex = 0; if(nextWord != null) { word = nextWord; startIndex = nextStartIndex; nextWord = null; } else { word = matcher.group(0); startIndex = matcher.start(); } if(matcher.find()) { nextWord = matcher.group(0); nextStartIndex = matcher.start(); } if(mightContainRangeQuery && "TO".equals(word)) { continue; } if("AND".equals(word) || "OR".equals(word) || "NOT".equals(word)) { lastBooleanOp = word; continue; } // treat "AND NOT" as "NOT"... if ("AND".equals(nextWord) && original.length() > nextStartIndex + 7 && original.substring(nextStartIndex, nextStartIndex + 7).equals( "AND NOT")) { nextWord = "NOT"; } int flagValue = 0; if (word.charAt(0) == '-' || (startIndex > 0 && original.charAt(startIndex - 1) == '-')) { flagValue = PROHIBITED_TERM_FLAG; } else if (word.charAt(0) == '+' || (startIndex > 0 && original.charAt(startIndex - 1) == '+')) { flagValue = REQUIRED_TERM_FLAG; //we don't know the default operator so just assume the first operator isn't new. } else if (nextWord != null && lastBooleanOp != null && !nextWord.equals(lastBooleanOp) && ("AND".equals(nextWord) || "OR".equals(nextWord) || "NOT".equals(nextWord))) { flagValue = TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG; //...unless the 1st boolean operator is a NOT, because only AND/OR can be default. } else if (nextWord != null && lastBooleanOp == null && !nextWord.equals(lastBooleanOp) && ("NOT".equals(nextWord))) { flagValue = TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG; } try { analyze(result, word, startIndex, flagValue); } catch (IOException e) { // TODO: shouldn't we log something? } } if(lastBooleanOp != null) { for(Token t : result) { int f = t.getFlags(); t.setFlags(f |= QueryConverter.TERM_IN_BOOLEAN_QUERY_FLAG); } } return result; } protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException { TokenStream stream = analyzer.tokenStream("", text); // TODO: support custom attributes CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); stream.reset(); while (stream.incrementToken()) { Token token = new Token(); token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setOffset(offset + offsetAtt.startOffset(), offset + offsetAtt.endOffset()); token.setFlags(flagsAttValue); //overwriting any flags already set... token.setType(typeAtt.type()); token.setPayload(payloadAtt.getPayload()); token.setPositionIncrement(posIncAtt.getPositionIncrement()); result.add(token); } stream.end(); stream.close(); } }