package org.apache.lucene.queryparser.analyzing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Version;
/**
* Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
* are also passed through the given analyzer, but wildcard characters <code>*</code> and
* <code>?</code> don't get removed from the search terms.
*
* <p><b>Warning:</b> This class should only be used with analyzers that do not use stopwords
* or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer
* will turn <code>Häuser</code> into <code>hau</code>, but <code>H?user</code> will
* become <code>h?user</code> when using this parser and thus no match would be found (i.e.
* using this parser will be no improvement over QueryParser in such cases).
*/
public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.QueryParser {
// gobble escaped chars or find a wildcard character
private final Pattern wildcardPattern = Pattern.compile("(\\.)|([?*]+)");
public AnalyzingQueryParser(Version matchVersion, String field, Analyzer analyzer) {
super(matchVersion, field, analyzer);
setAnalyzeRangeTerms(true);
}
/**
* Called when parser parses an input term that contains one or more wildcard
* characters (like <code>*</code>), but is not a prefix term (one that has
* just a single <code>*</code> character at the end).
* <p>
* Example: will be called for <code>H?user</code> or for <code>H*user</code>.
* <p>
* Depending on analyzer and settings, a wildcard term may (most probably will)
* be lower-cased automatically. It <b>will</b> go through the default Analyzer.
* <p>
* Overrides super class, by passing terms through analyzer.
*
* @param field Name of the field query will use.
* @param termStr Term that contains one or more wildcard
* characters (? or *), but is not simple prefix term
*
* @return Resulting {@link Query} built for the term
*/
@Override
protected Query getWildcardQuery(String field, String termStr) throws ParseException {
if (termStr == null){
//can't imagine this would ever happen
throw new ParseException("Passed null value as term to getWildcardQuery");
}
if ( ! getAllowLeadingWildcard() && (termStr.startsWith("*") || termStr.startsWith("?"))) {
throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery"
+ " unless getAllowLeadingWildcard() returns true");
}
Matcher wildcardMatcher = wildcardPattern.matcher(termStr);
StringBuilder sb = new StringBuilder();
int last = 0;
while (wildcardMatcher.find()){
// continue if escaped char
if (wildcardMatcher.group(1) != null){
continue;
}
if (wildcardMatcher.start() > 0){
String chunk = termStr.substring(last, wildcardMatcher.start());
String analyzed = analyzeSingleChunk(field, termStr, chunk);
sb.append(analyzed);
}
//append the wildcard character
sb.append(wildcardMatcher.group(2));
last = wildcardMatcher.end();
}
if (last < termStr.length()){
sb.append(analyzeSingleChunk(field, termStr, termStr.substring(last)));
}
return super.getWildcardQuery(field, sb.toString());
}
/**
* Called when parser parses an input term
* that uses prefix notation; that is, contains a single '*' wildcard
* character as its last character. Since this is a special case
* of generic wildcard term, and such a query can be optimized easily,
* this usually results in a different query object.
* <p>
* Depending on analyzer and settings, a prefix term may (most probably will)
* be lower-cased automatically. It <b>will</b> go through the default Analyzer.
* <p>
* Overrides super class, by passing terms through analyzer.
*
* @param field Name of the field query will use.
* @param termStr Term to use for building term for the query
* (<b>without</b> trailing '*' character!)
*
* @return Resulting {@link Query} built for the term
*/
@Override
protected Query getPrefixQuery(String field, String termStr) throws ParseException {
String analyzed = analyzeSingleChunk(field, termStr, termStr);
return super.getPrefixQuery(field, analyzed);
}
/**
* Called when parser parses an input term that has the fuzzy suffix (~) appended.
* <p>
* Depending on analyzer and settings, a fuzzy term may (most probably will)
* be lower-cased automatically. It <b>will</b> go through the default Analyzer.
* <p>
* Overrides super class, by passing terms through analyzer.
*
* @param field Name of the field query will use.
* @param termStr Term to use for building term for the query
*
* @return Resulting {@link Query} built for the term
*/
@Override
protected Query getFuzzyQuery(String field, String termStr, float minSimilarity)
throws ParseException {
String analyzed = analyzeSingleChunk(field, termStr, termStr);
return super.getFuzzyQuery(field, analyzed, minSimilarity);
}
/**
* Returns the analyzed form for the given chunk
*
* If the analyzer produces more than one output token from the given chunk,
* a ParseException is thrown.
*
* @param field The target field
* @param termStr The full term from which the given chunk is excerpted
* @param chunk The portion of the given termStr to be analyzed
* @return The result of analyzing the given chunk
* @throws ParseException when analysis returns other than one output token
*/
protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException{
String analyzed = null;
try (TokenStream stream = getAnalyzer().tokenStream(field, chunk)) {
stream.reset();
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
// get first and hopefully only output token
if (stream.incrementToken()) {
analyzed = termAtt.toString();
// try to increment again, there should only be one output token
StringBuilder multipleOutputs = null;
while (stream.incrementToken()) {
if (null == multipleOutputs) {
multipleOutputs = new StringBuilder();
multipleOutputs.append('"');
multipleOutputs.append(analyzed);
multipleOutputs.append('"');
}
multipleOutputs.append(',');
multipleOutputs.append('"');
multipleOutputs.append(termAtt.toString());
multipleOutputs.append('"');
}
stream.end();
if (null != multipleOutputs) {
throw new ParseException(
String.format(getLocale(),
"Analyzer created multiple terms for \"%s\": %s", chunk, multipleOutputs.toString()));
}
} else {
// nothing returned by analyzer. Was it a stop word and the user accidentally
// used an analyzer with stop words?
stream.end();
throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk));
}
} catch (IOException e){
throw new ParseException(
String.format(getLocale(), "IO error while trying to analyze single term: \"%s\"", termStr));
}
return analyzed;
}
}