package org.apache.lucene.queryParser.analyzing;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Version;
/**
* Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
* are also passed through the given analyzer, but wild card characters (like <code>*</code>)
* don't get removed from the search terms.
*
* <p><b>Warning:</b> This class should only be used with analyzers that do not use stopwords
* or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer
* will turn <code>Häuser</code> into <code>hau</code>, but <code>H?user</code> will
* become <code>h?user</code> when using this parser and thus no match would be found (i.e.
* using this parser will be no improvement over QueryParser in such cases).
*
* @version $Revision$, $Date$
*/
public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryParser {
/**
* Constructs a query parser.
* @param field the default field for query terms.
* @param analyzer used to find terms in the query text.
*/
public AnalyzingQueryParser(Version matchVersion, String field, Analyzer analyzer) {
super(matchVersion, field, analyzer);
}
/**
* Called when parser
* parses an input term token that contains one or more wildcard
* characters (like <code>*</code>), but is not a prefix term token (one
* that has just a single * character at the end).
* <p>
* Example: will be called for <code>H?user</code> or for <code>H*user</code>
* but not for <code>*user</code>.
* <p>
* Depending on analyzer and settings, a wildcard term may (most probably will)
* be lower-cased automatically. It <b>will</b> go through the default Analyzer.
* <p>
* Overrides super class, by passing terms through analyzer.
*
* @param field Name of the field query will use.
* @param termStr Term token that contains one or more wild card
* characters (? or *), but is not simple prefix term
*
* @return Resulting {@link Query} built for the term
* @throws ParseException
*/
@Override
protected Query getWildcardQuery(String field, String termStr) throws ParseException {
List<String> tlist = new ArrayList<String>();
List<String> wlist = new ArrayList<String>();
/* somewhat a hack: find/store wildcard chars
* in order to put them back after analyzing */
boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*"));
StringBuilder tmpBuffer = new StringBuilder();
char[] chars = termStr.toCharArray();
for (int i = 0; i < termStr.length(); i++) {
if (chars[i] == '?' || chars[i] == '*') {
if (isWithinToken) {
tlist.add(tmpBuffer.toString());
tmpBuffer.setLength(0);
}
isWithinToken = false;
} else {
if (!isWithinToken) {
wlist.add(tmpBuffer.toString());
tmpBuffer.setLength(0);
}
isWithinToken = true;
}
tmpBuffer.append(chars[i]);
}
if (isWithinToken) {
tlist.add(tmpBuffer.toString());
} else {
wlist.add(tmpBuffer.toString());
}
// get Analyzer from superclass and tokenize the term
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
int countTokens = 0;
while (true) {
try {
if (!source.incrementToken()) break;
} catch (IOException e) {
break;
}
String term = termAtt.toString();
if (!"".equals(term)) {
try {
tlist.set(countTokens++, term);
} catch (IndexOutOfBoundsException ioobe) {
countTokens = -1;
}
}
}
try {
source.close();
} catch (IOException e) {
// ignore
}
if (countTokens != tlist.size()) {
/* this means that the analyzer used either added or consumed
* (common for a stemmer) tokens, and we can't build a WildcardQuery */
throw new ParseException("Cannot build WildcardQuery with analyzer "
+ getAnalyzer().getClass() + " - tokens added or lost");
}
if (tlist.size() == 0) {
return null;
} else if (tlist.size() == 1) {
if (wlist != null && wlist.size() == 1) {
/* if wlist contains one wildcard, it must be at the end, because:
* 1) wildcards are not allowed in 1st position of a term by QueryParser
* 2) if wildcard was *not* in end, there would be *two* or more tokens */
return super.getWildcardQuery(field, tlist.get(0)
+ wlist.get(0).toString());
} else {
/* we should never get here! if so, this method was called
* with a termStr containing no wildcard ... */
throw new IllegalArgumentException("getWildcardQuery called without wildcard");
}
} else {
/* the term was tokenized, let's rebuild to one token
* with wildcards put back in postion */
StringBuilder sb = new StringBuilder();
for (int i = 0; i < tlist.size(); i++) {
sb.append( tlist.get(i));
if (wlist != null && wlist.size() > i) {
sb.append(wlist.get(i));
}
}
return super.getWildcardQuery(field, sb.toString());
}
}
/**
* Called when parser parses an input term
* token that uses prefix notation; that is, contains a single '*' wildcard
* character as its last character. Since this is a special case
* of generic wildcard term, and such a query can be optimized easily,
* this usually results in a different query object.
* <p>
* Depending on analyzer and settings, a prefix term may (most probably will)
* be lower-cased automatically. It <b>will</b> go through the default Analyzer.
* <p>
* Overrides super class, by passing terms through analyzer.
*
* @param field Name of the field query will use.
* @param termStr Term token to use for building term for the query
* (<b>without</b> trailing '*' character!)
*
* @return Resulting {@link Query} built for the term
* @throws ParseException
*/
@Override
protected Query getPrefixQuery(String field, String termStr) throws ParseException {
// get Analyzer from superclass and tokenize the term
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
List<String> tlist = new ArrayList<String>();
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
while (true) {
try {
if (!source.incrementToken()) break;
} catch (IOException e) {
break;
}
tlist.add(termAtt.toString());
}
try {
source.close();
} catch (IOException e) {
// ignore
}
if (tlist.size() == 1) {
return super.getPrefixQuery(field, tlist.get(0));
} else {
/* this means that the analyzer used either added or consumed
* (common for a stemmer) tokens, and we can't build a PrefixQuery */
throw new ParseException("Cannot build PrefixQuery with analyzer "
+ getAnalyzer().getClass()
+ (tlist.size() > 1 ? " - token(s) added" : " - token consumed"));
}
}
/**
* Called when parser parses an input term token that has the fuzzy suffix (~) appended.
* <p>
* Depending on analyzer and settings, a fuzzy term may (most probably will)
* be lower-cased automatically. It <b>will</b> go through the default Analyzer.
* <p>
* Overrides super class, by passing terms through analyzer.
*
* @param field Name of the field query will use.
* @param termStr Term token to use for building term for the query
*
* @return Resulting {@link Query} built for the term
* @exception ParseException
*/
@Override
protected Query getFuzzyQuery(String field, String termStr, float minSimilarity)
throws ParseException {
// get Analyzer from superclass and tokenize the term
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
String nextToken = null;
boolean multipleTokens = false;
try {
if (source.incrementToken()) {
nextToken = termAtt.toString();
}
multipleTokens = source.incrementToken();
} catch (IOException e) {
nextToken = null;
}
try {
source.close();
} catch (IOException e) {
// ignore
}
if (multipleTokens) {
throw new ParseException("Cannot build FuzzyQuery with analyzer " + getAnalyzer().getClass()
+ " - tokens were added");
}
return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken, minSimilarity);
}
/**
* Overrides super class, by passing terms through analyzer.
* @exception ParseException
*/
@Override
protected Query getRangeQuery(String field, String part1, String part2, boolean inclusive)
throws ParseException {
// get Analyzer from superclass and tokenize the terms
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(part1));
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
boolean multipleTokens = false;
// part1
try {
if (source.incrementToken()) {
part1 = termAtt.toString();
}
multipleTokens = source.incrementToken();
} catch (IOException e) {
// ignore
}
try {
source.close();
} catch (IOException e) {
// ignore
}
if (multipleTokens) {
throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
+ " - tokens were added to part1");
}
// part2
source = getAnalyzer().tokenStream(field, new StringReader(part2));
termAtt = source.addAttribute(CharTermAttribute.class);
try {
if (source.incrementToken()) {
part2 = termAtt.toString();
}
multipleTokens = source.incrementToken();
} catch (IOException e) {
// ignore
}
try {
source.close();
} catch (IOException e) {
// ignore
}
if (multipleTokens) {
throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
+ " - tokens were added to part2");
}
return super.getRangeQuery(field, part1, part2, inclusive);
}
}