package org.apache.lucene.spelt; /** * Copyright 2007 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import java.io.IOException; import java.io.StringReader; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Query; /** * Handles spelling correction for simple queries produced by the Lucene * {@link QueryParser}. Allows a custom {@link QueryParser} to be supplied, * though it must retain the case of the input tokens, so that we can supply * spelling corrections using the same case the user did. * * @author Martin Haye */ public class QuerySpeller extends SimpleQueryRewriter { /** Used to get spelling suggestions */ private SpellReader spellReader; /** Set of fields we're allowed to collect terms for */ private HashSet<String> fieldSet; /** List of terms collected */ private LinkedHashSet<String> terms; /** Mapping of terms to replace */ private HashMap<String, String> suggestMap; /** Used to parse queries */ private QueryParser queryParser; /** * Construct a new speller using a given dictionary reader. The queries * will be parsed with a {@link MinimalAnalyzer}, and the default field * name will be "text". * * @param spellReader source for spelling suggestions -- see * {@link SpellReader#open(File)}. */ public QuerySpeller(SpellReader spellReader) { this(spellReader, new QueryParser("text", new MinimalAnalyzer())); } /** * Construct a new speller using a given dictionary reader and analyzer (note * that the analyzer should do MINIMAL token filtering, without any case * conversion). * * @param spellReader source for spelling suggestions -- see * {@link SpellReader#open(File)}. * @param queryParser used to parse queries; note that the analyzer it uses * should do only MINIMAL token filtering, not even conversion to * lower case, so that suggestions can be made in the same case the * user typed them. In particular, StandardAnalyzer should not be * used. */ public QuerySpeller(SpellReader spellReader, QueryParser queryParser) { this.spellReader = spellReader; this.queryParser = queryParser; // Test out the query parser's analyzer to make sure it preserves // the case of input tokens. // validateAnalyzer(); } /** * Make sure the analyzer preserves the case of input tokens. If it didn't, * we would be unable to make spelling suggestions that match the case of * user queries. */ private void validateAnalyzer() { TokenStream toks = queryParser.getAnalyzer().tokenStream( queryParser.getField(), new StringReader("MixedCaseToken")); try { Token t; while ((t = toks.next()) != null) { if (t.termText().equals("MixedCaseToken")) return; } } catch (IOException e) { throw new RuntimeException(e); } throw new IllegalArgumentException( "Unacceptable analyzer passed to QuerySpeller - must not convert to lower case"); } /** * Suggest alternate spellings for terms in a Lucene query. By default, * we consider terms in any field. If you need to specify a subset of fields * to consider, use the * {@linkplain #suggest(String, String[]) alternate method} below. * * @param inQuery the original query to scan * @return an query with some suggested spelling corrections, or * null if no suggestions could be found. */ public synchronized String suggest(String inQuery) throws ParseException, IOException { return suggest(inQuery, null); } /** * Suggest alternate spellings for terms in a Lucene query, limiting * suggestions to the specified fields only. * * @param inQuery the original query to scan * @param fields to consider for correction, or null for all * @return a query with some suggested spelling corrections, or * null if no suggestions could be found. */ public synchronized String suggest(String inQuery, String[] fields) throws ParseException, IOException { // Record the set of fields to consider. if (fields == null) fieldSet = null; else { fieldSet = new HashSet(fields.length); for (String f : fields) fieldSet.add(f); } // Okay, traverse the query once, but don't make any changes. Just collect // the terms. // Query inQueryParsed = queryParser.parse(inQuery); suggestMap = new HashMap<String, String>(); terms = new LinkedHashSet<String>(); rewriteQuery(inQueryParsed); // No terms found? Then we can't make a suggestion. if (terms.isEmpty()) return null; // Get some suggestions for these terms. If none found, we're outta here. String[] oldTerms = terms.toArray(new String[0]); String[] suggTerms = spellReader.suggestKeywords(oldTerms); if (suggTerms == null) return null; // Make a mapping of the suggestions. for (int i=0; i<oldTerms.length; i++) suggestMap.put(oldTerms[i], suggTerms[i]); // Rewrite the query, replacing the suggested words. Query rewritten = rewriteQuery(inQueryParsed); // Finally, convert the query back to a string, and we're done. return rewritten.toString(queryParser.getField()); } /** This is the way we slip in to grab or rewrite terms */ protected @Override Term rewrite(Term t) { // Skip fields we're not supposed to look at. if (fieldSet != null && !fieldSet.contains(t.field())) return t; // Add this term to our accumulating list (if it's not already there) String text = t.text(); terms.add(text); // If there's a suggestion, implement it. if (suggestMap.containsKey(text)) { String suggText = suggestMap.get(text); if (suggText != null) return new Term(t.field(), suggText); else return null; } else return t; } }