/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.spell; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.lucene.search.IndexSearcher; import org.apache.nutch.searcher.Query; import org.apache.nutch.util.LogUtil; import org.apache.nutch.util.NutchConfiguration; /** * Parses queries and sends them to NGramSpeller for spell checking. * * @author Andy Liu <andyliu1227@gmail.com> */ public class SpellCheckerBean { public static final Log LOG = LogFactory.getLog(SpellCheckerBean.class); IndexSearcher spellingSearcher; // // Configuration parameters used by NGramSpeller . Hardcoded for now. // final int minThreshold = 5; final int ng1 = 3; final int ng2 = 4; final int maxr = 10; final int maxd = 5; final float bStart = 2.0f; final float bEnd = 1.0f; final float bTransposition = 6.5f; // configuration variable names public static final String SPELLING_INDEX_LOCATION = "spell.index.dir"; public static final String SPELLING_DOCFREQ_THRESHOLD = "spell.docfreq.threshold"; public static final String SPELLING_DOCFREQ_THRESHOLD_FACTOR = "spell.docfreq.threshold.factor"; String indexLocation; int threshold; int thresholdFactor; Configuration conf; public SpellCheckerBean(Configuration conf) { this.conf=conf; indexLocation = conf.get(SPELLING_INDEX_LOCATION, "./spelling"); threshold = conf.getInt(SPELLING_DOCFREQ_THRESHOLD, 100); thresholdFactor = conf.getInt(SPELLING_DOCFREQ_THRESHOLD_FACTOR, 10); try { spellingSearcher = new IndexSearcher(indexLocation); } catch (IOException ioe) { LOG.info("error opening spell checking index"); ioe.printStackTrace(LogUtil.getInfoStream(LOG)); } } /** Cache in Configuration. */ public static SpellCheckerBean get(Configuration conf) { SpellCheckerBean spellCheckerBean = (SpellCheckerBean) conf .getObject(SpellCheckerBean.class.getName()); if (spellCheckerBean == null) { LOG.info("creating new spell checker bean"); spellCheckerBean = new SpellCheckerBean(conf); conf.setObject(SpellCheckerBean.class.getName(), spellCheckerBean); } return spellCheckerBean; } public SpellCheckerTerms checkSpelling(Query query, String queryString) { return checkSpelling(query, queryString, threshold, thresholdFactor); } /** * Parses original query, retrieves suggestions from ngrams spelling index * * @param originalQuery * Query to be spell-checked * @param docFreqThreshold * Terms in query that have a docFreq lower than this threshold * qualify as "mispelled" * @param factorThreshold * The suggested term must have a docFreq at least factorThreshold * times more than the mispelled term. Set to 1 to disable. * @return terms with corrected spelling */ public SpellCheckerTerms checkSpelling(Query query, String queryString, int docFreqThreshold, int factorThreshold) { SpellCheckerTerms spellCheckerTerms = null; try { spellCheckerTerms = parseOriginalQuery(query, queryString); for (int i = 0; i < spellCheckerTerms.size(); i++) { SpellCheckerTerm currentTerm = spellCheckerTerms.getSpellCheckerTerm(i); String originalTerm = currentTerm.getOriginalTerm(); spellCheckerTerms.getSpellCheckerTerm(i).setOriginalDocFreq( getDocFreq(originalTerm)); // // Spell checking is not effective for words under 4 letters long // Any words over 25 letters long isn't worth checking either. // if (originalTerm.length() < 4) continue; if (originalTerm.length() > 25) continue; List lis = new ArrayList(maxr); String[] suggestions = NGramSpeller.suggestUsingNGrams(spellingSearcher , originalTerm, ng1, ng2, maxr, bStart, bEnd, bTransposition, maxd, lis, true); if (suggestions.length > 0) { currentTerm.setSuggestedTerm(suggestions[0]); if (lis != null) { NGramSpeller.SpellSuggestionDetails detail = (NGramSpeller.SpellSuggestionDetails) lis .get(0); currentTerm.setSuggestedTermDocFreq(detail.docFreq); } // We use document frequencies of the original term and the suggested // term to guess // whether or not a term is mispelled. The criteria is as follows: // // 1. The term's document frequency must be under a constant threshold // 2. The suggested term's docFreq must be greater than the original // term's docFreq * constant factor // if ((currentTerm.originalDocFreq < docFreqThreshold) && ((currentTerm.originalDocFreq * factorThreshold) < (currentTerm.suggestedTermDocFreq))) { spellCheckerTerms.setHasMispelledTerms(true); currentTerm.setMispelled(true); } } } } catch (Throwable t) { t.printStackTrace(); } return spellCheckerTerms; } /** * * Parses the query and preserves characters and formatting surrounding terms * to be spell-checked. This is done so that we can present the query in the * "Did you mean: XYZ" message in the same format the user originally typed * it. * * @param originalQuery * text to be parsed * @return spell checker terms */ public SpellCheckerTerms parseOriginalQuery(Query query, String queryString) throws IOException { String[] terms = query.getTerms(); SpellCheckerTerms spellCheckerTerms = new SpellCheckerTerms(); int previousTermPos = 0; for (int i = 0; i < terms.length; i++) { int termPos = queryString.toLowerCase().indexOf(terms[i]); String charsBefore = ""; String charsAfter = ""; // Is this the first term? If so, we need to check for characters // before the first term. if (i == 0) { if (termPos > 0) { charsBefore = queryString.substring(0, termPos); } // We're in-between terms... } else { int endOfLastTerm = previousTermPos + terms[i - 1].length(); if (endOfLastTerm < termPos) { charsBefore = queryString.substring(endOfLastTerm, termPos); } } // Is this the last term? If so, we need to check for characters // after the last term. if (i == (terms.length - 1)) { int endOfCurrentTerm = termPos + terms[i].length(); if (endOfCurrentTerm < queryString.length()) { charsAfter = queryString.substring(endOfCurrentTerm, queryString .length()); } } previousTermPos = termPos; spellCheckerTerms.add(new SpellCheckerTerm(terms[i], charsBefore, charsAfter)); } return spellCheckerTerms; } public SpellCheckerTerms parseOriginalQuery(String queryString) throws IOException { return parseOriginalQuery(Query.parse(queryString, conf), queryString); } /** * Retrieves docFreq as stored within spelling index. Alternatively, we could * simply consult the main index for a docFreq() of a term (which would be * faster) but it's nice to have a separate, spelling index that can stand on * its own. * * @param term * @return document frequency of term */ private int getDocFreq(String term) throws IOException { /* * Hits hits = this.spellingSearcher.getLuceneSearcher().search(new * TermQuery(new Term( NGramSpeller.F_WORD, term))); if (hits.length() > 0) { * Document doc = hits.doc(0); String docFreq = * doc.get(NGramSpeller.F_FREQ); return Integer.parseInt(docFreq); } */ return 0; } public static void main(String[] args) throws Throwable { if (args.length < 1) { System.out.println("usage: SpellCheckerBean [ngrams spelling index]"); return; } Configuration conf = NutchConfiguration.create(); conf.set("spell.index.dir", args[0]); SpellCheckerBean checker = new SpellCheckerBean(conf); BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); String line; while ((line = in.readLine()) != null) { Query query = Query.parse(line, conf); SpellCheckerTerms terms = checker.checkSpelling(query, line); StringBuffer buf = new StringBuffer(); for (int i = 0; i < terms.size(); i++) { SpellCheckerTerm currentTerm = terms.getSpellCheckerTerm(i); buf.append(currentTerm.getCharsBefore()); if (currentTerm.isMispelled()) { buf.append(currentTerm.getSuggestedTerm()); } else { buf.append(currentTerm.getOriginalTerm()); } } System.out.println("Spell checked: " + buf); } } public void init() { //do initialization here } public String[] suggest(Query query) { // TODO Auto-generated method stub return null; } public String getID() { return "SPELLER"; } }