/* Jazzy - a Java library for Spell Checking Copyright (C) 2001 Mindaugas Idzelis Full text of license can be found in LICENSE.txt This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ /* Created by bgalbs on Jan 30, 2003 at 11:45:25 PM */ package com.swabunga.spell.engine; import java.io.File; import java.io.IOException; import java.io.Reader; import java.security.InvalidParameterException; import java.util.Collections; import java.util.Enumeration; import java.util.Hashtable; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Vector; /** * Container for various methods that any <code>SpellDictionary</code> will use. Based on the original Jazzy <a * href="http://aspell.net/">aspell</a> port. * <p/> * * */ public abstract class SpellDictionaryASpell implements SpellDictionary { /** The reference to a Transformator, used to transform a word into it's phonetic code. */ protected Transformator tf; public SpellDictionaryASpell(File phonetic) throws IOException { if (phonetic == null) { tf = new DoubleMeta(); } else { tf = new GenericTransformator(phonetic); } } public SpellDictionaryASpell(File phonetic, String encoding) throws IOException { if (phonetic == null) { tf = new DoubleMeta(); } else { tf = new GenericTransformator(phonetic, encoding); } } public SpellDictionaryASpell(Reader phonetic) throws IOException { if (phonetic == null) { tf = new DoubleMeta(); } else { tf = new GenericTransformator(phonetic); } } /** * Returns a list of Word objects that are the suggestions to an incorrect word. * <p> * * @param word * Suggestions for given mispelt word * @param threshold * The lower boundary of similarity to mispelt word * @return Vector a List of suggestions */ @Override public List<Word> getSuggestions(String word, int threshold) { Hashtable<String, String> nearmisscodes = new Hashtable<String, String>(); String code = getCode(word); // add all words that have the same phonetics nearmisscodes.put(code, code); Vector<Word> phoneticList = getWordsFromCode(word, nearmisscodes); // do some tranformations to pick up more results // interchange nearmisscodes = new Hashtable<String, String>(); char[] charArray = word.toCharArray(); for (int i = 0; i < word.length() - 1; i++) { char a = charArray[i]; char b = charArray[i + 1]; charArray[i] = b; charArray[i + 1] = a; String s = getCode(new String(charArray)); nearmisscodes.put(s, s); charArray[i] = a; charArray[i + 1] = b; } char[] replacelist = tf.getReplaceList(); // change charArray = word.toCharArray(); for (int i = 0; i < word.length(); i++) { char original = charArray[i]; for (int j = 0; j < replacelist.length; j++) { charArray[i] = replacelist[j]; String s = getCode(new String(charArray)); nearmisscodes.put(s, s); } charArray[i] = original; } // add charArray = (word += " ").toCharArray(); int iy = charArray.length - 1; while (true) { for (int j = 0; j < replacelist.length; j++) { charArray[iy] = replacelist[j]; String s = getCode(new String(charArray)); nearmisscodes.put(s, s); } if (iy == 0) { break; } charArray[iy] = charArray[iy - 1]; --iy; } // delete word = word.trim(); charArray = word.toCharArray(); char[] charArray2 = new char[charArray.length - 1]; for (int ix = 0; ix < charArray2.length; ix++) { charArray2[ix] = charArray[ix]; } char a, b; a = charArray[charArray.length - 1]; int ii = charArray2.length; while (true) { String s = getCode(new String(charArray)); nearmisscodes.put(s, s); if (ii == 0) { break; } b = a; a = charArray2[ii - 1]; charArray2[ii - 1] = b; --ii; } nearmisscodes.remove(code); // already accounted for in phoneticList Vector<Word> wordlist = getWordsFromCode(word, nearmisscodes); if (wordlist.size() == 0 && phoneticList.size() == 0) { addBestGuess(word, phoneticList); } // We sort a Vector at the end instead of maintaining a // continously sorted TreeSet because everytime you add a collection // to a treeset it has to be resorted. It's better to do this operation // once at the end. Collections.sort(phoneticList, new Word()); // always sort phonetic matches along the top Collections.sort(wordlist, new Word()); // the non-phonetic matches can be listed below phoneticList.addAll(wordlist); return phoneticList; } /** * When we don't come up with any suggestions (probably because the threshold was too strict), then pick the best guesses from the those * words that have the same phonetic code. * * @param word * - the word we are trying spell correct * @param wordList * - the linked list that will get the best guess */ private void addBestGuess(String word, Vector<Word> wordList) { if (wordList.size() != 0) { throw new InvalidParameterException("the wordList vector must be empty"); } int bestScore = Integer.MAX_VALUE; String code = getCode(word); List<String> simwordlist = getWords(code); LinkedList<Word> candidates = new LinkedList<Word>(); for (Iterator<String> j = simwordlist.iterator(); j.hasNext();) { String similar = j.next(); int distance = EditDistance.getDistance(word, similar); if (distance <= bestScore) { bestScore = distance; Word goodGuess = new Word(similar, distance); candidates.add(goodGuess); } } // now, only pull out the guesses that had the best score for (Iterator<Word> iter = candidates.iterator(); iter.hasNext();) { Word candidate = iter.next(); if (candidate.getCost() == bestScore) { wordList.add(candidate); } } } private Vector<Word> getWordsFromCode(String word, Hashtable<String, String> codes) { Configuration config = Configuration.getConfiguration(); Vector<Word> result = new Vector<Word>(); final int configDistance = config.getInteger(Configuration.SPELL_THRESHOLD); for (Enumeration<String> i = codes.keys(); i.hasMoreElements();) { String code = i.nextElement(); List<String> simwordlist = getWords(code); for (Iterator<String> iter = simwordlist.iterator(); iter.hasNext();) { String similar = iter.next(); int distance = EditDistance.getDistance(word, similar); if (distance < configDistance) { Word w = new Word(similar, distance); result.addElement(w); } } } return result; } /** * Returns the phonetic code representing the word. */ public String getCode(String word) { return tf.transform(word); } /** * Returns a list of words that have the same phonetic code. */ protected abstract List<String> getWords(String phoneticCode); /** * Returns true if the word is correctly spelled against the current word list. */ @Override public boolean isCorrect(String word) { List<String> possible = getWords(getCode(word)); if (possible.contains(word)) { return true; } else if (possible.contains(word.toLowerCase())) { return true; } return false; } }