SpellDictionary.java example

Explorer
ekit-master
- src
  - com
/*
 * put your module comment here
 * formatted with JxBeauty (c) johann.langhofer@nextra.at
 */

package com.swabunga.spell.engine;

import java.io.*;
import java.util.*;

/**
 * The SpellDictionary class holds the instance of the dictionary.
 * <p>
 * This class is thread safe. Derived classes should ensure that this preserved.
 * </p>
 * <p>
 * There are many open source dictionary files. For just a few see:
 * http://wordlist.sourceforge.net/
 * </p>
 * <p>
 * This dictionary class reads words one per line. Make sure that your word list
 * is formatted in this way (most are).
 * </p>
 */
public class SpellDictionary
{

	/** The replace list is used in the getSuggestions method*/
	private static final char[] replacelist =
		{
			'A',
			'B',
			'X',
			'S',
			'K',
			'J',
			'T',
			'F',
			'H',
			'L',
			'M',
			'N',
			'P',
			'R',
			'0' };

	/** A field indicating the initial hash map capacity (16KB) for the main
	 *  dictionary hash map. Interested to see what the performance of a
	 *  smaller initial capacity is like.
	 */
	private final static int INITIAL_CAPACITY = 16 * 1024;
	/**
	 * The hashmap that contains the word dictionary. The map is hashed on the doublemeta
	 * code. The map entry contains a LinkedList of words that have the same double meta code.
	 */
	protected HashMap mainDictionary = new HashMap(INITIAL_CAPACITY);
	/**The reference to a Transformator, used to transform a word into it's.
	 * phonetic code.
	 */
	private Transformator tf = null;


	/** Holds the dictionary file for appending*/
	private File dictFile = null;

	/**
	 * Dictionary Constructor.
	 */
	public SpellDictionary(Reader wordList) throws IOException {
		tf = new DoubleMeta();
		createDictionary(new BufferedReader(wordList));
	}

	/**
	 * Dictionary Constructor for JAR files
	 * @author Howard Kistler
	 */
	public SpellDictionary(String wordListResource) throws IOException
	{
		tf = new DoubleMeta();
		InputStream is = this.getClass().getResourceAsStream("dictionary/" + wordListResource);
		createDictionary(new BufferedReader(new InputStreamReader(is)));
	}

	/**
	 * Dictionary Convienence Constructor.
	 */
	public SpellDictionary(File wordList)
		throws FileNotFoundException, IOException {
		this(new FileReader(wordList));
		dictFile = wordList;
	}

	/**
	* Dictionary constructor that uses an aspell phonetic file to
	* build the transformation table.
	*/
	public SpellDictionary(File wordList, File phonetic)
		throws FileNotFoundException, IOException {
		tf = new GenericTransformator(phonetic);
		dictFile = wordList;
		createDictionary(new BufferedReader(new FileReader(wordList)));
	}

	/**
	 * Add a word permanantly to the dictionary (and the dictionary file).
	 * <p>This needs to be made thread safe (synchronized)</p>
	 */
	public void addWord(String word) {
		putWord(word);
		if (dictFile == null)
			return;
		try {
			FileWriter w = new FileWriter(dictFile.toString(), true);
			// Open with append.
			w.write(word);
			w.write("\n");
			w.close();
		} catch (IOException ex) {
			System.out.println("Error writing to dictionary file");
		}
	}

	/**
	 * Constructs the dictionary from a word list file.
	 * <p>
	 * Each word in the reader should be on a seperate line.
	 * <p>
	 * This is a very slow function. On my machine it takes quite a while to
	 * load the data in. I suspect that we could speed this up quite alot.
	 */
	protected void createDictionary(BufferedReader in) throws IOException {
		String line = "";
		while (line != null) {
			line = in.readLine();
			if (line != null) {
				line = new String(line.toCharArray());
				putWord(line);
			}
		}
	}

	/**
	 * Returns the code representing the word.
	 */
	public String getCode(String word) {
		return tf.transform(word);
	}

	/**
	 * Allocates a word in the dictionary
	 */
	protected void putWord(String word) {
		String code = getCode(word);
		LinkedList list = (LinkedList) mainDictionary.get(code);
		if (list != null) {
			list.add(word);
		} else {
			list = new LinkedList();
			list.add(word);
			mainDictionary.put(code, list);
		}
	}

	/**
	 * Returns a list of strings (words) for the code.
	 */
	public LinkedList getWords(String code) {
		//Check the main dictionary.
		LinkedList mainDictResult = (LinkedList) mainDictionary.get(code);
		if (mainDictResult == null)
			return new LinkedList();
		return mainDictResult;
	}

	/**
	 * Returns true if the word is correctly spelled against the current word list.
	 */
	public boolean isCorrect(String word) {
		LinkedList possible = getWords(getCode(word));
		if (possible.contains(word))
			return true;
		//JMH should we always try the lowercase version. If I dont then capitalised
		//words are always returned as incorrect.
		else if (possible.contains(word.toLowerCase()))
			return true;
		return false;
	}

	/**
	 * Returns a linked list of Word objects that are the suggestions to an
	 * incorrect word.
	 * <p>
	 * @param word Suggestions for given mispelt word
	 * @param threshold The lower boundary of similarity to mispelt word
	 * @return LinkedList a List of suggestions
	 */
	public LinkedList getSuggestions(String word, int threshold) {

		HashSet nearmisscodes = new HashSet();
		String code = getCode(word);

		// add all words that have the same codeword
		nearmisscodes.add(code);

		// do some tranformations to pick up more results
		//interchange 
		char[] charArray = word.toCharArray();
		for (int i = 0; i < word.length() - 1; i++) {
			char a = charArray[i];
			char b = charArray[i + 1];
			charArray[i] = b;
			charArray[i + 1] = a;
			nearmisscodes.add(getCode(new String(charArray)));
			charArray[i] = a;
			charArray[i + 1] = b;
		}
		//change
		charArray = word.toCharArray();
		for (int i = 0; i < word.length(); i++) {
			char original = charArray[i];
			for (int j = 0; j < replacelist.length; j++) {
				charArray[i] = replacelist[j];
				nearmisscodes.add(getCode(new String(charArray)));
			}
			charArray[i] = original;
		}
		//add
		charArray = (word += " ").toCharArray();
		int iy = charArray.length - 1;
		while (true) {
			for (int j = 0; j < replacelist.length; j++) {
				charArray[iy] = replacelist[j];
				nearmisscodes.add(getCode(new String(charArray)));
			}
			if (iy == 0)
				break;
			charArray[iy] = charArray[iy - 1];
			--iy;
		}
		//delete
		word = word.trim();
		charArray = word.toCharArray();
		char[] charArray2 = new char[charArray.length - 1];
		for (int ix = 0; ix < charArray2.length; ix++) {
			charArray2[ix] = charArray[ix];
		}
		char a, b;
		a = charArray[charArray.length - 1];
		int ii = charArray2.length;
		while (true) {
			nearmisscodes.add(getCode(new String(charArray)));
			if (ii == 0)
				break;
			b = a;
			a = charArray2[ii - 1];
			charArray2[ii - 1] = b;
			--ii;
		}

		LinkedList wordlist = getWordsFromCode(word, nearmisscodes);
		// We sort a linkedlist at the end instead of maintaining a
		// continously sorted TreeSet because everytime you add a collection
		// to a treeset it has to be resorted. It's better to do this operation
		// once at the end.
		Collections.sort( wordlist, new Word());
		return wordlist;
	}

	private LinkedList getWordsFromCode(String word, Collection codes) {
		Configuration config = Configuration.getConfiguration();
		LinkedList result = new LinkedList();
		for (Iterator i = codes.iterator(); i.hasNext();) {
			String code = (String) i.next();
			LinkedList simwordlist = getWords(code);
			for (Iterator j = simwordlist.iterator(); j.hasNext();) {
				String similar = (String) j.next();
				int distance = EditDistance.getDistance(word, similar);
				if (distance < config.getInteger(Configuration.SPELL_THRESHOLD)) {
					Word w = new Word(similar, distance);
					result.add(w);
				}
			}
		}
		return result;
	}


	/** Added to free up the class memory and resources,
	  * which otherwise trash the system quickly (code by Steve Birmingham)
	  */
	public void dispose()
	{
		mainDictionary   = null;
		tf               = null;
		dictFile         = null;
	}

}