/*
* put your module comment here
* formatted with JxBeauty (c) johann.langhofer@nextra.at
*/
package com.swabunga.spell.engine;
import java.io.*;
import java.util.*;
/**
* The SpellDictionary class holds the instance of the dictionary.
* <p>
* This class is thread safe. Derived classes should ensure that this preserved.
* </p>
* <p>
* There are many open source dictionary files. For just a few see:
* http://wordlist.sourceforge.net/
* </p>
* <p>
* This dictionary class reads words one per line. Make sure that your word list
* is formatted in this way (most are).
* </p>
*/
public class SpellDictionary
{
/** The replace list is used in the getSuggestions method*/
private static final char[] replacelist =
{
'A',
'B',
'X',
'S',
'K',
'J',
'T',
'F',
'H',
'L',
'M',
'N',
'P',
'R',
'0' };
/** A field indicating the initial hash map capacity (16KB) for the main
* dictionary hash map. Interested to see what the performance of a
* smaller initial capacity is like.
*/
private final static int INITIAL_CAPACITY = 16 * 1024;
/**
* The hashmap that contains the word dictionary. The map is hashed on the doublemeta
* code. The map entry contains a LinkedList of words that have the same double meta code.
*/
protected HashMap mainDictionary = new HashMap(INITIAL_CAPACITY);
/**The reference to a Transformator, used to transform a word into it's.
* phonetic code.
*/
private Transformator tf = null;
/** Holds the dictionary file for appending*/
private File dictFile = null;
/**
* Dictionary Constructor.
*/
public SpellDictionary(Reader wordList) throws IOException {
tf = new DoubleMeta();
createDictionary(new BufferedReader(wordList));
}
/**
* Dictionary Constructor for JAR files
* @author Howard Kistler
*/
public SpellDictionary(String wordListResource) throws IOException
{
tf = new DoubleMeta();
InputStream is = this.getClass().getResourceAsStream("dictionary/" + wordListResource);
createDictionary(new BufferedReader(new InputStreamReader(is)));
}
/**
* Dictionary Convienence Constructor.
*/
public SpellDictionary(File wordList)
throws FileNotFoundException, IOException {
this(new FileReader(wordList));
dictFile = wordList;
}
/**
* Dictionary constructor that uses an aspell phonetic file to
* build the transformation table.
*/
public SpellDictionary(File wordList, File phonetic)
throws FileNotFoundException, IOException {
tf = new GenericTransformator(phonetic);
dictFile = wordList;
createDictionary(new BufferedReader(new FileReader(wordList)));
}
/**
* Add a word permanantly to the dictionary (and the dictionary file).
* <p>This needs to be made thread safe (synchronized)</p>
*/
public void addWord(String word) {
putWord(word);
if (dictFile == null)
return;
try {
FileWriter w = new FileWriter(dictFile.toString(), true);
// Open with append.
w.write(word);
w.write("\n");
w.close();
} catch (IOException ex) {
System.out.println("Error writing to dictionary file");
}
}
/**
* Constructs the dictionary from a word list file.
* <p>
* Each word in the reader should be on a seperate line.
* <p>
* This is a very slow function. On my machine it takes quite a while to
* load the data in. I suspect that we could speed this up quite alot.
*/
protected void createDictionary(BufferedReader in) throws IOException {
String line = "";
while (line != null) {
line = in.readLine();
if (line != null) {
line = new String(line.toCharArray());
putWord(line);
}
}
}
/**
* Returns the code representing the word.
*/
public String getCode(String word) {
return tf.transform(word);
}
/**
* Allocates a word in the dictionary
*/
protected void putWord(String word) {
String code = getCode(word);
LinkedList list = (LinkedList) mainDictionary.get(code);
if (list != null) {
list.add(word);
} else {
list = new LinkedList();
list.add(word);
mainDictionary.put(code, list);
}
}
/**
* Returns a list of strings (words) for the code.
*/
public LinkedList getWords(String code) {
//Check the main dictionary.
LinkedList mainDictResult = (LinkedList) mainDictionary.get(code);
if (mainDictResult == null)
return new LinkedList();
return mainDictResult;
}
/**
* Returns true if the word is correctly spelled against the current word list.
*/
public boolean isCorrect(String word) {
LinkedList possible = getWords(getCode(word));
if (possible.contains(word))
return true;
//JMH should we always try the lowercase version. If I dont then capitalised
//words are always returned as incorrect.
else if (possible.contains(word.toLowerCase()))
return true;
return false;
}
/**
* Returns a linked list of Word objects that are the suggestions to an
* incorrect word.
* <p>
* @param word Suggestions for given mispelt word
* @param threshold The lower boundary of similarity to mispelt word
* @return LinkedList a List of suggestions
*/
public LinkedList getSuggestions(String word, int threshold) {
HashSet nearmisscodes = new HashSet();
String code = getCode(word);
// add all words that have the same codeword
nearmisscodes.add(code);
// do some tranformations to pick up more results
//interchange
char[] charArray = word.toCharArray();
for (int i = 0; i < word.length() - 1; i++) {
char a = charArray[i];
char b = charArray[i + 1];
charArray[i] = b;
charArray[i + 1] = a;
nearmisscodes.add(getCode(new String(charArray)));
charArray[i] = a;
charArray[i + 1] = b;
}
//change
charArray = word.toCharArray();
for (int i = 0; i < word.length(); i++) {
char original = charArray[i];
for (int j = 0; j < replacelist.length; j++) {
charArray[i] = replacelist[j];
nearmisscodes.add(getCode(new String(charArray)));
}
charArray[i] = original;
}
//add
charArray = (word += " ").toCharArray();
int iy = charArray.length - 1;
while (true) {
for (int j = 0; j < replacelist.length; j++) {
charArray[iy] = replacelist[j];
nearmisscodes.add(getCode(new String(charArray)));
}
if (iy == 0)
break;
charArray[iy] = charArray[iy - 1];
--iy;
}
//delete
word = word.trim();
charArray = word.toCharArray();
char[] charArray2 = new char[charArray.length - 1];
for (int ix = 0; ix < charArray2.length; ix++) {
charArray2[ix] = charArray[ix];
}
char a, b;
a = charArray[charArray.length - 1];
int ii = charArray2.length;
while (true) {
nearmisscodes.add(getCode(new String(charArray)));
if (ii == 0)
break;
b = a;
a = charArray2[ii - 1];
charArray2[ii - 1] = b;
--ii;
}
LinkedList wordlist = getWordsFromCode(word, nearmisscodes);
// We sort a linkedlist at the end instead of maintaining a
// continously sorted TreeSet because everytime you add a collection
// to a treeset it has to be resorted. It's better to do this operation
// once at the end.
Collections.sort( wordlist, new Word());
return wordlist;
}
private LinkedList getWordsFromCode(String word, Collection codes) {
Configuration config = Configuration.getConfiguration();
LinkedList result = new LinkedList();
for (Iterator i = codes.iterator(); i.hasNext();) {
String code = (String) i.next();
LinkedList simwordlist = getWords(code);
for (Iterator j = simwordlist.iterator(); j.hasNext();) {
String similar = (String) j.next();
int distance = EditDistance.getDistance(word, similar);
if (distance < config.getInteger(Configuration.SPELL_THRESHOLD)) {
Word w = new Word(similar, distance);
result.add(w);
}
}
}
return result;
}
/** Added to free up the class memory and resources,
* which otherwise trash the system quickly (code by Steve Birmingham)
*/
public void dispose()
{
mainDictionary = null;
tf = null;
dictFile = null;
}
}