package spelling.norvig;
import nlp.com.knowledgebooks.nlp.util.Tokenizer;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.*;
/**
* A spelling correct suggestion utility based on Peter Norvig's
* Python spelling program: http://norvig.com/spell-correct.html
*
*/
public class SpellingSuggestions {
private SpellingSuggestions() { } // disable default constructor
private static List<String> edits(String word) {
int wordL = word.length(), wordLm1 = wordL - 1;
List<String> possible = new ArrayList<String>();
// drop a character:
for (int i=0; i < wordL; ++i) {
possible.add(word.substring(0, i) + word.substring(i+1));
}
// reverse order of 2 characters:
for (int i=0; i < wordLm1; ++i) {
possible.add(word.substring(0, i) + word.substring(i+1, i+2) +
word.substring(i, i+1) + word.substring(i+2));
}
// replace a character in each location in the word:
for (int i=0; i < wordL; ++i) {
for (char ch='a'; ch <= 'z'; ++ch) {
possible.add(word.substring(0, i) + ch +
word.substring(i+1));
}
}
// add in a character in each location in the word:
for (int i=0; i <= wordL; ++i) {
for (char ch='a'; ch <= 'z'; ++ch) {
possible.add(word.substring(0, i) + ch +
word.substring(i));
}
}
return possible;
}
public static String correct(String word) {
if(wordCounts.containsKey(word)) return word;
List<String> list = edits(word);
// candidate hash has as word counts as keys, word as value:
HashMap<Integer, String> candidates = new HashMap<Integer, String>();
for (String testWord : list) {
if(wordCounts.containsKey(testWord)) {
candidates.put(wordCounts.get(testWord), testWord);
}
}
/**
* If candidates is not empty, then return the word with
* the largest key (word count) value:
*/
if(candidates.size() > 0) {
return candidates.get(Collections.max(candidates.keySet()));
}
/**
* If the edits method does not provide a candidate word that matches
* then we will call edits again with each previous permutation words.
* Note: this case occurs only about 20% of the time and obviously
* increases the runtime of method correct.
*/
candidates.clear();
for (String editWords : list) {
for (String wrd : edits(editWords)) {
if(wordCounts.containsKey(wrd)) {
candidates.put(wordCounts.get(wrd),wrd);
}
}
}
//System.out.println(candidates);
if (candidates.size() > 0) {
return candidates.get(Collections.max(candidates.keySet()));
}
return word;
}
/**
* main test method
*/
public static void main(String[] args) {
System.out.println(SpellingSuggestions.correct("baank"));
System.out.println(SpellingSuggestions.correct("hown"));
System.out.println(SpellingSuggestions.correct("watr"));
System.out.println(SpellingSuggestions.correct("thee"));
System.out.println(SpellingSuggestions.correct("thhe"));
System.out.println(SpellingSuggestions.correct("smth"));
System.out.println(SpellingSuggestions.correct("joonees"));
System.out.println(SpellingSuggestions.correct("waateer"));
}
private static Map<String, Integer> wordCounts = new HashMap<String, Integer>();
static {
// Use Peter Norvig's training file:
// http://www.norvig.com/spell-correct.html
try {
FileInputStream fstream = new FileInputStream("/tmp/big.txt");
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String line;
while ((line = br.readLine()) != null) {
List<String> words = Tokenizer.wordsToList(line);
for (String word : words) {
if (wordCounts.containsKey(word)) {
Integer count = wordCounts.get(word);
wordCounts.put(word, count + 1);
} else {
wordCounts.put(word, 1);
}
}
}
in.close();
} catch (Exception e) {
System.err.println("Error: " + e.getMessage());
}
}
}