package spelling.norvig;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
import java.util.regex.Pattern;
/**
* A spelling correct suggestion utility based on Peter Norvig's
* Python spelling program: http://norvig.com/spell-correct.html
*
* Note: this version uses word pairs (greatly increasing memory and processing requirements)
*
*/
public class SpellingSuggestionsWordPairs {
public SpellingSuggestionsWordPairs() throws FileNotFoundException {
Pattern p = Pattern.compile("[,.()'\";:\\s]+");
Scanner scanner = new Scanner(new File("/tmp/small.txt"));
scanner.useDelimiter(p);
String last = "ahjhjhdsgh";
while (scanner.hasNext()) {
String word = scanner.next();
if (wordCounts.containsKey(word)) {
Integer count = wordCounts.get(word);
wordCounts.put(word, count + 1);
} else {
wordCounts.put(word, 1);
}
String pair = last + " " + word;
if (wordPairCounts.containsKey(pair)) {
Integer count = wordPairCounts.get(pair);
wordPairCounts.put(pair, count + 1);
} else {
wordPairCounts.put(pair, 1);
}
last = word;
}
scanner.close();
for (String pair : wordPairCounts.keySet()) {
if (wordPairCounts.get(pair) > 1) {
System.out.println(pair + ": " + wordPairCounts.get(pair));
}
}
}
private List<String> edits(String word) {
int wordL = word.length(), wordLm1 = wordL - 1;
List<String> possible = new ArrayList<String>();
// drop a character:
for (int i=0; i < wordL; ++i) {
possible.add(word.substring(0, i) + word.substring(i+1));
}
// reverse order of 2 characters:
for (int i=0; i < wordLm1; ++i) {
possible.add(word.substring(0, i) + word.substring(i+1, i+2) +
word.substring(i, i+1) + word.substring(i+2));
}
// replace a character in each location in the word:
for (int i=0; i < wordL; ++i) {
for (char ch='a'; ch <= 'z'; ++ch) {
possible.add(word.substring(0, i) + ch +
word.substring(i+1));
}
}
// add in a character in each location in the word:
for (int i=0; i <= wordL; ++i) {
for (char ch='a'; ch <= 'z'; ++ch) {
possible.add(word.substring(0, i) + ch +
word.substring(i));
}
}
return possible;
}
public String correct(String word, String previous_word) {
if(wordCounts.containsKey(word)) return word;
List<String> list = edits(word);
// candidate hash has as word counts as keys, word as value:
HashMap<Integer, String> candidates = new HashMap<Integer, String>();
for (String testWord : list) {
String word_pair = previous_word + " " + testWord;
int count_from_1_word = 0;
int count_from_word_pairs = 0;
if(wordCounts.containsKey(testWord)) {
count_from_1_word += wordCounts.get(testWord);
candidates.put(wordCounts.get(testWord), testWord);
}
if (wordPairCounts.containsKey(word_pair)) {
count_from_word_pairs += wordPairCounts.get(word_pair);
}
word_pair = testWord + " " + previous_word;
if (wordPairCounts.containsKey(word_pair)) {
count_from_word_pairs += wordPairCounts.get(word_pair);
}
int sum = count_from_1_word + count_from_word_pairs;
if (sum > 0) {
candidates.put(sum, testWord);
}
System.out.println(word_pair + " : " + count_from_1_word + ", " + count_from_word_pairs + " " + testWord);
}
/**
* If candidates is not empty, then return the word with
* the largest key (word count) value:
*/
if(candidates.size() > 0) {
return candidates.get(Collections.max(candidates.keySet()));
}
return word;
}
public int usePairStatistics(String test_word, String previous_word) {
return wordPairCounts.get(previous_word + " " + test_word);
}
/**
* main test method
* @throws FileNotFoundException
*/
public static void main(String[] args) throws FileNotFoundException {
SpellingSuggestionsWordPairs test = new SpellingSuggestionsWordPairs();
System.out.println(test.edits("Doyyle"));
System.out.println(test.edits("Doyyle").size());
System.out.println(test.correct("Doyyle","Conan"));
}
private static Map<String, Integer> wordCounts = new HashMap<String, Integer>();
private static Map<String, Integer> wordPairCounts = new HashMap<String, Integer>();
}