/* * File: SimpleStatisticalSpellingCorrector.java * Authors: Justin Basilico * Company: Sandia National Laboratories * Project: Cognitive Foundry * * Copyright January 14, 2010, Sandia Corporation. * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive * license for use of this work by or on behalf of the U.S. Government. Export * of this program may require a license from the United States Government. * See CopyrightHistory.txt for complete details. * */ package gov.sandia.cognition.text.spelling; import gov.sandia.cognition.annotation.PublicationReference; import gov.sandia.cognition.annotation.PublicationType; import gov.sandia.cognition.evaluator.Evaluator; import gov.sandia.cognition.learning.algorithm.AbstractBatchAndIncrementalLearner; import gov.sandia.cognition.statistics.distribution.DefaultDataDistribution; import gov.sandia.cognition.util.AbstractCloneableSerializable; import java.util.Collection; import java.util.HashSet; import java.util.Set; /** * A simple statistical spelling corrector based on word counts that looks at * possible one and two-character edits. * * @author Justin Basilico * @since 3.0 */ @PublicationReference( author="Peter Norvig", title="How to Write a Spelling Corrector", year=2009, type=PublicationType.WebPage, url="http://norvig.com/spell-correct.html" ) public class SimpleStatisticalSpellingCorrector extends AbstractCloneableSerializable implements Evaluator<String, String> { /** * Creates the default alphabet, which are the lower-case English letters. * * @return * The default alphabet. */ public static char[] createDefaultAlphabet() { return "abcdefghijklmnopqrstuvwxyz".toCharArray(); } /** Maps known words to the number of times they've been seen. */ protected DefaultDataDistribution<String> wordCounts; /** The alphabet of lower case characters. */ protected char[] alphabet; /** * Creates a new, default {@code SimpleStatisticalSpellingCorrector} with * a default alphabet. */ public SimpleStatisticalSpellingCorrector() { this(createDefaultAlphabet()); } /** * Creates a new {@code SimpleStatisticalSpellingCorrector} with a given * alphabet. * * @param alphabet * The alphabet to use. */ public SimpleStatisticalSpellingCorrector( final char[] alphabet) { this(new DefaultDataDistribution<String>(), alphabet); } /** * Creates a new {@code SimpleStatisticalSpellingCorrector}. * * @param wordCounts * The initial word counts. * @param alphabet * The alphabet to use. */ public SimpleStatisticalSpellingCorrector( final DefaultDataDistribution<String> wordCounts, final char[] alphabet) { super(); this.setWordCounts(wordCounts); this.setAlphabet(alphabet); } /** * Adds a word to the dictionary of counts for the spelling corrector. * * @param word * The word to add an occurrence of. */ public void add( final String word) { this.wordCounts.increment(word.toLowerCase()); } /** * Adds a given number of counts for a word to the dictionary of counts for * the spelling corrector. * * @param word * The word to add. * @param count * The count of occurrences. */ public void add( final String word, final int count) { this.wordCounts.increment(word, count); } @Override public String evaluate( final String word) { if (word == null) { // Bad word. return null; } final String input = word.toLowerCase(); if (input.isEmpty() || this.wordCounts.get(input) > 0) { // This is a known word, so nothing to correct. return input; } // Compute the one-character edits. final HashSet<String> oneCharacterEdits = new HashSet<String>(); this.possibleOneCharacterEdits(input, oneCharacterEdits); // Find the best one-character edits. String result = this.findBest(oneCharacterEdits, null); if (result != null) { // There was a good one-character edit, so return it. return result; } // else - No known one-character edits. // Now compute all the possible edits from the one character edits. final Set<String> twoCharacterEdits = this.knownTwoCharacterEdits(oneCharacterEdits); // Find the best known two character edits with a default of using the // input word. result = this.findBest(twoCharacterEdits, input); // Return the result. return result; } /** * Finds the best word from a given list of words by finding the one with * the highest count in the dictionary. If no words are in the dictionary, * the given default best word is returned. * * @param words * The list of words. * @param defaultBestWord * The default word to return if none are in the dictionary. * @return * The word with the highest count. */ public String findBest( final Iterable<String> words, final String defaultBestWord) { String bestWord = defaultBestWord; double bestCount = 0; // Go through the words. for (String word : words) { // Get the count. final double count = this.wordCounts.get(word); if (count > bestCount) { // Best found so far. bestWord = word; bestCount = count; } } // Return the best word found so far. return bestWord; } /** * Lists all possible one-character edits for a given word by looking at * character deletes, transposes, replaces, and inserts. * * @param word * The word to get the edits for. * @param result * The collection to write the edits into. */ protected void possibleOneCharacterEdits( final String word, final Collection<String> result) { // Cache information about the word we will reuse. final int wordLength = word.length(); final char[] characters = word.toCharArray(); final String[] prefixes = new String[wordLength + 1]; final String[] suffixes = new String[wordLength + 1]; for (int i = 0; i < wordLength; i++) { prefixes[i] = word.substring(0, i); suffixes[i] = word.substring(i); } prefixes[wordLength] = word; suffixes[wordLength] = ""; // Deletes: for (int i = 0; i < wordLength; i++) { result.add(prefixes[i] + suffixes[i + 1]); } // Transposes: for (int i = 0; i < wordLength - 1; i++) { result.add(prefixes[i] + characters[i + 1] + characters[i] + suffixes[i + 2]); } // Replaces: for (int i = 0; i < wordLength; i++) { for (char c : this.alphabet) { result.add(prefixes[i] + c + suffixes[i + 1]); } } // Inserts: for (int i = 0; i <= wordLength; i++) { for (char c : this.alphabet) { result.add(prefixes[i] + c + suffixes[i]); } } } /** * Creates the set of known two character edits for a given list of one * character edits. * * @param oneCharacterEdits * The list of one character edits. * @return * The set of known two-character edits, which are the two-character * edits that are in the dictionary. */ protected Set<String> knownTwoCharacterEdits( final Iterable<String> oneCharacterEdits) { // Create a hash set for the unique results. final HashSet<String> result = new HashSet<String>(); // Create a hash set to locally keep track of the possible edits. final HashSet<String> possible = new HashSet<String>(); for (String word : oneCharacterEdits) { // Clear out the set of possible edits. possible.clear(); // Get the possible edits for the word. possibleOneCharacterEdits(word, possible); // Go through the possible edits. for (String editedWord : possible) { // See if this is a known word. if (this.wordCounts.get(editedWord) > 0) { // This word is known. result.add(editedWord); } } } // Return the result. return result; } /** * Gets the dictionary of word counts. * * @return * The word counts. */ public DefaultDataDistribution<String> getWordCounts() { return this.wordCounts; } /** * Sets the dictionary of words counts. * * @param wordCounts * The dictionary of word counts. */ public void setWordCounts( final DefaultDataDistribution<String> wordCounts) { this.wordCounts = wordCounts; } /** * Gets the alphabet of lower-case characters that can be used for replaces * and inserts. * * @return * The alphabet of lower-case characters. */ public char[] getAlphabet() { return this.alphabet; } /** * Sets the alphabet of lower-case characters that can be used for replaces * and inserts. * * @param alphabet * The alphabet of lower-case characters. */ public void setAlphabet( final char[] alphabet) { this.alphabet = alphabet; } /** * A learner for the {@code SimpleStatisticalSpellingCorrector}. */ public static class Learner extends AbstractBatchAndIncrementalLearner<String, SimpleStatisticalSpellingCorrector> { /** The alphabet of lower case characters. */ protected char[] alphabet; /** * Creates a new simple statistical spelling corrector learner with the * default alphabet. */ public Learner() { this(createDefaultAlphabet()); } /** * Creates a new simple statistical spelling corrector learner with the * default alphabet. * * @param alphabet * The alphabet of lower-case characters to use. */ public Learner( final char[] alphabet) { super(); this.setAlphabet(alphabet); } @Override public SimpleStatisticalSpellingCorrector createInitialLearnedObject() { return new SimpleStatisticalSpellingCorrector(this.getAlphabet()); } @Override public void update( final SimpleStatisticalSpellingCorrector target, final String word) { // Add each word. target.add(word); } /** * Gets the alphabet of lower-case characters that can be used for * replaces and inserts. * * @return * The alphabet of lower-case characters. */ public char[] getAlphabet() { return this.alphabet; } /** * Sets the alphabet of lower-case characters that can be used for * replaces and inserts. * * @param alphabet * The alphabet of lower-case characters. */ public void setAlphabet( final char[] alphabet) { this.alphabet = alphabet; } } }