SimpleStatisticalSpellingCorrector.java example

/*
 * File:                SimpleStatisticalSpellingCorrector.java
 * Authors:             Justin Basilico
 * Company:             Sandia National Laboratories
 * Project:             Cognitive Foundry
 * 
 * Copyright January 14, 2010, Sandia Corporation.
 * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive 
 * license for use of this work by or on behalf of the U.S. Government. Export 
 * of this program may require a license from the United States Government. 
 * See CopyrightHistory.txt for complete details.
 * 
 */

package gov.sandia.cognition.text.spelling;

import gov.sandia.cognition.annotation.PublicationReference;
import gov.sandia.cognition.annotation.PublicationType;
import gov.sandia.cognition.evaluator.Evaluator;
import gov.sandia.cognition.learning.algorithm.AbstractBatchAndIncrementalLearner;
import gov.sandia.cognition.statistics.distribution.DefaultDataDistribution;
import gov.sandia.cognition.util.AbstractCloneableSerializable;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;

/**
 * A simple statistical spelling corrector based on word counts that looks at
 * possible one and two-character edits.
 * 
 * @author  Justin Basilico
 * @since   3.0
 */
@PublicationReference(
    author="Peter Norvig",
    title="How to Write a Spelling Corrector",
    year=2009,
    type=PublicationType.WebPage,
    url="http://norvig.com/spell-correct.html"
)
public class SimpleStatisticalSpellingCorrector
    extends AbstractCloneableSerializable
    implements Evaluator<String, String>
{
    /**
     * Creates the default alphabet, which are the lower-case English letters.
     *
     * @return
     *      The default alphabet.
     */
    public static char[] createDefaultAlphabet()
    {
        return "abcdefghijklmnopqrstuvwxyz".toCharArray();
    }

    /** Maps known words to the number of times they've been seen. */
    protected DefaultDataDistribution<String> wordCounts;

    /** The alphabet of lower case characters. */
    protected char[] alphabet;

    /**
     * Creates a new, default {@code SimpleStatisticalSpellingCorrector} with
     * a default alphabet.
     */
    public SimpleStatisticalSpellingCorrector()
    {
        this(createDefaultAlphabet());
    }

    /**
     * Creates a new {@code SimpleStatisticalSpellingCorrector} with a given
     * alphabet.
     *
     * @param   alphabet
     *      The alphabet to use.
     */
    public SimpleStatisticalSpellingCorrector(
        final char[] alphabet)
    {
        this(new DefaultDataDistribution<String>(), alphabet);
    }

    /**
     * Creates a new {@code SimpleStatisticalSpellingCorrector}.
     *
     * @param   wordCounts
     *      The initial word counts.
     * @param   alphabet
     *      The alphabet to use.
     */
    public SimpleStatisticalSpellingCorrector(
        final DefaultDataDistribution<String> wordCounts,
        final char[] alphabet)
    {
        super();

        this.setWordCounts(wordCounts);
        this.setAlphabet(alphabet);
    }

    /**
     * Adds a word to the dictionary of counts for the spelling corrector.
     *
     * @param   word
     *      The word to add an occurrence of.
     */
    public void add(
        final String word)
    {
        this.wordCounts.increment(word.toLowerCase());
    }

    /**
     * Adds a given number of counts for a word to the dictionary of counts for
     * the spelling corrector.
     *
     * @param   word
     *      The word to add.
     * @param count
     *      The count of occurrences.
     */
    public void add(
        final String word,
        final int count)
    {
        this.wordCounts.increment(word, count);
    }

    @Override
    public String evaluate(
        final String word)
    {
        if (word == null)
        {
            // Bad word.
            return null;
        }

        final String input = word.toLowerCase();
        if (input.isEmpty() || this.wordCounts.get(input) > 0)
        {
            // This is a known word, so nothing to correct.
            return input;
        }

        // Compute the one-character edits.
        final HashSet<String> oneCharacterEdits = new HashSet<String>();
        this.possibleOneCharacterEdits(input, oneCharacterEdits);

        // Find the best one-character edits.
        String result = this.findBest(oneCharacterEdits, null);
        if (result != null)
        {
            // There was a good one-character edit, so return it.
            return result;
        }
        // else - No known one-character edits.

        // Now compute all the possible edits from the one character edits.
        final Set<String> twoCharacterEdits =
            this.knownTwoCharacterEdits(oneCharacterEdits);

        // Find the best known two character edits with a default of using the
        // input word.
        result = this.findBest(twoCharacterEdits, input);

        // Return the result.
        return result;
    }

    /**
     * Finds the best word from a given list of words by finding the one with
     * the highest count in the dictionary. If no words are in the dictionary,
     * the given default best word is returned.
     *
     * @param words
     *      The list of words.
     * @param defaultBestWord
     *      The default word to return if none are in the dictionary.
     * @return
     *      The word with the highest count.
     */
    public String findBest(
        final Iterable<String> words,
        final String defaultBestWord)
    {
        String bestWord = defaultBestWord;
        double bestCount = 0;

        // Go through the words.
        for (String word : words)
        {
            // Get the count.
            final double count = this.wordCounts.get(word);

            if (count > bestCount)
            {
                // Best found so far.
                bestWord = word;
                bestCount = count;
            }
        }

        // Return the best word found so far.
        return bestWord;
    }

    /**
     * Lists all possible one-character edits for a given word by looking at
     * character deletes, transposes, replaces, and inserts.
     *
     * @param   word
     *      The word to get the edits for.
     * @param result
     *      The collection to write the edits into.
     */
    protected void possibleOneCharacterEdits(
        final String word,
        final Collection<String> result)
    {
        // Cache information about the word we will reuse.
        final int wordLength = word.length();
        final char[] characters = word.toCharArray();
        final String[] prefixes = new String[wordLength + 1];
        final String[] suffixes = new String[wordLength + 1];
        for (int i = 0; i < wordLength; i++)
        {
            prefixes[i] = word.substring(0, i);
            suffixes[i] = word.substring(i);
        }
        prefixes[wordLength] = word;
        suffixes[wordLength] = "";

        // Deletes:
        for (int i = 0; i < wordLength; i++)
        {
            result.add(prefixes[i] + suffixes[i + 1]);
        }

        // Transposes:
        for (int i = 0; i < wordLength - 1; i++)
        {
            result.add(prefixes[i]
                + characters[i + 1] + characters[i]
                + suffixes[i + 2]);
        }

        // Replaces:
        for (int i = 0; i < wordLength; i++)
        {
            for (char c : this.alphabet)
            {
                result.add(prefixes[i] + c + suffixes[i + 1]);
            }
        }

        // Inserts:
        for (int i = 0; i <= wordLength; i++)
        {
            for (char c : this.alphabet)
            {
                result.add(prefixes[i] + c + suffixes[i]);
            }
        }
    }

    /**
     * Creates the set of known two character edits for a given list of one
     * character edits.
     *
     * @param   oneCharacterEdits
     *      The list of one character edits.
     * @return
     *      The set of known two-character edits, which are the two-character
     *      edits that are in the dictionary.
     */
    protected Set<String> knownTwoCharacterEdits(
        final Iterable<String> oneCharacterEdits)
    {
        // Create a hash set for the unique results.
        final HashSet<String> result = new HashSet<String>();

        // Create a hash set to locally keep track of the possible edits.
        final HashSet<String> possible = new HashSet<String>();
        for (String word : oneCharacterEdits)
        {
            // Clear out the set of possible edits.
            possible.clear();

            // Get the possible edits for the word.
            possibleOneCharacterEdits(word, possible);

            // Go through the possible edits.
            for (String editedWord : possible)
            {
                // See if this is a known word.
                if (this.wordCounts.get(editedWord) > 0)
                {
                    // This word is known.
                    result.add(editedWord);
                }
            }
        }

        // Return the result.
        return result;
    }

    /**
     * Gets the dictionary of word counts.
     *
     * @return
     *      The word counts.
     */
    public DefaultDataDistribution<String> getWordCounts()
    {
        return this.wordCounts;
    }

    /**
     * Sets the dictionary of words counts.
     *
     * @param   wordCounts
     *      The dictionary of word counts.
     */
    public void setWordCounts(
        final DefaultDataDistribution<String> wordCounts)
    {
        this.wordCounts = wordCounts;
    }

    /**
     * Gets the alphabet of lower-case characters that can be used for replaces
     * and inserts.
     *
     * @return
     *      The alphabet of lower-case characters.
     */
    public char[] getAlphabet()
    {
        return this.alphabet;
    }

    /**
     * Sets the alphabet of lower-case characters that can be used for replaces
     * and inserts.
     *
     * @param   alphabet
     *      The alphabet of lower-case characters.
     */
    public void setAlphabet(
        final char[] alphabet)
    {
        this.alphabet = alphabet;
    }

    /**
     * A learner for the {@code SimpleStatisticalSpellingCorrector}.
     */
    public static class Learner
        extends AbstractBatchAndIncrementalLearner<String, SimpleStatisticalSpellingCorrector>
    {

        /** The alphabet of lower case characters. */
        protected char[] alphabet;

        /**
         * Creates a new simple statistical spelling corrector learner with the
         * default alphabet.
         */
        public Learner()
        {
            this(createDefaultAlphabet());
        }

        /**
         * Creates a new simple statistical spelling corrector learner with the
         * default alphabet.
         *
         * @param   alphabet
         *      The alphabet of lower-case characters to use.
         */
        public Learner(
            final char[] alphabet)
        {
            super();
            
            this.setAlphabet(alphabet);
        }
        
        @Override
        public SimpleStatisticalSpellingCorrector createInitialLearnedObject()
        {
            return new SimpleStatisticalSpellingCorrector(this.getAlphabet());
        }

        @Override
        public void update(
            final SimpleStatisticalSpellingCorrector target,
            final String word)
        {
            // Add each word.
            target.add(word);
        }

        /**
         * Gets the alphabet of lower-case characters that can be used for
         * replaces and inserts.
         *
         * @return
         *      The alphabet of lower-case characters.
         */
        public char[] getAlphabet()
        {
            return this.alphabet;
        }

        /**
         * Sets the alphabet of lower-case characters that can be used for
         * replaces and inserts.
         *
         * @param   alphabet
         *      The alphabet of lower-case characters.
         */
        public void setAlphabet(
            final char[] alphabet)
        {
            this.alphabet = alphabet;
        }

    }

}