package edu.northwestern.at.utils.spellcheck;
import java.io.*;
import java.util.*;
import edu.northwestern.at.utils.*;
import edu.northwestern.at.utils.corpuslinguistics.phonetics.*;
import edu.northwestern.at.utils.corpuslinguistics.stringsimilarity.*;
/** SimpleMindedSpellingChecker -- A very simple minded spelling checker.
*
* <p>
* Implements a very simple minded spelling checker using HashMaps
* to hold the dictionaries. The dictionaries are read as combined
* word lists and metaphone codes from text files into two hashmaps.
* One hashmap contains each word as a key and has the metaphone code
* for that word as a value. The second hashmap uses the metaphone values
* as keys and the list of words mapping to that metaphone as values.
* This allows presentation of suggested spellings for misspelled words.
* The list of suggestions may optionally be pruned by using
* a measure of the Levenstein distance between the original
* misspelling and each suggested spelling.
* </p>
*
* <p>
* Words beginning with a digit are assumed to be numbers and
* therefore spelled correctly.
* </p>
*
* <p>
* When creating a list of suggested replacements for a misspelled
* word, the first letter of the suggestions is capitalized if
* the first letter of the original misspelled word is capitalized.
* </p>
*
* <p>
* This class is not intended for production use because of
* the time needed to load and save dictionaries and the amount
* of memory used to hold the dictionaries in memory. This class
* serves as a reference implementation for the SpellingChecker
* interface as well as a testbed for applications needing a
* spelling checker during development.
* </p>
*/
public class SimpleMindedSpellingChecker implements SpellingChecker
{
/** The global dictionary. */
private SpellingDictionary globalDictionary = null;
/** The local dictionary. */
private SpellingDictionary localDictionary = null;
/** Ignore list. */
private SpellingDictionary ignoreList = new HashMapSpellingDictionary();
/** Metaphone encoder instance. */
private DoubleMetaphone metaphone = new DoubleMetaphone();
/** Create spelling checker specifying global and local dictionaries.
*/
public SimpleMindedSpellingChecker( SpellingDictionary globalDictionary ,
SpellingDictionary localDictionary )
{
this.globalDictionary = globalDictionary;
this.localDictionary = localDictionary;
}
/** Create spelling checker specifying global dictionary.
*/
public SimpleMindedSpellingChecker( SpellingDictionary globalDictionary )
{
this.globalDictionary = globalDictionary;
this.localDictionary = null;
}
/** Create spelling checked without loading any dictionaries. */
public SimpleMindedSpellingChecker()
{
this.globalDictionary = null;
this.localDictionary = null;
}
/** Select global dictionary to use to check spelling.
*
* @param dictionary Identifies dictionary.
* The dictionary class must implement
* the SpellingDictionary interface.
*
* <p>
* The global dictionary is usually shared among many users.
* Typically it is created by a system administrator as a
* shareable resource.
* </p>
*/
public void useGlobalDictionary( SpellingDictionary dictionary )
{
this.globalDictionary = dictionary;
}
/** Select local dictionary to use to check spelling.
*
* @param dictionary The dictionary to use.
* If the name is null, no local
* dictionary is used.
* The dictionary class must implement
* the SpellingDictionary interface.
*
* <p>
* The local dictionary is usually limited to access by
* a single individual. The local dictionary generally
* contains words added by an individual while checking
* the spelling of one or more documents.
* </p>
*/
public void useLocalDictionary( SpellingDictionary dictionary )
{
this.localDictionary = dictionary;
}
/** Add word to a dictionary.
*
* @param word The word to add.
* @param dictionary The dictionary.
*
* @return True if word added.
*/
private boolean addWordToDictionary( String word ,
SpellingDictionary dictionary )
{
return dictionary.addWord( word );
}
/** Check spelling of word.
*
* @param word The word whose spelling should be checked.
*
* @return True if the word was found in the
* global or local dictionaries.
*/
public boolean checkSpelling( String word )
{
// Consider null or empty word to be spelled
// correctly.
if ( ( word == null ) || ( word.length() <= 0 ) ) return true;
// Assume word which is all upper case to be
// spelled correctly.
if ( word.equals( word.toUpperCase() ) ) return true;
// If word starts with a digit,
// assume it is a number and
// say it is spelled OK.
if ( Character.isDigit( word.charAt( 0 ) ) )
return true;
// Convert word to lower case for
// dictionary lookups.
String lowerCaseWord = word.toLowerCase();
// Otherwise look up word in
// ignores list, local dictionary,
// and finally, global dictionary.
return ignoreList.lookupWord( lowerCaseWord ) ||
( ( localDictionary == null ) ?
false : localDictionary.lookupWord( lowerCaseWord ) ) ||
( ( globalDictionary == null ) ?
false : globalDictionary.lookupWord( lowerCaseWord ) );
}
/** Prunes suggestions to those most like the original word.
*
* @param word The original misspelled word.
* @param suggestions String collection of suggested spellings.
*
* @return Possibly pruned String set of suggested words.
* The suggestions are pruned using the
* Levenstein distance.
*/
private Set<String> pruneSuggestions
(
String word ,
Collection<String> suggestions
)
{
int nSuggestions = suggestions.size();
Set<String> result = SetFactory.createNewSet();
Iterator<String> iterator = suggestions.iterator();
while ( iterator.hasNext() )
{
String suggestedWord = iterator.next();
if ( LevensteinDistance.areAlike( word , suggestedWord ) )
result.add( suggestedWord );
}
return result;
}
/** Adds words to a map.
*
* @param map The map with string keys and integer values.
* @param words Collection of words to add to the map.
*/
private void addWords
(
Map<String, Integer> map ,
Collection<String> words
)
{
if ( ( words == null ) || ( words.size() == 0 ) ) return;
Iterator<String> iterator = words.iterator();
while ( iterator.hasNext() )
{
String word = iterator.next();
if ( !map.containsKey( word ) )
{
map.put( word , new Integer( 1 ) );
}
}
}
/** Suggest alternative words for misspelled item.
*
* @param word The misspelled word for which
* possible alternatives are desired.
*
* @param prune True to prune suggestions using
* Levenstein distance.
*
* @return Sorted array of correctly spelled
* words which are similar in some sense
* to the misspelled word.
*
* <p>
* The suggested words are typically words which are similar
* in sound or spelling to the misspelled word. If the misspelled
* word begins with a capital letter, the suggestions are also
* capitalized to match.
* </p>
*
*/
public String[] suggest( String word , boolean prune )
{
// Convert word to lower case for
// dictionary lookups.
String lowerCaseWord = word.toLowerCase();
// Remember if word starts with a capital letter.
boolean startsWithCapital = Character.isUpperCase( word.charAt( 0 ) );
String[] result = null;
try
{
// Get metaphone values for misspelled word.
String metaphoneValue = metaphone.encode( lowerCaseWord );
String metaphoneValue2 = metaphone.getAlternate();
// Pick up list of words with matching
// metaphone values from the dictionaries.
// These are the suggested replacements
// for the misspelled word.
TreeMap<String, Integer> combinedSuggestions =
new TreeMap<String, Integer>();
if ( globalDictionary != null )
addWords( combinedSuggestions ,
globalDictionary.getRelatedWords( metaphoneValue ) );
if ( localDictionary != null )
addWords( combinedSuggestions ,
localDictionary.getRelatedWords( metaphoneValue ) );
addWords( combinedSuggestions ,
ignoreList.getRelatedWords( metaphoneValue ) );
if ( !metaphoneValue2.equals( metaphoneValue ) )
{
if ( globalDictionary != null )
addWords( combinedSuggestions ,
globalDictionary.getRelatedWords( metaphoneValue2 ) );
if ( localDictionary != null )
addWords( combinedSuggestions ,
localDictionary.getRelatedWords( metaphoneValue2 ) );
addWords( combinedSuggestions ,
ignoreList.getRelatedWords( metaphoneValue2 ) );
}
// Copy unique suggested words to
// suggestions list. Words will be in
// sorted ascending alphabetical order.
Set<String> suggestions =
SetFactory.createNewSet( combinedSuggestions.size() );
List<String> keys =
ListFactory.createNewList();
keys.addAll( combinedSuggestions.keySet() );
Iterator<String> iterator = keys.iterator();
while ( iterator.hasNext() )
{
suggestions.add( iterator.next() );
}
// If no suggestions, just return.
if ( suggestions != null )
{
// Optionally prune the list of suggestions
// to those whose Levenstein distance is
// close to the original word.
if ( prune )
{
suggestions =
pruneSuggestions( lowerCaseWord , suggestions );
}
// Copy the possibly pruned list to the result.
// If the initial character of the original
// misspelled word was capitalized, make sure
// the first letter of the suggested replacements
// is also capitalized.
int nSuggestions = suggestions.size();
result = new String[ nSuggestions ];
iterator = suggestions.iterator();
int i = 0;
while ( iterator.hasNext() )
{
result[ i ] = iterator.next();
if ( startsWithCapital )
{
result[ i ] =
Character.toUpperCase( result[ i ].charAt( 0 ) ) +
result[ i ].substring( 1 );
}
i++;
}
}
}
catch ( Exception e )
{
}
return result;
}
/** Suggest alternative words for misspelled item.
*
* @param word The misspelled word for which
* possible alternatives are desired.
*
* <p>
* The suggested words are typically words which are similar
* in sound or spelling to the misspelled word.
* </p>
*
*/
public String[] suggest( String word )
{
return suggest( word , true );
}
/** Add a word to the local dictionary.
*
* @param word The word to add to the local dictionary.
*
* @return True if word added successfully.
*/
public boolean addWordToLocalDictionary( String word )
{
if ( localDictionary != null )
{
return localDictionary.addWord( word );
}
else
{
return false;
}
}
/** Add a word to the global dictionary.
*
* @param word The word to add to the global dictionary.
*
* @return True if word added successfully.
*
* <p>
* Typically this function is retricted to system administrators.
* </p>
*/
public boolean addWordToGlobalDictionary( String word )
{
// return globalDictionary.addWord( word );
return false;
}
/** Add word to ignore list.
*
* @param word Word to ignore.
*
* @return True if word successfully added to ignore list.
*
* <p>
* The ignore list contains words which an individual marks as ignorable
* while checking the spelling of one of more documents. The ignore
* list is transient. If an ignored word is to persist across multiple
* spelling check sessions, the word should be added to the local
* dictionary.
* </p>
*/
public boolean addWordToIgnoreList( String word )
{
return ignoreList.addWord( word );
}
/** Empties the ignore list.
*
* <p>
* Removes all words from the ignore list. Typically this is
* invoked at the start of a spelling checker session for a
* new document.
* </p>
*/
public void emptyIgnoreList()
{
ignoreList.clear();
}
}