package edu.northwestern.at.utils.spellcheck;
import java.io.*;
import java.util.*;
import edu.northwestern.at.utils.*;
/** TernaryTrieSpellingDictionary -- implements spelling dictionary using ternary trie.
*
* <p>
* This class provides the basic methods for a spelling checker
* dictionary implemented using a ternary trie.
* </p>
*/
public class TernaryTrieSpellingDictionary implements SpellingDictionary
{
/** Holds the ternary search trie. */
protected TernaryTrie trie;
/** Word list file which populates dictionary. */
protected String wordListFileName;
/** Debugging flag. True to debug. */
protected boolean debug = false;
/** Maximum edit distance. */
protected final static int MAXDIFFS = 2;
/** Create TernaryTrieSpellingDictionary from a map containing words.
*
* @param wordsMap Map with words to add to the dictionary.
*/
public TernaryTrieSpellingDictionary( Map<String, String> wordsMap )
{
this.wordListFileName = "";
try
{
this.trie = new TernaryTrie( wordsMap , true );
}
catch ( Exception e )
{
e.printStackTrace();
}
}
/** Create TernaryTrieSpellingDictionary from a set containing words.
*
* @param wordsSet Set with words to add to the dictionary.
*/
public TernaryTrieSpellingDictionary( Set<String> wordsSet )
{
this.wordListFileName = "";
try
{
this.trie = new TernaryTrie( wordsSet );
}
catch ( Exception e )
{
e.printStackTrace();
}
}
/** Create TernaryTrieSpellingDictionary from a list containing words.
*
* @param wordsList List with words to add to the dictionary.
*/
public TernaryTrieSpellingDictionary( List<String> wordsList )
{
this.wordListFileName = "";
try
{
this.trie = new TernaryTrie( wordsList );
}
catch ( Exception e )
{
e.printStackTrace();
}
}
/** Create TernaryTrieSpellingDictionary from a tagged strings list.
*
* @param wordsList Tagged strings list with words
* to add to the dictionary.
*/
public TernaryTrieSpellingDictionary( TaggedStrings wordsList )
{
this.wordListFileName = "";
try
{
this.trie = new TernaryTrie( wordsList , true );
}
catch ( Exception e )
{
e.printStackTrace();
}
}
/** Create TernaryTrieSpellingDictionary from an existing trie.
*
* @param trie Trie to use for dictionary.
*/
public TernaryTrieSpellingDictionary( TernaryTrie trie )
{
this.trie = trie;
}
/** Lookup word in dictionary.
*
* @param word The word to lookup.
*
* @return True if the word was found in the
* dictionary.
*
* <p>
* <strong>Note:</strong>
* </p>
*
* <p>
* Any processing of the word (conversion to lower case, etc.)
* should be done before calling this routine.
* </p>
*/
public boolean lookupWord( String word )
{
// Consider null or empty word to be spelled
// correctly.
if ( ( word == null ) || ( word.length() <= 0 ) ) return true;
// Look for an exact match in the
// ternary trie for the lower-case
// version of the word.
String lowerCaseWord = word.toLowerCase();
return trie.containsString( lowerCaseWord );
}
/** Add a word to the dictionary.
*
* @param word The word to add to the dictionary.
*
* @return True if word added successfully.
*/
private boolean addWordPrivate( String word )
{
trie.put( word , new Double( 0 ) );
return true;
}
/** Add a word to the dictionary.
*
* @param word The word to add to the dictionary.
*
* @return True if word added successfully.
*/
public boolean addWord( String word )
{
return addWordPrivate( word );
}
/** Add multiple words to the dictionary.
*
* @param words The words to add to the dictionary.
*
* @return True if all words added successfully.
*/
public boolean addWords( String[] words )
{
boolean result = true;
for ( int i = 0; i < words.length; i++ )
{
boolean added = addWord( words[ i ] );
result = result && added;
}
return result;
}
/** Get list of words which almost match given word.
*
* @param word The word for which to find similar words.
*
* @return Set of words which are similar
* to specified word.
*/
public Set<String> getRelatedWords( String word )
{
Set<String> result = null;
if ( ( word != null ) && ( word.length() > 0 ) )
{
result = findMostSimilarSet( word );
}
return result;
}
public Set<String> findMostSimilarSet( String word )
{
Set<String> result = SetFactory.createNewSet();
int size = word.length();
if ( size == 0 ) return null;
int maxSuggestions = 1;
int i;
int j;
String suggestion;
String suggestion2;
String savedWord = word;
// Start by looking for words with a
// maximum of MAXDIFFS letters different.
for ( maxSuggestions = 1;
( maxSuggestions <= MAXDIFFS ) &&
( size >= maxSuggestions * 2 );
maxSuggestions++
)
{
result.addAll
(
trie.nearSearch( word , maxSuggestions )
);
}
// Check for 1 letter removed or added,
// and 1 letter removed or added and
// 1 different.
if ( ( result.size() == 0 ) && ( size > 2 ) )
{
for ( maxSuggestions = 0 ;
( maxSuggestions <= 1 ) ;
maxSuggestions++
)
{
for ( i = size - 1 ; i >= 0 ; i-- )
{
result.addAll
(
trie.nearSearch
(
word.substring( 0 , i ) +
word.substring( i + 1 , size ) ,
maxSuggestions
)
);
for ( j = 'a' ; j < 'z' ; j++ )
{
result.addAll
(
trie.nearSearch
(
word.substring( 0 , i ) + ((char)j ) +
word.substring( i , size ) ,
maxSuggestions
)
);
}
}
}
}
if ( result.size() == 0 )
{
for ( i = j = 1 ; i < size ; i++ )
{
if ( word.charAt( i ) == word.charAt( i - 1 ) )
{
suggestion =
word.substring( 0 , j ) +
word.substring( i + 1 , size );
if ( trie.get( suggestion ) != null )
{
result.add( suggestion );
}
}
else
{
j = i + 1;
}
}
}
// Two consecutive letters exchanged
// and 1 character different.
if ( result.size() == 0 )
{
StringBuffer sugBuf;
for ( i = 0 ; i < size - 1 ; i++ )
{
sugBuf = new StringBuffer( word );
char ci = sugBuf.charAt( i );
char ci1 = sugBuf.charAt( i + 1 );
sugBuf.setCharAt( i + 1 , ci );
sugBuf.setCharAt( i , ci1 );
suggestion = sugBuf.toString();
result.addAll( trie.nearSearch( suggestion , 1 ) );
}
}
// Prefixes.
if ( result.size() == 0 )
{
result.addAll( trie.prefixSearch( word ) );
}
// Repeated characters removed and
// 1 character different.
if ( result.size() == 0 )
{
for ( i = 1 ; i < size - 1 ; i++ )
{
if ( word.charAt( i ) == word.charAt( i - 1 ) )
{
suggestion =
word.substring( 0 , i ) +
word.substring( i + 1 , size );
result.addAll( trie.nearSearch( suggestion , 1 ) );
}
}
}
return result;
}
/** Retrieves all words in dictionary.
*
* @return ArrayList of all words in dictionary.
*/
public Set<String> getAllWords()
{
return trie.getAllStrings();
}
/** Retrieves number of words in dictionary.
*
* @return Number of words in dictionary.
*/
public int getNumberOfWords()
{
return trie.getAllStrings().size();
}
/** Clear dictionary of all words and metaphone values. */
public void clear()
{
}
/** Get set of words which almost match given word.
*
* @param word The word for which to find similar words.
*
* @return Set of words which are similar
* to specified word.
*/
public Set<String> getMoreRelatedWords( String word )
{
Set<String> result = SetFactory.createNewSet();
if ( ( word != null ) && ( word.length() > 0 ) )
{
int l = word.length();
int k = 0;
for ( int i = ( MAXDIFFS + 1 ) ; i < Math.min( l / 2 , 10 ) ; i++ )
{
List<String> nearSearches =
trie.nearSearch( word.toLowerCase() , i );
if ( nearSearches.size() > 0 )
{
result.addAll( nearSearches );
k++;
if ( k >= 2 ) break;
}
}
}
return result;
}
}