package info.ephyra.util;
import info.ephyra.nlp.NETagger;
import info.ephyra.nlp.SnowballStemmer;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
/**
* <p>A <code>Dictionary</code> that is based on a hash set and allows lookups
* in constant time.</p>
*
* <p>All words are converted to lower case, tokenized and stemmed. E.g. there
* is no distinction between "Internet" and "internets".</p>
*
* <p>This class implements the interface <code>Dictionary</code>.</p>
*
* @author Nico Schlaefer
* @version 2007-02-06
*/
public class HashDictionary implements Dictionary {
/** <code>HashSet</code> used to store the words. **/
private HashSet<String> words;
/** <code>HashSet</code> used to store the tokens of words. **/
private HashSet<String> tokens;
/** Maximum number of tokens of a word in the dictionary. */
private int maxTokens = 1;
/**
* Creates an empty <code>HashDictionary</code>.
*/
public HashDictionary() {
this.words = new HashSet<String>();
this.tokens = new HashSet<String>();
}
/**
* Creates a <code>HashDictionary</code> from a list of words in a file.
*
* @param fileName file containing a list of words
* @throws IOException if the list could not be read from the file
*/
public HashDictionary(String fileName) throws IOException {
this();
if (fileName != null) {
File file = new File(fileName);
BufferedReader in = new BufferedReader(new FileReader(file));
while (in.ready()) {
// read and normalize word
String word = in.readLine().trim();
if (word.startsWith("//")) continue; // skip comments
word = NETagger.tokenizeWithSpaces(word.toLowerCase());
word = SnowballStemmer.stemAllTokens(word);
// add whole word
if (word.length() > 0) words.add(word);
// add tokens of word
String[] tokens = word.split(" ");
if (tokens.length > maxTokens) maxTokens = tokens.length;
for (int p = 0; p < tokens.length; p++)
if (tokens[p].length() > 0) this.tokens.add(tokens[p]);
}
in.close();
}
}
/**
* Adds a word to the dictionary.
*
* @param word the word to add
*/
public void add(String word) {
if (word != null) {
word = NETagger.tokenizeWithSpaces(word.trim().toLowerCase());
word = SnowballStemmer.stemAllTokens(word);
// add whole word
if (word.length() > 0) words.add(word);
// add tokens of word
String[] tokens = word.split(" ");
if (tokens.length > maxTokens) maxTokens = tokens.length;
for (int p = 0; p < tokens.length; p++)
if (tokens[p].length() > 0) this.tokens.add(tokens[p]);
}
}
/**
* Looks up a word.
*
* @param word the word to look up
* @return <code>true</code> iff the word was found
*/
public boolean contains(String word) {
word = NETagger.tokenizeWithSpaces(word.trim().toLowerCase());
word = SnowballStemmer.stemAllTokens(word);
return words.contains(word);
}
/**
* Looks up a word token.
*
* @param token the word token to look up
* @return <code>true</code> iff a word in the dictionary contains the token
*/
public boolean containsToken(String token) {
token = SnowballStemmer.stem(token.trim().toLowerCase());
return tokens.contains(token);
}
/**
* Does a fuzzy lookup for a word. The specified word w is considered as
* contained in the dictionary is there is a word W in the dictionary such
* that <code>LevenshteinDistance(w, W) <= maxDistance</code>
*
* @param word the word to look up
* @param maxDistance the maximum Levenshtein edit distance for fuzzy
* comparison
* @return <code>true</code> iff the word was found
*/
public boolean fuzzyContains(String word, int maxDistance) {
word = NETagger.tokenizeWithSpaces(word.trim().toLowerCase());
word = SnowballStemmer.stemAllTokens(word);
if (maxDistance == 0) return this.words.contains(word);
else if (this.words.contains(word)) return true;
Iterator<String> wordIter = this.words.iterator();
while (wordIter.hasNext())
if (getLevenshteinDistance(word, wordIter.next(), maxDistance, true, 1, 1) <= maxDistance) return true;
return false;
}
/**
* Does a fuzzy lookup for a token. The specified token t is considered as
* contained in the dictionary is there is a token T in the dictionary such
* that <code>LevenshteinDistance(t, T) <= maxDistance</code>
*
* @param token the token to look up
* @param maxDistance the maximum Levenshtein edit distance for fuzzy
* comparison
* @return <code>true</code> iff a word in the dictionary contains the token
*/
public boolean fuzzyContainsToken(String token, int maxDistance) {
token = SnowballStemmer.stem(token.trim().toLowerCase());
if (maxDistance == 0) return this.tokens.contains(token);
else if (this.tokens.contains(token)) return true;
Iterator<String> tokenIter = this.tokens.iterator();
while (tokenIter.hasNext())
if (getLevenshteinDistance(token, tokenIter.next(), maxDistance, true, 1, 1) <= maxDistance) return true;
return false;
}
/** compute the Levenshtein distance of two Strings
* @param string1 the first String
* @param string2 the second String
* @param threshold the maximum distance (computation will stop if specified value reached)
* @param caseSensitive use case sensitive or case insensitive comparison
* @param insertCost the cost for inserting a Character
* @param deleteCost the cost for deleting a Character
* @return the Levenshtein distance of the specified Strings, maximum the specified threshold plus one, soon as the minimum possible distance exceeds the threshold
* Note: a threshold of 0 will compute the entire editing distance, regardless of its value
*/
public static int getLevenshteinDistance(String string1, String string2, int threshold, boolean caseSensitive, int insertCost, int deleteCost) {
// Step 1
int length1 = ((string1 == null) ? 0 : string1.length()); // length of string1
int length2 = ((string2 == null) ? 0 : string2.length()); // length of string2
// Step 1.5
if ((Math.abs(length1 - length2) > threshold) && (threshold > 0)) return (threshold + 1);
// Step 2
int[][] distanceMatrix = new int[length1 + 1][length2 + 1]; // matrix
distanceMatrix[0][0] = 0;
// fill the matrix top-left to bottom-right instead of line-wise
int limit = 1;
int minLength = ((length1 > length2) ? length2 : length1); // the limit for the square computation
// variables for distance computation
int cost; // cost in current step
int substitutionCost = ((insertCost + deleteCost) / 2);
int distance = 0; // minimum distance currently possible
while (limit <= minLength) {
distanceMatrix[limit][0] = (limit * insertCost);
distanceMatrix[0][limit] = (limit * deleteCost);
// compute line
for (int c = 1; c < limit; c++) {
cost = getCost(string1.charAt(c - 1), string2.charAt(limit - 1), substitutionCost, caseSensitive);
distance = min3(distanceMatrix[c - 1][limit] + deleteCost, distanceMatrix[c][limit - 1] + insertCost, distanceMatrix[c - 1][limit - 1] + cost);
distanceMatrix[c][limit] = distance;
}
// compute column
for (int l = 1; l < limit; l++) {
cost = getCost(string1.charAt(limit - 1), string2.charAt(l - 1), substitutionCost, caseSensitive);
distance = min3(distanceMatrix[limit - 1][l] + deleteCost, distanceMatrix[limit][l - 1] + insertCost, distanceMatrix[limit - 1][l - 1] + cost);
distanceMatrix[limit][l] = distance;
}
// compute new corner
cost = getCost(string1.charAt(limit - 1), string2.charAt(limit - 1), substitutionCost, caseSensitive);
distance = min3(distanceMatrix[limit - 1][limit] + deleteCost, distanceMatrix[limit][limit - 1] + insertCost, distanceMatrix[limit - 1][limit - 1] + cost);
if ((distance > threshold) && (threshold > 0)) return (threshold + 1);
distanceMatrix[limit][limit] = distance;
// increment limit
limit ++;
}
// Step 2.5 (compute remaining columns)
while (limit <= length1) {
distanceMatrix[limit][0] = (limit * insertCost);
// compute column
for (int l = 1; l <= length2; l++) {
cost = getCost(string1.charAt(limit - 1), string2.charAt(l - 1), substitutionCost, caseSensitive);
distance = min3(distanceMatrix[limit - 1][l] + deleteCost, distanceMatrix[limit][l - 1] + insertCost, distanceMatrix[limit - 1][l - 1] + cost);
distanceMatrix[limit][l] = distance;
}
if ((distance > threshold) && (threshold > 0)) return (threshold + 1);
// increment limit
limit ++;
}
// Step 2.5b (compute remaining rows)
while (limit <= length2) {
distanceMatrix[0][limit] = (limit * deleteCost);
// compute line
for (int c = 1; c <= length1; c++) {
cost = getCost(string1.charAt(c - 1), string2.charAt(limit - 1), substitutionCost, caseSensitive);
distance = min3(distanceMatrix[c - 1][limit] + deleteCost, distanceMatrix[c][limit - 1] + insertCost, distanceMatrix[c - 1][limit - 1] + cost);
distanceMatrix[c][limit] = distance;
}
if ((distance > threshold) && (threshold > 0)) return (threshold + 1);
// increment limit
limit ++;
}
// Step 7
return distanceMatrix[length1][length2];
}
/** compute edit cost for two chars
* @param char1 the first char
* @param char2 the second char
* @param substCost the cost for the substitution of one char with another one
* @param caseSensitive use case sensitive or case insensitive comparison for the Token's values
* @return the edit cost for the two Tokens
*/
public static int getCost(char char1, char char2, int substCost, boolean caseSensitive) {
if (char1 == char2) return 0;
if (!caseSensitive && (Character.toLowerCase(char1) == Character.toLowerCase(char2))) return 0;
return substCost;
}
/** compute the minimum of three int variables (helper for Levenshtein)
* @param x
* @param y
* @param z
* @return the minimum of x, y and z
*/
public static int min3(int x, int y, int z) {
return Math.min(x, Math.min(y, z));
}
/**
* Returns an iterator over the dictionary entries.
*
* @return iterator
*/
public Iterator<String> getIterator() {
return words.iterator();
}
/**
* Returns the maximum number of tokens of a word in the dictionary.
*
* @return maximum number of tokens
*/
public int getMaxTokens() {
return maxTokens;
}
}