HashDictionary.java example

Explorer
lucida-master
- lucida
package info.ephyra.util;

import info.ephyra.nlp.NETagger;
import info.ephyra.nlp.SnowballStemmer;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;

/**
 * <p>A <code>Dictionary</code> that is based on a hash set and allows lookups
 * in constant time.</p>
 * 
 * <p>All words are converted to lower case, tokenized and stemmed. E.g. there
 * is no distinction between "Internet" and "internets".</p>
 * 
 * <p>This class implements the interface <code>Dictionary</code>.</p>
 * 
 * @author Nico Schlaefer
 * @version 2007-02-06
 */
public class HashDictionary implements Dictionary {
	/** <code>HashSet</code> used to store the words. **/
	private HashSet<String> words;
	/** <code>HashSet</code> used to store the tokens of words. **/
	private HashSet<String> tokens;
	/** Maximum number of tokens of a word in the dictionary. */
	private int maxTokens = 1;
	
	/**
	 * Creates an empty <code>HashDictionary</code>.
	 */
	public HashDictionary() {
		this.words = new HashSet<String>();
		this.tokens = new HashSet<String>();
	}
	
	/**
	 * Creates a <code>HashDictionary</code> from a list of words in a file.
	 * 
	 * @param fileName file containing a list of words
	 * @throws IOException if the list could not be read from the file
	 */
	public HashDictionary(String fileName) throws IOException {
		this();
		
		if (fileName != null) {
			File file = new File(fileName);
			BufferedReader in = new BufferedReader(new FileReader(file));
			
			while (in.ready()) {
				// read and normalize word
				String word = in.readLine().trim();
				if (word.startsWith("//")) continue;  // skip comments
				word = NETagger.tokenizeWithSpaces(word.toLowerCase());
				word = SnowballStemmer.stemAllTokens(word);
				
				// add whole word
				if (word.length() > 0) words.add(word);
				
				// add tokens of word
				String[] tokens = word.split(" ");
				if (tokens.length > maxTokens) maxTokens = tokens.length;
				for (int p = 0; p < tokens.length; p++)
					if (tokens[p].length() > 0) this.tokens.add(tokens[p]);
			}
			
			in.close();
		}
	}
	
	/**
	 * Adds a word to the dictionary.
	 * 
	 * @param word the word to add
	 */
	public void add(String word) {
		if (word != null) {
			word = NETagger.tokenizeWithSpaces(word.trim().toLowerCase());
			word = SnowballStemmer.stemAllTokens(word);
			
			// add whole word
			if (word.length() > 0) words.add(word);
			
			// add tokens of word
			String[] tokens = word.split(" ");
			if (tokens.length > maxTokens) maxTokens = tokens.length;
			for (int p = 0; p < tokens.length; p++)
				if (tokens[p].length() > 0) this.tokens.add(tokens[p]);
		}
	}
	
	/**
	 * Looks up a word.
	 * 
	 * @param word the word to look up
	 * @return <code>true</code> iff the word was found
	 */
	public boolean contains(String word) {
		word = NETagger.tokenizeWithSpaces(word.trim().toLowerCase());
		word = SnowballStemmer.stemAllTokens(word);
		
		return words.contains(word);
	}
	
	/**
	 * Looks up a word token.
	 * 
	 * @param token the word token to look up
	 * @return <code>true</code> iff a word in the dictionary contains the token
	 */
	public boolean containsToken(String token) {
		token = SnowballStemmer.stem(token.trim().toLowerCase());
		
		return tokens.contains(token);
	}
	
	/**
	 * Does a fuzzy lookup for a word. The specified word w is considered as
	 * contained in the dictionary is there is a word W in the dictionary such
	 * that <code>LevenshteinDistance(w, W) <= maxDistance</code>
	 * 
	 * @param word the word to look up
	 * @param maxDistance the maximum Levenshtein edit distance for fuzzy
	 *            comparison
	 * @return <code>true</code> iff the word was found
	 */
	public boolean fuzzyContains(String word, int maxDistance) {
		word = NETagger.tokenizeWithSpaces(word.trim().toLowerCase());
		word = SnowballStemmer.stemAllTokens(word);
		
		if (maxDistance == 0) return this.words.contains(word);
		else if (this.words.contains(word)) return true;
		
		Iterator<String> wordIter = this.words.iterator();
		while (wordIter.hasNext())
			if (getLevenshteinDistance(word, wordIter.next(), maxDistance, true, 1, 1) <= maxDistance) return true;
		
		return false;
	}
	
	/**
	 * Does a fuzzy lookup for a token. The specified token t is considered as
	 * contained in the dictionary is there is a token T in the dictionary such
	 * that <code>LevenshteinDistance(t, T) <= maxDistance</code>
	 * 
	 * @param token the token to look up
	 * @param maxDistance the maximum Levenshtein edit distance for fuzzy
	 *            comparison
	 * @return <code>true</code> iff a word in the dictionary contains the token
	 */
	public boolean fuzzyContainsToken(String token, int maxDistance) {
		token = SnowballStemmer.stem(token.trim().toLowerCase());
		
		if (maxDistance == 0) return this.tokens.contains(token);
		else if (this.tokens.contains(token)) return true;
		
		Iterator<String> tokenIter = this.tokens.iterator();
		while (tokenIter.hasNext())
			if (getLevenshteinDistance(token, tokenIter.next(), maxDistance, true, 1, 1) <= maxDistance) return true;
		
		return false;
	}
	
	/**	compute the Levenshtein distance of two Strings
	 * @param	string1			the first String
	 * @param	string2			the second String
	 * @param	threshold		the maximum distance (computation will stop if specified value reached)
	 * @param	caseSensitive	use case sensitive or case insensitive comparison
	 * @param	insertCost		the cost for inserting a Character
	 * @param	deleteCost		the cost for deleting a Character
	 * @return the Levenshtein distance of the specified Strings, maximum the specified threshold plus one, soon as the minimum possible distance exceeds the threshold
	 * Note: a threshold of 0 will compute the entire editing distance, regardless of its value
	 */
	public static int getLevenshteinDistance(String string1, String string2, int threshold, boolean caseSensitive, int insertCost, int deleteCost) {
		
		//	Step 1
		int length1 = ((string1 == null) ? 0 : string1.length()); // length of string1
		int length2 = ((string2 == null) ? 0 : string2.length()); // length of string2
		
		//	Step 1.5
		if ((Math.abs(length1 - length2) > threshold) && (threshold > 0)) return (threshold + 1);
		
		// Step 2
		int[][] distanceMatrix = new int[length1 + 1][length2 + 1]; // matrix
		distanceMatrix[0][0] = 0;
		
		//	fill the matrix top-left to bottom-right instead of line-wise
		int limit = 1;
		int minLength = ((length1 > length2) ? length2 : length1); // the limit for the square computation
		
		//	variables for distance computation
		int cost; // cost in current step
		int substitutionCost = ((insertCost + deleteCost) / 2);
		int distance = 0; // minimum distance currently possible
		
		while (limit <= minLength) {
			distanceMatrix[limit][0] = (limit * insertCost);
			distanceMatrix[0][limit] = (limit * deleteCost);
			
			//	compute line
			for (int c = 1; c < limit; c++) {
				cost = getCost(string1.charAt(c - 1), string2.charAt(limit - 1), substitutionCost, caseSensitive);
				distance = min3(distanceMatrix[c - 1][limit] + deleteCost, distanceMatrix[c][limit - 1] + insertCost, distanceMatrix[c - 1][limit - 1] + cost);
				distanceMatrix[c][limit] = distance;
			}
			
			//	compute column
			for (int l = 1; l < limit; l++) {
				cost = getCost(string1.charAt(limit - 1), string2.charAt(l - 1), substitutionCost, caseSensitive);
				distance = min3(distanceMatrix[limit - 1][l] + deleteCost, distanceMatrix[limit][l - 1] + insertCost, distanceMatrix[limit - 1][l - 1] + cost);
				distanceMatrix[limit][l] = distance;
			}
			
			//	compute new corner
			cost = getCost(string1.charAt(limit - 1), string2.charAt(limit - 1), substitutionCost, caseSensitive);
			distance = min3(distanceMatrix[limit - 1][limit] + deleteCost, distanceMatrix[limit][limit - 1] + insertCost, distanceMatrix[limit - 1][limit - 1] + cost);
			if ((distance > threshold) && (threshold > 0)) return (threshold + 1);
			distanceMatrix[limit][limit] = distance;
			
			//	increment limit
			limit ++;
		}
		
		//	Step 2.5 (compute remaining columns)
		while (limit <= length1) {
			distanceMatrix[limit][0] = (limit * insertCost);
			
			//	compute column
			for (int l = 1; l <= length2; l++) {
				cost = getCost(string1.charAt(limit - 1), string2.charAt(l - 1), substitutionCost, caseSensitive);
				distance = min3(distanceMatrix[limit - 1][l] + deleteCost, distanceMatrix[limit][l - 1] + insertCost, distanceMatrix[limit - 1][l - 1] + cost);
				distanceMatrix[limit][l] = distance;
			}
			if ((distance > threshold) && (threshold > 0)) return (threshold + 1);
			
			//	increment limit
			limit ++;
		}
		
		//	Step 2.5b (compute remaining rows)
		while (limit <= length2) {
			distanceMatrix[0][limit] = (limit * deleteCost);
			
			//	compute line
			for (int c = 1; c <= length1; c++) {
				cost = getCost(string1.charAt(c - 1), string2.charAt(limit - 1), substitutionCost, caseSensitive);
				distance = min3(distanceMatrix[c - 1][limit] + deleteCost, distanceMatrix[c][limit - 1] + insertCost, distanceMatrix[c - 1][limit - 1] + cost);
				distanceMatrix[c][limit] = distance;
			}
			if ((distance > threshold) && (threshold > 0)) return (threshold + 1);
			
			//	increment limit
			limit ++;
		}
		
		// Step 7
		return distanceMatrix[length1][length2];
	}
	
	/**	compute edit cost for two chars
	 * @param	char1			the first char
	 * @param	char2			the second char
	 * @param	substCost		the cost for the substitution of one char with another one
	 * @param	caseSensitive	use case sensitive or case insensitive comparison for the Token's values 
	 * @return the edit cost for the two Tokens
	 */
	public static int getCost(char char1, char char2, int substCost, boolean caseSensitive) {
		if (char1 == char2) return 0;
		if (!caseSensitive && (Character.toLowerCase(char1) == Character.toLowerCase(char2))) return 0;
		return substCost;
	}
	
	/**	compute the minimum of three int variables (helper for Levenshtein)
	 * @param	x
	 * @param	y
	 * @param	z
	 * @return the minimum of x, y and z
	 */
	public static int min3(int x, int y, int z) {
		return Math.min(x, Math.min(y, z));
	}
	
	/**
	 * Returns an iterator over the dictionary entries.
	 * 
	 * @return iterator
	 */
	public Iterator<String> getIterator() {
		return words.iterator();
	}
	
	/**	
	 * Returns the maximum number of tokens of a word in the dictionary.
	 * 
	 * @return maximum number of tokens
	 */
	public int getMaxTokens() {
		return maxTokens;
	}
}