WordFrequencies.java example

Explorer
lucida-master
- lucida
package info.ephyra.nlp.indices;

import info.ephyra.io.MsgPrinter;
import info.ephyra.nlp.NETagger;
import info.ephyra.util.FileUtils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Hashtable;

/**
 * <p>Counts the frequencies of words in an arbitrary text corpus and represents
 * them in a dictionary.</p>
 * 
 * <p>Internally, a hash table is used to store the index, which allows access
 * to the index in constant time.</p>
 * 
 * @author Nico Schlaefer
 * @version 2008-01-23
 */
public class WordFrequencies {
	/** Maximum number of words to be parsed (0 = no limit). */
	private static final int MAX_WORDS = 0;
	/** Whether words are converted to lower case. */
	private static final boolean LOWER_CASE = true;
	/** Minimum frequency of a word to remain in the index. */
	private static final int MIN_FREQUENCY = 2;
	/** Whether words are saved in the order of their frequencies. */
	private static final boolean SORT_BY_FREQUENCY = true;
	
	/** Total number of words that have been parsed. */
	private static int total;
	/** Number of distinct words in the index. */
	private static int distinct;
	/** <code>Hashtable</code> used to store (word, frequency) pairs. */
	private static Hashtable<String, Integer> index;
	
	/**
	 * Creates an index of word frequencies from an arbitrary text file.
	 * 
	 * @param filename name of the text file to parse
	 * @return true, iff the index was created successfully
	 */
	public static boolean createIndexFromFile(String filename) {
		total = 0;
		distinct = 0;
		index = new Hashtable<String, Integer>(10000);  // initial size 10,000
		
		return updateIndexFromFile(filename);
	}

	/**
	 * Updates the index with the words in an arbitrary text file.
	 * 
	 * @param filename name of the text file to parse
	 * @return true, iff the index was updated successfully
	 */
	public static boolean updateIndexFromFile(String filename) {
		MsgPrinter.printStatusMsg(filename);
		
		File file = new File(filename);
		try {
			BufferedReader in = new BufferedReader(new FileReader(file));
			
			String line;
			String[] words;
			int frequency;
			
			while (in.ready()) {
				line = in.readLine();  // read file line-by-line
				words = NETagger.tokenize(line);
				
				// update index for each word
				for (String word : words) {
					// maximum number of words reached?
					if (MAX_WORDS > 0 && total >= MAX_WORDS) return true;
					
					// convert to lower case
					if (LOWER_CASE) word = word.toLowerCase();
					
					if (index.containsKey(word))
						frequency = index.get(word).intValue();
					else {
						frequency = 0;
						distinct++;  // update number of distinct words
					}
					
					index.put(word, new Integer(++frequency));
					total++;  // update total number of words
				}
			}
			
			in.close();
		} catch (IOException e) {
			return false;
		}
		
		return true;
	}
	
	/**
	 * Creates an index of word frequencies from a folder containing text files.
	 * 
	 * @param dirname name of the folder to parse
	 * @return true, iff the index was created successfully
	 */
	public static boolean createIndexFromDir(String dirname) {
		total = 0;
		distinct = 0;
		index = new Hashtable<String, Integer>(10000);  // initial size 10,000
		
		return updateIndexFromDir(dirname);
	}
	
	/**
	 * Updates the index by adding the words contained in the files in the given
	 * folder.
	 * 
	 * @param dir name of the folder to parse
	 * @return true, iff the index was updated successfully
	 */
	public static boolean updateIndexFromDir(String dir) {
		File[] files = FileUtils.getFiles(dir);
		
		// update index for each file
		for (File file : files) {
			// maximum number of words reached?
			if (MAX_WORDS > 0 && total >= MAX_WORDS) return true;
			
			if (!updateIndexFromFile(file.getPath())) return false;
		}
		
		return true;
	}
	
	/**
	 * Drops rare words from the index.
	 */
	public static void dropRareWords() {
		Hashtable<String, Integer> newIndex = new Hashtable<String, Integer>();
		
		for (String word : index.keySet()) {
			int frequency = index.get(word);
			
			if (frequency < MIN_FREQUENCY) distinct--;
			else newIndex.put(word, frequency);
		}
		
		index = newIndex;
	}
	
	/**
	 * Sorts the words in the index by their frequencies in descending order.
	 * 
	 * @return words sorted by their frequencies
	 */
	public static String[] getSortedWords() {
		String[] sorted = index.keySet().toArray(new String[index.size()]);
		
		Arrays.sort(sorted, new Comparator<String>() {
				public int compare(String s1, String s2) {
					return lookup(s2) - lookup(s1);
				}
			});
		
		return sorted;
	}
	
	/**
	 * Saves index of word frequencies to an ouput file.
	 * 
	 * @param filename name of the output file to write to
	 * @return true, iff the index was saved successfully
	 */
	public static boolean saveIndex(String filename) {
		if (index == null) return false;  // no index loaded
		
		File file = new File(filename);
		
		try {
			PrintWriter out = new PrintWriter(new FileOutputStream(file));
			
			out.println(total);  // write total number of words
			out.println(distinct);  // write number of distinct words
			
			// write (word, frequency) pairs ...
			if (SORT_BY_FREQUENCY) {
				// ... in the order of their frequencies
				String[] sorted = getSortedWords();
				for (String word : sorted) {
					out.println(word);
					out.println(lookup(word));
				}
			} else {
				// ... in an arbitrary order
				for (String word : index.keySet()) {
					out.println(word);
					out.println(lookup(word));
				}
			}
			
			out.close();
		} catch (IOException e) {
			return false;
		}
		
		return true;
	}
	
	/**
	 * Loads an index of word frequencies from an input file.
	 * 
	 * @param filename name of the input file containing the index
	 * @return true, iff the index was loaded successfully
	 */
	public static boolean loadIndex(String filename) {
		File file = new File(filename);
		
		try {
			BufferedReader in = new BufferedReader(new FileReader(file));
			
			// read total number of words
			total = Integer.parseInt(in.readLine());
			// read number of distinct words
			distinct = Integer.parseInt(in.readLine());
			
			// create hash table that will have load factor of 0.5
			index = new Hashtable<String, Integer>(2 * distinct);
			
			String word;
			int frequency;
			
			// read (word, frequency) pairs
			for (int i = 0; i < distinct; i++) {
				word = in.readLine();
				frequency = Integer.parseInt(in.readLine());
				
				index.put(word, new Integer(frequency));
			}
			
			in.close();
		} catch (IOException e) {
			return false;
		}
		
		return true;
	}
	
	/**
	 * Returns the total number of words that have been parsed.
	 * 
	 * @return total number of words
	 */
	public static int getTotal() {
		return total;
	}
	
	/**
	 * Returns the number of distinct words in the index.
	 * 
	 * @return total number of distinct words
	 */
	public static int getDistinct() {
		return distinct;
	}
	
	/**
	 * Looks up a word in the index and returns its frequency. If the word is
	 * not in the index, the frequency is 0.
	 * 
	 * @param word word to look up
	 * @return frequency of the word
	 */
	public static int lookup(String word) {
		if (index == null) return 0;  // no index loaded
		
		// convert to lower case;
		if (LOWER_CASE) word = word.toLowerCase();
		
		if (index.containsKey(word))
			return (index.get(word)).intValue();
		else
			return 0;
	}
	
	/**
	 * Looks up a word in the index and returns its relative frequency. If the
	 * word is not in the index, the relative frequency is 0.
	 * 
	 * @param word word to look up
	 * @return relative frequency of the word
	 */
	public static double lookupRel(String word) {
		double frequency = lookup(word);
		
		if (total > 0) return frequency / total; else return 0; 
	}
	
	/**
	 * Entry point. Creates the index from the text files in a given folder,
	 * drops rare words and saves the index.
	 * 
	 * @param args argument 1: folder containing text files
	 * 			   argument 2: output file
	 */
	public static void main(String[] args) {
		if (args.length < 2) {
			MsgPrinter.printUsage("java WordFrequencies corpus_folder " +
								  "output_file");
			System.exit(1);
		}
		
		MsgPrinter.enableStatusMsgs(true);
		MsgPrinter.printStatusMsg("Building index of word frequencies...");
		
		createIndexFromDir(args[0]);
		dropRareWords();
		saveIndex(args[1]);
		
		MsgPrinter.printStatusMsg("...completed.");
	}
}