package info.ephyra.nlp.indices;
import info.ephyra.io.MsgPrinter;
import info.ephyra.nlp.NETagger;
import info.ephyra.util.FileUtils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Hashtable;
/**
* <p>Counts the frequencies of words in an arbitrary text corpus and represents
* them in a dictionary.</p>
*
* <p>Internally, a hash table is used to store the index, which allows access
* to the index in constant time.</p>
*
* @author Nico Schlaefer
* @version 2008-01-23
*/
public class WordFrequencies {
/** Maximum number of words to be parsed (0 = no limit). */
private static final int MAX_WORDS = 0;
/** Whether words are converted to lower case. */
private static final boolean LOWER_CASE = true;
/** Minimum frequency of a word to remain in the index. */
private static final int MIN_FREQUENCY = 2;
/** Whether words are saved in the order of their frequencies. */
private static final boolean SORT_BY_FREQUENCY = true;
/** Total number of words that have been parsed. */
private static int total;
/** Number of distinct words in the index. */
private static int distinct;
/** <code>Hashtable</code> used to store (word, frequency) pairs. */
private static Hashtable<String, Integer> index;
/**
* Creates an index of word frequencies from an arbitrary text file.
*
* @param filename name of the text file to parse
* @return true, iff the index was created successfully
*/
public static boolean createIndexFromFile(String filename) {
total = 0;
distinct = 0;
index = new Hashtable<String, Integer>(10000); // initial size 10,000
return updateIndexFromFile(filename);
}
/**
* Updates the index with the words in an arbitrary text file.
*
* @param filename name of the text file to parse
* @return true, iff the index was updated successfully
*/
public static boolean updateIndexFromFile(String filename) {
MsgPrinter.printStatusMsg(filename);
File file = new File(filename);
try {
BufferedReader in = new BufferedReader(new FileReader(file));
String line;
String[] words;
int frequency;
while (in.ready()) {
line = in.readLine(); // read file line-by-line
words = NETagger.tokenize(line);
// update index for each word
for (String word : words) {
// maximum number of words reached?
if (MAX_WORDS > 0 && total >= MAX_WORDS) return true;
// convert to lower case
if (LOWER_CASE) word = word.toLowerCase();
if (index.containsKey(word))
frequency = index.get(word).intValue();
else {
frequency = 0;
distinct++; // update number of distinct words
}
index.put(word, new Integer(++frequency));
total++; // update total number of words
}
}
in.close();
} catch (IOException e) {
return false;
}
return true;
}
/**
* Creates an index of word frequencies from a folder containing text files.
*
* @param dirname name of the folder to parse
* @return true, iff the index was created successfully
*/
public static boolean createIndexFromDir(String dirname) {
total = 0;
distinct = 0;
index = new Hashtable<String, Integer>(10000); // initial size 10,000
return updateIndexFromDir(dirname);
}
/**
* Updates the index by adding the words contained in the files in the given
* folder.
*
* @param dir name of the folder to parse
* @return true, iff the index was updated successfully
*/
public static boolean updateIndexFromDir(String dir) {
File[] files = FileUtils.getFiles(dir);
// update index for each file
for (File file : files) {
// maximum number of words reached?
if (MAX_WORDS > 0 && total >= MAX_WORDS) return true;
if (!updateIndexFromFile(file.getPath())) return false;
}
return true;
}
/**
* Drops rare words from the index.
*/
public static void dropRareWords() {
Hashtable<String, Integer> newIndex = new Hashtable<String, Integer>();
for (String word : index.keySet()) {
int frequency = index.get(word);
if (frequency < MIN_FREQUENCY) distinct--;
else newIndex.put(word, frequency);
}
index = newIndex;
}
/**
* Sorts the words in the index by their frequencies in descending order.
*
* @return words sorted by their frequencies
*/
public static String[] getSortedWords() {
String[] sorted = index.keySet().toArray(new String[index.size()]);
Arrays.sort(sorted, new Comparator<String>() {
public int compare(String s1, String s2) {
return lookup(s2) - lookup(s1);
}
});
return sorted;
}
/**
* Saves index of word frequencies to an ouput file.
*
* @param filename name of the output file to write to
* @return true, iff the index was saved successfully
*/
public static boolean saveIndex(String filename) {
if (index == null) return false; // no index loaded
File file = new File(filename);
try {
PrintWriter out = new PrintWriter(new FileOutputStream(file));
out.println(total); // write total number of words
out.println(distinct); // write number of distinct words
// write (word, frequency) pairs ...
if (SORT_BY_FREQUENCY) {
// ... in the order of their frequencies
String[] sorted = getSortedWords();
for (String word : sorted) {
out.println(word);
out.println(lookup(word));
}
} else {
// ... in an arbitrary order
for (String word : index.keySet()) {
out.println(word);
out.println(lookup(word));
}
}
out.close();
} catch (IOException e) {
return false;
}
return true;
}
/**
* Loads an index of word frequencies from an input file.
*
* @param filename name of the input file containing the index
* @return true, iff the index was loaded successfully
*/
public static boolean loadIndex(String filename) {
File file = new File(filename);
try {
BufferedReader in = new BufferedReader(new FileReader(file));
// read total number of words
total = Integer.parseInt(in.readLine());
// read number of distinct words
distinct = Integer.parseInt(in.readLine());
// create hash table that will have load factor of 0.5
index = new Hashtable<String, Integer>(2 * distinct);
String word;
int frequency;
// read (word, frequency) pairs
for (int i = 0; i < distinct; i++) {
word = in.readLine();
frequency = Integer.parseInt(in.readLine());
index.put(word, new Integer(frequency));
}
in.close();
} catch (IOException e) {
return false;
}
return true;
}
/**
* Returns the total number of words that have been parsed.
*
* @return total number of words
*/
public static int getTotal() {
return total;
}
/**
* Returns the number of distinct words in the index.
*
* @return total number of distinct words
*/
public static int getDistinct() {
return distinct;
}
/**
* Looks up a word in the index and returns its frequency. If the word is
* not in the index, the frequency is 0.
*
* @param word word to look up
* @return frequency of the word
*/
public static int lookup(String word) {
if (index == null) return 0; // no index loaded
// convert to lower case;
if (LOWER_CASE) word = word.toLowerCase();
if (index.containsKey(word))
return (index.get(word)).intValue();
else
return 0;
}
/**
* Looks up a word in the index and returns its relative frequency. If the
* word is not in the index, the relative frequency is 0.
*
* @param word word to look up
* @return relative frequency of the word
*/
public static double lookupRel(String word) {
double frequency = lookup(word);
if (total > 0) return frequency / total; else return 0;
}
/**
* Entry point. Creates the index from the text files in a given folder,
* drops rare words and saves the index.
*
* @param args argument 1: folder containing text files
* argument 2: output file
*/
public static void main(String[] args) {
if (args.length < 2) {
MsgPrinter.printUsage("java WordFrequencies corpus_folder " +
"output_file");
System.exit(1);
}
MsgPrinter.enableStatusMsgs(true);
MsgPrinter.printStatusMsg("Building index of word frequencies...");
createIndexFromDir(args[0]);
dropRareWords();
saveIndex(args[1]);
MsgPrinter.printStatusMsg("...completed.");
}
}