package org.apache.lucene.spelt;
/**
* Copyright 2006-2007 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Acknowledgements:
*
* A significant amount of new and/or modified code in this module
* was made possible by a grant from the Andrew W. Mellon Foundation,
* as part of the Melvyl Recommender Project.
*/
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.lucene.util.CountedInputStream;
import org.apache.lucene.util.CountedOutputStream;
import org.apache.lucene.util.FileSorter;
import org.apache.lucene.util.IntList;
import org.apache.lucene.util.ProgressTracker;
/**
* <p>
* Writes spelling dictionaries, which can later be used by {@link SpellReader}
* to obtain spelling suggestions. Provides efficient, high-volume updates
* to a spelling correction dictionary. Typical steps for creating a dictionary:
* </p>
* <ol>
* <li>First, {@linkplain #open(File) open} a new writer.</li>
* <li>Repeatedly {@linkplain #queueWord(String) queue} words to be added
* to the dictionary. This writes the words and pairs to a simple
* disk file.
* </li>
* <li>Optionally {@linkplain #flushQueuedWords() flush} the queued words,
* processing them into a final dictionary.
* </li>
* <li>Finally, {@linkplain #close()} the writer.
* </ol>
* <p>
* Inspired by and very distantly based on Nicolas Maisonneuve / David Spencer
* code.
* </p>
*
* @author Martin Haye
*/
public class SpellWriter
{
/** Directory to store the spelling dictionary in */
private File spellIndexDir;
/** Set of stop words in use; default is null for no stop set */
private Set stopSet = null;
/** File to queue words into */
private File wordQueueFile;
/** The previous word queued, or null if none (or a break was queued) */
private String prevWord;
/** File to queue words into */
private File pairQueueFile;
/** File containing compiled word frequencies */
private File freqFile;
/** File containing frequency sample data */
private File sampleFile;
/** File containing edit map data */
private File edmapFile;
/** File containing compiled pair frequency data */
private File pairFreqFile;
/** For writing to the word queue */
private PrintWriter wordQueueWriter = null;
/** For writing to the pair queue */
private PrintWriter pairQueueWriter = null;
/** How large to make the cache of recently added words */
private static final int MAX_RECENT_WORDS = 20000;
/** For counting word frequencies prior to write */
private HashMap<String, Integer> recentWords = new HashMap<String, Integer>(
MAX_RECENT_WORDS);
/** Max # of pairs to hash before flushing */
private static final int MAX_RECENT_PAIRS = 200000;
/** For counting pair frequencies prior to write */
private HashMap<String, Integer> recentPairs = new HashMap<String, Integer>(
MAX_RECENT_PAIRS);
/** Default minimum word frequency = 2 */
private static final int DEFAULT_MIN_WORD_FREQ = 2;
/** Minimum frequency for words to retain */
private int minWordFreq = DEFAULT_MIN_WORD_FREQ;
/** Default minimum pair frequency = 2 */
private static final int DEFAULT_MIN_PAIR_FREQ = 2;
/** Minimum frequency for pairs to retain */
private int minPairFreq = DEFAULT_MIN_PAIR_FREQ;
/** Used for calculating double metaphone keys */
private static DoubleMetaphone doubleMetaphone = new DoubleMetaphone();
/** Used for splitting lines delimited with bar */
Pattern splitPat = Pattern.compile("\\|");
/** Memory limit for sorting */
private int SORT_MEM_LIMIT = 100 * 1024 * 1024; // 10 megs per file (2 files)
/** Character array for forming combo keys */
private char[] keyChars = new char[4];
/** String buffer for edmap pairs */
private StringBuffer edmapBuf = new StringBuffer();
/**
* Private constructor -- do not construct directly; rather, use the
* static {@link #open(File)} method.
*/
private SpellWriter() {
}
/**
* Creates a SpellWriter, and establishes the directory to store the
* dictionary in.
*
* If you want stop-words to be recognized and discarded (especially
* important if the dictionary will be large), call
* {@link #setStopwords(Set)} after opening a writer.
*
* The minimum word frequency defaults to 2; if you want to
* override that, call {@link #setMinWordFreq(int)}.
*
* A similar threhold exists for pairs; the minimum pair frequency defaults
* to 2; if you want to override that, call {@link #setMinPairFreq(int)}.
*
* @param spellIndexDir Directory in which to store the spelling dictionary
*/
public static SpellWriter open(File spellIndexDir)
throws IOException
{
SpellWriter writer = new SpellWriter();
writer.openInternal(spellIndexDir);
return writer;
}
/**
* Establishes the directory to store the dictionary in.
*/
private void openInternal(File spellIndexDir)
throws IOException
{
this.spellIndexDir = spellIndexDir;
// Figure out the files we're going to store stuff in
wordQueueFile = new File(spellIndexDir, "newWords.txt");
pairQueueFile = new File(spellIndexDir, "newPairs.txt");
freqFile = new File(spellIndexDir, "words.dat");
sampleFile = new File(spellIndexDir, "freqSamples.dat");
edmapFile = new File(spellIndexDir, "edmap.dat");
pairFreqFile = new File(spellIndexDir, "pairs.dat");
// If the index directory doesn't exist, make it.
if (!spellIndexDir.isDirectory()) {
if (!spellIndexDir.mkdir())
throw new IOException("Error creating spelling index directory");
}
}
/**
* Establishes a set of stop words (e.g. "the", "and", "a", etc.) to
* receive special handling. This can significantly decrease the size of
* the dictionary.
*
* @param set the set of stop words to use
*/
public void setStopwords(Set set) {
this.stopSet = set;
}
/**
* Establish a minimum word frequency. When the in-memory cache is flushed
* to disk (every 20,000 words or so) those with a frequency below this
* threshold will be discarded; those at or above this threshold will be
* written to the disk queue.
*
* @param freq the new minimum word frequency
*/
public void setMinWordFreq(int freq) {
this.minWordFreq = freq;
}
/**
* Establish a minimum pair frequency. When the in-memory cache is flushed
* to disk (every 200,000 pairs or so) those with a frequency below this
* threshold will be discarded; those at or above this threshold will be
* written to the disk queue.
*
* @param freq the new minimum pair frequency
*/
public void setMinPairFreq(int freq) {
this.minPairFreq = freq;
}
/**
* Closes all files. Does NOT write queued words (they stay queued on
* disk.)
*/
public synchronized void close()
throws IOException
{
closeQueueWriters();
}
/** Delete all words in the dictionary (including those queued on disk) */
public synchronized void clearDictionary()
throws IOException
{
close();
wordQueueFile.delete();
pairQueueFile.delete();
freqFile.delete();
sampleFile.delete();
edmapFile.delete();
pairFreqFile.delete();
recentWords.clear();
recentPairs.clear();
}
/**
* Queue the given word. The queue can later be flushed by calling
* flushQueuedWords(); this is typically put off until the end of an indexing
* run.
*/
public synchronized void queueWord(String word)
throws IOException
{
// Map all words to lower case. That way, we can easily strip out stop
// words, and we can do case copying when reading the dictionary.
//
word = word.toLowerCase();
// If the word is a stop word, for now we simply ignore it. This way, we
// can still accumulate pair data for words on either side of it.
//
if (stopSet != null && stopSet.contains(word))
return;
// Do we have a pair?
if (prevWord != null)
{
// Calculate a key for this pair, and get the current count
String key = prevWord + "|" + word;
Integer val = recentPairs.get(key);
// Increment the count
if (val == null)
val = Integer.valueOf(1);
else
val = Integer.valueOf(val.intValue() + 1);
// Store it, and if the hash is full, flush it.
recentPairs.put(key, val);
if (recentPairs.size() >= MAX_RECENT_PAIRS)
flushRecentPairs();
}
// Save this word for pairing with the next one.
prevWord = word;
// Bump the count for this word.
Integer val = recentWords.get(word);
if (val == null)
val = Integer.valueOf(1);
else
val = Integer.valueOf(val.intValue() + 1);
// Store it, and if the hash is full, flush it.
recentWords.put(word, val);
if (recentWords.size() >= MAX_RECENT_WORDS)
flushRecentWords();
} // queueWord()
/**
* Called to signal a break in the text, to inform the spell checker to avoid
* pairing the previous word with the next one. This should be called at the
* start or end of a section or field, and at the start or end of each
* sentence.
*/
public void queueBreak()
{
// Suppress pairing until another word comes in
prevWord = null;
}
/**
* Flush any accumulated pairs, with their counts. For efficiency, skip any
* pair that appeared only once.
*/
private void flushRecentPairs()
throws IOException
{
if (recentPairs.isEmpty())
return;
openPairQueueWriter();
Set<String> keySet = recentPairs.keySet();
ArrayList<String> list = new ArrayList<String>(keySet);
Collections.sort(list);
for (int i = 0; i < list.size(); i++) {
String key = list.get(i);
int count = recentPairs.get(key).intValue();
if (count >= minPairFreq)
pairQueueWriter.println(key + "|" + count);
}
pairQueueWriter.flush();
recentPairs.clear();
}
/**
* Flush any accumulated words, with their counts.
*/
private void flushRecentWords()
throws IOException
{
if (recentWords.isEmpty())
return;
openWordQueueWriter();
Set<String> keySet = recentWords.keySet();
ArrayList<String> list = new ArrayList<String>(keySet);
Collections.sort(list);
for (int i = 0; i < list.size(); i++) {
String key = list.get(i);
int count = recentWords.get(key).intValue();
wordQueueWriter.println(key + "|" + count);
}
wordQueueWriter.flush();
recentWords.clear();
}
/** Check if any words are queued for add. */
public synchronized boolean anyWordsQueued()
throws IOException
{
closeQueueWriters();
long queueSize = wordQueueFile.length();
return queueSize > 1;
}
/**
* Ensures that all words in the queue are written to the dictionary on disk.
* Note that this can take quite some time; if you want to print out progress
* messages during the process, use {@link #flushQueuedWords(ProgressTracker)}
* below.
*/
public synchronized void flushQueuedWords()
throws IOException
{
flushQueuedWords(null);
}
/**
* Ensures that all words in the queue are written to the dictionary on disk.
*
* @param prog A tracker that will be called periodically during the
* process; generally you'll want to supply one that
* prints out progress messages.
* If null, no progress will be reported.
*/
public synchronized void flushQueuedWords(ProgressTracker prog)
throws IOException
{
closeQueueWriters();
// If no progress messages are desired, use a stub.
if (prog == null) {
prog = new ProgressTracker() {
@Override public void report(int pctDone, String descrip) { }
};
}
// Approximately calculate how much work there is to do, so we can report
// progress in a rational way.
//
ProgressTracker[] phaseProgs = prog.split(
(freqFile.length() + wordQueueFile.length()) * 10,
pairQueueFile.length());
// Phase 1: Accumulate word frequencies
flushPhase1(phaseProgs[0]);
// Phase 2: Accumulate pairs into the pair data file
flushPhase2(phaseProgs[1]);
// All done.
prog.progress(100, 100, "Done.", true);
} // flushQueuedWords()
/**
* Performs the word-adding phase of the flush procedure.
*
* @throws IOException if something goes wrong
*/
private void flushPhase1(ProgressTracker prog)
throws IOException
{
// If there are no new words to add, skip this phase.
if (!wordQueueFile.canRead())
return;
// Divide the progress into sub-phases: reading word lists,
// writing frequency samples, writing frequencies, writing edmap.
//
ProgressTracker[] subProgs = prog.split(5, 30, 5, 60);
// Initial progress message
ProgressTracker[] wordProgs = subProgs[0].split(freqFile.length(),
wordQueueFile.length());
// Read the existing frequency list (if any)
FileSorter freqSorter = FileSorter.start(spellIndexDir, SORT_MEM_LIMIT);
readFreqs(freqFile, freqSorter, wordProgs[0]);
// Add in the new frequencies
readFreqs(wordQueueFile, freqSorter, wordProgs[1]);
// And write out the accumulated frequencies (culling entries with low
// frequency as we go). Also, we'll start building the edit map.
//
File newFreqFile = new File(spellIndexDir, "words.dat.new");
FileSorter edmapSorter = FileSorter.start(spellIndexDir, SORT_MEM_LIMIT);
IntList allFreqs = new IntList(10000);
writeFreqs(newFreqFile, freqSorter, allFreqs, edmapSorter, subProgs[1]);
// Write out frequency samples for statistical purposes.
File newSampleFile = new File(spellIndexDir, "freqSamples.dat.new");
writeFreqSamples(allFreqs, newSampleFile, subProgs[2]);
// Write out the new edit map.
File newEdmapFile = new File(spellIndexDir, "edmap.dat.new");
writeEdMap(edmapSorter, newEdmapFile, subProgs[3]);
// Clear the queue, and replace the old data files.
replaceFile(freqFile, newFreqFile);
replaceFile(sampleFile, newSampleFile);
replaceFile(edmapFile, newEdmapFile);
deleteFile(wordQueueFile);
}
/**
* Read an existing frequency file, and add it to a file sorter.
*/
private void readFreqs(File inFile, FileSorter out, ProgressTracker prog)
throws IOException
{
// Skip if we can't open the file.
if (!inFile.canRead())
return;
// Read each line, consisting of a word and a count separated by "|"
CountedInputStream countedIn = new CountedInputStream(
new FileInputStream(inFile));
BufferedReader freqReader = new BufferedReader(
new InputStreamReader(countedIn, "UTF-8"));
int lineCt = 0;
while (true)
{
String line = freqReader.readLine();
if (line == null)
break;
out.addLine(line);
// Report progress every once in a while.
if ((lineCt++ & 0xfff) == 0)
prog.progress(countedIn.nRead(), inFile.length(), "Reading word files.");
}
freqReader.close();
}
/**
* Write out frequency data, in sorted order.
*/
private void writeFreqs(final File outFile, final FileSorter freqSorter,
final IntList allFreqs, final FileSorter edmapSorter,
final ProgressTracker prog)
throws IOException
{
final BufferedWriter out = new BufferedWriter(new FileWriter(outFile));
freqSorter.finish(new FileSorter.Output()
{
String curWord = null;
int curFreq = 0;
int nProcessed = 0;
// For each token/frequency pair...
public void writeLine(String line)
throws IOException
{
String[] tokens = splitPat.split(line);
if (tokens.length == 2)
{
// If this is a new word, flush the old one.
if (!tokens[0].equals(curWord)) {
if (curWord != null)
flushWord();
curWord = tokens[0];
}
// Accumulate the frequency (skip if invalid)
try {
curFreq += Integer.parseInt(tokens[1]);
}
catch (NumberFormatException e) {
}
// Report progress every once in a while.
if ((nProcessed++ & 0xfff) == 0 && nProcessed > 1)
prog.progress(nProcessed,
freqSorter.nLinesAdded(),
"Processed " + nProcessed + " words.");
}
}
private void flushWord()
throws IOException
{
// Skip if the frequency is below our threshold
if (curFreq < minWordFreq)
return;
// Add the frequency to our list of all (for statistics later)
allFreqs.add(curFreq);
// Write a line to the final frequency file
out.append(curWord);
out.append('|');
out.append(Integer.toString(curFreq));
out.append('\n');
// Add combinations to the edit map.
addCombos(curWord, edmapSorter);
// Prepare for the next word.
curFreq = 0;
}
public void close()
throws IOException
{
out.close();
prog.progress(nProcessed,
freqSorter.nLinesAdded(),
"Processed " + nProcessed + " words.",
true);
}
});
}
/**
* Add combinations of the first six letters of the word, capturing all the
* possibilities that represent an edit distance of 2 or less.
*/
private void addCombos(String word, FileSorter edMapSorter)
throws IOException
{
// Add combinations to the edit map
addCombo(word, edMapSorter, 0, 1, 2, 3);
addCombo(word, edMapSorter, 0, 1, 2, 4);
addCombo(word, edMapSorter, 0, 1, 2, 5);
addCombo(word, edMapSorter, 0, 1, 3, 4);
addCombo(word, edMapSorter, 0, 1, 3, 5);
addCombo(word, edMapSorter, 0, 1, 4, 5);
addCombo(word, edMapSorter, 0, 2, 3, 4);
addCombo(word, edMapSorter, 0, 2, 3, 5);
addCombo(word, edMapSorter, 0, 2, 4, 5);
addCombo(word, edMapSorter, 0, 3, 4, 5);
if (word.length() > 1) {
addCombo(word, edMapSorter, 1, 2, 3, 4);
addCombo(word, edMapSorter, 1, 2, 3, 5);
addCombo(word, edMapSorter, 1, 2, 4, 5);
addCombo(word, edMapSorter, 1, 3, 4, 5);
if (word.length() > 2)
addCombo(word, edMapSorter, 2, 3, 4, 5);
}
}
/** Add a combination of letters to the edit map */
private void addCombo(String word, FileSorter edmapSorter, int p0,
int p1, int p2, int p3)
throws IOException
{
edmapBuf.setLength(0);
edmapBuf.append(comboKey(word, p0, p1, p2, p3));
edmapBuf.append('|');
edmapBuf.append(word);
String line = edmapBuf.toString();
edmapSorter.addLine(line);
}
/** Calculate a key from the given characters of the word. */
private char[] comboKey(String word, int p0, int p1, int p2, int p3) {
keyChars[0] = word.length() > p0 ? comboChar(word.charAt(p0)) : ' ';
keyChars[1] = word.length() > p1 ? comboChar(word.charAt(p1)) : ' ';
keyChars[2] = word.length() > p2 ? comboChar(word.charAt(p2)) : ' ';
keyChars[3] = word.length() > p3 ? comboChar(word.charAt(p3)) : ' ';
return keyChars;
}
private char comboChar(char c) {
if (c >= 0x20 && (c & ~0x7f) == 0)
return c;
c = (char)((c & 0x7f) | 0x20);
return (c == '|') ? '*' : c;
}
/** Write term frequency samples to the given file. */
private void writeFreqSamples(IntList allFreqs, File file,
ProgressTracker prog)
throws IOException
{
// Calculate the mean of the term frequencies
prog.progress(0, 100, "Sampling frequencies.");
long totalFreq = 0L;
for (int i = 0; i < allFreqs.size(); i++)
totalFreq += allFreqs.get(i);
double avgFreq = totalFreq / (double)allFreqs.size();
// Eliminate all at- or below-average frequencies.
prog.progress(10, 100, "Sampling frequencies.");
IntList aboveAvgFreqs = new IntList(allFreqs.size() / 2);
for (int i = 0; i < allFreqs.size(); i++) {
int freq = allFreqs.get(i);
if (freq > avgFreq)
aboveAvgFreqs.add(freq);
}
// Sort the array by frequency.
prog.progress(20, 100, "Sampling frequencies.");
aboveAvgFreqs.sort();
// If more than 1000 entries, sample it down.
final int MAX_SAMPLES = 1000;
IntList finalFreqs;
if (aboveAvgFreqs.size() < MAX_SAMPLES)
finalFreqs = aboveAvgFreqs;
else
{
finalFreqs = new IntList(MAX_SAMPLES);
for (int i = 0; i < MAX_SAMPLES; i++) {
int pos = (int)(((long)i) * aboveAvgFreqs.size() / MAX_SAMPLES);
finalFreqs.add(aboveAvgFreqs.get(pos));
}
}
// Make sure the very first sample reflects the average
if (finalFreqs.size() > 0)
finalFreqs.set(0, (int)avgFreq);
// Write out the data
prog.progress(50, 100, "Sampling frequencies.");
PrintWriter writer = new PrintWriter(new FileWriter(file));
writer.println(allFreqs.size());
writer.println(finalFreqs.size());
for (int i = 0; i < finalFreqs.size(); i++)
writer.println(finalFreqs.get(i));
writer.close();
prog.progress(100, 100, "Sampling frequencies.");
} // writeFreqSamples()
/**
* Write out a prefix-compressed edit-distance map, which also contains
* term frequencies.
*/
private void writeEdMap(final FileSorter edmapSorter, final File outFile,
final ProgressTracker prog)
throws IOException
{
final CountedOutputStream outCounted = new CountedOutputStream(
new BufferedOutputStream(new FileOutputStream(outFile)));
final Writer out = new OutputStreamWriter(outCounted, "UTF-8");
prog.progress(0, 100, "Building word map.", true);
// Finish sorting all the edit map entries, group them, and write out the keys.
final ArrayList<String> edKeys = new ArrayList<String>();
final IntList sizes = new IntList();
edmapSorter.finish(new FileSorter.Output()
{
String curKey = null;
ArrayList<String> curWords = new ArrayList<String>();
int nWritten = 0;
public void writeLine(String line)
throws IOException
{
String[] tokens = splitPat.split(line);
assert tokens.length == 2 : "invalid edmap line";
if (!tokens[0].equals(curKey)) {
if (curKey != null)
flushKey();
curKey = tokens[0];
}
curWords.add(tokens[1]);
// Give progress every once in a while.
if ((nWritten++ & 0xFFF) == 0)
prog.progress(nWritten,
edmapSorter.nLinesAdded(),
"Building word map.");
}
private void flushKey()
throws IOException
{
// Write out the condensed key
long prevPos = outCounted.nWritten();
condenseEdmapKey(curKey, curWords, out);
out.flush();
// Record the key and its size on disk
edKeys.add(curKey);
sizes.add((int)(outCounted.nWritten() - prevPos));
// Clear the word list in preparation for the next word
curWords.clear();
}
public void close() {
}
});
// At the end of the file, write an index of positions.
long indexPos = outCounted.nWritten();
out.append("edMap index\n");
out.append(Integer.toString(edKeys.size()));
out.append('\n');
for (int i = 0; i < edKeys.size(); i++) {
String key = edKeys.get(i);
out.append(key);
out.append('|');
out.append(Integer.toString(sizes.get(i)));
out.append('\n');
}
// And finally, at the very end, write the position of the index.
String tmp = Long.toString(indexPos);
while (tmp.length() < 20)
tmp = " " + tmp;
out.append(tmp);
// All done.
out.close();
}
/**
* Perform prefix compression on a list of words for a single edit map
* key.
*/
private void condenseEdmapKey(String key, ArrayList<String> words,
Writer out)
throws IOException
{
String prev = words.get(0);
// Write the key and the first word in full
out.append(key);
out.append('|');
out.append(prev);
// Prefix-compress the list.
for (int j = 1; j < words.size(); j++)
{
String word = words.get(j);
// Skip duplicates
if (word.equals(prev))
continue;
// Figure out how many characters overlap.
int k;
for (k = 0; k < Math.min(prev.length(), word.length()); k++) {
if (word.charAt(k) != prev.charAt(k))
break;
}
// Write the prefix length and suffix
out.append('|');
out.append((char)('0' + k));
out.append(word.substring(k));
// Next...
prev = word;
}
// Done with this line. Write it, and record the size.
out.append('\n');
}
/** Attempt to delete (and at least truncate) the given file. */
private void deleteFile(File file)
throws IOException
{
// First, simply try to delete it.
if (file.delete())
return;
// Couldn't delete it... at least truncate it.
FileOutputStream tmp = new FileOutputStream(file);
tmp.close();
}
/** Replace an old file with a new one */
private void replaceFile(File oldFile, File newFile)
{
// First, try to delete the old one.
oldFile.delete();
// Then rename the new one to the old one's name.
newFile.renameTo(oldFile);
}
/**
* Performs the pair-adding phase of the flush procedure.
*/
private void flushPhase2(ProgressTracker prog)
throws IOException
{
// Skip this phase if there are no pairs to add.
if (!pairQueueFile.canRead())
return;
// Read in existing pair data (if any)
FreqData pairData = new FreqData();
if (pairFreqFile.canRead())
pairData.add(pairFreqFile);
// Open the queue, and put a counter on it so we can give accurate
// progress messages.
//
CountedInputStream queueCounted = new CountedInputStream(
new FileInputStream(pairQueueFile));
BufferedReader queueReader = new BufferedReader(
new InputStreamReader(queueCounted, "UTF-8"));
// Divide the progress into two sub-phases: read and write
ProgressTracker[] subProgs = prog.split(90, 10);
// Process each pair in the queue.
long fileTotal = pairQueueFile.length();
int totalAdded = 0;
try
{
boolean eof = false;
while (!eof)
{
String line = queueReader.readLine();
if (line == null) {
eof = true;
break;
}
// Break up the three components of each line (separated by |)
String[] tokens = splitPat.split(line);
if (tokens.length == 3)
{
String word1 = tokens[0];
String word2 = tokens[1];
String countTxt = tokens[2];
try
{
pairData.add(word1, word2, Integer.parseInt(countTxt));
++totalAdded;
// Every 4000 or so words, give some status feedback.
// Only allocate 90%, leaving 10% for the final write.
//
if ((totalAdded & 0xFFF) == 0) {
long filePos = queueCounted.nRead();
subProgs[0].progress(filePos + 1,
fileTotal + 1,
"Read " + totalAdded + " pairs.");
}
}
catch (NumberFormatException e) { /*ignore*/
}
}
} // while
subProgs[0].progress(100, 100,
"Read " + totalAdded + " pairs.",
true);
}
finally {
queueReader.close();
queueCounted.close();
}
// Write out the resulting data and replace the old data file, if any.
File newPairFreqFile = new File(spellIndexDir, "pairs.dat.new");
newPairFreqFile.delete();
subProgs[1].progress(50, 100, "Writing pair data.", true);
pairData.save(newPairFreqFile);
if (pairFreqFile.canRead() && !pairFreqFile.delete())
throw new IOException(
"Could not delete old pair data file -- permission problem?");
if (!newPairFreqFile.renameTo(pairFreqFile))
throw new IOException(
"Could not rename new pair data file -- permission problem?");
// Clear out (and try to delete) the queue file.
FileOutputStream tmp = new FileOutputStream(pairQueueFile);
tmp.close();
pairQueueFile.delete();
}
/** Opens the word queue writer. */
private void openWordQueueWriter()
throws IOException
{
// If already open, skip re-opening.
if (wordQueueWriter != null)
return;
// Open the writers now. Be sure to append if they already exist.
wordQueueWriter = new PrintWriter(
new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(wordQueueFile, true),
"UTF-8")));
}
/** Opens the pair queue writer. */
private void openPairQueueWriter()
throws IOException
{
// If already open, skip re-opening.
if (pairQueueWriter != null)
return;
pairQueueWriter = new PrintWriter(
new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(pairQueueFile, true),
"UTF-8")));
} // openQueueWriters()
/** Closes the queue writers if either are open */
private void closeQueueWriters()
throws IOException
{
flushRecentWords();
if (wordQueueWriter != null) {
wordQueueWriter.close();
wordQueueWriter = null;
}
flushRecentPairs();
if (pairQueueWriter != null) {
pairQueueWriter.close();
pairQueueWriter = null;
}
} // closeQueueWriters()
public static String calcMetaphone(String word) {
return doubleMetaphone.doubleMetaphone(word);
}
protected void finalize()
throws Throwable
{
close();
}
} // class SpellWriter