package org.apache.lucene.spelt;
/**
* Copyright 2007 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
// old: import org.apache.lucene.search.spell.Dictionary;
// old: import org.apache.lucene.search.spell.SpellChecker;
// old: import org.apache.lucene.store.Directory;
// old: import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.ProgressTracker;
import org.apache.lucene.util.StringUtil;
/**
* A command-line driver class to test out the spelling correction
* engine.
*
* @author Martin Haye
*/
public class SpellTestCmdLine
{
/**
* Parse command line arguments and run.
*/
public static void main(String[] args)
{
long startTime = System.currentTimeMillis();
System.out.println("\n*** Spelt command-line test utility ***\n");
if (args.length < 3)
printUsageAndExit();
try
{
int alg = 2; // Default to Spelt (new) algorithm
// Parse all the arguments.
for (int i=0; i<args.length; i++)
{
// Is the "old" (Lucene) algorithm requested?
// old: if (args[i].equals("-old")) {
// old: alg = 1;
// old: continue;
// old: }
// Is it a "build" command?
if (args[i].equals("-build") && i+3 <= args.length) {
buildDictionary(alg, args[i+1], args[i+2]);
i += 2;
}
// Is it a "test" command?
else if (args[i].equals("-test") && i+3 <= args.length) {
testDictionary(alg, args[i+1], args[i+2]);
i += 2;
}
else
printUsageAndExit();
}
}
catch (IOException e) {
System.out.flush();
System.err.println("Unexpected exception during spellcheck: " + e);
e.printStackTrace(System.err);
}
// Tell the user how long we spent building the dictionary.
long elapsed = System.currentTimeMillis() - startTime;
System.out.println("Done. Total time: " + (elapsed / 1000.0f) + " sec");
}
/**
* Prints out a message saying how to use this tool, then exits.
*/
private static void printUsageAndExit()
{
{
System.err.println("Usage: java -jar spelt.jar " +
// old: "{-old} " +
"-build <src-files-dir> <dictionary-dir>\n" +
" or: java -jar spelt.jar " +
// old: "{-old} " +
"-test <test-queries-file> <dictionary-dir>\n" +
"\n" +
" where <test-queries-file> contains one or more lines like this\n" +
" misspelled phrase\n" +
" or misspelled phrase -> correct phrase 1|correct phrase 2|...\n");
System.exit(1);
}
}
/**
* Rip text from files in a given source directory and add them to a
* spelling dictionary, using the specified algorithm.
*
* @param alg 0 for null (to test speed of ripper);
* 1 for the old Lucene algorithm;
* 2 for the new Spelt algorithm.
* @param srcDir Directory to grab files to rip
* @param dictDir Directory to put spelling dictionary in
*/
private static void buildDictionary(int alg, String srcDir, String dictDir)
throws IOException
{
// Clear all the files in the target dir.
File dictDirFile = new File(dictDir);
if (dictDirFile.isDirectory()) {
for (File f : dictDirFile.listFiles())
f.delete();
if (!dictDirFile.delete())
throw new IOException("Error deleting old dictionary from '" + dictDir + "'");
}
// Make a builder... what kind depends on the selected algorithm.
DictBuilder builder = (alg == 0) ? new DictBuilder() /* does nothing */ :
//(alg == 1) ? new LuceneDictBuilder(dictDir) :
(alg == 2) ? new SpeltDictBuilder(dictDirFile) :
null;
// Rip all the text.
System.out.println("Ripping text and adding to dictionary...");
TextRipper ripper = new TextRipper(srcDir);
builder.add(ripper);
// And finish up.
System.out.println("Finishing dictionary...");
ProgressTracker prog = new ProgressTracker()
{
public void report(int pctDone, String descrip) {
String pctTxt = Integer.toString(pctDone);
while (pctTxt.length() < 3)
pctTxt = " " + pctTxt;
System.out.println("[" + pctTxt + "%] " + descrip);
}
};
prog.setMinInterval(3000);
builder.finish(prog);
}
/** Test the spelling index */
private static void testDictionary(int alg, String testFile, String dictDir)
throws IOException
{
// Make a specific tester for the specified algorithm.
SuggTester suggTester = //(alg == 1) ? new LuceneSuggTester(dictDir) :
(alg == 2) ? new SpeltSuggTester(dictDir) :
null;
// Open the test query file and read each line.
BufferedReader lineReader = new BufferedReader(new FileReader(testFile));
int nTried = 0;
int nCorrect = 0;
while (true)
{
// It should consist of phrase, a separator, and a correction
String line = lineReader.readLine();
if (line == null)
break;
// Strip off any comment, and skip blank lines.
line = line.replaceFirst(";.*", "").trim();
if (line.length() == 0)
break;
// Split into the incorrect phrase, and the correction(s)
String[] parts = line.split("->");
if (parts.length != 1 && parts.length != 2) {
System.out.println("Unrecognized test line: " + line);
continue;
}
String origPhrase = parts[0].trim();
String[] correctPhrases = (parts.length == 1) ? null :
parts[1].trim().split("\\|");
// Give feedback
System.out.println("Incorrect phrase: " + origPhrase);
// Break the original phrase into words, and get a suggestion.
String[] origWords = StringUtil.splitWords(origPhrase);
String[] suggWords = suggTester.suggest(origWords);
String suggPhrase = StringUtil.join(suggWords);
System.out.println("Suggested phrase: " + suggPhrase);
nTried++;
// If no correct versions available, we're done with this phrase.
if (correctPhrases == null)
continue;
// Check against the correct versions.
boolean found = false;
for (String s : correctPhrases)
found |= (s.trim().equalsIgnoreCase(suggPhrase));
if (found) {
System.out.println("--> CORRECT");
nCorrect++;
continue;
}
System.out.println("--> INCORRECT. Answer should have been:");
for (String s : correctPhrases)
System.out.println(" " + s);
} // while
System.out.println();
System.out.printf("TOTAL: %d correct out of %d tried = %.1f%%\n", nCorrect, nTried, nCorrect * 100.0 / nTried);
System.out.println();
suggTester.close();
}
/** Create a default stop-word set */
private static HashSet makeStopSet()
{
HashSet stopSet = new HashSet();
String[] stopWords = StringUtil.splitWords(
"a an and are as at be but by for if in into is it no not of on or s such t that the their then there these they this to was will with");
for (int i=0; i<stopWords.length; i++)
stopSet.add(stopWords[i]);
return stopSet;
}
/** Common interface for various dictionary-building algorithms */
private static class DictBuilder
{
void add(Iterator words) throws IOException
{
while (words.hasNext())
words.next();
}
void finish(ProgressTracker prog) throws IOException { }
}
/** Builds an old-style Lucene spelling dictionary */
// old: private static class LuceneDictBuilder extends DictBuilder
// old: {
// old: SpellChecker spellChecker;
// old:
// old: LuceneDictBuilder(String dictDir) throws IOException
// old: {
// old: Directory spellIndex = FSDirectory.getDirectory(new File(dictDir));
// old: spellChecker = new SpellChecker(spellIndex);
// old: }
// old:
// old: void add(final Iterator words) throws IOException
// old: {
// old: spellChecker.indexDictionary(new Dictionary() {
// old: public Iterator getWordsIterator() {
// old: return words;
// old: }
// old: });
// old: }
// old: }
/** Builds a new-style Spelt spelling dictionary */
private static class SpeltDictBuilder extends DictBuilder
{
SpellWriter spellWriter;
SpeltDictBuilder(File dictDir) throws IOException
{
spellWriter = SpellWriter.open(dictDir);
spellWriter.setStopwords(makeStopSet());
}
void add(Iterator words) throws IOException
{
while (words.hasNext())
spellWriter.queueWord((String)words.next());
}
void finish(ProgressTracker prog) throws IOException
{
spellWriter.flushQueuedWords(prog);
spellWriter.close();
}
}
/**
* Scans a directory for files, and rips text from all of them. The words
* are accessible in the form of an Iterator.
*/
private static class TextRipper implements Iterator
{
Stack fileStack = new Stack();
BufferedReader reader;
boolean more = true;
String line;
Matcher words = null;
TextRipper(String dir) throws IOException
{
fileStack.push(new File(dir));
advance();
}
/**
* Scan to the next file in the sequence, and open it.
*
* @return true if there was a file to open
*/
boolean nextFile() throws IOException
{
// Close the existing file, if any
if (reader != null) {
reader.close();
reader = null;
}
// Scan until we find a file we can read.
while (reader == null && !fileStack.isEmpty())
{
File file = (File) fileStack.pop();
if (file.isFile()) {
String path = file.getCanonicalPath();
if (path.matches(".*\\.(xml|txt|text|html|htm|xhtml)"))
{
System.out.println(file);
reader = new BufferedReader(
new InputStreamReader(new FileInputStream(file), "UTF-8"));
}
}
else if (file.isDirectory()) {
File[] subFiles = file.listFiles();
for (int i=0; i<subFiles.length; i++)
fileStack.push(subFiles[i]);
}
}
return reader != null;
}
/** Pattern for matching words */
Pattern wordPat = Pattern.compile("\\w+('\\w+)?");
/**
* Advance to the next word in the current file, or the next file if at the
* end of the current one.
*/
void advance() throws IOException
{
while (more)
{
// Get the next file if necessary
if (reader == null) {
if (!nextFile()) {
more = false;
break;
}
}
// Get another line if necessary
if (words == null || !words.find())
{
line = reader.readLine();
if (line == null) {
reader.close();
reader = null;
continue;
}
// Make a half-hearted attempt to strip out XML stuff
line = stripXML(line);
// And save the words to iterate.
words = wordPat.matcher(line);
continue;
}
// Got one!
break;
}
}
/** Pattern for matching XML and HTML elements */
final Pattern xmlPat = Pattern.compile("<[^<]*>");
/** Try to strip XML and HTML elements from a line */
String stripXML(String line)
{
if (line.indexOf('<') < 0)
return line;
return xmlPat.matcher(line).replaceAll("");
}
/** Check if there's another word to get */
public boolean hasNext() {
return more;
}
/** Get the next word in the sequence */
public Object next() {
String ret = line.substring(words.start(), words.end());
try {
advance();
}
catch (IOException e) {
throw new RuntimeException(e);
}
return ret;
}
/** Not implemented */
public void remove() {
throw new UnsupportedOperationException();
}
}
/**
* Generic strategy for testing spelling suggestion algorithms
*/
private interface SuggTester
{
String[] suggest(String[] origPhrase) throws IOException;
void close() throws IOException;
}
/** Get spelling suggestions using Lucene (old) algorithm */
// old: private static class LuceneSuggTester implements SuggTester
// old: {
// old: SpellChecker spellChecker;
// old:
// old: LuceneSuggTester(String dictDir) throws IOException
// old: {
// old: Directory spellIndex = FSDirectory.getDirectory(new File(dictDir));
// old: spellChecker = new SpellChecker(spellIndex);
// old: }
// old:
// old: public String[] suggest(String[] origPhrase) throws IOException
// old: {
// old: String[] ret = new String[origPhrase.length];
// old: boolean anyChange = false;
// old: for (int i=0; i<ret.length; i++) {
// old: String[] suggs = spellChecker.suggestSimilar(origPhrase[i], 1);
// old: if (suggs.length == 0)
// old: ret[i] = origPhrase[i];
// old: else {
// old: ret[i] = suggs[0];
// old: anyChange |= !WordEquiv.DEFAULT.isEquivalent(ret[i], origPhrase[i]);
// old: }
// old: }
// old: if (!anyChange)
// old: return null;
// old: return ret;
// old: }
// old:
// old: public void close() { }
// old: }
/** Get spelling suggestions using the Spelt (new) algorithm */
private static class SpeltSuggTester implements SuggTester
{
SpellReader spellReader;
SpeltSuggTester(String dictDir) throws IOException
{
spellReader = SpellReader.open(new File(dictDir));
spellReader.setStopwords(makeStopSet());
}
public String[] suggest(String[] origPhrase) throws IOException
{
return spellReader.suggestKeywords(origPhrase);
}
public void close() throws IOException
{
spellReader.close();
}
}
}