package org.apache.lucene.spelt; /* * Copyright 2007 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.util.HashSet; import org.apache.lucene.util.StringUtil; import junit.framework.TestCase; /** * Test the {@link SpellReader} and {@link SpellWriter} classes * * @author Martin Haye */ public class SpellReadWriteTest extends TestCase { protected File dictDir; protected SpellReader reader; protected PrintWriter debugWriter; static final HashSet STOP_SET = new HashSet(); static { for (String word : StringUtil.splitWords( "a an and are as at be but by for if in into is it no not of on or s " + "such t that the their then there these they this to was will with")) STOP_SET.add(word); } /** Create the temporary spelling dictionary */ protected @Override void setUp() throws Exception { super.setUp(); createDictDir("SpellReadWriteTest"); SpellWriter writer = SpellWriter.open(dictDir); writer.setStopwords(STOP_SET); writer.setMinWordFreq(1); try { // Simplest possible tokenization for (String word : CALL_OF_THE_WILD.split("\\W+")) writer.queueWord(word); // Add in some accented chars to test that. writer.queueWord("europ\u00e4ische"); writer.queueWord("europ\u00e4ische"); writer.queueWord("europ\u00e4ische"); // Finish the dictionary writer.flushQueuedWords(); // Open a reader for the tests to use. reader = SpellReader.open(dictDir); reader.setStopwords(STOP_SET); // For debugging purposes, this gives a view into the guts. debugWriter = new PrintWriter(new FileWriter( new File(dictDir, "spellDebug.log"))); reader.setDebugWriter(debugWriter); } finally { writer.close(); } } /** * Creates a temporary directory for the spelling dictionary, and ensures * that it's empty. */ protected void createDictDir(String name) throws IOException { // Create a spelling dictionary to test with dictDir = File.createTempFile(name, null); dictDir.delete(); // Get rid of normal file, so we can make directory if (dictDir.isDirectory()) for (File f : dictDir.listFiles()) f.delete(); dictDir.delete(); } /** Blow away the temporary spelling dictionary */ protected @Override void tearDown() throws IOException { if (reader != null) reader.close(); if (debugWriter != null) debugWriter.close(); if (dictDir.isDirectory()) { for (File f : dictDir.listFiles()) f.delete(); dictDir.delete(); } } /** Test out single-word replacements */ public void testSingleWords() throws IOException { // First, test some words that shouldn't get corrected checkSuggestion("London", null); checkSuggestion("newspapers", null); checkSuggestion("asdfkjlh", null); // Also make sure stop words don't result in suggestions checkSuggestion("the", null); checkSuggestion("and", null); // Okay, let's try some things that should get a suggestion. checkSuggestion("newpapers", "newspapers"); checkSuggestion("newspaper", "newspapers"); checkSuggestion("bck", "buck"); checkSuggestion("bcuk", "buck"); // Check that accents come back correctly checkSuggestion("europbische", "europ\u00e4ische"); // Check the case copying facility checkSuggestion("Newpapers", "Newspapers"); checkSuggestion("NEWPAPERS", "NEWSPAPERS"); checkSuggestion("Bck", "Buck"); } /** Test out multi-word replacements */ public void testMultiWords() throws IOException { checkSuggestion("news papers", "newspapers"); checkSuggestion("readnewspapers", "read newspapers"); checkSuggestion("readn ewspapers", "read newspapers"); checkSuggestion("orchards and bery patches", "orchards and berry patches"); } /** Check that the given series of input words gets the right suggestion */ private void checkSuggestion(String inWords, String outWords) throws IOException { String[] ret = reader.suggestKeywords(inWords.split("\\W+")); assertEquals(outWords, StringUtil.join(ret)); } /** Some test data for us to play with (thanks Project Gutenberg!) */ public final String CALL_OF_THE_WILD = "The Call of the Wild\n" + "by Jack London\n" + "\n" + "Buck did not read the newspapers, or he would have known that " + "trouble was brewing, not alone for himself, but for every tide" + "water dog, strong of muscle and with warm, long hair, from Puget " + "Sound to San Diego. Because men, groping in the Arctic darkness, " + "had found a yellow metal, and because steamship and transportation " + "companies were booming the find, thousands of men were rushing " + "into the Northland. These men wanted dogs, and the dogs they " + "wanted were heavy dogs, with strong muscles by which to toil, and " + "furry coats to protect them from the frost.\n" + "\n"+ "Buck lived at a big house in the sun-kissed Santa Clara Valley. " + "Judge Miller's place, it was called. It stood back from the road, " + "half hidden among the trees, through which glimpses could be " + "caught of the wide cool veranda that ran around its four sides. " + "The house was approached by gravelled driveways which wound about " + "through wide-spreading lawns and under the interlacing boughs of " + "tall poplars. At the rear things were on even a more spacious " + "scale than at the front. There were great stables, where a dozen " + "grooms and boys held forth, rows of vine-clad servants' cottages, " + "an endless and orderly array of outhouses, long grape arbors, " + "green pastures, orchards, and berry patches. Then there was the " + "pumping plant for the artesian well, and the big cement tank where " + "Judge Miller's boys took their morning plunge and kept cool in the " + "hot afternoon.\n" + "\n"+ "And over this great demesne Buck ruled. Here he was born, and " + "here he had lived the four years of his life. It was true, there " + "were other dogs, There could not but be other dogs on so vast a " + "place, but they did not count. They came and went, resided in the " + "populous kennels, or lived obscurely in the recesses of the house " + "after the fashion of Toots, the Japanese pug, or Ysabel, the " + "Mexican hairless,--strange creatures that rarely put nose out of " + "doors or set foot to ground. On the other hand, there were the fox " + "terriers, a score of them at least, who yelped fearful promises at " + "Toots and Ysabel looking out of the windows at them and protected " + "by a legion of housemaids armed with brooms and mops.\n" + "\n"+ "But Buck was neither house-dog nor kennel-dog. The whole realm " + "was his. He plunged into the swimming tank or went hunting with " + "the Judge's sons; he escorted Mollie and Alice, the Judge's " + "daughters, on long twilight or early morning rambles; on wintry " + "nights he lay at the Judge's feet before the roaring library fire; " + "he carried the Judge's grandsons on his back, or rolled them in " + "the grass, and guarded their footsteps through wild adventures " + "down to the fountain in the stable yard, and even beyond, where " + "the paddocks were, and the berry patches. Among the terriers he " + "stalked imperiously, and Toots and Ysabel he utterly ignored, for " + "he was king,--king over all creeping, crawling, flying things of " + "Judge Miller's place, humans included.\n" + "\n"+ "His father, Elmo, a huge St. Bernard, had been the Judge's " + "inseparable companion, and Buck bid fair to follow in the way of " + "his father. He was not so large,--he weighed only one hundred and " + "forty pounds,--for his mother, Shep, had been a Scotch shepherd " + "dog. Nevertheless, one hundred and forty pounds, to which was " + "added the dignity that comes of good living and universal respect, " + "enabled him to carry himself in right royal fashion. During the " + "four years since his puppyhood he had lived the life of a sated " + "aristocrat; he had a fine pride in himself, was even a trifle " + "egotistical, as country gentlemen sometimes become because of " + "their insular situation. But he had saved himself by not becoming " + "a mere pampered house-dog. Hunting and kindred outdoor delights " + "had kept down the fat and hardened his muscles; and to him, as to " + "the cold-tubbing races, the love of water had been a tonic and a " + "health preserver.\n" + "\n"+ "And this was the manner of dog Buck was in the fall of 1897, when " + "the Klondike strike dragged men from all the world into the frozen " + "North. But Buck did not read the newspapers, and he did not know " + "that Manuel, one of the gardener's helpers, was an undesirable " + "acquaintance. Manuel had one besetting sin. He loved to play " + "Chinese lottery. Also, in his gambling, he had one besetting " + "weakness--faith in a system; and this made his damnation certain. " + "For to play a system requires money, while the wages of a " + "gardener's helper do not lap over the needs of a wife and numerous " + "progeny.\n"; }