import org.apache.lucene.util.Version; import org.junit.Test; import org.wikibrain.core.lang.Language; import org.wikibrain.core.lang.StringNormalizer; import org.wikibrain.lucene.LuceneStringNormalizer; import org.wikibrain.lucene.TokenizerOptions; import static org.junit.Assert.*; /** * @author Shilad Sen */ public class TestLuceneNormalizer { private static Language EN = Language.getByLangCode("en"); private static Language SIMPLE = Language.getByLangCode("simple"); @Test public void testSimple() { StringNormalizer n = new LuceneStringNormalizer( new TokenizerOptions(false, false, false), Version.LUCENE_43); long before = System.currentTimeMillis(); for (int i = 0; i < 100000; i++) { assertEquals("Hello world", n.normalize(EN, "Hello, world!")); } long after = System.currentTimeMillis(); System.err.println("average normalize time is " + 1.0 * (after-before) / 100000 + " millis"); assertEquals("hello World", n.normalize(SIMPLE, "hello-World")); } @Test public void testCaseinsensitive() { StringNormalizer n = new LuceneStringNormalizer( new TokenizerOptions(true, true, false), Version.LUCENE_43); assertEquals("hello world", n.normalize(EN, "Hello, world!")); assertEquals("hello world", n.normalize(SIMPLE, "hello-World")); assertEquals("hello worldy", n.normalize(EN, "hello-Worldy")); } @Test public void testCaseinsensitivePorter() { StringNormalizer n = new LuceneStringNormalizer( new TokenizerOptions(true, true, true), Version.LUCENE_43); assertEquals("hello world", n.normalize(EN, "Hello, world!")); assertEquals("hello worldi", n.normalize(SIMPLE, "hello-Worldy")); } }