/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.standard; import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; /** * A few tests based on org.apache.lucene.analysis.TestUAX29URLEmailTokenizer */ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTestCase { public void testUAX29URLEmailTokenizer() throws Exception { Reader reader = new StringReader("Wha\u0301t's this thing do?"); Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory()); stream.setReader(reader); assertTokenStreamContents(stream, new String[] { "Wha\u0301t's", "this", "thing", "do" }); } public void testArabic() throws Exception { Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008."); Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory()); stream.setReader(reader); assertTokenStreamContents(stream, new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا", "بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" }); } public void testChinese() throws Exception { Reader reader = new StringReader("我是中国人。 1234 Tests "); Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory()); stream.setReader(reader); assertTokenStreamContents(stream, new String[] { "我", "是", "中", "国", "人", "1234", "Tests" }); } public void testKorean() throws Exception { Reader reader = new StringReader("안녕하세요 한글입니다"); Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory()); stream.setReader(reader); assertTokenStreamContents(stream, new String[] { "안녕하세요", "한글입니다" }); } public void testHyphen() throws Exception { Reader reader = new StringReader("some-dashed-phrase"); Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory()); stream.setReader(reader); assertTokenStreamContents(stream, new String[] { "some", "dashed", "phrase" }); } // Test with some URLs from TestUAX29URLEmailTokenizer's // urls.from.random.text.with.urls.txt public void testURLs() throws Exception { String textWithURLs = "http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram¶graphs=50&length=200&no-ads=on\n" + " some extra\nWords thrown in here. " + "http://c5-3486.bisynxu.FR/aI.YnNms/" + " samba Halta gamba " + "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R\n" + "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb\n" + "Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m" + " inter Locutio " + "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/\n" + "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7" + " blah Sirrah woof " + "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4\n"; Reader reader = new StringReader(textWithURLs); Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory()); stream.setReader(reader); assertTokenStreamContents(stream, new String[] { "http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram¶graphs=50&length=200&no-ads=on", "some", "extra", "Words", "thrown", "in", "here", "http://c5-3486.bisynxu.FR/aI.YnNms/", "samba", "Halta", "gamba", "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R", "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb", "Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m", "inter", "Locutio", "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/", "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7", "blah", "Sirrah", "woof", "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4" } ); } // Test with some emails from TestUAX29URLEmailTokenizer's // email.addresses.from.random.text.with.email.addresses.txt public void testEmails() throws Exception { String textWithEmails = " some extra\nWords thrown in here. " + "dJ8ngFi@avz13m.CC\n" + "kU-l6DS@[082.015.228.189]\n" + "\"%U\u0012@?\\B\"@Fl2d.md" + " samba Halta gamba " + "Bvd#@tupjv.sn\n" + "SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt\n" + "~+Kdz@3mousnl.SE\n" + " inter Locutio " + "C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY\n" + "}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM" + " blah Sirrah woof " + "lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae\n" + "lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H\n"; Reader reader = new StringReader(textWithEmails); Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory()); stream.setReader(reader); assertTokenStreamContents(stream, new String[] { "some", "extra", "Words", "thrown", "in", "here", "dJ8ngFi@avz13m.CC", "kU-l6DS@[082.015.228.189]", "\"%U\u0012@?\\B\"@Fl2d.md", "samba", "Halta", "gamba", "Bvd#@tupjv.sn", "SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt", "~+Kdz@3mousnl.SE", "inter", "Locutio", "C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY", "}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM", "blah", "Sirrah", "woof", "lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae", "lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H" } ); } public void testMaxTokenLength() throws Exception { StringBuilder builder = new StringBuilder(); for (int i = 0 ; i < 100 ; ++i) { builder.append("abcdefg"); // 7 * 100 = 700 char "word" } String longWord = builder.toString(); String content = "one two three " + longWord + " four five six"; Reader reader = new StringReader(content); Tokenizer stream = tokenizerFactory("UAX29URLEmail", "maxTokenLength", "1000").create(newAttributeFactory()); stream.setReader(reader); assertTokenStreamContents(stream, new String[] {"one", "two", "three", longWord, "four", "five", "six" }); } /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception { IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { tokenizerFactory("UAX29URLEmail", "bogusArg", "bogusValue"); }); assertTrue(expected.getMessage().contains("Unknown parameters")); } public void testIllegalArguments() throws Exception { IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { tokenizerFactory("UAX29URLEmail", "maxTokenLength", "-1").create(); }); assertTrue(expected.getMessage().contains("maxTokenLength must be greater than zero")); } }