package org.xbib.elasticsearch.index.analysis.icu.segmentation;
import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.IndexSettings;
import org.junit.Test;
import org.xbib.elasticsearch.index.analysis.BaseTokenStreamTest;
import java.io.Reader;
import java.io.StringReader;
/**
*
*/
public class IcuTokenizerFactoryTests extends BaseTokenStreamTest {
class TestIcuTokenizerFactory extends IcuTokenizerFactory {
public TestIcuTokenizerFactory(Settings settings) {
super(indexSettings(), null, "test", settings);
}
}
private static IndexSettings indexSettings() {
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.build();
IndexMetaData indexMetaData = IndexMetaData.builder("test")
.settings(settings)
.numberOfShards(1)
.numberOfReplicas(1)
.build();
return new IndexSettings(indexMetaData, settings);
}
@Test
public void testMixedText() throws Exception {
Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี This is a test ກວ່າດອກ");
IcuTokenizerFactory factory = new TestIcuTokenizerFactory(Settings.EMPTY);
Tokenizer stream = factory.create();
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี",
"This", "is", "a", "test", "ກວ່າ", "ດອກ"});
}
@Test
public void testTokenizeLatinOnWhitespaceOnly() throws Exception {
Reader reader = new StringReader
(" Don't,break.at?/(punct)! \u201Cnice\u201D\r\n\r\n85_At:all; `really\" +2=3$5,&813 !@#%$^)(*@#$ ");
Settings settings = Settings.builder()
.put("rulefiles", "Latn:icu/Latin-break-only-on-whitespace.rbbi")
.build();
IcuTokenizerFactory factory = new TestIcuTokenizerFactory(settings);
Tokenizer stream = factory.create();
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[] { "Don't,break.at?/(punct)!", "\u201Cnice\u201D", "85_At:all;", "`really\"", "+2=3$5,&813", "!@#%$^)(*@#$" },
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<OTHER>" });
}
@Test
public void testTokenizeLatinDontBreakOnHyphens() throws Exception {
Reader reader = new StringReader
("One-two punch. Brang-, not brung-it. This one--not that one--is the right one, -ish.");
Settings settings = Settings.builder()
.put("rulefiles", "Latn:icu/Latin-dont-break-on-hyphens.rbbi")
.build();
IcuTokenizerFactory factory = new TestIcuTokenizerFactory(settings);
Tokenizer stream = factory.create();
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[] { "One-two", "punch",
"Brang", "not", "brung-it",
"This", "one", "not", "that", "one", "is", "the", "right", "one", "ish" });
}
@Test
public void testKeywordTokenizeCyrillicAndThai() throws Exception {
Reader reader = new StringReader
("Some English. Немного русский. ข้อความภาษาไทยเล็ก ๆ น้อย ๆ More English.");
Settings settings = Settings.builder()
.put("rulefiles", "Cyrl:icu/KeywordTokenizer.rbbi,Thai:icu/KeywordTokenizer.rbbi")
.build();
IcuTokenizerFactory factory = new TestIcuTokenizerFactory(settings);
Tokenizer stream = factory.create();
stream.setReader(reader);
assertTokenStreamContents(stream, new String[] { "Some", "English",
"Немного русский. ",
"ข้อความภาษาไทยเล็ก ๆ น้อย ๆ ",
"More", "English" });
}
}