package org.xbib.elasticsearch.index.analysis.icu; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.junit.Ignore; import org.junit.Test; import java.io.IOException; import java.io.StringReader; import static org.junit.Assert.*; import static org.xbib.elasticsearch.MapperTestUtils.tokenizerFactory; /** * */ public class IcuTokenizerTests { @Test public void testLetterNonBreak() throws IOException { String source = "Das ist ein Bindestrich-Wort, oder etwa nicht? Jetzt kommen wir zum Ende."; String[] expected = { "Das", "ist", "ein", "Bindestrich-Wort", "oder", "etwa", "nicht", "Jetzt", "kommen", "wir", "zum", "Ende" }; String resource = "org/xbib/elasticsearch/index/analysis/icu/icu_tokenizer.json"; Tokenizer tokenizer = tokenizerFactory(resource,"my_hyphen_icu_tokenizer").create(); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenizer, expected); } @Test public void testIdentifierNonBreak() throws IOException { String source = "ISBN 3-428-84350-9"; String[] expected = {"ISBN", "3-428-84350-9"}; String resource = "org/xbib/elasticsearch/index/analysis/icu/icu_tokenizer.json"; Tokenizer tokenizer = tokenizerFactory(resource,"my_hyphen_icu_tokenizer").create(); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenizer, expected); } @Test @Ignore public void testIdentifierNonBreakSingleToken() throws IOException { String source = "3-428-84350-9"; String[] expected = {"3-428-84350-9"}; String resource = "org/xbib/elasticsearch/index/analysis/icu/icu_tokenizer.json"; Tokenizer tokenizer = tokenizerFactory(resource,"my_hyphen_icu_tokenizer").create(); tokenizer.setReader(new StringReader(source)); // THIS FAILS BUT WHY? only single digit ? // expected:<3[-428-84350-9]> but was:<3[] assertSimpleTSOutput(tokenizer, expected); } @Test public void testIdentifierNonBreakSpaceTwoTokens() throws IOException { String source = "Binde-strich-wort 3-428-84350-9"; String[] expected = {"Binde-strich-wort", "3-428-84350-9"}; String resource = "org/xbib/elasticsearch/index/analysis/icu/icu_tokenizer.json"; Tokenizer tokenizer = tokenizerFactory(resource,"my_hyphen_icu_tokenizer").create(); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenizer, expected); } private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { stream.reset(); CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); assertNotNull(termAttr); int i = 0; while (stream.incrementToken()) { assertTrue(i < expected.length); assertEquals(expected[i], termAttr.toString()); i++; } assertEquals(i, expected.length); stream.close(); } }