package org.xbib.elasticsearch.index.analysis.hyphen; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.junit.Assert; import org.junit.Test; import java.io.IOException; import java.io.StringReader; import static org.xbib.elasticsearch.MapperTestUtils.analyzer; import static org.xbib.elasticsearch.MapperTestUtils.tokenFilterFactory; import static org.xbib.elasticsearch.MapperTestUtils.tokenizerFactory; /** * */ public class HyphenTokenizerTests extends Assert { @Test public void testOne() throws IOException { String source = "Das ist ein Bindestrich-Wort."; String[] expected = { "Das", "ist", "ein", "Bindestrich-Wort", "BindestrichWort", "Wort", "Bindestrich" }; String resource = "org/xbib/elasticsearch/index/analysis/hyphen/hyphen_tokenizer.json"; Tokenizer tokenizer = tokenizerFactory(resource, "my_hyphen_tokenizer").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = tokenFilterFactory(resource,"hyphen"); TokenStream tokenStream = tokenFilter.create(tokenizer); assertSimpleTSOutput(tokenStream, expected); } @Test public void testTwo() throws IOException { String source = "Das E-Book muss dringend zum Buchbinder."; String[] expected = { "Das", "E-Book", "EBook", "Book", "muss", "dringend", "zum", "Buchbinder" }; String resource = "org/xbib/elasticsearch/index/analysis/hyphen/hyphen_tokenizer.json"; Tokenizer tokenizer = tokenizerFactory(resource,"my_icu_tokenizer").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = tokenFilterFactory(resource,"hyphen"); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); } @Test public void testThree() throws IOException { String source = "Ich will nicht als Service-Center-Mitarbeiterin, sondern 100-prozentig als Dipl.-Ing. arbeiten!"; String[] expected = { "Ich", "will", "nicht", "als", "Service-Center-Mitarbeiterin", "ServiceCenterMitarbeiterin", "Mitarbeiterin", "ServiceCenter", "ServiceCenter-Mitarbeiterin", "Center-Mitarbeiterin", "Service", "sondern", "100-prozentig", "als", "Dipl", "Ing", "arbeiten" }; String resource = "org/xbib/elasticsearch/index/analysis/hyphen/hyphen_tokenizer.json"; Tokenizer tokenizer = tokenizerFactory(resource,"my_hyphen_tokenizer").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = tokenFilterFactory(resource,"hyphen"); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); } @Test public void testFour() throws IOException { String source = "So wird's was: das Elasticsearch-Buch erscheint beim O'Reilly-Verlag."; String[] expected = { "So", "wird's", "was", "das", "Elasticsearch-Buch", "ElasticsearchBuch", "Buch", "Elasticsearch", "erscheint", "beim", "O'Reilly-Verlag" }; String resource = "org/xbib/elasticsearch/index/analysis/hyphen/hyphen_tokenizer.json"; Tokenizer tokenizer = tokenizerFactory(resource,"my_hyphen_tokenizer").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = tokenFilterFactory(resource,"hyphen"); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); } @Test public void testFive() throws IOException { String source = "978-1-4493-5854-9"; String[] expected = { "978-1-4493-5854-9" }; String resource = "org/xbib/elasticsearch/index/analysis/hyphen/hyphen_tokenizer.json"; Tokenizer tokenizer = tokenizerFactory(resource,"my_hyphen_tokenizer").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = tokenFilterFactory(resource,"hyphen"); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); } @Test public void testSix() throws IOException { String source = "E-Book"; String[] expected = { "E-Book", "EBook", "Book" }; String resource = "org/xbib/elasticsearch/index/analysis/hyphen/hyphen_tokenizer.json"; Tokenizer tokenizer = tokenizerFactory(resource,"my_hyphen_tokenizer").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = tokenFilterFactory(resource,"hyphen"); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); } @Test public void testSeven() throws IOException { String source = "Procter & Gamble ist Procter&Gamble. Schwarz - weiss ist schwarz-weiss"; String[] expected = { "Procter", "Gamble", "ist", "Procter&Gamble", "Schwarz", "weiss", "ist", "schwarz-weiss", "schwarzweiss", "weiss", "schwarz" }; String resource = "org/xbib/elasticsearch/index/analysis/hyphen/hyphen_tokenizer.json"; Tokenizer tokenizer = tokenizerFactory(resource,"my_hyphen_tokenizer").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = tokenFilterFactory(resource,"hyphen"); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); } @Test public void testEight() throws IOException { String source = "Ich will nicht als Service-Center-Mitarbeiterin mit C++, sondern 100-prozentig als Dipl.-Ing. arbeiten!"; String[] expected = { "Ich", "will", "nicht", "als", "Service-Center-Mitarbeiterin", "ServiceCenterMitarbeiterin", "mit", "C++", "sondern", "100-prozentig", "100prozentig", "als", "Dipl", "Ing", "arbeiten" }; String resource = "org/xbib/elasticsearch/index/analysis/hyphen/hyphen_tokenizer_without_subwords.json"; Tokenizer tokenizer = tokenizerFactory(resource, "my_hyphen_tokenizer").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = tokenFilterFactory(resource,"my_hyphen_tokenfilter"); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); } @Test public void testNine() throws IOException { String source = "Das ist ein Punkt. Und noch ein Punkt für U.S.A. Oder? Nicht doch."; String[] expected = { "Das", "ist", "ein", "Punkt", "Und", "noch", "ein", "Punkt", "für", "U.S.A", "Oder", "Nicht", "doch" }; String resource = "org/xbib/elasticsearch/index/analysis/hyphen/hyphen_tokenizer_without_subwords.json"; Tokenizer tokenizer = tokenizerFactory(resource,"my_hyphen_tokenizer").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = tokenFilterFactory(resource,"my_hyphen_tokenfilter"); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); } @Test public void testTen() throws IOException { String source = "Das ist ein Punkt. Und noch ein Punkt für U.S.A. Oder? Nicht doch."; String[] expected = { "Das", "ist", "ein", "Punkt", "Und", "noch", "ein", "Punkt", "für", "U.S.A", "Oder", "Nicht", "doch" }; String resource = "org/xbib/elasticsearch/index/analysis/hyphen/hyphen_analyzer.json"; Analyzer analyzer = analyzer(resource, "my_hyphen_analyzer"); assertSimpleTSOutput(analyzer.tokenStream("text", new StringReader(source)), expected); } private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { stream.reset(); CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); assertNotNull(termAttr); int i = 0; while (stream.incrementToken()) { assertTrue(i < expected.length); assertEquals(expected[i], termAttr.toString()); i++; } assertEquals(expected.length, i); stream.close(); } }