package org.xbib.elasticsearch.index.analysis.decompound.patricia; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.junit.Assert; import org.junit.Test; import org.xbib.elasticsearch.MapperTestUtils; import java.io.IOException; import java.io.StringReader; import static org.xbib.elasticsearch.MapperTestUtils.tokenFilterFactory; import static org.xbib.elasticsearch.MapperTestUtils.tokenizerFactory; /** * */ public class DecompoundTokenFilterTests extends Assert { @Test public void test() throws IOException { String source = "Die Jahresfeier der Rechtsanwaltskanzleien auf dem Donaudampfschiff hat viel Ökosteuer gekostet"; String[] expected = { "Die", "Die", "Jahresfeier", "Jahr", "feier", "der", "der", "Rechtsanwaltskanzleien", "Recht", "anwalt", "kanzlei", "auf", "auf", "dem", "dem", "Donaudampfschiff", "Donau", "dampf", "schiff", "hat", "hat", "viel", "viel", "Ökosteuer", "Ökosteuer", "gekostet", "gekosten" }; String resource = "org/xbib/elasticsearch/index/analysis/decompound/patricia/decompound_analysis.json"; TokenFilterFactory tokenFilter = tokenFilterFactory(resource, "decomp"); Tokenizer tokenizer = tokenizerFactory(resource, "standard").create(); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); } @Test public void testWithSubwordsOnly() throws IOException { String source = "Das ist ein Schlüsselwort, ein Bindestrichwort"; String[] expected = { "Da", "ist", "ein", "Schlüssel", "wort", "ein", "Bindestrich", "wort" }; String resource = "org/xbib/elasticsearch/index/analysis/decompound/patricia/keywords_analysis.json"; Analyzer analyzer = MapperTestUtils.analyzer(resource, "with_subwords_only"); assertNotNull(analyzer); assertSimpleTSOutput(analyzer.tokenStream("test-field", source), expected); } private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { stream.reset(); CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); Assert.assertNotNull(termAttr); int i = 0; while (stream.incrementToken()) { assertTrue(i < expected.length); assertEquals(expected[i], termAttr.toString()); i++; } assertEquals(i, expected.length); stream.close(); } }