package org.xbib.elasticsearch.index.analysis.german; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.junit.Assert; import org.junit.Test; import java.io.IOException; import java.io.StringReader; import static org.xbib.elasticsearch.MapperTestUtils.tokenFilterFactory; import static org.xbib.elasticsearch.MapperTestUtils.tokenizerFactory; /** * */ public class GermanNormalizationTests extends Assert { @Test public void testGerman1() throws IOException { String source = "Ein schöner Tag in Köln im Café an der Straßenecke"; String[] expected = { "Ein", "schoner", "Tag", "in", "Koln", "im", "Café", "an", "der", "Strassenecke" }; String resource = "org/xbib/elasticsearch/index/analysis/german/german_normalization_analysis.json"; TokenFilterFactory tokenFilter = tokenFilterFactory(resource, "umlaut"); Tokenizer tokenizer = tokenizerFactory(resource, "standard").create(); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); } private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { stream.reset(); CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); assertNotNull(termAttr); int i = 0; while (stream.incrementToken()) { assertTrue(i < expected.length); assertEquals(expected[i], termAttr.toString()); i++; } assertEquals(i, expected.length); stream.close(); } }