package org.xbib.elasticsearch.index.analysis.icu; import com.ibm.icu.text.FilteredNormalizer2; import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.UnicodeSet; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractCharFilterFactory; import org.elasticsearch.index.analysis.MultiTermAwareComponent; import java.io.InputStream; import java.io.Reader; /** * */ public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory implements MultiTermAwareComponent { private final Normalizer2 normalizer; public IcuNormalizerCharFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name); Normalizer2 base = Normalizer2.getInstance(getNormalizationResource(settings), getNormalizationName(settings), getNormalizationMode(settings)); String unicodeSetFilter = settings.get("unicodeSetFilter"); this.normalizer = unicodeSetFilter != null ? new FilteredNormalizer2(base, new UnicodeSet(unicodeSetFilter).freeze()) : base; } @Override public Reader create(Reader reader) { return new IcuNormalizerCharFilter(reader, normalizer); } @Override public Object getMultiTermComponent() { return this; } protected InputStream getNormalizationResource(Settings settings) { return null; } protected String getNormalizationName(Settings settings) { return settings.get("name", "nfkc_cf"); } protected Normalizer2.Mode getNormalizationMode(Settings settings) { Normalizer2.Mode normalizationMode; switch (settings.get("mode", "compose")) { case "compose_contiguous": normalizationMode = Normalizer2.Mode.COMPOSE_CONTIGUOUS; break; case "decompose": normalizationMode = Normalizer2.Mode.DECOMPOSE; break; case "fcd": normalizationMode = Normalizer2.Mode.FCD; break; default: normalizationMode = Normalizer2.Mode.COMPOSE; break; } return normalizationMode; } }