package org.xbib.elasticsearch.index.analysis.icu; import com.ibm.icu.text.NumberFormat; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import java.io.IOException; import java.text.ParsePosition; /** * */ public final class IcuNumberFormatTokenFilter extends TokenFilter { private final NumberFormat numberFormat; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); public IcuNumberFormatTokenFilter(TokenStream input, NumberFormat numberFormat) { super(input); this.numberFormat = numberFormat; } @Override public final boolean incrementToken() throws IOException { if (!input.incrementToken()) { return false; } else { String s = termAtt.toString(); ParsePosition parsePosition = new ParsePosition(0); Number result = numberFormat.parse(s, parsePosition); if (parsePosition.getIndex() > 0) { // zehn-tausend -> zehntausend // one hundred thousand -> onehundredthousand s = numberFormat.format(result).replaceAll("[\u00AD\u0020]", ""); } termAtt.setEmpty().append(s); typeAtt.setType("<ALPHANUM>"); return true; } } @Override public boolean equals(Object object) { return object instanceof IcuNumberFormatTokenFilter && numberFormat.equals(((IcuNumberFormatTokenFilter) object).numberFormat); } @Override public int hashCode() { return numberFormat.hashCode(); } }