package org.xbib.elasticsearch.index.analysis.icu;
import com.ibm.icu.text.Collator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.collation.CollationKeyAnalyzer;
/**
* Configures a {@link KeywordTokenizer} with an {@link IcuCollationAttributeFactory}.
* <p>
* Converts the token into its {@link com.ibm.icu.text.CollationKey} and
* then encodes the CollationKey directly.
* </p>
* <p>
* <strong>WARNING:</strong> Make sure you use exactly the same Collator at
* index and query time -- CollationKeys are only comparable when produced by
* the same Collator. {@link com.ibm.icu.text.RuleBasedCollator}s are
* independently versioned, so it is safe to search against stored
* CollationKeys if the following are exactly the same (best practice is
* to store this information with the index and check that they remain the
* same at query time):
* </p>
* <ol>
* <li>
* Collator version - see {@link Collator#getVersion()}
* </li>
* <li>
* The collation strength used - see {@link Collator#setStrength(int)}
* </li>
* </ol>
* <p>
* CollationKeys generated by ICU Collators are not compatible with those
* generated by java.text.Collators. Specifically, if you use
* ICUCollationKeyAnalyzer to generate index terms, do not use
* {@link CollationKeyAnalyzer} on the query side, or vice versa.
* </p>
* <p>
* ICUCollationKeyAnalyzer is significantly faster and generates significantly
* shorter keys than CollationKeyAnalyzer. See
* <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
* >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
* generation timing and key length comparisons between ICU4J and
* java.text.Collator over several languages.
* </p>
*/
public final class IcuCollationKeyAnalyzer extends Analyzer {
private final IcuCollationAttributeFactory factory;
public IcuCollationKeyAnalyzer(Collator collator) {
this.factory = new IcuCollationAttributeFactory(collator);
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
KeywordTokenizer tokenizer = new KeywordTokenizer(factory, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
return new TokenStreamComponents(tokenizer, tokenizer);
}
}