/*
* Copyright (C) 2014 Jörg Prante
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program; if not, see http://www.gnu.org/licenses
* or write to the Free Software Foundation, Inc., 51 Franklin Street,
* Fifth Floor, Boston, MA 02110-1301 USA.
*
* The interactive user interfaces in modified source and object code
* versions of this program must display Appropriate Legal Notices,
* as required under Section 5 of the GNU Affero General Public License.
*
*/
package org.xbib.elasticsearch.index.analysis.icu;
import com.ibm.icu.text.Collator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.collation.CollationKeyAnalyzer;
import org.apache.lucene.collation.ICUCollationAttributeFactory;
import java.io.Reader;
/**
* <p>
* Configures {@link KeywordTokenizer} with {@link ICUCollationAttributeFactory}.
* <p>
* Converts the token into its {@link com.ibm.icu.text.CollationKey}, and
* then encodes the CollationKey directly.
* </p>
* <p>
* <strong>WARNING:</strong> Make sure you use exactly the same Collator at
* index and query time -- CollationKeys are only comparable when produced by
* the same Collator. {@link com.ibm.icu.text.RuleBasedCollator}s are
* independently versioned, so it is safe to search against stored
* CollationKeys if the following are exactly the same (best practice is
* to store this information with the index and check that they remain the
* same at query time):
* </p>
* <ol>
* <li>
* Collator version - see {@link Collator#getVersion()}
* </li>
* <li>
* The collation strength used - see {@link Collator#setStrength(int)}
* </li>
* </ol>
* <p>
* CollationKeys generated by ICU Collators are not compatible with those
* generated by java.text.Collators. Specifically, if you use
* ICUCollationKeyAnalyzer to generate index terms, do not use
* {@link CollationKeyAnalyzer} on the query side, or vice versa.
* </p>
* <p>
* ICUCollationKeyAnalyzer is significantly faster and generates significantly
* shorter keys than CollationKeyAnalyzer. See
* <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
* >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
* generation timing and key length comparisons between ICU4J and
* java.text.Collator over several languages.
* </p>
*/
public final class IcuCollationKeyAnalyzer extends Analyzer {
private final ICUCollationAttributeFactory factory;
public IcuCollationKeyAnalyzer(Collator collator) {
this.factory = new ICUCollationAttributeFactory(collator);
}
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
return new TokenStreamComponents(tokenizer, tokenizer);
}
}