package org.xbib.elasticsearch.index.analysis.icu; import com.ibm.icu.lang.UScript; import com.ibm.icu.text.Collator; import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.util.ULocale; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider; import java.util.ArrayList; import java.util.List; /** * An ICU collation analyzer provider. * There are two ways to configure collation: * The first is simply specifying the locale (defaults to the default locale). * The <tt>language</tt> parameter is the lowercase two-letter ISO-639 code. * An additional <tt>country</tt> and <tt>variant</tt> can be provided. * The second option is to specify collation rules as defined in the * <a href="http://www.icu-project.org/userguide/Collate_Customization.html"> * Collation customization</a> chapter in icu docs. The <tt>rules</tt> parameter can either * embed the rules definition in the settings or refer to an external location * (preferable located under the <tt>config</tt> location, relative to it). */ public class IcuCollationKeyAnalyzerProvider extends AbstractIndexAnalyzerProvider<IcuCollationKeyAnalyzer> { private final Collator collator; public IcuCollationKeyAnalyzerProvider(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); this.collator = createCollator(settings); } public static Collator createCollator(Settings settings) { Collator collator; String rules = settings.get("rules"); if (rules != null) { try { collator = new RuleBasedCollator(rules); } catch (Exception e) { throw new ElasticsearchException("Failed to parse collation rules", e); } } else { String localeStr = settings.get("locale"); if (localeStr != null) { collator = Collator.getInstance(new ULocale(localeStr)); } else { String language = settings.get("language"); if (language != null) { ULocale locale; String country = settings.get("country"); if (country != null) { String variant = settings.get("variant"); if (variant != null) { locale = new ULocale(language, country, variant); } else { locale = new ULocale(language, country); } } else { locale = new ULocale(language); } collator = Collator.getInstance(locale); } else { collator = Collator.getInstance(); } } } // set the strength flag, otherwise it will be the default. String strength = settings.get("strength"); if (strength != null) { int i; switch (strength.toLowerCase()) { case "primary": i = Collator.PRIMARY; break; case "secondary": i = Collator.SECONDARY; break; case "tertiary": i = Collator.TERTIARY; break; case "quaternary": i = Collator.QUATERNARY; break; case "identical": i = Collator.IDENTICAL; break; default: throw new ElasticsearchException("Invalid strength: " + strength); } collator.setStrength(i); } // set the decomposition flag, otherwise it will be the default. String decomposition = settings.get("decomposition"); if (decomposition != null) { if ("no".equalsIgnoreCase(decomposition)) { collator.setDecomposition(Collator.NO_DECOMPOSITION); } else if ("canonical".equalsIgnoreCase(decomposition)) { collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION); } else { throw new ElasticsearchException("Invalid decomposition: " + decomposition); } } if (!(collator instanceof RuleBasedCollator)) { return collator.freeze(); } RuleBasedCollator rbc = (RuleBasedCollator) collator; String alternate = settings.get("alternate"); if (alternate != null) { if ("shifted".equalsIgnoreCase(alternate)) { rbc.setAlternateHandlingShifted(true); } else if ("non-ignorable".equalsIgnoreCase(alternate)) { rbc.setAlternateHandlingShifted(false); } else { throw new ElasticsearchException("Invalid alternate: " + alternate); } } Boolean caseLevel = settings.getAsBoolean("caseLevel", null); if (caseLevel != null) { rbc.setCaseLevel(caseLevel); } String caseFirst = settings.get("caseFirst"); if (caseFirst != null) { if ("lower".equalsIgnoreCase(caseFirst)) { rbc.setLowerCaseFirst(true); } else if ("upper".equalsIgnoreCase(caseFirst)) { rbc.setUpperCaseFirst(true); } else { throw new ElasticsearchException("invalid caseFirst: " + caseFirst); } } Boolean numeric = settings.getAsBoolean("numeric", null); if (numeric != null) { rbc.setNumericCollation(numeric); } int maxVariable = settings.getAsInt("variableTop", Collator.ReorderCodes.DEFAULT); rbc.setMaxVariable(maxVariable); String[] reorderStrings = settings.getAsArray("reorder"); if (reorderStrings.length > 0) { List<Integer> list = new ArrayList<>(); for (String s : reorderStrings) { switch (s.toLowerCase()) { case "currency": list.add(Collator.ReorderCodes.CURRENCY); break; case "default": list.add(Collator.ReorderCodes.DEFAULT); break; case "digit": list.add(Collator.ReorderCodes.DIGIT); break; case "first": list.add(Collator.ReorderCodes.FIRST); break; case "none": list.add(Collator.ReorderCodes.NONE); break; case "others": list.add(Collator.ReorderCodes.OTHERS); break; case "punctuation": list.add(Collator.ReorderCodes.PUNCTUATION); break; case "space": list.add(Collator.ReorderCodes.SPACE); break; case "symbol": list.add(Collator.ReorderCodes.SYMBOL); break; default: int code = UScript.getCodeFromName(s); if (code == UScript.INVALID_CODE) { throw new ElasticsearchException("invalid reorder code: " + s); } else { list.add(code); } } } rbc.setReorderCodes(list.stream().mapToInt(i -> i).toArray()); } return rbc.freeze(); } @Override public IcuCollationKeyAnalyzer get() { return new IcuCollationKeyAnalyzer(collator); } }