/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.index.mapper; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailures; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertOrderedSearchHits; import com.ibm.icu.text.Collator; import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.util.ULocale; import org.elasticsearch.action.search.SearchRequest; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.search.builder.SearchSourceBuilder; import org.elasticsearch.search.sort.SortOrder; import org.elasticsearch.test.ESIntegTestCase; import java.util.Collection; import java.util.Collections; public class ICUCollationKeywordFieldMapperIT extends ESIntegTestCase { @Override protected Collection<Class<? extends Plugin>> nodePlugins() { return Collections.singletonList(AnalysisICUPlugin.class); } /* * Turkish has some funny casing. * This test shows how you can solve this kind of thing easily with collation. * Instead of using LowerCaseFilter, use a turkish collator with primary strength. * Then things will sort and match correctly. */ public void testBasicUsage() throws Exception { String index = "foo"; String type = "mytype"; String[] equilavent = {"I WİLL USE TURKİSH CASING", "ı will use turkish casıng"}; XContentBuilder builder = jsonBuilder() .startObject().startObject("properties") .startObject("collate") .field("type", "icu_collation_keyword") .field("language", "tr") .field("strength", "primary") .endObject() .endObject().endObject(); assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder)); // both values should collate to same value indexRandom(true, client().prepareIndex(index, type, "1").setSource("{\"collate\":\"" + equilavent[0] + "\"}", XContentType.JSON), client().prepareIndex(index, type, "2").setSource("{\"collate\":\"" + equilavent[1] + "\"}", XContentType.JSON) ); // searching for either of the terms should return both results since they collate to the same value SearchRequest request = new SearchRequest() .indices(index) .types(type) .source(new SearchSourceBuilder() .fetchSource(false) .query(QueryBuilders.termQuery("collate", randomBoolean() ? equilavent[0] : equilavent[1])) .sort("collate") .sort("_uid", SortOrder.DESC) // secondary sort should kick in because both will collate to same value ); SearchResponse response = client().search(request).actionGet(); assertNoFailures(response); assertHitCount(response, 2L); assertOrderedSearchHits(response, "2", "1"); } /* * Test usage of the decomposition option for unicode normalization. */ public void testNormalization() throws Exception { String index = "foo"; String type = "mytype"; String[] equilavent = {"I W\u0049\u0307LL USE TURKİSH CASING", "ı will use turkish casıng"}; XContentBuilder builder = jsonBuilder() .startObject().startObject("properties") .startObject("collate") .field("type", "icu_collation_keyword") .field("language", "tr") .field("strength", "primary") .field("decomposition", "canonical") .endObject() .endObject().endObject(); assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder)); indexRandom(true, client().prepareIndex(index, type, "1").setSource("{\"collate\":\"" + equilavent[0] + "\"}", XContentType.JSON), client().prepareIndex(index, type, "2").setSource("{\"collate\":\"" + equilavent[1] + "\"}", XContentType.JSON) ); // searching for either of the terms should return both results since they collate to the same value SearchRequest request = new SearchRequest() .indices(index) .types(type) .source(new SearchSourceBuilder() .fetchSource(false) .query(QueryBuilders.termQuery("collate", randomBoolean() ? equilavent[0] : equilavent[1])) .sort("collate") .sort("_uid", SortOrder.DESC) // secondary sort should kick in because both will collate to same value ); SearchResponse response = client().search(request).actionGet(); assertNoFailures(response); assertHitCount(response, 2L); assertOrderedSearchHits(response, "2", "1"); } /* * Test secondary strength, for english case is not significant. */ public void testSecondaryStrength() throws Exception { String index = "foo"; String type = "mytype"; String[] equilavent = {"TESTING", "testing"}; XContentBuilder builder = jsonBuilder() .startObject().startObject("properties") .startObject("collate") .field("type", "icu_collation_keyword") .field("language", "en") .field("strength", "secondary") .field("decomposition", "no") .endObject() .endObject().endObject(); assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder)); indexRandom(true, client().prepareIndex(index, type, "1").setSource("{\"collate\":\"" + equilavent[0] + "\"}", XContentType.JSON), client().prepareIndex(index, type, "2").setSource("{\"collate\":\"" + equilavent[1] + "\"}", XContentType.JSON) ); SearchRequest request = new SearchRequest() .indices(index) .types(type) .source(new SearchSourceBuilder() .fetchSource(false) .query(QueryBuilders.termQuery("collate", randomBoolean() ? equilavent[0] : equilavent[1])) .sort("collate") .sort("_uid", SortOrder.DESC) // secondary sort should kick in because both will collate to same value ); SearchResponse response = client().search(request).actionGet(); assertNoFailures(response); assertHitCount(response, 2L); assertOrderedSearchHits(response, "2", "1"); } /* * Setting alternate=shifted to shift whitespace, punctuation and symbols * to quaternary level */ public void testIgnorePunctuation() throws Exception { String index = "foo"; String type = "mytype"; String[] equilavent = {"foo-bar", "foo bar"}; XContentBuilder builder = jsonBuilder() .startObject().startObject("properties") .startObject("collate") .field("type", "icu_collation_keyword") .field("language", "en") .field("strength", "primary") .field("alternate", "shifted") .endObject() .endObject().endObject(); assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder)); indexRandom(true, client().prepareIndex(index, type, "1").setSource("{\"collate\":\"" + equilavent[0] + "\"}", XContentType.JSON), client().prepareIndex(index, type, "2").setSource("{\"collate\":\"" + equilavent[1] + "\"}", XContentType.JSON) ); SearchRequest request = new SearchRequest() .indices(index) .types(type) .source(new SearchSourceBuilder() .fetchSource(false) .query(QueryBuilders.termQuery("collate", randomBoolean() ? equilavent[0] : equilavent[1])) .sort("collate") .sort("_uid", SortOrder.DESC) // secondary sort should kick in because both will collate to same value ); SearchResponse response = client().search(request).actionGet(); assertNoFailures(response); assertHitCount(response, 2L); assertOrderedSearchHits(response, "2", "1"); } /* * Setting alternate=shifted and variableTop to shift whitespace, but not * punctuation or symbols, to quaternary level */ public void testIgnoreWhitespace() throws Exception { String index = "foo"; String type = "mytype"; XContentBuilder builder = jsonBuilder() .startObject().startObject("properties") .startObject("collate") .field("type", "icu_collation_keyword") .field("language", "en") .field("strength", "primary") .field("alternate", "shifted") .field("variable_top", " ") .field("index", false) .endObject() .endObject().endObject(); assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder)); indexRandom(true, client().prepareIndex(index, type, "1").setSource("{\"collate\":\"foo bar\"}", XContentType.JSON), client().prepareIndex(index, type, "2").setSource("{\"collate\":\"foobar\"}", XContentType.JSON), client().prepareIndex(index, type, "3").setSource("{\"collate\":\"foo-bar\"}", XContentType.JSON) ); SearchRequest request = new SearchRequest() .indices(index) .types(type) .source(new SearchSourceBuilder() .fetchSource(false) .sort("collate", SortOrder.ASC) .sort("_uid", SortOrder.ASC) // secondary sort should kick in on docs 1 and 3 because same value collate value ); SearchResponse response = client().search(request).actionGet(); assertNoFailures(response); assertHitCount(response, 3L); assertOrderedSearchHits(response, "3", "1", "2"); } /* * Setting numeric to encode digits with numeric value, so that * foobar-9 sorts before foobar-10 */ public void testNumerics() throws Exception { String index = "foo"; String type = "mytype"; XContentBuilder builder = jsonBuilder() .startObject().startObject("properties") .startObject("collate") .field("type", "icu_collation_keyword") .field("language", "en") .field("numeric", true) .field("index", false) .endObject() .endObject().endObject(); assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder)); indexRandom(true, client().prepareIndex(index, type, "1").setSource("{\"collate\":\"foobar-10\"}", XContentType.JSON), client().prepareIndex(index, type, "2").setSource("{\"collate\":\"foobar-9\"}", XContentType.JSON) ); SearchRequest request = new SearchRequest() .indices(index) .types(type) .source(new SearchSourceBuilder() .fetchSource(false) .sort("collate", SortOrder.ASC) ); SearchResponse response = client().search(request).actionGet(); assertNoFailures(response); assertHitCount(response, 2L); assertOrderedSearchHits(response, "2", "1"); } /* * Setting caseLevel=true to create an additional case level between * secondary and tertiary */ public void testIgnoreAccentsButNotCase() throws Exception { String index = "foo"; String type = "mytype"; XContentBuilder builder = jsonBuilder() .startObject().startObject("properties") .startObject("collate") .field("type", "icu_collation_keyword") .field("language", "en") .field("strength", "primary") .field("case_level", true) .field("index", false) .endObject() .endObject().endObject(); assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder)); indexRandom(true, client().prepareIndex(index, type, "1").setSource("{\"collate\":\"résumé\"}", XContentType.JSON), client().prepareIndex(index, type, "2").setSource("{\"collate\":\"Resume\"}", XContentType.JSON), client().prepareIndex(index, type, "3").setSource("{\"collate\":\"resume\"}", XContentType.JSON), client().prepareIndex(index, type, "4").setSource("{\"collate\":\"Résumé\"}", XContentType.JSON) ); SearchRequest request = new SearchRequest() .indices(index) .types(type) .source(new SearchSourceBuilder() .fetchSource(false) .sort("collate", SortOrder.ASC) .sort("_uid", SortOrder.DESC) ); SearchResponse response = client().search(request).actionGet(); assertNoFailures(response); assertHitCount(response, 4L); assertOrderedSearchHits(response, "3", "1", "4", "2"); } /* * Setting caseFirst=upper to cause uppercase strings to sort * before lowercase ones. */ public void testUpperCaseFirst() throws Exception { String index = "foo"; String type = "mytype"; XContentBuilder builder = jsonBuilder() .startObject().startObject("properties") .startObject("collate") .field("type", "icu_collation_keyword") .field("language", "en") .field("strength", "tertiary") .field("case_first", "upper") .field("index", false) .endObject() .endObject().endObject(); assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder)); indexRandom(true, client().prepareIndex(index, type, "1").setSource("{\"collate\":\"resume\"}", XContentType.JSON), client().prepareIndex(index, type, "2").setSource("{\"collate\":\"Resume\"}", XContentType.JSON) ); SearchRequest request = new SearchRequest() .indices(index) .types(type) .source(new SearchSourceBuilder() .fetchSource(false) .sort("collate", SortOrder.ASC) ); SearchResponse response = client().search(request).actionGet(); assertNoFailures(response); assertHitCount(response, 2L); assertOrderedSearchHits(response, "2", "1"); } /* * For german, you might want oe to sort and match with o umlaut. * This is not the default, but you can make a customized ruleset to do this. * * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior. * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383 */ public void testCustomRules() throws Exception { String index = "foo"; String type = "mytype"; RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE")); String DIN5007_2_tailorings = "& ae , a\u0308 & AE , A\u0308" + "& oe , o\u0308 & OE , O\u0308" + "& ue , u\u0308 & UE , u\u0308"; RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings); String tailoredRules = tailoredCollator.getRules(); String[] equilavent = {"Töne", "Toene"}; XContentBuilder builder = jsonBuilder() .startObject().startObject("properties") .startObject("collate") .field("type", "icu_collation_keyword") .field("rules", tailoredRules) .field("strength", "primary") .endObject() .endObject().endObject(); assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder)); indexRandom(true, client().prepareIndex(index, type, "1").setSource("{\"collate\":\"" + equilavent[0] + "\"}", XContentType.JSON), client().prepareIndex(index, type, "2").setSource("{\"collate\":\"" + equilavent[1] + "\"}", XContentType.JSON) ); SearchRequest request = new SearchRequest() .indices(index) .types(type) .source(new SearchSourceBuilder() .fetchSource(false) .query(QueryBuilders.termQuery("collate", randomBoolean() ? equilavent[0] : equilavent[1])) .sort("collate", SortOrder.ASC) .sort("_uid", SortOrder.DESC) // secondary sort should kick in because both will collate to same value ); SearchResponse response = client().search(request).actionGet(); assertNoFailures(response); assertHitCount(response, 2L); assertOrderedSearchHits(response, "2", "1"); } }