/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.search; import org.apache.solr.SolrTestCaseJ4; import org.junit.BeforeClass; import org.junit.Test; public class TestFoldingMultitermQuery extends SolrTestCaseJ4 { public String getCoreName() { return "basic"; } @BeforeClass public static void beforeTests() throws Exception { initCore("solrconfig-basic.xml", "schema-folding.xml"); String docs[] = { "abcdefg1 finger", "gangs hijklmn1", "opqrstu1 zilly", }; // prepare the index for (int i = 0; i < docs.length; i++) { String num = Integer.toString(i); String boolVal = ((i % 2) == 0) ? "true" : "false"; assertU(adoc("id", num, "int_f", num, "float_f", num, "long_f", num, "double_f", num, "bool_f", boolVal, "date_f", "200" + Integer.toString(i % 10) + "-01-01T00:00:00Z", "content", docs[i], "content_ws", docs[i], "content_rev", docs[i], "content_multi", docs[i], "content_lower_token", docs[i], "content_oldstyle", docs[i], "content_charfilter", docs[i], "content_multi_bad", docs[i], "content_straight", docs[i], "content_lower", docs[i], "content_folding", docs[i], "content_stemming", docs[i], "content_keyword", docs[i] )); } // Mixing and matching amongst various languages is probalby a bad thing, so add some tests for various // special filters int idx = docs.length; // Greek assertU(adoc("id", Integer.toString(idx++), "content_greek", "Μάϊος")); assertU(adoc("id", Integer.toString(idx++), "content_greek", "ΜΆΪΟΣ")); // Turkish assertU(adoc("id", Integer.toString(idx++), "content_turkish", "\u0130STANBUL")); assertU(adoc("id", Integer.toString(idx++), "content_turkish", "ISPARTA")); assertU(adoc("id", Integer.toString(idx++), "content_turkish", "izmir")); // Russian normalization assertU(adoc("id", Integer.toString(idx++), "content_russian", "электромагнитной")); assertU(adoc("id", Integer.toString(idx++), "content_russian", "Вместе")); assertU(adoc("id", Integer.toString(idx++), "content_russian", "силе")); // persian normalization assertU(adoc("id", Integer.toString(idx++), "content_persian", "هاي")); // arabic normalization assertU(adoc("id", Integer.toString(idx++), "content_arabic", "روبرت")); // hindi normalization assertU(adoc("id", Integer.toString(idx++), "content_hindi", "हिंदी")); assertU(adoc("id", Integer.toString(idx++), "content_hindi", "अाअा")); // german normalization assertU(adoc("id", Integer.toString(idx++), "content_german", "weissbier")); // cjk width normalization assertU(adoc("id", Integer.toString(idx++), "content_width", "ヴィッツ")); assertU(commit()); } @Test public void testPrefixCaseAccentFolding() throws Exception { String matchOneDocPrefixUpper[][] = { {"A*", "ÁB*", "ABÇ*"}, // these should find only doc 0 {"H*", "HÏ*", "HìJ*"}, // these should find only doc 1 {"O*", "ÖP*", "OPQ*"}, // these should find only doc 2 }; String matchRevPrefixUpper[][] = { {"*Ğ1", "*DEfG1", "*EfG1"}, {"*N1", "*LmŊ1", "*MÑ1"}, {"*Ǖ1", "*sTu1", "*RŠTU1"} }; // test the prefix queries find only one doc where the query is uppercased. Must go through query parser here! for (int idx = 0; idx < matchOneDocPrefixUpper.length; idx++) { for (int jdx = 0; jdx < matchOneDocPrefixUpper[idx].length; jdx++) { String me = matchOneDocPrefixUpper[idx][jdx]; assertQ(req("q", "content:" + me), "//*[@numFound='1']", "//*[@name='id'][.='" + Integer.toString(idx) + "']"); assertQ(req("q", "content_ws:" + me), "//*[@numFound='1']", "//*[@name='id'][.='" + Integer.toString(idx) + "']"); assertQ(req("q", "content_multi:" + me), "//*[@numFound='1']", "//*[@name='id'][.='" + Integer.toString(idx) + "']"); assertQ(req("q", "content_lower_token:" + me), "//result[@numFound='1']", "//*[@name='id'][.='" + Integer.toString(idx) + "']"); assertQ(req("q", "content_oldstyle:" + me), "//result[@numFound='0']"); } } for (int idx = 0; idx < matchRevPrefixUpper.length; idx++) { for (int jdx = 0; jdx < matchRevPrefixUpper[idx].length; jdx++) { String me = matchRevPrefixUpper[idx][jdx]; assertQ(req("q", "content_rev:" + me), "//*[@numFound='1']", "//*[@name='id'][.='" + Integer.toString(idx) + "']"); } } } // test the wildcard queries find only one doc where the query is uppercased and/or accented. @Test public void testWildcardCaseAccentFolding() throws Exception { String matchOneDocWildUpper[][] = { {"Á*C*", "ÁB*1", "ABÇ*g1", "Á*FG1"}, // these should find only doc 0 {"H*k*", "HÏ*l?*", "HìJ*n*", "HìJ*m*"}, // these should find only doc 1 {"O*ř*", "ÖP*ş???", "OPQ*S?Ů*", "ÖP*1"}, // these should find only doc 2 }; for (int idx = 0; idx < matchOneDocWildUpper.length; idx++) { for (int jdx = 0; jdx < matchOneDocWildUpper[idx].length; jdx++) { String me = matchOneDocWildUpper[idx][jdx]; assertQ("Error with " + me, req("q", "content:" + me), "//result[@numFound='1']", "//*[@name='id'][.='" + Integer.toString(idx) + "']"); assertQ(req("q", "content_ws:" + me), "//result[@numFound='1']", "//*[@name='id'][.='" + Integer.toString(idx) + "']"); assertQ(req("q", "content_multi:" + me), "//result[@numFound='1']", "//*[@name='id'][.='" + Integer.toString(idx) + "']"); assertQ(req("q", "content_oldstyle:" + me), "//result[@numFound='0']"); } } } @Test public void testLowerTokenizer() { // The lowercasetokenizer will remove the '1' from the index, but not from the query, thus the special test. assertQ(req("q", "content_lower_token:Á*C*"), "//result[@numFound='1']"); assertQ(req("q", "content_lower_token:Á*C*1"), "//result[@numFound='0']"); assertQ(req("q", "content_lower_token:h*1"), "//result[@numFound='0']"); assertQ(req("q", "content_lower_token:H*1"), "//result[@numFound='0']"); assertQ(req("q", "content_lower_token:*1"), "//result[@numFound='0']"); assertQ(req("q", "content_lower_token:HÏ*l?*"), "//result[@numFound='1']"); assertQ(req("q", "content_lower_token:hȉ*l?*"), "//result[@numFound='1']"); } @Test public void testFuzzy() throws Exception { assertQ(req("q", "content:ZiLLx~1"), "//result[@numFound='1']"); assertQ(req("q", "content_straight:ZiLLx~1"), // case preserving field shouldn't match "//result[@numFound='0']"); assertQ(req("q", "content_folding:ZiLLx~1"), // case preserving field shouldn't match "//result[@numFound='0']"); } @Test public void testRegex() throws Exception { assertQ(req("q", "content:/Zill[a-z]/"), "//result[@numFound='1']"); assertQ(req("q", "content:/Zill[A-Z]/"), // everything in the regex gets lowercased? "//result[@numFound='1']"); assertQ(req("q", "content_keyword:/.*Zill[A-Z]/"), "//result[@numFound='1']"); assertQ(req("q", "content_straight:/Zill[a-z]/"), // case preserving field shouldn't match "//result[@numFound='0']"); assertQ(req("q", "content_folding:/Zill[a-z]/"), // case preserving field shouldn't match "//result[@numFound='0']"); assertQ(req("q", "content_keyword:/Abcdefg1 Finger/"), // test spaces "//result[@numFound='1']"); } @Test public void testGeneral() throws Exception { assertQ(req("q", "content_stemming:fings*"), "//result[@numFound='0']"); // should not match (but would if fings* was stemmed to fing* assertQ(req("q", "content_stemming:fing*"), "//result[@numFound='1']"); } // Phrases should fail. This test is mainly a marker so if phrases ever do start working with wildcards we go // and update the documentation @Test public void testPhrase() { assertQ(req("q", "content:\"silly ABCD*\""), "//result[@numFound='0']"); } @Test public void testWildcardRange() { assertQ(req("q", "content:[* TO *]"), "//result[@numFound='3']"); assertQ(req("q", "content:[AB* TO Z*]"), "//result[@numFound='3']"); assertQ(req("q", "content:[AB*E?G* TO TU*W]"), "//result[@numFound='3']"); } // Does the char filter get correctly handled? @Test public void testCharFilter() { assertQ(req("q", "content_charfilter:" + "Á*C*"), "//result[@numFound='1']", "//*[@name='id'][.='0']"); assertQ(req("q", "content_charfilter:" + "ABÇ*g1"), "//result[@numFound='1']", "//*[@name='id'][.='0']"); assertQ(req("q", "content_charfilter:" + "HÏ*l?*"), "//result[@numFound='1']", "//*[@name='id'][.='1']"); } @Test public void testRangeQuery() { assertQ(req("q", "content:" + "{Ȫp*1 TO QŮ*}"), "//result[@numFound='1']", "//*[@name='id'][.='2']"); assertQ(req("q", "content:" + "[Áb* TO f?Ñg?r]"), "//result[@numFound='1']", "//*[@name='id'][.='0']"); } @Test public void testNonTextTypes() { String[] intTypes = {"int_f", "float_f", "long_f", "double_f"}; for (String str : intTypes) { assertQ(req("q", str + ":" + "0"), "//result[@numFound='1']", "//*[@name='id'][.='0']"); assertQ(req("q", str + ":" + "[0 TO 2]"), "//result[@numFound='3']", "//*[@name='id'][.='0']", "//*[@name='id'][.='1']", "//*[@name='id'][.='2']"); } assertQ(req("q", "bool_f:true"), "//result[@numFound='2']", "//*[@name='id'][.='0']", "//*[@name='id'][.='2']"); assertQ(req("q", "bool_f:[false TO true]"), "//result[@numFound='3']", "//*[@name='id'][.='0']", "//*[@name='id'][.='1']", "//*[@name='id'][.='2']"); assertQ(req("q", "date_f:2000-01-01T00\\:00\\:00Z"), "//result[@numFound='1']", "//*[@name='id'][.='0']"); assertQ(req("q", "date_f:[2000-12-31T23:59:59.999Z TO 2002-01-02T00:00:01Z]"), "//result[@numFound='2']", "//*[@name='id'][.='1']", "//*[@name='id'][.='2']"); } @Test public void testMultiBad() { try { ignoreException("analyzer returned too many terms"); assertQ(req("q", "content_multi_bad:" + "abCD*")); fail("Should throw exception when token evaluates to more than one term"); } catch (Exception expected) { assertTrue(expected.getCause() instanceof org.apache.solr.common.SolrException); } finally { resetExceptionIgnores(); } } @Test public void testGreek() { assertQ(req("q", "content_greek:μαιο*"), "//result[@numFound='2']"); assertQ(req("q", "content_greek:ΜΆΪΟ*"), "//result[@numFound='2']"); assertQ(req("q", "content_greek:Μάϊο*"), "//result[@numFound='2']"); } @Test public void testRussian() { assertQ(req("q", "content_russian:элЕктРомагн*тной"), "//result[@numFound='1']"); assertQ(req("q", "content_russian:Вме*те"), "//result[@numFound='1']"); assertQ(req("q", "content_russian:Си*е"), "//result[@numFound='1']"); assertQ(req("q", "content_russian:эЛектромагнИт*"), "//result[@numFound='1']"); } public void testPersian() { assertQ(req("q", "content_persian:های*"), "//result[@numFound='1']"); } public void testArabic() { assertQ(req("q", "content_arabic:روبرـــــــــــــــــــــــــــــــــت*"), "//result[@numFound='1']"); } public void testHindi() { assertQ(req("q", "content_hindi:हिन्दी*"), "//result[@numFound='1']"); assertQ(req("q", "content_hindi:आआ*"), "//result[@numFound='1']"); } public void testGerman() { assertQ(req("q", "content_german:weiß*"), "//result[@numFound='1']"); } public void testCJKWidth() { assertQ(req("q", "content_width:ヴィ*"), "//result[@numFound='1']"); } }