/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.miscellaneous; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map.Entry; import java.util.Map; import java.util.Set; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CharacterUtils; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap; import org.apache.lucene.util.TestUtil; /** * */ public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase { private KeywordTokenizer keywordTokenizer(String data) throws IOException { KeywordTokenizer tokenizer = new KeywordTokenizer(); tokenizer.setReader(new StringReader(data)); return tokenizer; } public void testOverride() throws IOException { // lets make booked stem to books // the override filter will convert "booked" to "books", // but also mark it with KeywordAttribute so Porter will not change it. StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(); builder.add("booked", "books"); Tokenizer tokenizer = keywordTokenizer("booked"); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter( tokenizer, builder.build())); assertTokenStreamContents(stream, new String[] {"books"}); } public void testIgnoreCase() throws IOException { // lets make booked stem to books // the override filter will convert "booked" to "books", // but also mark it with KeywordAttribute so Porter will not change it. StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true); builder.add("boOkEd", "books"); Tokenizer tokenizer = keywordTokenizer("BooKeD"); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter( tokenizer, builder.build())); assertTokenStreamContents(stream, new String[] {"books"}); } public void testNoOverrides() throws IOException { StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true); Tokenizer tokenizer = keywordTokenizer("book"); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter( tokenizer, builder.build())); assertTokenStreamContents(stream, new String[] {"book"}); } public void testRandomRealisticWhiteSpace() throws IOException { Map<String,String> map = new HashMap<>(); Set<String> seen = new HashSet<>(); int numTerms = atLeast(50); boolean ignoreCase = random().nextBoolean(); for (int i = 0; i < numTerms; i++) { String randomRealisticUnicodeString = TestUtil .randomRealisticUnicodeString(random()); char[] charArray = randomRealisticUnicodeString.toCharArray(); StringBuilder builder = new StringBuilder(); for (int j = 0; j < charArray.length;) { int cp = Character.codePointAt(charArray, j, charArray.length); if (!Character.isWhitespace(cp)) { builder.appendCodePoint(cp); } j += Character.charCount(cp); } if (builder.length() > 0) { String inputValue = builder.toString(); // Make sure we don't try to add two inputs that vary only by case: String seenInputValue; if (ignoreCase) { // TODO: can we simply use inputValue.toLowerCase(Locale.ROOT)??? char[] buffer = inputValue.toCharArray(); CharacterUtils.toLowerCase(buffer, 0, buffer.length); seenInputValue = buffer.toString(); } else { seenInputValue = inputValue; } if (seen.contains(seenInputValue) == false) { seen.add(seenInputValue); String value = TestUtil.randomSimpleString(random()); map.put(inputValue, value.isEmpty() ? "a" : value); } } } if (map.isEmpty()) { map.put("booked", "books"); } StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(ignoreCase); Set<Entry<String,String>> entrySet = map.entrySet(); StringBuilder input = new StringBuilder(); List<String> output = new ArrayList<>(); for (Entry<String,String> entry : entrySet) { builder.add(entry.getKey(), entry.getValue()); if (random().nextBoolean() || output.isEmpty()) { input.append(entry.getKey()).append(" "); output.add(entry.getValue()); } } Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(input.toString())); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter( tokenizer, builder.build())); assertTokenStreamContents(stream, output.toArray(new String[0])); } public void testRandomRealisticKeyword() throws IOException { Map<String,String> map = new HashMap<>(); int numTerms = atLeast(50); for (int i = 0; i < numTerms; i++) { String randomRealisticUnicodeString = TestUtil .randomRealisticUnicodeString(random()); if (randomRealisticUnicodeString.length() > 0) { String value = TestUtil.randomSimpleString(random()); map.put(randomRealisticUnicodeString, value.isEmpty() ? "a" : value); } } if (map.isEmpty()) { map.put("booked", "books"); } // This test might fail if ignoreCase is true since the map might have twice the same key, once // lowercased and once uppercased StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false); Set<Entry<String,String>> entrySet = map.entrySet(); for (Entry<String,String> entry : entrySet) { builder.add(entry.getKey(), entry.getValue()); } StemmerOverrideMap build = builder.build(); for (Entry<String,String> entry : entrySet) { if (random().nextBoolean()) { Tokenizer tokenizer = new KeywordTokenizer(); tokenizer.setReader(new StringReader(entry.getKey())); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter( tokenizer, build)); assertTokenStreamContents(stream, new String[] {entry.getValue()}); } } } }