/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.miscellaneous; import java.io.IOException; import java.util.ArrayList; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST.BytesReader; /** * Provides the ability to override any {@link KeywordAttribute} aware stemmer * with custom dictionary-based stemming. */ public final class StemmerOverrideFilter extends TokenFilter { private final StemmerOverrideMap stemmerOverrideMap; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); private final BytesReader fstReader; private final Arc<BytesRef> scratchArc = new FST.Arc<>(); private char[] spare = new char[0]; /** * Create a new StemmerOverrideFilter, performing dictionary-based stemming * with the provided <code>dictionary</code>. * <p> * Any dictionary-stemmed terms will be marked with {@link KeywordAttribute} * so that they will not be stemmed with stemmers down the chain. * </p> */ public StemmerOverrideFilter(final TokenStream input, final StemmerOverrideMap stemmerOverrideMap) { super(input); this.stemmerOverrideMap = stemmerOverrideMap; fstReader = stemmerOverrideMap.getBytesReader(); } @Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { if (fstReader == null) { // No overrides return true; } if (!keywordAtt.isKeyword()) { // don't muck with already-keyworded terms final BytesRef stem = stemmerOverrideMap.get(termAtt.buffer(), termAtt.length(), scratchArc, fstReader); if (stem != null) { spare = ArrayUtil.grow(termAtt.buffer(), stem.length); final int length = UnicodeUtil.UTF8toUTF16(stem, spare); if (spare != termAtt.buffer()) { termAtt.copyBuffer(spare, 0, length); } else { termAtt.setLength(length); } keywordAtt.setKeyword(true); } } return true; } else { return false; } } /** * A read-only 4-byte FST backed map that allows fast case-insensitive key * value lookups for {@link StemmerOverrideFilter} */ // TODO maybe we can generalize this and reuse this map somehow? public final static class StemmerOverrideMap { private final FST<BytesRef> fst; private final boolean ignoreCase; /** * Creates a new {@link StemmerOverrideMap} * @param fst the fst to lookup the overrides * @param ignoreCase if the keys case should be ingored */ public StemmerOverrideMap(FST<BytesRef> fst, boolean ignoreCase) { this.fst = fst; this.ignoreCase = ignoreCase; } /** * Returns a {@link BytesReader} to pass to the {@link #get(char[], int, FST.Arc, FST.BytesReader)} method. */ public BytesReader getBytesReader() { if (fst == null) { return null; } else { return fst.getBytesReader(); } } /** * Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary. */ public BytesRef get(char[] buffer, int bufferLen, Arc<BytesRef> scratchArc, BytesReader fstReader) throws IOException { BytesRef pendingOutput = fst.outputs.getNoOutput(); BytesRef matchOutput = null; int bufUpto = 0; fst.getFirstArc(scratchArc); while (bufUpto < bufferLen) { final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { return null; } pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); bufUpto += Character.charCount(codePoint); } if (scratchArc.isFinal()) { matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); } return matchOutput; } } /** * This builder builds an {@link FST} for the {@link StemmerOverrideFilter} */ public static class Builder { private final BytesRefHash hash = new BytesRefHash(); private final BytesRefBuilder spare = new BytesRefBuilder(); private final ArrayList<CharSequence> outputValues = new ArrayList<>(); private final boolean ignoreCase; private final CharsRefBuilder charsSpare = new CharsRefBuilder(); /** * Creates a new {@link Builder} with ignoreCase set to <code>false</code> */ public Builder() { this(false); } /** * Creates a new {@link Builder} * @param ignoreCase if the input case should be ignored. */ public Builder(boolean ignoreCase) { this.ignoreCase = ignoreCase; } /** * Adds an input string and its stemmer override output to this builder. * * @param input the input char sequence * @param output the stemmer override output char sequence * @return <code>false</code> iff the input has already been added to this builder otherwise <code>true</code>. */ public boolean add(CharSequence input, CharSequence output) { final int length = input.length(); if (ignoreCase) { // convert on the fly to lowercase charsSpare.grow(length); final char[] buffer = charsSpare.chars(); for (int i = 0; i < length; ) { i += Character.toChars( Character.toLowerCase( Character.codePointAt(input, i)), buffer, i); } spare.copyChars(buffer, 0, length); } else { spare.copyChars(input, 0, length); } if (hash.add(spare.get()) >= 0) { outputValues.add(output); return true; } return false; } /** * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} * @throws IOException if an {@link IOException} occurs; */ public StemmerOverrideMap build() throws IOException { ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<>( FST.INPUT_TYPE.BYTE4, outputs); final int[] sort = hash.sort(); IntsRefBuilder intsSpare = new IntsRefBuilder(); final int size = hash.size(); BytesRef spare = new BytesRef(); for (int i = 0; i < size; i++) { int id = sort[i]; BytesRef bytesRef = hash.get(id, spare); intsSpare.copyUTF8Bytes(bytesRef); builder.add(intsSpare.get(), new BytesRef(outputValues.get(id))); } return new StemmerOverrideMap(builder.finish(), ignoreCase); } } }