/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.miscellaneous; import java.io.IOException; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.InPlaceMergeSorter; import org.apache.lucene.util.RamUsageEstimator; /** * Splits words into subwords and performs optional transformations on subword * groups, producing a correct token graph so that e.g. {@link PhraseQuery} can * work correctly when this filter is used in the search-time analyzer. Unlike * the deprecated {@link WordDelimiterFilter}, this token filter produces a * correct token graph as output. However, it cannot consume an input token * graph correctly. * * <p> * Words are split into subwords with the following rules: * <ul> * <li>split on intra-word delimiters (by default, all non alpha-numeric * characters): <code>"Wi-Fi"</code> → <code>"Wi", "Fi"</code></li> * <li>split on case transitions: <code>"PowerShot"</code> → * <code>"Power", "Shot"</code></li> * <li>split on letter-number transitions: <code>"SD500"</code> → * <code>"SD", "500"</code></li> * <li>leading and trailing intra-word delimiters on each subword are ignored: * <code>"//hello---there, 'dude'"</code> → * <code>"hello", "there", "dude"</code></li> * <li>trailing "'s" are removed for each subword: <code>"O'Neil's"</code> * → <code>"O", "Neil"</code> * <ul> * <li>Note: this step isn't performed in a separate filter because of possible * subword combinations.</li> * </ul> * </li> * </ul> * * The <b>combinations</b> parameter affects how subwords are combined: * <ul> * <li>combinations="0" causes no subword combinations: <code>"PowerShot"</code> * → <code>0:"Power", 1:"Shot"</code> (0 and 1 are the token positions)</li> * <li>combinations="1" means that in addition to the subwords, maximum runs of * non-numeric subwords are catenated and produced at the same position of the * last subword in the run: * <ul> * <li><code>"PowerShot"</code> → * <code>0:"Power", 1:"Shot" 1:"PowerShot"</code></li> * <li><code>"A's+B's&C's"</code> > <code>0:"A", 1:"B", 2:"C", 2:"ABC"</code> * </li> * <li><code>"Super-Duper-XL500-42-AutoCoder!"</code> → * <code>0:"Super", 1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder"</code> * </li> * </ul> * </li> * </ul> * One use for {@link WordDelimiterGraphFilter} is to help match words with different * subword delimiters. For example, if the source text contained "wi-fi" one may * want "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match. One way of doing so * is to specify combinations="1" in the analyzer used for indexing, and * combinations="0" (the default) in the analyzer used for querying. Given that * the current {@link StandardTokenizer} immediately removes many intra-word * delimiters, it is recommended that this filter be used after a tokenizer that * does not do this (such as {@link WhitespaceTokenizer}). */ public final class WordDelimiterGraphFilter extends TokenFilter { /** * Causes parts of words to be generated: * <p> * "PowerShot" => "Power" "Shot" */ public static final int GENERATE_WORD_PARTS = 1; /** * Causes number subwords to be generated: * <p> * "500-42" => "500" "42" */ public static final int GENERATE_NUMBER_PARTS = 2; /** * Causes maximum runs of word parts to be catenated: * <p> * "wi-fi" => "wifi" */ public static final int CATENATE_WORDS = 4; /** * Causes maximum runs of number parts to be catenated: * <p> * "500-42" => "50042" */ public static final int CATENATE_NUMBERS = 8; /** * Causes all subword parts to be catenated: * <p> * "wi-fi-4000" => "wifi4000" */ public static final int CATENATE_ALL = 16; /** * Causes original words are preserved and added to the subword list (Defaults to false) * <p> * "500-42" => "500" "42" "500-42" */ public static final int PRESERVE_ORIGINAL = 32; /** * Causes lowercase -> uppercase transition to start a new subword. */ public static final int SPLIT_ON_CASE_CHANGE = 64; /** * If not set, causes numeric changes to be ignored (subwords will only be generated * given SUBWORD_DELIM tokens). */ public static final int SPLIT_ON_NUMERICS = 128; /** * Causes trailing "'s" to be removed for each subword * <p> * "O'Neil's" => "O", "Neil" */ public static final int STEM_ENGLISH_POSSESSIVE = 256; /** * If not null is the set of tokens to protect from being delimited * */ final CharArraySet protWords; private final int flags; // packs start pos, end pos, start part, end part (= slice of the term text) for each buffered part: private int[] bufferedParts = new int[16]; private int bufferedLen; private int bufferedPos; // holds text for each buffered part, or null if it's a simple slice of the original term private char[][] bufferedTermParts = new char[4][]; private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class); // used for iterating word delimiter breaks private final WordDelimiterIterator iterator; // used for concatenating runs of similar typed subwords (word,number) private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation(); // number of subwords last output by concat. private int lastConcatCount; // used for catenate all private final WordDelimiterConcatenation concatAll = new WordDelimiterConcatenation(); // used for accumulating position increment gaps so that we preserve incoming holes: private int accumPosInc; private char[] savedTermBuffer = new char[16]; private int savedTermLength; private int savedStartOffset; private int savedEndOffset; private AttributeSource.State savedState; private int lastStartOffset; // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. private boolean hasIllegalOffsets; private int wordPos; /** * Creates a new WordDelimiterGraphFilter * * @param in TokenStream to be filtered * @param charTypeTable table containing character types * @param configurationFlags Flags configuring the filter * @param protWords If not null is the set of tokens to protect from being delimited */ public WordDelimiterGraphFilter(TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) { super(in); if ((configurationFlags & ~(GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | PRESERVE_ORIGINAL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE)) != 0) { throw new IllegalArgumentException("flags contains unrecognized flag: " + configurationFlags); } this.flags = configurationFlags; this.protWords = protWords; this.iterator = new WordDelimiterIterator( charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE)); } /** * Creates a new WordDelimiterGraphFilter using {@link WordDelimiterIterator#DEFAULT_WORD_DELIM_TABLE} * as its charTypeTable * * @param in TokenStream to be filtered * @param configurationFlags Flags configuring the filter * @param protWords If not null is the set of tokens to protect from being delimited */ public WordDelimiterGraphFilter(TokenStream in, int configurationFlags, CharArraySet protWords) { this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords); } /** Iterates all words parts and concatenations, buffering up the term parts we should return. */ private void bufferWordParts() throws IOException { saveState(); // if length by start + end offsets doesn't match the term's text then set offsets for all our word parts/concats to the incoming // offsets. this can happen if WDGF is applied to an injected synonym, or to a stem'd form, etc: hasIllegalOffsets = (savedEndOffset - savedStartOffset != savedTermLength); bufferedLen = 0; lastConcatCount = 0; wordPos = 0; if (iterator.isSingleWord()) { buffer(wordPos, wordPos+1, iterator.current, iterator.end); wordPos++; iterator.next(); } else { // iterate all words parts, possibly buffering them, building up concatenations and possibly buffering them too: while (iterator.end != WordDelimiterIterator.DONE) { int wordType = iterator.type(); // do we already have queued up incompatible concatenations? if (concat.isNotEmpty() && (concat.type & wordType) == 0) { flushConcatenation(concat); } // add subwords depending upon options if (shouldConcatenate(wordType)) { concatenate(concat); } // add all subwords (catenateAll) if (has(CATENATE_ALL)) { concatenate(concatAll); } // if we should output the word or number part if (shouldGenerateParts(wordType)) { buffer(wordPos, wordPos+1, iterator.current, iterator.end); wordPos++; } iterator.next(); } if (concat.isNotEmpty()) { // flush final concatenation flushConcatenation(concat); } if (concatAll.isNotEmpty()) { // only if we haven't output this same combo above, e.g. PowerShot with CATENATE_WORDS: if (concatAll.subwordCount > lastConcatCount) { if (wordPos == concatAll.startPos) { // we are not generating parts, so we must advance wordPos now wordPos++; } concatAll.write(); } concatAll.clear(); } } if (has(PRESERVE_ORIGINAL)) { if (wordPos == 0) { // can happen w/ strange flag combos and inputs :) wordPos++; } // add the original token now so that we can set the correct end position buffer(0, wordPos, 0, savedTermLength); } sorter.sort(0, bufferedLen); wordPos = 0; // set back to 0 for iterating from the buffer bufferedPos = 0; } @Override public boolean incrementToken() throws IOException { while (true) { if (savedState == null) { // process a new input token if (input.incrementToken() == false) { return false; } int termLength = termAttribute.length(); char[] termBuffer = termAttribute.buffer(); accumPosInc += posIncAttribute.getPositionIncrement(); // iterate & cache all word parts up front: iterator.setText(termBuffer, termLength); iterator.next(); // word of no delimiters, or protected word: just return it if ((iterator.current == 0 && iterator.end == termLength) || (protWords != null && protWords.contains(termBuffer, 0, termLength))) { posIncAttribute.setPositionIncrement(accumPosInc); accumPosInc = 0; return true; } // word of simply delimiters: swallow this token, creating a hole, and move on to next token if (iterator.end == WordDelimiterIterator.DONE) { if (has(PRESERVE_ORIGINAL) == false) { continue; } else { return true; } } // otherwise, we have delimiters, process & buffer all parts: bufferWordParts(); } if (bufferedPos < bufferedLen) { clearAttributes(); restoreState(savedState); char[] termPart = bufferedTermParts[bufferedPos]; int startPos = bufferedParts[4*bufferedPos]; int endPos = bufferedParts[4*bufferedPos+1]; int startPart = bufferedParts[4*bufferedPos+2]; int endPart = bufferedParts[4*bufferedPos+3]; bufferedPos++; int startOffset; int endOffset; if (hasIllegalOffsets) { startOffset = savedStartOffset; endOffset = savedEndOffset; } else { startOffset = savedStartOffset + startPart; endOffset = savedStartOffset + endPart; } // never let offsets go backwards: startOffset = Math.max(startOffset, lastStartOffset); endOffset = Math.max(endOffset, lastStartOffset); offsetAttribute.setOffset(startOffset, endOffset); lastStartOffset = startOffset; if (termPart == null) { termAttribute.copyBuffer(savedTermBuffer, startPart, endPart - startPart); } else { termAttribute.copyBuffer(termPart, 0, termPart.length); } posIncAttribute.setPositionIncrement(accumPosInc + startPos - wordPos); accumPosInc = 0; posLenAttribute.setPositionLength(endPos - startPos); wordPos = startPos; return true; } // no saved concatenations, on to the next input word savedState = null; } } @Override public void reset() throws IOException { super.reset(); accumPosInc = 0; savedState = null; lastStartOffset = 0; concat.clear(); concatAll.clear(); } // ================================================= Helper Methods ================================================ private class PositionSorter extends InPlaceMergeSorter { @Override protected int compare(int i, int j) { // sort by smaller start position int iPosStart = bufferedParts[4*i]; int jPosStart = bufferedParts[4*j]; int cmp = Integer.compare(iPosStart, jPosStart); if (cmp != 0) { return cmp; } // tie break by longest pos length: int iPosEnd = bufferedParts[4*i+1]; int jPosEnd = bufferedParts[4*j+1]; return Integer.compare(jPosEnd, iPosEnd); } @Override protected void swap(int i, int j) { int iOffset = 4*i; int jOffset = 4*j; for(int x=0;x<4;x++) { int tmp = bufferedParts[iOffset+x]; bufferedParts[iOffset+x] = bufferedParts[jOffset+x]; bufferedParts[jOffset+x] = tmp; } char[] tmp2 = bufferedTermParts[i]; bufferedTermParts[i] = bufferedTermParts[j]; bufferedTermParts[j] = tmp2; } } final PositionSorter sorter = new PositionSorter(); /** * startPos, endPos -> graph start/end position * startPart, endPart -> slice of the original term for this part */ void buffer(int startPos, int endPos, int startPart, int endPart) { buffer(null, startPos, endPos, startPart, endPart); } /** * a null termPart means it's a simple slice of the original term */ void buffer(char[] termPart, int startPos, int endPos, int startPart, int endPart) { /* System.out.println("buffer: pos=" + startPos + "-" + endPos + " part=" + startPart + "-" + endPart); if (termPart != null) { System.out.println(" termIn=" + new String(termPart)); } else { System.out.println(" term=" + new String(savedTermBuffer, startPart, endPart-startPart)); } */ assert endPos > startPos: "startPos=" + startPos + " endPos=" + endPos; assert endPart > startPart || (endPart == 0 && startPart == 0 && savedTermLength == 0): "startPart=" + startPart + " endPart=" + endPart; if ((bufferedLen+1)*4 > bufferedParts.length) { bufferedParts = ArrayUtil.grow(bufferedParts, (bufferedLen+1)*4); } if (bufferedTermParts.length == bufferedLen) { int newSize = ArrayUtil.oversize(bufferedLen+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF); char[][] newArray = new char[newSize][]; System.arraycopy(bufferedTermParts, 0, newArray, 0, bufferedTermParts.length); bufferedTermParts = newArray; } bufferedTermParts[bufferedLen] = termPart; bufferedParts[bufferedLen*4] = startPos; bufferedParts[bufferedLen*4+1] = endPos; bufferedParts[bufferedLen*4+2] = startPart; bufferedParts[bufferedLen*4+3] = endPart; bufferedLen++; } /** * Saves the existing attribute states */ private void saveState() { savedTermLength = termAttribute.length(); savedStartOffset = offsetAttribute.startOffset(); savedEndOffset = offsetAttribute.endOffset(); savedState = captureState(); if (savedTermBuffer.length < savedTermLength) { savedTermBuffer = new char[ArrayUtil.oversize(savedTermLength, Character.BYTES)]; } System.arraycopy(termAttribute.buffer(), 0, savedTermBuffer, 0, savedTermLength); } /** * Flushes the given WordDelimiterConcatenation by either writing its concat and then clearing, or just clearing. * * @param concat WordDelimiterConcatenation that will be flushed */ private void flushConcatenation(WordDelimiterConcatenation concat) { if (wordPos == concat.startPos) { // we are not generating parts, so we must advance wordPos now wordPos++; } lastConcatCount = concat.subwordCount; if (concat.subwordCount != 1 || shouldGenerateParts(concat.type) == false) { concat.write(); } concat.clear(); } /** * Determines whether to concatenate a word or number if the current word is the given type * * @param wordType Type of the current word used to determine if it should be concatenated * @return {@code true} if concatenation should occur, {@code false} otherwise */ private boolean shouldConcatenate(int wordType) { return (has(CATENATE_WORDS) && WordDelimiterIterator.isAlpha(wordType)) || (has(CATENATE_NUMBERS) && WordDelimiterIterator.isDigit(wordType)); } /** * Determines whether a word/number part should be generated for a word of the given type * * @param wordType Type of the word used to determine if a word/number part should be generated * @return {@code true} if a word/number part should be generated, {@code false} otherwise */ private boolean shouldGenerateParts(int wordType) { return (has(GENERATE_WORD_PARTS) && WordDelimiterIterator.isAlpha(wordType)) || (has(GENERATE_NUMBER_PARTS) && WordDelimiterIterator.isDigit(wordType)); } /** * Concatenates the saved buffer to the given WordDelimiterConcatenation * * @param concatenation WordDelimiterConcatenation to concatenate the buffer to */ private void concatenate(WordDelimiterConcatenation concatenation) { if (concatenation.isEmpty()) { concatenation.type = iterator.type(); concatenation.startPart = iterator.current; concatenation.startPos = wordPos; } concatenation.append(savedTermBuffer, iterator.current, iterator.end - iterator.current); concatenation.endPart = iterator.end; } /** * Determines whether the given flag is set * * @param flag Flag to see if set * @return {@code true} if flag is set */ private boolean has(int flag) { return (flags & flag) != 0; } // ================================================= Inner Classes ================================================= /** * A WDF concatenated 'run' */ final class WordDelimiterConcatenation { final StringBuilder buffer = new StringBuilder(); int startPart; int endPart; int startPos; int type; int subwordCount; /** * Appends the given text of the given length, to the concetenation at the given offset * * @param text Text to append * @param offset Offset in the concetenation to add the text * @param length Length of the text to append */ void append(char text[], int offset, int length) { buffer.append(text, offset, length); subwordCount++; } /** * Writes the concatenation to part buffer */ void write() { char[] termPart = new char[buffer.length()]; buffer.getChars(0, buffer.length(), termPart, 0); buffer(termPart, startPos, wordPos, startPart, endPart); } /** * Determines if the concatenation is empty * * @return {@code true} if the concatenation is empty, {@code false} otherwise */ boolean isEmpty() { return buffer.length() == 0; } boolean isNotEmpty() { return isEmpty() == false; } /** * Clears the concatenation and resets its state */ void clear() { buffer.setLength(0); startPart = endPart = type = subwordCount = 0; } } /** Returns string representation of configuration flags */ public static String flagsToString(int flags) { StringBuilder b = new StringBuilder(); if ((flags & GENERATE_WORD_PARTS) != 0) { b.append("GENERATE_WORD_PARTS"); } if ((flags & GENERATE_NUMBER_PARTS) != 0) { if (b.length() > 0) { b.append(" | "); } b.append("GENERATE_NUMBER_PARTS"); } if ((flags & CATENATE_WORDS) != 0) { if (b.length() > 0) { b.append(" | "); } b.append("CATENATE_WORDS"); } if ((flags & CATENATE_NUMBERS) != 0) { if (b.length() > 0) { b.append(" | "); } b.append("CATENATE_NUMBERS"); } if ((flags & CATENATE_ALL) != 0) { if (b.length() > 0) { b.append(" | "); } b.append("CATENATE_ALL"); } if ((flags & PRESERVE_ORIGINAL) != 0) { if (b.length() > 0) { b.append(" | "); } b.append("PRESERVE_ORIGINAL"); } if ((flags & SPLIT_ON_CASE_CHANGE) != 0) { if (b.length() > 0) { b.append(" | "); } b.append("SPLIT_ON_CASE_CHANGE"); } if ((flags & SPLIT_ON_NUMERICS) != 0) { if (b.length() > 0) { b.append(" | "); } b.append("SPLIT_ON_NUMERICS"); } if ((flags & STEM_ENGLISH_POSSESSIVE) != 0) { if (b.length() > 0) { b.append(" | "); } b.append("STEM_ENGLISH_POSSESSIVE"); } return b.toString(); } @Override public String toString() { StringBuilder b = new StringBuilder(); b.append("WordDelimiterGraphFilter(flags="); b.append(flagsToString(flags)); b.append(')'); return b.toString(); } // questions: // negative numbers? -42 indexed as just 42? // dollar sign? $42 // percent sign? 33% // downsides: if source text is "powershot" then a query of "PowerShot" won't match! }