/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.ngram; import java.io.IOException; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.CharacterUtils; import org.apache.lucene.util.AttributeFactory; /** * Tokenizes the input into n-grams of the given size(s). * <p>On the contrary to {@link NGramTokenFilter}, this class sets offsets so * that characters between startOffset and endOffset in the original stream are * the same as the term chars. * <p>For example, "abcde" would be tokenized as (minGram=2, maxGram=3): * <table summary="ngram tokens example"> * <tr><th>Term</th><td>ab</td><td>abc</td><td>bc</td><td>bcd</td><td>cd</td><td>cde</td><td>de</td></tr> * <tr><th>Position increment</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr> * <tr><th>Position length</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr> * <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr> * </table> * <a name="version"></a> * <p>This tokenizer changed a lot in Lucene 4.4 in order to:<ul> * <li>tokenize in a streaming fashion to support streams which are larger * than 1024 chars (limit of the previous version), * <li>count grams based on unicode code points instead of java chars (and * never split in the middle of surrogate pairs), * <li>give the ability to {@link #isTokenChar(int) pre-tokenize} the stream * before computing n-grams.</ul> * <p>Additionally, this class doesn't trim trailing whitespaces and emits * tokens in a different order, tokens are now emitted by increasing start * offsets while they used to be emitted by increasing lengths (which prevented * from supporting large input streams). */ // non-final to allow for overriding isTokenChar, but all other methods should be final public class NGramTokenizer extends Tokenizer { public static final int DEFAULT_MIN_NGRAM_SIZE = 1; public static final int DEFAULT_MAX_NGRAM_SIZE = 2; private CharacterUtils.CharacterBuffer charBuffer; private int[] buffer; // like charBuffer, but converted to code points private int bufferStart, bufferEnd; // remaining slice in buffer private int offset; private int gramSize; private int minGram, maxGram; private boolean exhausted; private int lastCheckedChar; // last offset in the buffer that we checked private int lastNonTokenChar; // last offset that we found to not be a token char private boolean edgesOnly; // leading edges n-grams only private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); NGramTokenizer(int minGram, int maxGram, boolean edgesOnly) { init(minGram, maxGram, edgesOnly); } /** * Creates NGramTokenizer with given min and max n-grams. * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public NGramTokenizer(int minGram, int maxGram) { this(minGram, maxGram, false); } NGramTokenizer(AttributeFactory factory, int minGram, int maxGram, boolean edgesOnly) { super(factory); init(minGram, maxGram, edgesOnly); } /** * Creates NGramTokenizer with given min and max n-grams. * @param factory {@link org.apache.lucene.util.AttributeFactory} to use * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public NGramTokenizer(AttributeFactory factory, int minGram, int maxGram) { this(factory, minGram, maxGram, false); } /** * Creates NGramTokenizer with default min and max n-grams. */ public NGramTokenizer() { this(DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE); } private void init(int minGram, int maxGram, boolean edgesOnly) { if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new IllegalArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.edgesOnly = edgesOnly; charBuffer = CharacterUtils.newCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader buffer = new int[charBuffer.getBuffer().length]; // Make the term att large enough termAtt.resizeBuffer(2 * maxGram); } @Override public final boolean incrementToken() throws IOException { clearAttributes(); // termination of this loop is guaranteed by the fact that every iteration // either advances the buffer (calls consumes()) or increases gramSize while (true) { // compact if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted) { System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart); bufferEnd -= bufferStart; lastCheckedChar -= bufferStart; lastNonTokenChar -= bufferStart; bufferStart = 0; // fill in remaining space exhausted = !CharacterUtils.fill(charBuffer, input, buffer.length - bufferEnd); // convert to code points bufferEnd += CharacterUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer, bufferEnd); } // should we go to the next offset? if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd) { if (bufferStart + 1 + minGram > bufferEnd) { assert exhausted; return false; } consume(); gramSize = minGram; } updateLastNonTokenChar(); // retry if the token to be emitted was going to not only contain token chars final boolean termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize); final boolean isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1; if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar) { consume(); gramSize = minGram; continue; } final int length = CharacterUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0); termAtt.setLength(length); posIncAtt.setPositionIncrement(1); posLenAtt.setPositionLength(1); offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + length)); ++gramSize; return true; } } private void updateLastNonTokenChar() { final int termEnd = bufferStart + gramSize - 1; if (termEnd > lastCheckedChar) { for (int i = termEnd; i > lastCheckedChar; --i) { if (!isTokenChar(buffer[i])) { lastNonTokenChar = i; break; } } lastCheckedChar = termEnd; } } /** Consume one code point. */ private void consume() { offset += Character.charCount(buffer[bufferStart++]); } /** Only collect characters which satisfy this condition. */ protected boolean isTokenChar(int chr) { return true; } @Override public final void end() throws IOException { super.end(); assert bufferStart <= bufferEnd; int endOffset = offset; for (int i = bufferStart; i < bufferEnd; ++i) { endOffset += Character.charCount(buffer[i]); } endOffset = correctOffset(endOffset); // set final offset offsetAtt.setOffset(endOffset, endOffset); } @Override public final void reset() throws IOException { super.reset(); bufferStart = bufferEnd = buffer.length; lastNonTokenChar = lastCheckedChar = bufferStart - 1; offset = 0; gramSize = minGram; exhausted = false; charBuffer.reset(); } }