/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.shingle; import java.io.IOException; import java.util.Iterator; import java.util.LinkedList; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.AttributeSource; /** * <p>A ShingleFilter constructs shingles (token n-grams) from a token stream. * In other words, it creates combinations of tokens as a single token. * * <p>For example, the sentence "please divide this sentence into shingles" * might be tokenized into shingles "please divide", "divide this", * "this sentence", "sentence into", and "into shingles". * * <p>This filter handles position increments > 1 by inserting filler tokens * (tokens with termtext "_"). It does not handle a position increment of 0. */ public final class ShingleFilter extends TokenFilter { /** * filler token for when positionIncrement is more than 1 */ public static final String DEFAULT_FILLER_TOKEN = "_"; /** * default maximum shingle size is 2. */ public static final int DEFAULT_MAX_SHINGLE_SIZE = 2; /** * default minimum shingle size is 2. */ public static final int DEFAULT_MIN_SHINGLE_SIZE = 2; /** * default token type attribute value is "shingle" */ public static final String DEFAULT_TOKEN_TYPE = "shingle"; /** * The default string to use when joining adjacent tokens to form a shingle */ public static final String DEFAULT_TOKEN_SEPARATOR = " "; /** * The sequence of input stream tokens (or filler tokens, if necessary) * that will be composed to form output shingles. */ private LinkedList<InputWindowToken> inputWindow = new LinkedList<>(); /** * The number of input tokens in the next output token. This is the "n" in * "token n-grams". */ private CircularSequence gramSize; /** * Shingle and unigram text is composed here. */ private StringBuilder gramBuilder = new StringBuilder(); /** * The token type attribute value to use - default is "shingle" */ private String tokenType = DEFAULT_TOKEN_TYPE; /** * The string to use when joining adjacent tokens to form a shingle */ private String tokenSeparator = DEFAULT_TOKEN_SEPARATOR; /** * The string to insert for each position at which there is no token * (i.e., when position increment is greater than one). */ private char[] fillerToken = DEFAULT_FILLER_TOKEN.toCharArray(); /** * By default, we output unigrams (individual tokens) as well as shingles * (token n-grams). */ private boolean outputUnigrams = true; /** * By default, we don't override behavior of outputUnigrams. */ private boolean outputUnigramsIfNoShingles = false; /** * maximum shingle size (number of tokens) */ private int maxShingleSize; /** * minimum shingle size (number of tokens) */ private int minShingleSize; /** * The remaining number of filler tokens to be inserted into the input stream * from which shingles are composed, to handle position increments greater * than one. */ private int numFillerTokensToInsert; /** * When the next input stream token has a position increment greater than * one, it is stored in this field until sufficient filler tokens have been * inserted to account for the position increment. */ private AttributeSource nextInputStreamToken; /** * Whether or not there is a next input stream token. */ private boolean isNextInputStreamToken = false; /** * Whether at least one unigram or shingle has been output at the current * position. */ private boolean isOutputHere = false; /** * true if no shingles have been output yet (for outputUnigramsIfNoShingles). */ boolean noShingleOutput = true; /** * Holds the State after input.end() was called, so we can * restore it in our end() impl. */ private State endState; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); /** * Constructs a ShingleFilter with the specified shingle size from the * {@link TokenStream} <code>input</code> * * @param input input stream * @param minShingleSize minimum shingle size produced by the filter. * @param maxShingleSize maximum shingle size produced by the filter. */ public ShingleFilter(TokenStream input, int minShingleSize, int maxShingleSize) { super(input); setMaxShingleSize(maxShingleSize); setMinShingleSize(minShingleSize); } /** * Constructs a ShingleFilter with the specified shingle size from the * {@link TokenStream} <code>input</code> * * @param input input stream * @param maxShingleSize maximum shingle size produced by the filter. */ public ShingleFilter(TokenStream input, int maxShingleSize) { this(input, DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize); } /** * Construct a ShingleFilter with default shingle size: 2. * * @param input input stream */ public ShingleFilter(TokenStream input) { this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE); } /** * Construct a ShingleFilter with the specified token type for shingle tokens * and the default shingle size: 2 * * @param input input stream * @param tokenType token type for shingle tokens */ public ShingleFilter(TokenStream input, String tokenType) { this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE); setTokenType(tokenType); } /** * Set the type of the shingle tokens produced by this filter. * (default: "shingle") * * @param tokenType token tokenType */ public void setTokenType(String tokenType) { this.tokenType = tokenType; } /** * Shall the output stream contain the input tokens (unigrams) as well as * shingles? (default: true.) * * @param outputUnigrams Whether or not the output stream shall contain * the input tokens (unigrams) */ public void setOutputUnigrams(boolean outputUnigrams) { this.outputUnigrams = outputUnigrams; gramSize = new CircularSequence(); } /** * <p>Shall we override the behavior of outputUnigrams==false for those * times when no shingles are available (because there are fewer than * minShingleSize tokens in the input stream)? (default: false.) * <p>Note that if outputUnigrams==true, then unigrams are always output, * regardless of whether any shingles are available. * * @param outputUnigramsIfNoShingles Whether or not to output a single * unigram when no shingles are available. */ public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) { this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles; } /** * Set the max shingle size (default: 2) * * @param maxShingleSize max size of output shingles */ public void setMaxShingleSize(int maxShingleSize) { if (maxShingleSize < 2) { throw new IllegalArgumentException("Max shingle size must be >= 2"); } this.maxShingleSize = maxShingleSize; } /** * <p>Set the min shingle size (default: 2). * <p>This method requires that the passed in minShingleSize is not greater * than maxShingleSize, so make sure that maxShingleSize is set before * calling this method. * <p>The unigram output option is independent of the min shingle size. * * @param minShingleSize min size of output shingles */ public void setMinShingleSize(int minShingleSize) { if (minShingleSize < 2) { throw new IllegalArgumentException("Min shingle size must be >= 2"); } if (minShingleSize > maxShingleSize) { throw new IllegalArgumentException ("Min shingle size must be <= max shingle size"); } this.minShingleSize = minShingleSize; gramSize = new CircularSequence(); } /** * Sets the string to use when joining adjacent tokens to form a shingle * @param tokenSeparator used to separate input stream tokens in output shingles */ public void setTokenSeparator(String tokenSeparator) { this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator; } /** * Sets the string to insert for each position at which there is no token * (i.e., when position increment is greater than one). * * @param fillerToken string to insert at each position where there is no token */ public void setFillerToken(String fillerToken) { this.fillerToken = null == fillerToken ? new char[0] : fillerToken.toCharArray(); } @Override public boolean incrementToken() throws IOException { boolean tokenAvailable = false; int builtGramSize = 0; if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) { shiftInputWindow(); gramBuilder.setLength(0); } else { builtGramSize = gramSize.getPreviousValue(); } if (inputWindow.size() >= gramSize.getValue()) { boolean isAllFiller = true; InputWindowToken nextToken = null; Iterator<InputWindowToken> iter = inputWindow.iterator(); for (int gramNum = 1 ; iter.hasNext() && builtGramSize < gramSize.getValue() ; ++gramNum) { nextToken = iter.next(); if (builtGramSize < gramNum) { if (builtGramSize > 0) { gramBuilder.append(tokenSeparator); } gramBuilder.append(nextToken.termAtt.buffer(), 0, nextToken.termAtt.length()); ++builtGramSize; } if (isAllFiller && nextToken.isFiller) { if (gramNum == gramSize.getValue()) { gramSize.advance(); } } else { isAllFiller = false; } } if ( ! isAllFiller && builtGramSize == gramSize.getValue()) { inputWindow.getFirst().attSource.copyTo(this); posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1); termAtt.setEmpty().append(gramBuilder); if (gramSize.getValue() > 1) { typeAtt.setType(tokenType); noShingleOutput = false; } offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset()); if (outputUnigrams) { posLenAtt.setPositionLength(builtGramSize); } else { // position length for this token is the number of position created by shingles of smaller size. posLenAtt.setPositionLength(Math.max(1, (builtGramSize - minShingleSize) + 1)); } isOutputHere = true; gramSize.advance(); tokenAvailable = true; } } return tokenAvailable; } private boolean exhausted; /** * <p>Get the next token from the input stream. * <p>If the next token has <code>positionIncrement > 1</code>, * <code>positionIncrement - 1</code> {@link #fillerToken}s are * inserted first. * @param target Where to put the new token; if null, a new instance is created. * @return On success, the populated token; null otherwise * @throws IOException if the input stream has a problem */ private InputWindowToken getNextToken(InputWindowToken target) throws IOException { InputWindowToken newTarget = target; if (numFillerTokensToInsert > 0) { if (null == target) { newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes()); } else { nextInputStreamToken.copyTo(target.attSource); } // A filler token occupies no space newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset()); newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length); newTarget.isFiller = true; --numFillerTokensToInsert; } else if (isNextInputStreamToken) { if (null == target) { newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes()); } else { nextInputStreamToken.copyTo(target.attSource); } isNextInputStreamToken = false; newTarget.isFiller = false; } else if (!exhausted) { if (input.incrementToken()) { if (null == target) { newTarget = new InputWindowToken(cloneAttributes()); } else { this.copyTo(target.attSource); } if (posIncrAtt.getPositionIncrement() > 1) { // Each output shingle must contain at least one input token, // so no more than (maxShingleSize - 1) filler tokens will be inserted. numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1); // Save the current token as the next input stream token if (null == nextInputStreamToken) { nextInputStreamToken = cloneAttributes(); } else { this.copyTo(nextInputStreamToken); } isNextInputStreamToken = true; // A filler token occupies no space newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset()); newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length); newTarget.isFiller = true; --numFillerTokensToInsert; } else { newTarget.isFiller = false; } } else { exhausted = true; input.end(); endState = captureState(); numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement(), maxShingleSize - 1); if (numFillerTokensToInsert > 0) { nextInputStreamToken = new AttributeSource(getAttributeFactory()); nextInputStreamToken.addAttribute(CharTermAttribute.class); OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(OffsetAttribute.class); newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset()); // Recurse/loop just once: return getNextToken(target); } else { newTarget = null; } } } else { newTarget = null; } return newTarget; } @Override public void end() throws IOException { if (!exhausted) { super.end(); } else { restoreState(endState); } } /** * <p>Fills {@link #inputWindow} with input stream tokens, if available, * shifting to the right if the window was previously full. * <p>Resets {@link #gramSize} to its minimum value. * * @throws IOException if there's a problem getting the next token */ private void shiftInputWindow() throws IOException { InputWindowToken firstToken = null; if (inputWindow.size() > 0) { firstToken = inputWindow.removeFirst(); } while (inputWindow.size() < maxShingleSize) { if (null != firstToken) { // recycle the firstToken, if available if (null != getNextToken(firstToken)) { inputWindow.add(firstToken); // the firstToken becomes the last firstToken = null; } else { break; // end of input stream } } else { InputWindowToken nextToken = getNextToken(null); if (null != nextToken) { inputWindow.add(nextToken); } else { break; // end of input stream } } } if (outputUnigramsIfNoShingles && noShingleOutput && gramSize.minValue > 1 && inputWindow.size() < minShingleSize) { gramSize.minValue = 1; } gramSize.reset(); isOutputHere = false; } @Override public void reset() throws IOException { super.reset(); gramSize.reset(); inputWindow.clear(); nextInputStreamToken = null; isNextInputStreamToken = false; numFillerTokensToInsert = 0; isOutputHere = false; noShingleOutput = true; exhausted = false; endState = null; if (outputUnigramsIfNoShingles && ! outputUnigrams) { // Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles gramSize.minValue = minShingleSize; } } /** * <p>An instance of this class is used to maintain the number of input * stream tokens that will be used to compose the next unigram or shingle: * {@link #gramSize}. * <p><code>gramSize</code> will take on values from the circular sequence * <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>. * <p>1 is included in the circular sequence only if * {@link #outputUnigrams} = true. */ private class CircularSequence { private int value; private int previousValue; private int minValue; public CircularSequence() { minValue = outputUnigrams ? 1 : minShingleSize; reset(); } /** * @return the current value. * @see #advance() */ public int getValue() { return value; } /** * <p>Increments this circular number's value to the next member in the * circular sequence * <code>gramSize</code> will take on values from the circular sequence * <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>. * <p>1 is included in the circular sequence only if * {@link #outputUnigrams} = true. */ public void advance() { previousValue = value; if (value == 1) { value = minShingleSize; } else if (value == maxShingleSize) { reset(); } else { ++value; } } /** * <p>Sets this circular number's value to the first member of the * circular sequence * <p><code>gramSize</code> will take on values from the circular sequence * <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>. * <p>1 is included in the circular sequence only if * {@link #outputUnigrams} = true. */ public void reset() { previousValue = value = minValue; } /** * <p>Returns true if the current value is the first member of the circular * sequence. * <p>If {@link #outputUnigrams} = true, the first member of the circular * sequence will be 1; otherwise, it will be {@link #minShingleSize}. * * @return true if the current value is the first member of the circular * sequence; false otherwise */ public boolean atMinValue() { return value == minValue; } /** * @return the value this instance had before the last advance() call */ public int getPreviousValue() { return previousValue; } } private static class InputWindowToken { final AttributeSource attSource; final CharTermAttribute termAtt; final OffsetAttribute offsetAtt; boolean isFiller = false; public InputWindowToken(AttributeSource attSource) { this.attSource = attSource; this.termAtt = attSource.getAttribute(CharTermAttribute.class); this.offsetAtt = attSource.getAttribute(OffsetAttribute.class); } } }