package org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.IdentityHashMap; import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Payload; import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeSource; /** * A <code>TokenStream</code> enumerates the sequence of tokens, either from * {@link Field}s of a {@link Document} or from query text. * <p> * This is an abstract class; concrete subclasses are: * <ul> * <li>{@link Tokenizer}, a <code>TokenStream</code> whose input is a Reader; and * <li>{@link TokenFilter}, a <code>TokenStream</code> whose input is another * <code>TokenStream</code>. * </ul> * A new <code>TokenStream</code> API has been introduced with Lucene 2.9. This API * has moved from being {@link Token}-based to {@link Attribute}-based. While * {@link Token} still exists in 2.9 as a convenience class, the preferred way * to store the information of a {@link Token} is to use {@link AttributeImpl}s. * <p> * <code>TokenStream</code> now extends {@link AttributeSource}, which provides * access to all of the token {@link Attribute}s for the <code>TokenStream</code>. * Note that only one instance per {@link AttributeImpl} is created and reused * for every token. This approach reduces object creation and allows local * caching of references to the {@link AttributeImpl}s. See * {@link #incrementToken()} for further details. * <p> * <b>The workflow of the new <code>TokenStream</code> API is as follows:</b> * <ol> * <li>Instantiation of <code>TokenStream</code>/{@link TokenFilter}s which add/get * attributes to/from the {@link AttributeSource}. * <li>The consumer calls {@link TokenStream#reset()}. * <li>The consumer retrieves attributes from the stream and stores local * references to all attributes it wants to access. * <li>The consumer calls {@link #incrementToken()} until it returns false * consuming the attributes after each call. * <li>The consumer calls {@link #end()} so that any end-of-stream operations * can be performed. * <li>The consumer calls {@link #close()} to release any resource when finished * using the <code>TokenStream</code>. * </ol> * To make sure that filters and consumers know which attributes are available, * the attributes must be added during instantiation. Filters and consumers are * not required to check for availability of attributes in * {@link #incrementToken()}. * <p> * You can find some example code for the new API in the analysis package level * Javadoc. * <p> * Sometimes it is desirable to capture a current state of a <code>TokenStream</code>, * e.g., for buffering purposes (see {@link CachingTokenFilter}, * {@link TeeSinkTokenFilter}). For this usecase * {@link AttributeSource#captureState} and {@link AttributeSource#restoreState} * can be used. */ public abstract class TokenStream extends AttributeSource { /** @deprecated Remove this when old API is removed! */ private static final AttributeFactory DEFAULT_TOKEN_WRAPPER_ATTRIBUTE_FACTORY = new TokenWrapperAttributeFactory(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); /** @deprecated Remove this when old API is removed! */ private final TokenWrapper tokenWrapper; /** @deprecated Remove this when old API is removed! */ private static boolean onlyUseNewAPI = false; /** @deprecated Remove this when old API is removed! */ private final MethodSupport supportedMethods = getSupportedMethods(this.getClass()); /** @deprecated Remove this when old API is removed! */ private static final class MethodSupport { final boolean hasIncrementToken, hasReusableNext, hasNext; MethodSupport(Class clazz) { hasIncrementToken = isMethodOverridden(clazz, "incrementToken", METHOD_NO_PARAMS); hasReusableNext = isMethodOverridden(clazz, "next", METHOD_TOKEN_PARAM); hasNext = isMethodOverridden(clazz, "next", METHOD_NO_PARAMS); } private static boolean isMethodOverridden(Class clazz, String name, Class[] params) { try { return clazz.getMethod(name, params).getDeclaringClass() != TokenStream.class; } catch (NoSuchMethodException e) { // should not happen throw new RuntimeException(e); } } private static final Class[] METHOD_NO_PARAMS = new Class[0]; private static final Class[] METHOD_TOKEN_PARAM = new Class[]{Token.class}; } /** @deprecated Remove this when old API is removed! */ private static final IdentityHashMap/*<Class<? extends TokenStream>,MethodSupport>*/ knownMethodSupport = new IdentityHashMap(); /** @deprecated Remove this when old API is removed! */ private static MethodSupport getSupportedMethods(Class clazz) { MethodSupport supportedMethods; synchronized(knownMethodSupport) { supportedMethods = (MethodSupport) knownMethodSupport.get(clazz); if (supportedMethods == null) { knownMethodSupport.put(clazz, supportedMethods = new MethodSupport(clazz)); } } return supportedMethods; } /** @deprecated Remove this when old API is removed! */ private static final class TokenWrapperAttributeFactory extends AttributeFactory { private final AttributeFactory delegate; private TokenWrapperAttributeFactory(AttributeFactory delegate) { this.delegate = delegate; } public AttributeImpl createAttributeInstance(Class attClass) { return attClass.isAssignableFrom(TokenWrapper.class) ? new TokenWrapper() : delegate.createAttributeInstance(attClass); } // this is needed for TeeSinkTokenStream's check for compatibility of AttributeSource, // so two TokenStreams using old API have the same AttributeFactory wrapped by this one. public boolean equals(Object other) { if (this == other) return true; if (other instanceof TokenWrapperAttributeFactory) { final TokenWrapperAttributeFactory af = (TokenWrapperAttributeFactory) other; return this.delegate.equals(af.delegate); } return false; } public int hashCode() { return delegate.hashCode() ^ 0x0a45ff31; } } /** * A TokenStream using the default attribute factory. */ protected TokenStream() { super(onlyUseNewAPI ? AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY : TokenStream.DEFAULT_TOKEN_WRAPPER_ATTRIBUTE_FACTORY ); tokenWrapper = initTokenWrapper(null); check(); } /** * A TokenStream that uses the same attributes as the supplied one. */ protected TokenStream(AttributeSource input) { super(input); tokenWrapper = initTokenWrapper(input); check(); } /** * A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances. */ protected TokenStream(AttributeFactory factory) { super(onlyUseNewAPI ? factory : new TokenWrapperAttributeFactory(factory) ); tokenWrapper = initTokenWrapper(null); check(); } /** @deprecated Remove this when old API is removed! */ private TokenWrapper initTokenWrapper(AttributeSource input) { if (onlyUseNewAPI) { // no wrapper needed return null; } else { // if possible get the wrapper from the filter's input stream if (input instanceof TokenStream && ((TokenStream) input).tokenWrapper != null) { return ((TokenStream) input).tokenWrapper; } // check that all attributes are implemented by the same TokenWrapper instance final Attribute att = addAttribute(TermAttribute.class); if (att instanceof TokenWrapper && addAttribute(TypeAttribute.class) == att && addAttribute(PositionIncrementAttribute.class) == att && addAttribute(FlagsAttribute.class) == att && addAttribute(OffsetAttribute.class) == att && addAttribute(PayloadAttribute.class) == att ) { return (TokenWrapper) att; } else { throw new UnsupportedOperationException( "If onlyUseNewAPI is disabled, all basic Attributes must be implemented by the internal class "+ "TokenWrapper. Please make sure, that all TokenStreams/TokenFilters in this chain have been "+ "instantiated with this flag disabled and do not add any custom instances for the basic Attributes!" ); } } } /** @deprecated Remove this when old API is removed! */ private void check() { if (onlyUseNewAPI && !supportedMethods.hasIncrementToken) { throw new UnsupportedOperationException(getClass().getName()+" does not implement incrementToken() which is needed for onlyUseNewAPI."); } // a TokenStream subclass must at least implement one of the methods! if (!(supportedMethods.hasIncrementToken || supportedMethods.hasNext || supportedMethods.hasReusableNext)) { throw new UnsupportedOperationException(getClass().getName()+" does not implement any of incrementToken(), next(Token), next()."); } } /** * For extra performance you can globally enable the new * {@link #incrementToken} API using {@link Attribute}s. There will be a * small, but in most cases negligible performance increase by enabling this, * but it only works if <b>all</b> <code>TokenStream</code>s use the new API and * implement {@link #incrementToken}. This setting can only be enabled * globally. * <P> * This setting only affects <code>TokenStream</code>s instantiated after this * call. All <code>TokenStream</code>s already created use the other setting. * <P> * All core {@link Analyzer}s are compatible with this setting, if you have * your own <code>TokenStream</code>s that are also compatible, you should enable * this. * <P> * When enabled, tokenization may throw {@link UnsupportedOperationException} * s, if the whole tokenizer chain is not compatible eg one of the * <code>TokenStream</code>s does not implement the new <code>TokenStream</code> API. * <P> * The default is <code>false</code>, so there is the fallback to the old API * available. * * @deprecated This setting will no longer be needed in Lucene 3.0 as the old * API will be removed. */ public static void setOnlyUseNewAPI(boolean onlyUseNewAPI) { TokenStream.onlyUseNewAPI = onlyUseNewAPI; } /** * Returns if only the new API is used. * * @see #setOnlyUseNewAPI * @deprecated This setting will no longer be needed in Lucene 3.0 as * the old API will be removed. */ public static boolean getOnlyUseNewAPI() { return onlyUseNewAPI; } /** * Consumers (i.e., {@link IndexWriter}) use this method to advance the stream to * the next token. Implementing classes must implement this method and update * the appropriate {@link AttributeImpl}s with the attributes of the next * token. * <P> * The producer must make no assumptions about the attributes after the method * has been returned: the caller may arbitrarily change it. If the producer * needs to preserve the state for subsequent calls, it can use * {@link #captureState} to create a copy of the current attribute state. * <p> * This method is called for every token of a document, so an efficient * implementation is crucial for good performance. To avoid calls to * {@link #addAttribute(Class)} and {@link #getAttribute(Class)} or downcasts, * references to all {@link AttributeImpl}s that this stream uses should be * retrieved during instantiation. * <p> * To ensure that filters and consumers know which attributes are available, * the attributes must be added during instantiation. Filters and consumers * are not required to check for availability of attributes in * {@link #incrementToken()}. * * @return false for end of stream; true otherwise * * <p> * <b>Note that this method will be defined abstract in Lucene * 3.0.</b> */ public boolean incrementToken() throws IOException { assert tokenWrapper != null; final Token token; if (supportedMethods.hasReusableNext) { token = next(tokenWrapper.delegate); } else { assert supportedMethods.hasNext; token = next(); } if (token == null) return false; tokenWrapper.delegate = token; return true; } /** * This method is called by the consumer after the last token has been * consumed, after {@link #incrementToken()} returned <code>false</code> * (using the new <code>TokenStream</code> API). Streams implementing the old API * should upgrade to use this feature. * <p/> * This method can be used to perform any end-of-stream operations, such as * setting the final offset of a stream. The final offset of a stream might * differ from the offset of the last token eg in case one or more whitespaces * followed after the last token, but a {@link WhitespaceTokenizer} was used. * * @throws IOException */ public void end() throws IOException { // do nothing by default } /** * Returns the next token in the stream, or null at EOS. When possible, the * input Token should be used as the returned Token (this gives fastest * tokenization performance), but this is not required and a new Token may be * returned. Callers may re-use a single Token instance for successive calls * to this method. * <p> * This implicitly defines a "contract" between consumers (callers of this * method) and producers (implementations of this method that are the source * for tokens): * <ul> * <li>A consumer must fully consume the previously returned {@link Token} * before calling this method again.</li> * <li>A producer must call {@link Token#clear()} before setting the fields in * it and returning it</li> * </ul> * Also, the producer must make no assumptions about a {@link Token} after it * has been returned: the caller may arbitrarily change it. If the producer * needs to hold onto the {@link Token} for subsequent calls, it must clone() * it before storing it. Note that a {@link TokenFilter} is considered a * consumer. * * @param reusableToken a {@link Token} that may or may not be used to return; * this parameter should never be null (the callee is not required to * check for null before using it, but it is a good idea to assert that * it is not null.) * @return next {@link Token} in the stream or null if end-of-stream was hit * @deprecated The new {@link #incrementToken()} and {@link AttributeSource} * APIs should be used instead. */ public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; if (tokenWrapper == null) throw new UnsupportedOperationException("This TokenStream only supports the new Attributes API."); if (supportedMethods.hasIncrementToken) { tokenWrapper.delegate = reusableToken; return incrementToken() ? tokenWrapper.delegate : null; } else { assert supportedMethods.hasNext; return next(); } } /** * Returns the next {@link Token} in the stream, or null at EOS. * * @deprecated The returned Token is a "full private copy" (not re-used across * calls to {@link #next()}) but will be slower than calling * {@link #next(Token)} or using the new {@link #incrementToken()} * method with the new {@link AttributeSource} API. */ public Token next() throws IOException { if (tokenWrapper == null) throw new UnsupportedOperationException("This TokenStream only supports the new Attributes API."); final Token nextToken; if (supportedMethods.hasIncrementToken) { final Token savedDelegate = tokenWrapper.delegate; tokenWrapper.delegate = new Token(); nextToken = incrementToken() ? tokenWrapper.delegate : null; tokenWrapper.delegate = savedDelegate; } else { assert supportedMethods.hasReusableNext; nextToken = next(new Token()); } if (nextToken != null) { Payload p = nextToken.getPayload(); if (p != null) { nextToken.setPayload((Payload) p.clone()); } } return nextToken; } /** * Resets this stream to the beginning. This is an optional operation, so * subclasses may or may not implement this method. {@link #reset()} is not needed for * the standard indexing process. However, if the tokens of a * <code>TokenStream</code> are intended to be consumed more than once, it is * necessary to implement {@link #reset()}. Note that if your TokenStream * caches tokens and feeds them back again after a reset, it is imperative * that you clone the tokens when you store them away (on the first pass) as * well as when you return them (on future passes after {@link #reset()}). */ public void reset() throws IOException {} /** Releases resources associated with this stream. */ public void close() throws IOException {} }