package com.antbrains.wordseg.luceneanalyzer;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Closeable;
import java.lang.reflect.Modifier;
/**
* A <code>TokenStream</code> enumerates the sequence of tokens, either from {@link Field}s of a
* {@link Document} or from query text.
* <p>
* This is an abstract class; concrete subclasses are:
* <ul>
* <li>{@link Tokenizer}, a <code>TokenStream</code> whose input is a Reader; and
* <li>{@link TokenFilter}, a <code>TokenStream</code> whose input is another
* <code>TokenStream</code>.
* </ul>
* A new <code>TokenStream</code> API has been introduced with Lucene 2.9. This API has moved from
* being {@link Token}-based to {@link Attribute}-based. While {@link Token} still exists in 2.9 as
* a convenience class, the preferred way to store the information of a {@link Token} is to use
* {@link AttributeImpl}s.
* <p>
* <code>TokenStream</code> now extends {@link AttributeSource}, which provides access to all of the
* token {@link Attribute}s for the <code>TokenStream</code>. Note that only one instance per
* {@link AttributeImpl} is created and reused for every token. This approach reduces object
* creation and allows local caching of references to the {@link AttributeImpl}s. See
* {@link #incrementToken()} for further details.
* <p>
* <b>The workflow of the new <code>TokenStream</code> API is as follows:</b>
* <ol>
* <li>Instantiation of <code>TokenStream</code>/{@link TokenFilter}s which add/get attributes
* to/from the {@link AttributeSource}.
* <li>The consumer calls {@link TokenStream#reset()}.
* <li>The consumer retrieves attributes from the stream and stores local references to all
* attributes it wants to access.
* <li>The consumer calls {@link #incrementToken()} until it returns false consuming the attributes
* after each call.
* <li>The consumer calls {@link #end()} so that any end-of-stream operations can be performed.
* <li>The consumer calls {@link #close()} to release any resource when finished using the
* <code>TokenStream</code>.
* </ol>
* To make sure that filters and consumers know which attributes are available, the attributes must
* be added during instantiation. Filters and consumers are not required to check for availability
* of attributes in {@link #incrementToken()}.
* <p>
* You can find some example code for the new API in the analysis package level Javadoc.
* <p>
* Sometimes it is desirable to capture a current state of a <code>TokenStream</code>, e.g., for
* buffering purposes (see {@link CachingTokenFilter}, TeeSinkTokenFilter). For this usecase
* {@link AttributeSource#captureState} and {@link AttributeSource#restoreState} can be used.
* <p>
* The {@code TokenStream}-API in Lucene is based on the decorator pattern. Therefore all
* non-abstract subclasses must be final or have at least a final implementation of
* {@link #incrementToken}! This is checked when Java assertions are enabled.
*/
public abstract class TokenStream extends AttributeSource implements Closeable {
/**
* A TokenStream using the default attribute factory.
*/
protected TokenStream() {
super();
assert assertFinal();
}
/**
* A TokenStream that uses the same attributes as the supplied one.
*/
protected TokenStream(AttributeSource input) {
super(input);
assert assertFinal();
}
/**
* A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances.
*/
protected TokenStream(AttributeFactory factory) {
super(factory);
assert assertFinal();
}
private boolean assertFinal() {
try {
final Class<?> clazz = getClass();
assert clazz.isAnonymousClass()
|| (clazz.getModifiers() & (Modifier.FINAL | Modifier.PRIVATE)) != 0
|| Modifier.isFinal(clazz.getMethod("incrementToken").getModifiers()) : "TokenStream implementation classes or at least their incrementToken() implementation must be final";
return true;
} catch (NoSuchMethodException nsme) {
return false;
}
}
/**
* Consumers (i.e., {@link IndexWriter}) use this method to advance the stream to the next token.
* Implementing classes must implement this method and update the appropriate
* {@link AttributeImpl}s with the attributes of the next token.
* <P>
* The producer must make no assumptions about the attributes after the method has been returned:
* the caller may arbitrarily change it. If the producer needs to preserve the state for
* subsequent calls, it can use {@link #captureState} to create a copy of the current attribute
* state.
* <p>
* This method is called for every token of a document, so an efficient implementation is crucial
* for good performance. To avoid calls to {@link #addAttribute(Class)} and
* {@link #getAttribute(Class)}, references to all {@link AttributeImpl}s that this stream uses
* should be retrieved during instantiation.
* <p>
* To ensure that filters and consumers know which attributes are available, the attributes must
* be added during instantiation. Filters and consumers are not required to check for availability
* of attributes in {@link #incrementToken()}.
*
* @return false for end of stream; true otherwise
*/
public abstract boolean incrementToken() throws IOException;
/**
* This method is called by the consumer after the last token has been consumed, after
* {@link #incrementToken()} returned <code>false</code> (using the new <code>TokenStream</code>
* API). Streams implementing the old API should upgrade to use this feature.
* <p/>
* This method can be used to perform any end-of-stream operations, such as setting the final
* offset of a stream. The final offset of a stream might differ from the offset of the last token
* eg in case one or more whitespaces followed after the last token, but a WhitespaceTokenizer was
* used.
*
* @throws IOException
*/
public void end() throws IOException {
// do nothing by default
}
/**
* Resets this stream to the beginning. This is an optional operation, so subclasses may or may
* not implement this method. {@link #reset()} is not needed for the standard indexing process.
* However, if the tokens of a <code>TokenStream</code> are intended to be consumed more than
* once, it is necessary to implement {@link #reset()}. Note that if your TokenStream caches
* tokens and feeds them back again after a reset, it is imperative that you clone the tokens when
* you store them away (on the first pass) as well as when you return them (on future passes after
* {@link #reset()}).
*/
public void reset() throws IOException {
}
/** Releases resources associated with this stream. */
public void close() throws IOException {
}
}