package org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.util.AttributeSource.AttributeFactory; /** * Tokenizer for testing. * <p> * This tokenizer is a replacement for {@link #WHITESPACE}, {@link #SIMPLE}, and {@link #KEYWORD} * tokenizers. If you are writing a component such as a TokenFilter, its a great idea to test * it wrapping this tokenizer instead for extra checks. This tokenizer has the following behavior: * <ul> * <li>An internal state-machine is used for checking consumer consistency. These checks can * be disabled with {@link #setEnableChecks(boolean)}. * <li>For convenience, optionally lowercases terms that it outputs. * </ul> */ public class MockTokenizer extends Tokenizer { /** Acts Similar to WhitespaceTokenizer */ public static final int WHITESPACE = 0; /** Acts Similar to KeywordTokenizer. * TODO: Keyword returns an "empty" token for an empty reader... */ public static final int KEYWORD = 1; /** Acts like LetterTokenizer. */ public static final int SIMPLE = 2; private final int pattern; private final boolean lowerCase; private final int maxTokenLength; public static final int DEFAULT_MAX_TOKEN_LENGTH = Integer.MAX_VALUE; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); int off = 0; // TODO: "register" with LuceneTestCase to ensure all streams are closed() ? // currently, we can only check that the lifecycle is correct if someone is reusing, // but not for "one-offs". private static enum State { SETREADER, // consumer set a reader input either via ctor or via reset(Reader) RESET, // consumer has called reset() INCREMENT, // consumer is consuming, has called incrementToken() == true INCREMENT_FALSE, // consumer has called incrementToken() which returned false END, // consumer has called end() to perform end of stream operations CLOSE // consumer has called close() to release any resources }; private State streamState = State.CLOSE; private boolean enableChecks = true; public MockTokenizer(AttributeFactory factory, Reader input, int pattern, boolean lowerCase, int maxTokenLength) { super(factory, input); this.pattern = pattern; this.lowerCase = lowerCase; this.streamState = State.SETREADER; this.maxTokenLength = maxTokenLength; } public MockTokenizer(Reader input, int pattern, boolean lowerCase, int maxTokenLength) { this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, pattern, lowerCase, maxTokenLength); } public MockTokenizer(Reader input, int pattern, boolean lowerCase) { this(input, pattern, lowerCase, DEFAULT_MAX_TOKEN_LENGTH); } @Override public final boolean incrementToken() throws IOException { assert !enableChecks || (streamState == State.RESET || streamState == State.INCREMENT) : "incrementToken() called while in wrong state: " + streamState; clearAttributes(); for (;;) { int startOffset = off; int cp = readCodePoint(); if (cp < 0) { break; } else if (isTokenChar(cp)) { int endOffset; do { char chars[] = Character.toChars(normalize(cp)); for (int i = 0; i < chars.length; i++) termAtt.append(chars[i]); endOffset = off; if (termAtt.length() >= maxTokenLength) { break; } cp = readCodePoint(); } while (cp >= 0 && isTokenChar(cp)); offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset)); streamState = State.INCREMENT; return true; } } streamState = State.INCREMENT_FALSE; return false; } protected int readCodePoint() throws IOException { int ch = input.read(); if (ch < 0) { return ch; } else { assert ch != 0xffff; /* only on 3.x */ assert !Character.isLowSurrogate((char) ch); off++; if (Character.isHighSurrogate((char) ch)) { int ch2 = input.read(); if (ch2 >= 0) { off++; assert Character.isLowSurrogate((char) ch2); return Character.toCodePoint((char) ch, (char) ch2); } } return ch; } } protected boolean isTokenChar(int c) { switch(pattern) { case WHITESPACE: return !Character.isWhitespace(c); case KEYWORD: return true; case SIMPLE: return Character.isLetter(c); default: throw new RuntimeException("invalid pattern constant:" + pattern); } } protected int normalize(int c) { return lowerCase ? Character.toLowerCase(c) : c; } @Override public void reset() throws IOException { super.reset(); off = 0; assert !enableChecks || streamState != State.RESET : "double reset()"; streamState = State.RESET; } @Override public void close() throws IOException { super.close(); // in some exceptional cases (e.g. TestIndexWriterExceptions) a test can prematurely close() // these tests should disable this check, by default we check the normal workflow. // TODO: investigate the CachingTokenFilter "double-close"... for now we ignore this assert !enableChecks || streamState == State.END || streamState == State.CLOSE : "close() called in wrong state: " + streamState; streamState = State.CLOSE; } @Override public void reset(Reader input) throws IOException { super.reset(input); assert !enableChecks || streamState == State.CLOSE : "setReader() called in wrong state: " + streamState; streamState = State.SETREADER; } @Override public void end() throws IOException { int finalOffset = correctOffset(off); offsetAtt.setOffset(finalOffset, finalOffset); // some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false. // these tests should disable this check (in general you should consume the entire stream) assert !enableChecks || streamState == State.INCREMENT_FALSE : "end() called before incrementToken() returned false!"; streamState = State.END; } /** * Toggle consumer workflow checking: if your test consumes tokenstreams normally you * should leave this enabled. */ public void setEnableChecks(boolean enableChecks) { this.enableChecks = enableChecks; } }