MockTokenizer.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis;

import java.io.IOException;
import java.nio.CharBuffer;
import java.util.Random;

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;

import com.carrotsearch.randomizedtesting.RandomizedContext;

/**
 * Tokenizer for testing.
 * <p>
 * This tokenizer is a replacement for {@link #WHITESPACE}, {@link #SIMPLE}, and {@link #KEYWORD}
 * tokenizers. If you are writing a component such as a TokenFilter, it's a great idea to test
 * it wrapping this tokenizer instead for extra checks. This tokenizer has the following behavior:
 * <ul>
 *   <li>An internal state-machine is used for checking consumer consistency. These checks can
 *       be disabled with {@link #setEnableChecks(boolean)}.
 *   <li>For convenience, optionally lowercases terms that it outputs.
 * </ul>
 */
public class MockTokenizer extends Tokenizer {
  /** Acts Similar to WhitespaceTokenizer */
  public static final CharacterRunAutomaton WHITESPACE = 
    new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+").toAutomaton());
  /** Acts Similar to KeywordTokenizer.
   * TODO: Keyword returns an "empty" token for an empty reader... 
   */
  public static final CharacterRunAutomaton KEYWORD =
    new CharacterRunAutomaton(new RegExp(".*").toAutomaton());
  /** Acts like LetterTokenizer. */
  // the ugly regex below is incomplete Unicode 5.2 [:Letter:]
  public static final CharacterRunAutomaton SIMPLE =
    new CharacterRunAutomaton(new RegExp("[A-Za-zªµºÀ-ÖØ-öø-ˁ一-鿌]+").toAutomaton());

  private final CharacterRunAutomaton runAutomaton;
  private final boolean lowerCase;
  private final int maxTokenLength;
  public static final int DEFAULT_MAX_TOKEN_LENGTH = Integer.MAX_VALUE;
  private int state;

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  int off = 0;
  
  // buffered state (previous codepoint and offset). we replay this once we
  // hit a reject state in case it's permissible as the start of a new term.
  int bufferedCodePoint = -1; // -1 indicates empty buffer
  int bufferedOff = -1;

  // TODO: "register" with LuceneTestCase to ensure all streams are closed() ?
  // currently, we can only check that the lifecycle is correct if someone is reusing,
  // but not for "one-offs".
  private static enum State { 
    SETREADER,       // consumer set a reader input either via ctor or via reset(Reader)
    RESET,           // consumer has called reset()
    INCREMENT,       // consumer is consuming, has called incrementToken() == true
    INCREMENT_FALSE, // consumer has called incrementToken() which returned false
    END,             // consumer has called end() to perform end of stream operations
    CLOSE            // consumer has called close() to release any resources
  };
  
  private State streamState = State.CLOSE;
  private int lastOffset = 0; // only for checks
  private boolean enableChecks = true;
  
  // evil: but we don't change the behavior with this random, we only switch up how we read
  private final Random random = new Random(RandomizedContext.current().getRandom().nextLong());
  
  public MockTokenizer(AttributeFactory factory, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
    super(factory);
    this.runAutomaton = runAutomaton;
    this.lowerCase = lowerCase;
    this.state = 0;
    this.maxTokenLength = maxTokenLength;
  }

  public MockTokenizer(CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
    this(BaseTokenStreamTestCase.newAttributeFactory(), runAutomaton, lowerCase, maxTokenLength);
  }

  public MockTokenizer(CharacterRunAutomaton runAutomaton, boolean lowerCase) {
    this(runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH);
  }

  /** Calls {@link #MockTokenizer(CharacterRunAutomaton, boolean) MockTokenizer(Reader, WHITESPACE, true)} */
  public MockTokenizer() {
    this(WHITESPACE, true);
  }

  public MockTokenizer(AttributeFactory factory, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
    this(factory, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH);
  }

  /** Calls {@link #MockTokenizer(AttributeFactory,CharacterRunAutomaton,boolean)
   *                MockTokenizer(AttributeFactory, Reader, WHITESPACE, true)} */
  public MockTokenizer(AttributeFactory factory) {
    this(factory, WHITESPACE, true);
  }
  
  // we allow some checks (e.g. state machine) to be turned off.
  // turning off checks just means we suppress exceptions from them
  private void fail(String message) {
    if (enableChecks) {
      throw new IllegalStateException(message);
    }
  }
  
  private void failAlways(String message) {
    throw new IllegalStateException(message);
  }

  @Override
  public final boolean incrementToken() throws IOException {
    if (streamState != State.RESET && streamState != State.INCREMENT) {
      fail("incrementToken() called while in wrong state: " + streamState);
    }

    clearAttributes();
    for (;;) {
      int startOffset;
      int cp;
      if (bufferedCodePoint >= 0) {
        cp = bufferedCodePoint;
        startOffset = bufferedOff;
        bufferedCodePoint = -1;
      } else {
        startOffset = off;
        cp = readCodePoint();
      }
      if (cp < 0) {
        break;
      } else if (isTokenChar(cp)) {
        int endOffset;
        do {
          char chars[] = Character.toChars(normalize(cp));
          for (int i = 0; i < chars.length; i++)
            termAtt.append(chars[i]);
          endOffset = off;
          if (termAtt.length() >= maxTokenLength) {
            break;
          }
          cp = readCodePoint();
        } while (cp >= 0 && isTokenChar(cp));
        
        if (termAtt.length() < maxTokenLength) {
          // buffer up, in case the "rejected" char can start a new word of its own
          bufferedCodePoint = cp;
          bufferedOff = endOffset;
        } else {
          // otherwise, it's because we hit term limit.
          bufferedCodePoint = -1;
        }
        int correctedStartOffset = correctOffset(startOffset);
        int correctedEndOffset = correctOffset(endOffset);
        if (correctedStartOffset < 0) {
          failAlways("invalid start offset: " + correctedStartOffset + ", before correction: " + startOffset);
        }
        if (correctedEndOffset < 0) {
          failAlways("invalid end offset: " + correctedEndOffset + ", before correction: " + endOffset);
        }
        if (correctedStartOffset < lastOffset) {
          failAlways("start offset went backwards: " + correctedStartOffset + ", before correction: " + startOffset + ", lastOffset: " + lastOffset);
        }
        lastOffset = correctedStartOffset;
        if (correctedEndOffset < correctedStartOffset) {
          failAlways("end offset: " + correctedEndOffset + " is before start offset: " + correctedStartOffset);
        }
        offsetAtt.setOffset(correctedStartOffset, correctedEndOffset);
        if (state == -1 || runAutomaton.isAccept(state)) {
          // either we hit a reject state (longest match), or end-of-text, but in an accept state
          streamState = State.INCREMENT;
          return true;
        }
      }
    }
    streamState = State.INCREMENT_FALSE;
    return false;
  }

  protected int readCodePoint() throws IOException {
    int ch = readChar();
    if (ch < 0) {
      return ch;
    } else {
      if (Character.isLowSurrogate((char) ch)) {
        failAlways("unpaired low surrogate: " + Integer.toHexString(ch));
      }
      off++;
      if (Character.isHighSurrogate((char) ch)) {
        int ch2 = readChar();
        if (ch2 >= 0) {
          off++;
          if (!Character.isLowSurrogate((char) ch2)) {
            failAlways("unpaired high surrogate: " + Integer.toHexString(ch) + ", followed by: " + Integer.toHexString(ch2));
          }
          return Character.toCodePoint((char) ch, (char) ch2);
        } else {
          failAlways("stream ends with unpaired high surrogate: " + Integer.toHexString(ch));
        }
      }
      return ch;
    }
  }
  
  protected int readChar() throws IOException {
    switch(random.nextInt(10)) {
      case 0: {
        // read(char[])
        char c[] = new char[1];
        int ret = input.read(c);
        return ret < 0 ? ret : c[0];
      }
      case 1: {
        // read(char[], int, int)
        char c[] = new char[2];
        int ret = input.read(c, 1, 1);
        return ret < 0 ? ret : c[1];
      }
      case 2: {
        // read(CharBuffer)
        char c[] = new char[1];
        CharBuffer cb = CharBuffer.wrap(c);
        int ret = input.read(cb);
        return ret < 0 ? ret : c[0];
      }
      default: 
        // read()
        return input.read();
    }
  }

  protected boolean isTokenChar(int c) {
    if (state < 0) {
      state = 0;
    }
    state = runAutomaton.step(state, c);
    if (state < 0) {
      return false;
    } else {
      return true;
    }
  }
  
  protected int normalize(int c) {
    return lowerCase ? Character.toLowerCase(c) : c;
  }

  @Override
  public void reset() throws IOException {
    try {
      super.reset();
      state = 0;
      lastOffset = off = 0;
      bufferedCodePoint = -1;
      if (streamState == State.RESET) {
        fail("double reset()");
      }
    } finally {
      streamState = State.RESET;
    }
  }
  
  @Override
  public void close() throws IOException {
    try {
      super.close();
      // in some exceptional cases (e.g. TestIndexWriterExceptions) a test can prematurely close()
      // these tests should disable this check, by default we check the normal workflow.
      // TODO: investigate the CachingTokenFilter "double-close"... for now we ignore this
      if (!(streamState == State.END || streamState == State.CLOSE)) {
        fail("close() called in wrong state: " + streamState);
      }
    } finally {
      streamState = State.CLOSE;
    }
  }

  @Override
  void setReaderTestPoint() {
    try {
      if (streamState != State.CLOSE) {
        fail("setReader() called in wrong state: " + streamState);
      }
    } finally {
      streamState = State.SETREADER;
    }
  }

  @Override
  public void end() throws IOException {
    try {
      super.end();
      int finalOffset = correctOffset(off);
      offsetAtt.setOffset(finalOffset, finalOffset);
      // some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false.
      // these tests should disable this check (in general you should consume the entire stream)
      if (streamState != State.INCREMENT_FALSE) {
        fail("end() called in wrong state=" + streamState + "!");
      }
    } finally {
      streamState = State.END;
    }
  }

  /** 
   * Toggle consumer workflow checking: if your test consumes tokenstreams normally you
   * should leave this enabled.
   */
  public void setEnableChecks(boolean enableChecks) {
    this.enableChecks = enableChecks;
  }
}