CJKBigramFilter.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.cjk;


import java.io.IOException;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.ArrayUtil;

/**
 * Forms bigrams of CJK terms that are generated from StandardTokenizer
 * or ICUTokenizer.
 * <p>
 * CJK types are set by these tokenizers, but you can also use 
 * {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which
 * of the CJK scripts are turned into bigrams.
 * <p>
 * By default, when a CJK character has no adjacent characters to form
 * a bigram, it is output in unigram form. If you want to always output
 * both unigrams and bigrams, set the <code>outputUnigrams</code>
 * flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}.
 * This can be used for a combined unigram+bigram approach.
 * <p>
 * In all cases, all non-CJK input is passed thru unmodified.
 */
public final class CJKBigramFilter extends TokenFilter {
  // configuration
  /** bigram flag for Han Ideographs */
  public static final int HAN = 1;
  /** bigram flag for Hiragana */
  public static final int HIRAGANA = 2;
  /** bigram flag for Katakana */
  public static final int KATAKANA = 4;
  /** bigram flag for Hangul */
  public static final int HANGUL = 8;

  /** when we emit a bigram, it's then marked as this type */
  public static final String DOUBLE_TYPE = "<DOUBLE>";
  /** when we emit a unigram, it's then marked as this type */
  public static final String SINGLE_TYPE = "<SINGLE>";

  // the types from standardtokenizer
  private static final String HAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
  private static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
  private static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
  private static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
  
  // sentinel value for ignoring a script 
  private static final Object NO = new Object();

  // these are set to either their type or NO if we want to pass them thru
  private final Object doHan;
  private final Object doHiragana;
  private final Object doKatakana;
  private final Object doHangul;
  
  // true if we should output unigram tokens always
  private final boolean outputUnigrams;
  private boolean ngramState; // false = output unigram, true = output bigram
    
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
  
  // buffers containing codepoint and offsets in parallel
  int buffer[] = new int[8];
  int startOffset[] = new int[8];
  int endOffset[] = new int[8];
  // length of valid buffer
  int bufferLen;
  // current buffer index
  int index;
  
  // the last end offset, to determine if we should bigram across tokens
  int lastEndOffset;
  
  private boolean exhausted;
  
  /** 
   * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
   *       CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)}
   */
  public CJKBigramFilter(TokenStream in) {
    this(in, HAN | HIRAGANA | KATAKANA | HANGUL);
  }
  
  /** 
   * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)
   *       CJKBigramFilter(in, flags, false)}
   */
  public CJKBigramFilter(TokenStream in, int flags) {
    this(in, flags, false);
  }
  
  /**
   * Create a new CJKBigramFilter, specifying which writing systems should be bigrammed,
   * and whether or not unigrams should also be output.
   * @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA}, 
   *        {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
   * @param outputUnigrams true if unigrams for the selected writing systems should also be output.
   *        when this is false, this is only done when there are no adjacent characters to form
   *        a bigram.
   */
  public CJKBigramFilter(TokenStream in, int flags, boolean outputUnigrams) {
    super(in);
    doHan =      (flags & HAN) == 0      ? NO : HAN_TYPE;
    doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
    doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
    doHangul =   (flags & HANGUL) == 0   ? NO : HANGUL_TYPE;
    this.outputUnigrams = outputUnigrams;
  }
  
  /*
   * much of this complexity revolves around handling the special case of a 
   * "lone cjk character" where cjktokenizer would output a unigram. this 
   * is also the only time we ever have to captureState.
   */
  @Override
  public boolean incrementToken() throws IOException {
    while (true) {
      if (hasBufferedBigram()) {
        
        // case 1: we have multiple remaining codepoints buffered,
        // so we can emit a bigram here.
        
        if (outputUnigrams) {

          // when also outputting unigrams, we output the unigram first,
          // then rewind back to revisit the bigram.
          // so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
          // the logic in hasBufferedUnigram ensures we output the C, 
          // even though it did actually have adjacent CJK characters.

          if (ngramState) {
            flushBigram();
          } else {
            flushUnigram();
            index--;
          }
          ngramState = !ngramState;
        } else {
          flushBigram();
        }
        return true;
      } else if (doNext()) {
        
        // case 2: look at the token type. should we form any n-grams?
        
        String type = typeAtt.type();
        if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul) {
          
          // acceptable CJK type: we form n-grams from these.
          // as long as the offsets are aligned, we just add these to our current buffer.
          // otherwise, we clear the buffer and start over.
          
          if (offsetAtt.startOffset() != lastEndOffset) { // unaligned, clear queue
            if (hasBufferedUnigram()) {
              
              // we have a buffered unigram, and we peeked ahead to see if we could form
              // a bigram, but we can't, because the offsets are unaligned. capture the state 
              // of this peeked data to be revisited next time thru the loop, and dump our unigram.
              
              loneState = captureState();
              flushUnigram();
              return true;
            }
            index = 0;
            bufferLen = 0;
          }
          refill();
        } else {
          
          // not a CJK type: we just return these as-is.
          
          if (hasBufferedUnigram()) {
            
            // we have a buffered unigram, and we peeked ahead to see if we could form
            // a bigram, but we can't, because it's not a CJK type. capture the state 
            // of this peeked data to be revisited next time thru the loop, and dump our unigram.
            
            loneState = captureState();
            flushUnigram();
            return true;
          }
          return true;
        }
      } else {
        
        // case 3: we have only zero or 1 codepoints buffered, 
        // so not enough to form a bigram. But, we also have no
        // more input. So if we have a buffered codepoint, emit
        // a unigram, otherwise, it's end of stream.
        
        if (hasBufferedUnigram()) {
          flushUnigram(); // flush our remaining unigram
          return true;
        }
        return false;
      }
    }
  }
  
  private State loneState; // rarely used: only for "lone cjk characters", where we emit unigrams
  
  /** 
   * looks at next input token, returning false is none is available 
   */
  private boolean doNext() throws IOException {
    if (loneState != null) {
      restoreState(loneState);
      loneState = null;
      return true;
    } else {
      if (exhausted) {
        return false;
      } else if (input.incrementToken()) {
        return true;
      } else {
        exhausted = true;
        return false;
      }
    }
  }
  
  /**
   * refills buffers with new data from the current token.
   */
  private void refill() {
    // compact buffers to keep them smallish if they become large
    // just a safety check, but technically we only need the last codepoint
    if (bufferLen > 64) {
      int last = bufferLen - 1;
      buffer[0] = buffer[last];
      startOffset[0] = startOffset[last];
      endOffset[0] = endOffset[last];
      bufferLen = 1;
      index -= last;
    }

    char termBuffer[] = termAtt.buffer();
    int len = termAtt.length();
    int start = offsetAtt.startOffset();
    int end = offsetAtt.endOffset();
    
    int newSize = bufferLen + len;
    buffer = ArrayUtil.grow(buffer, newSize);
    startOffset = ArrayUtil.grow(startOffset, newSize);
    endOffset = ArrayUtil.grow(endOffset, newSize);
    lastEndOffset = end;

    if (end - start != len) {
      // crazy offsets (modified by synonym or charfilter): just preserve
      for (int i = 0, cp = 0; i < len; i += Character.charCount(cp)) {
        cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
        startOffset[bufferLen] = start;
        endOffset[bufferLen] = end;
        bufferLen++;
      }
    } else {
      // normal offsets
      for (int i = 0, cp = 0, cpLen = 0; i < len; i += cpLen) {
        cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
        cpLen = Character.charCount(cp);
        startOffset[bufferLen] = start;
        start = endOffset[bufferLen] = start + cpLen;
        bufferLen++;
      }
    }
  }

  /** 
   * Flushes a bigram token to output from our buffer 
   * This is the normal case, e.g. ABC -> AB BC
   */
  private void flushBigram() {
    clearAttributes();
    char termBuffer[] = termAtt.resizeBuffer(4); // maximum bigram length in code units (2 supplementaries)
    int len1 = Character.toChars(buffer[index], termBuffer, 0);
    int len2 = len1 + Character.toChars(buffer[index+1], termBuffer, len1);
    termAtt.setLength(len2);
    offsetAtt.setOffset(startOffset[index], endOffset[index+1]);
    typeAtt.setType(DOUBLE_TYPE);
    // when outputting unigrams, all bigrams are synonyms that span two unigrams
    if (outputUnigrams) {
      posIncAtt.setPositionIncrement(0);
      posLengthAtt.setPositionLength(2);
    }
    index++;
  }
  
  /** 
   * Flushes a unigram token to output from our buffer.
   * This happens when we encounter isolated CJK characters, either the whole
   * CJK string is a single character, or we encounter a CJK character surrounded 
   * by space, punctuation, english, etc, but not beside any other CJK.
   */
  private void flushUnigram() {
    clearAttributes();
    char termBuffer[] = termAtt.resizeBuffer(2); // maximum unigram length (2 surrogates)
    int len = Character.toChars(buffer[index], termBuffer, 0);
    termAtt.setLength(len);
    offsetAtt.setOffset(startOffset[index], endOffset[index]);
    typeAtt.setType(SINGLE_TYPE);
    index++;
  }
  
  /**
   * True if we have multiple codepoints sitting in our buffer
   */
  private boolean hasBufferedBigram() {
    return bufferLen - index > 1;
  }

  /**
   * True if we have a single codepoint sitting in our buffer, where its future
   * (whether it is emitted as unigram or forms a bigram) depends upon not-yet-seen
   * inputs.
   */
  private boolean hasBufferedUnigram() {
    if (outputUnigrams) {
      // when outputting unigrams always
      return bufferLen - index == 1;
    } else {
      // otherwise it's only when we have a lone CJK character
      return bufferLen == 1 && index == 0;
    }
  }

  @Override
  public void reset() throws IOException {
    super.reset();
    bufferLen = 0;
    index = 0;
    lastEndOffset = 0;
    loneState = null;
    exhausted = false;
    ngramState = false;
  }
}