WordDelimiterIterator.java example

Explorer
solrcene-master
package org.apache.lucene.analysis.miscellaneous;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;

/**
 * A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules.
 * @lucene.internal
 */
public final class WordDelimiterIterator {

  /** Indicates the end of iteration */
  public static final int DONE = -1;
  
  public static final byte[] DEFAULT_WORD_DELIM_TABLE;

  char text[];
  int length;
  
  /** start position of text, excluding leading delimiters */
  int startBounds;
  /** end position of text, excluding trailing delimiters */
  int endBounds;
  
  /** Beginning of subword */
  int current;
  /** End of subword */
  int end;
  
  /* does this string end with a possessive such as 's */
  private boolean hasFinalPossessive = false;
  
  /**
   * If false, causes case changes to be ignored (subwords will only be generated
   * given SUBWORD_DELIM tokens). (Defaults to true)
   */
  final boolean splitOnCaseChange;
  
  /**
   * If false, causes numeric changes to be ignored (subwords will only be generated
   * given SUBWORD_DELIM tokens). (Defaults to true)
   */
  final boolean splitOnNumerics;

  /**
   * If true, causes trailing "'s" to be removed for each subword. (Defaults to true)
   * <p/>
   * "O'Neil's" => "O", "Neil"
   */
  final boolean stemEnglishPossessive;
  
  private final byte[] charTypeTable;
  
  /** if true, need to skip over a possessive found in the last call to next() */
  private boolean skipPossessive = false;

  // TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be
  // done if separated by these chars?) "," would be an obvious candidate...
  static {
    byte[] tab = new byte[256];
    for (int i = 0; i < 256; i++) {
      byte code = 0;
      if (Character.isLowerCase(i)) {
        code |= LOWER;
      }
      else if (Character.isUpperCase(i)) {
        code |= UPPER;
      }
      else if (Character.isDigit(i)) {
        code |= DIGIT;
      }
      if (code == 0) {
        code = SUBWORD_DELIM;
      }
      tab[i] = code;
    }
    DEFAULT_WORD_DELIM_TABLE = tab;
  }

  /**
   * Create a new WordDelimiterIterator operating with the supplied rules.
   * 
   * @param charTypeTable table containing character types
   * @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
   * @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" "se"
   * @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
   */
  WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive) {
    this.charTypeTable = charTypeTable;
    this.splitOnCaseChange = splitOnCaseChange;
    this.splitOnNumerics = splitOnNumerics;
    this.stemEnglishPossessive = stemEnglishPossessive;
  }
  
  /**
   * Advance to the next subword in the string.
   *
   * @return index of the next subword, or {@link #DONE} if all subwords have been returned
   */
  int next() {
    current = end;
    if (current == DONE) {
      return DONE;
    }
    
    if (skipPossessive) {
      current += 2;
      skipPossessive = false;
    }

    int lastType = 0;
    
    while (current < endBounds && (isSubwordDelim(lastType = charType(text[current])))) {
      current++;
    }

    if (current >= endBounds) {
      return end = DONE;
    }
    
    for (end = current + 1; end < endBounds; end++) {
      int type = charType(text[end]);
      if (isBreak(lastType, type)) {
        break;
      }
      lastType = type;
    }
    
    if (end < endBounds - 1 && endsWithPossessive(end + 2)) {
      skipPossessive = true;
    }
    
    return end;
  }


  /**
   * Return the type of the current subword.
   * This currently uses the type of the first character in the subword.
   *
   * @return type of the current word
   */
  int type() {
    if (end == DONE) {
      return 0;
    }
    
    int type = charType(text[current]);
    switch (type) {
      // return ALPHA word type for both lower and upper
      case LOWER:
      case UPPER:
        return ALPHA;
      default:
        return type;
    }
  }

  /**
   * Reset the text to a new value, and reset all state
   *
   * @param text New text
   * @param length length of the text
   */
  void setText(char text[], int length) {
    this.text = text;
    this.length = this.endBounds = length;
    current = startBounds = end = 0;
    skipPossessive = hasFinalPossessive = false;
    setBounds();
  }

  // ================================================= Helper Methods ================================================

  /**
   * Determines whether the transition from lastType to type indicates a break
   *
   * @param lastType Last subword type
   * @param type Current subword type
   * @return {@code true} if the transition indicates a break, {@code false} otherwise
   */
  private boolean isBreak(int lastType, int type) {
    if ((type & lastType) != 0) {
      return false;
    }
    
    if (!splitOnCaseChange && isAlpha(lastType) && isAlpha(type)) {
      // ALPHA->ALPHA: always ignore if case isn't considered.
      return false;
    } else if (isUpper(lastType) && isAlpha(type)) {
      // UPPER->letter: Don't split
      return false;
    } else if (!splitOnNumerics && ((isAlpha(lastType) && isDigit(type)) || (isDigit(lastType) && isAlpha(type)))) {
      // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
      return false;
    }

    return true;
  }
  
  /**
   * Determines if the current word contains only one subword.  Note, it could be potentially surrounded by delimiters
   *
   * @return {@code true} if the current word contains only one subword, {@code false} otherwise
   */
  boolean isSingleWord() {
    if (hasFinalPossessive) {
      return current == startBounds && end == endBounds - 2;
    }
    else {
      return current == startBounds && end == endBounds;
    }
  }
   
  /**
   * Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove
   * it yet, simply note it.
   */
  private void setBounds() {
    while (startBounds < length && (isSubwordDelim(charType(text[startBounds])))) {
      startBounds++;
    }
    
    while (endBounds > startBounds && (isSubwordDelim(charType(text[endBounds - 1])))) {
      endBounds--;
    }
    if (endsWithPossessive(endBounds)) {
      hasFinalPossessive = true;
    }
    current = startBounds;
  }
  
  /**
   * Determines if the text at the given position indicates an English possessive which should be removed
   *
   * @param pos Position in the text to check if it indicates an English possessive
   * @return {@code true} if the text at the position indicates an English posessive, {@code false} otherwise
   */
  private boolean endsWithPossessive(int pos) {
    return (stemEnglishPossessive &&
            pos > 2 &&
            text[pos - 2] == '\'' &&
            (text[pos - 1] == 's' || text[pos - 1] == 'S') &&
            isAlpha(charType(text[pos - 3])) &&
            (pos == endBounds || isSubwordDelim(charType(text[pos]))));
  }

  /**
   * Determines the type of the given character
   *
   * @param ch Character whose type is to be determined
   * @return Type of the character
   */
  private int charType(int ch) {
    if (ch < charTypeTable.length) {
      return charTypeTable[ch];
    }
    return getType(ch);
  }
  
  /**
   * Computes the type of the given character
   *
   * @param ch Character whose type is to be determined
   * @return Type of the character
   */
  public static byte getType(int ch) {
    switch (Character.getType(ch)) {
      case Character.UPPERCASE_LETTER: return UPPER;
      case Character.LOWERCASE_LETTER: return LOWER;

      case Character.TITLECASE_LETTER:
      case Character.MODIFIER_LETTER:
      case Character.OTHER_LETTER:
      case Character.NON_SPACING_MARK:
      case Character.ENCLOSING_MARK:  // depends what it encloses?
      case Character.COMBINING_SPACING_MARK:
        return ALPHA; 

      case Character.DECIMAL_DIGIT_NUMBER:
      case Character.LETTER_NUMBER:
      case Character.OTHER_NUMBER:
        return DIGIT;

      // case Character.SPACE_SEPARATOR:
      // case Character.LINE_SEPARATOR:
      // case Character.PARAGRAPH_SEPARATOR:
      // case Character.CONTROL:
      // case Character.FORMAT:
      // case Character.PRIVATE_USE:

      case Character.SURROGATE:  // prevent splitting
        return ALPHA|DIGIT;  

      // case Character.DASH_PUNCTUATION:
      // case Character.START_PUNCTUATION:
      // case Character.END_PUNCTUATION:
      // case Character.CONNECTOR_PUNCTUATION:
      // case Character.OTHER_PUNCTUATION:
      // case Character.MATH_SYMBOL:
      // case Character.CURRENCY_SYMBOL:
      // case Character.MODIFIER_SYMBOL:
      // case Character.OTHER_SYMBOL:
      // case Character.INITIAL_QUOTE_PUNCTUATION:
      // case Character.FINAL_QUOTE_PUNCTUATION:

      default: return SUBWORD_DELIM;
    }
  }
}