package org.apache.lucene.analysis.miscellaneous; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*; /** * A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules. * @lucene.internal */ public final class WordDelimiterIterator { /** Indicates the end of iteration */ public static final int DONE = -1; public static final byte[] DEFAULT_WORD_DELIM_TABLE; char text[]; int length; /** start position of text, excluding leading delimiters */ int startBounds; /** end position of text, excluding trailing delimiters */ int endBounds; /** Beginning of subword */ int current; /** End of subword */ int end; /* does this string end with a possessive such as 's */ private boolean hasFinalPossessive = false; /** * If false, causes case changes to be ignored (subwords will only be generated * given SUBWORD_DELIM tokens). (Defaults to true) */ final boolean splitOnCaseChange; /** * If false, causes numeric changes to be ignored (subwords will only be generated * given SUBWORD_DELIM tokens). (Defaults to true) */ final boolean splitOnNumerics; /** * If true, causes trailing "'s" to be removed for each subword. (Defaults to true) * <p/> * "O'Neil's" => "O", "Neil" */ final boolean stemEnglishPossessive; private final byte[] charTypeTable; /** if true, need to skip over a possessive found in the last call to next() */ private boolean skipPossessive = false; // TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be // done if separated by these chars?) "," would be an obvious candidate... static { byte[] tab = new byte[256]; for (int i = 0; i < 256; i++) { byte code = 0; if (Character.isLowerCase(i)) { code |= LOWER; } else if (Character.isUpperCase(i)) { code |= UPPER; } else if (Character.isDigit(i)) { code |= DIGIT; } if (code == 0) { code = SUBWORD_DELIM; } tab[i] = code; } DEFAULT_WORD_DELIM_TABLE = tab; } /** * Create a new WordDelimiterIterator operating with the supplied rules. * * @param charTypeTable table containing character types * @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) * @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" "se" * @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" */ WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive) { this.charTypeTable = charTypeTable; this.splitOnCaseChange = splitOnCaseChange; this.splitOnNumerics = splitOnNumerics; this.stemEnglishPossessive = stemEnglishPossessive; } /** * Advance to the next subword in the string. * * @return index of the next subword, or {@link #DONE} if all subwords have been returned */ int next() { current = end; if (current == DONE) { return DONE; } if (skipPossessive) { current += 2; skipPossessive = false; } int lastType = 0; while (current < endBounds && (isSubwordDelim(lastType = charType(text[current])))) { current++; } if (current >= endBounds) { return end = DONE; } for (end = current + 1; end < endBounds; end++) { int type = charType(text[end]); if (isBreak(lastType, type)) { break; } lastType = type; } if (end < endBounds - 1 && endsWithPossessive(end + 2)) { skipPossessive = true; } return end; } /** * Return the type of the current subword. * This currently uses the type of the first character in the subword. * * @return type of the current word */ int type() { if (end == DONE) { return 0; } int type = charType(text[current]); switch (type) { // return ALPHA word type for both lower and upper case LOWER: case UPPER: return ALPHA; default: return type; } } /** * Reset the text to a new value, and reset all state * * @param text New text * @param length length of the text */ void setText(char text[], int length) { this.text = text; this.length = this.endBounds = length; current = startBounds = end = 0; skipPossessive = hasFinalPossessive = false; setBounds(); } // ================================================= Helper Methods ================================================ /** * Determines whether the transition from lastType to type indicates a break * * @param lastType Last subword type * @param type Current subword type * @return {@code true} if the transition indicates a break, {@code false} otherwise */ private boolean isBreak(int lastType, int type) { if ((type & lastType) != 0) { return false; } if (!splitOnCaseChange && isAlpha(lastType) && isAlpha(type)) { // ALPHA->ALPHA: always ignore if case isn't considered. return false; } else if (isUpper(lastType) && isAlpha(type)) { // UPPER->letter: Don't split return false; } else if (!splitOnNumerics && ((isAlpha(lastType) && isDigit(type)) || (isDigit(lastType) && isAlpha(type)))) { // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split return false; } return true; } /** * Determines if the current word contains only one subword. Note, it could be potentially surrounded by delimiters * * @return {@code true} if the current word contains only one subword, {@code false} otherwise */ boolean isSingleWord() { if (hasFinalPossessive) { return current == startBounds && end == endBounds - 2; } else { return current == startBounds && end == endBounds; } } /** * Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove * it yet, simply note it. */ private void setBounds() { while (startBounds < length && (isSubwordDelim(charType(text[startBounds])))) { startBounds++; } while (endBounds > startBounds && (isSubwordDelim(charType(text[endBounds - 1])))) { endBounds--; } if (endsWithPossessive(endBounds)) { hasFinalPossessive = true; } current = startBounds; } /** * Determines if the text at the given position indicates an English possessive which should be removed * * @param pos Position in the text to check if it indicates an English possessive * @return {@code true} if the text at the position indicates an English posessive, {@code false} otherwise */ private boolean endsWithPossessive(int pos) { return (stemEnglishPossessive && pos > 2 && text[pos - 2] == '\'' && (text[pos - 1] == 's' || text[pos - 1] == 'S') && isAlpha(charType(text[pos - 3])) && (pos == endBounds || isSubwordDelim(charType(text[pos])))); } /** * Determines the type of the given character * * @param ch Character whose type is to be determined * @return Type of the character */ private int charType(int ch) { if (ch < charTypeTable.length) { return charTypeTable[ch]; } return getType(ch); } /** * Computes the type of the given character * * @param ch Character whose type is to be determined * @return Type of the character */ public static byte getType(int ch) { switch (Character.getType(ch)) { case Character.UPPERCASE_LETTER: return UPPER; case Character.LOWERCASE_LETTER: return LOWER; case Character.TITLECASE_LETTER: case Character.MODIFIER_LETTER: case Character.OTHER_LETTER: case Character.NON_SPACING_MARK: case Character.ENCLOSING_MARK: // depends what it encloses? case Character.COMBINING_SPACING_MARK: return ALPHA; case Character.DECIMAL_DIGIT_NUMBER: case Character.LETTER_NUMBER: case Character.OTHER_NUMBER: return DIGIT; // case Character.SPACE_SEPARATOR: // case Character.LINE_SEPARATOR: // case Character.PARAGRAPH_SEPARATOR: // case Character.CONTROL: // case Character.FORMAT: // case Character.PRIVATE_USE: case Character.SURROGATE: // prevent splitting return ALPHA|DIGIT; // case Character.DASH_PUNCTUATION: // case Character.START_PUNCTUATION: // case Character.END_PUNCTUATION: // case Character.CONNECTOR_PUNCTUATION: // case Character.OTHER_PUNCTUATION: // case Character.MATH_SYMBOL: // case Character.CURRENCY_SYMBOL: // case Character.MODIFIER_SYMBOL: // case Character.OTHER_SYMBOL: // case Character.INITIAL_QUOTE_PUNCTUATION: // case Character.FINAL_QUOTE_PUNCTUATION: default: return SUBWORD_DELIM; } } }