WordDelimiterFilter.java example

Explorer
mdrill-master
- trunk
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 
package org.apache.solr.analysis;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;

import java.io.IOException;

/**
 * Splits words into subwords and performs optional transformations on subword groups.
 * Words are split into subwords with the following rules:
 *  - split on intra-word delimiters (by default, all non alpha-numeric characters).
 *     - "Wi-Fi" -> "Wi", "Fi"
 *  - split on case transitions
 *     - "PowerShot" -> "Power", "Shot"
 *  - split on letter-number transitions
 *     - "SD500" -> "SD", "500"
 *  - leading and trailing intra-word delimiters on each subword are ignored
 *     - "//hello---there, 'dude'" -> "hello", "there", "dude"
 *  - trailing "'s" are removed for each subword
 *     - "O'Neil's" -> "O", "Neil"
 *     - Note: this step isn't performed in a separate filter because of possible subword combinations.
 *
 * The <b>combinations</b> parameter affects how subwords are combined:
 *  - combinations="0" causes no subword combinations.
 *     - "PowerShot" -> 0:"Power", 1:"Shot"  (0 and 1 are the token positions)
 *  - combinations="1" means that in addition to the subwords, maximum runs of non-numeric subwords are catenated and produced at the same position of the last subword in the run.
 *     - "PowerShot" -> 0:"Power", 1:"Shot" 1:"PowerShot"
 *     - "A's+B's&C's" -> 0:"A", 1:"B", 2:"C", 2:"ABC"
 *     - "Super-Duper-XL500-42-AutoCoder!" -> 0:"Super", 1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder"
 *
 *  One use for WordDelimiterFilter is to help match words with different subword delimiters.
 *  For example, if the source text contained "wi-fi" one may want "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match.
 *  One way of doing so is to specify combinations="1" in the analyzer used for indexing, and combinations="0" (the default)
 *  in the analyzer used for querying.  Given that the current StandardTokenizer immediately removes many intra-word
 *  delimiters, it is recommended that this filter be used after a tokenizer that does not do this (such as WhitespaceTokenizer).
 *
 *  @version $Id: WordDelimiterFilter.java 1166766 2011-09-08 15:52:10Z rmuir $
 */

final class WordDelimiterFilter extends TokenFilter {
  
  public static final int LOWER = 0x01;
  public static final int UPPER = 0x02;
  public static final int DIGIT = 0x04;
  public static final int SUBWORD_DELIM = 0x08;

  // combinations: for testing, not for setting bits
  public static final int ALPHA = 0x03;
  public static final int ALPHANUM = 0x07;

  /**
   * Causes parts of words to be generated:
   * <p/>
   * "PowerShot" => "Power" "Shot"
   */
  public static final int GENERATE_WORD_PARTS = 1;

  /**
   * Causes number subwords to be generated:
   * <p/>
   * "500-42" => "500" "42"
   */
  public static final int GENERATE_NUMBER_PARTS = 2;

  /**
   * Causes maximum runs of word parts to be catenated:
   * <p/>
   * "wi-fi" => "wifi"
   */
  public static final int CATENATE_WORDS = 4;

  /**
   * Causes maximum runs of word parts to be catenated:
   * <p/>
   * "wi-fi" => "wifi"
   */
  public static final int CATENATE_NUMBERS = 8;

  /**
   * Causes all subword parts to be catenated:
   * <p/>
   * "wi-fi-4000" => "wifi4000"
   */
  public static final int CATENATE_ALL = 16;

  /**
   * Causes original words are preserved and added to the subword list (Defaults to false)
   * <p/>
   * "500-42" => "500" "42" "500-42"
   */
  public static final int PRESERVE_ORIGINAL = 32;

  /**
   * If not set, causes case changes to be ignored (subwords will only be generated
   * given SUBWORD_DELIM tokens)
   */
  public static final int SPLIT_ON_CASE_CHANGE = 64;

  /**
   * If not set, causes numeric changes to be ignored (subwords will only be generated
   * given SUBWORD_DELIM tokens).
   */
  public static final int SPLIT_ON_NUMERICS = 128;

  /**
   * Causes trailing "'s" to be removed for each subword
   * <p/>
   * "O'Neil's" => "O", "Neil"
   */
  public static final int STEM_ENGLISH_POSSESSIVE = 256;
  
  /**
   * If not null is the set of tokens to protect from being delimited
   *
   */
  final CharArraySet protWords;

  private final int flags;
    
  private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
  private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
  private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);

  // used for iterating word delimiter breaks
  private final WordDelimiterIterator iterator;

  // used for concatenating runs of similar typed subwords (word,number)
  private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation();
  // number of subwords last output by concat.
  private int lastConcatCount = 0;

  // used for catenate all
  private final WordDelimiterConcatenation concatAll = new WordDelimiterConcatenation();

  // used for accumulating position increment gaps
  private int accumPosInc = 0;

  private char savedBuffer[] = new char[1024];
  private int savedStartOffset;
  private int savedEndOffset;
  private String savedType;
  private boolean hasSavedState = false;
  // if length by start + end offsets doesn't match the term text then assume
  // this is a synonym and don't adjust the offsets.
  private boolean hasIllegalOffsets = false;

  // for a run of the same subword type within a word, have we output anything?
  private boolean hasOutputToken = false;
  // when preserve original is on, have we output any token following it?
  // this token must have posInc=0!
  private boolean hasOutputFollowingOriginal = false;

  /**
   * Creates a new WordDelimiterFilter
   *
   * @param in TokenStream to be filtered
   * @param charTypeTable table containing character types
   * @param configurationFlags Flags configuring the filter
   * @param protWords If not null is the set of tokens to protect from being delimited
   */
  public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
    super(in);
    this.flags = configurationFlags;
    this.protWords = protWords;
    this.iterator = new WordDelimiterIterator(
        charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE));
  }

  /**
   * Creates a new WordDelimiterFilter using {@link WordDelimiterIterator#DEFAULT_WORD_DELIM_TABLE}
   * as its charTypeTable
   *
   * @param in TokenStream to be filtered
   * @param configurationFlags Flags configuring the filter
   * @param protWords If not null is the set of tokens to protect from being delimited
   */
  public WordDelimiterFilter(TokenStream in, int configurationFlags, CharArraySet protWords) {
    this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
  }

  /**
   * @param in Token stream to be filtered.
   * @param charTypeTable table containing character types
   * @param generateWordParts If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
   * @param generateNumberParts If 1, causes number subwords to be generated: "500-42" => "500" "42"
   * @param catenateWords  1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
   * @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
   * @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
   * @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
   * @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
   * @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
   * @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
   * @param protWords If not null is the set of tokens to protect from being delimited
   * @deprecated Use {@link #WordDelimiterFilter(TokenStream, byte[], int, CharArraySet)}
   */
  @Deprecated
  public WordDelimiterFilter(TokenStream in,
                             byte[] charTypeTable,
                             int generateWordParts,
                             int generateNumberParts,
                             int catenateWords,
                             int catenateNumbers,
                             int catenateAll,
                             int splitOnCaseChange,
                             int preserveOriginal,
                             int splitOnNumerics,
                             int stemEnglishPossessive,
                             CharArraySet protWords) {
    super(in);

    int flags = 0;
    if (generateWordParts != 0) {
      flags |= GENERATE_WORD_PARTS;
    }
    if (generateNumberParts != 0) {
      flags |= GENERATE_NUMBER_PARTS;
    }
    if (catenateWords != 0) {
      flags |= CATENATE_WORDS;
    }
    if (catenateNumbers != 0) {
      flags |= CATENATE_NUMBERS;
    }
    if (catenateAll != 0) {
      flags |= CATENATE_ALL;
    }
    if (preserveOriginal != 0) {
      flags |= PRESERVE_ORIGINAL;
    }
    if (splitOnCaseChange != 0) {
      flags |= SPLIT_ON_CASE_CHANGE;
    }
    if (splitOnNumerics != 0) {
      flags |= SPLIT_ON_NUMERICS;
    }
    if (stemEnglishPossessive != 0) {
      flags |= STEM_ENGLISH_POSSESSIVE;
    }
    this.protWords = protWords;
    this.iterator = new WordDelimiterIterator(charTypeTable, splitOnCaseChange != 0, splitOnNumerics != 0, stemEnglishPossessive != 0);
    this.flags = flags;
  }
  
  /**
   * Compatibility constructor
   * 
   * @deprecated Use
   *             {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, int, CharArraySet)}
   *             instead.
   */
  @Deprecated
  public WordDelimiterFilter(TokenStream in,
                             byte[] charTypeTable,
                             int generateWordParts,
                             int generateNumberParts,
                             int catenateWords,
                             int catenateNumbers,
                             int catenateAll,
                             int splitOnCaseChange,
                             int preserveOriginal,
                             int splitOnNumerics,
                             CharArraySet protWords) {
    this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, 1, 1, null);
  }

  /**
   * Compatibility constructor
   * 
   * @deprecated Use
   *             {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, int, CharArraySet)}
   *             instead.
   */
  @Deprecated
  public WordDelimiterFilter(TokenStream in,
                             byte[] charTypeTable,
                             int generateWordParts,
                             int generateNumberParts,
                             int catenateWords,
                             int catenateNumbers,
                             int catenateAll,
                             int splitOnCaseChange,
                             int preserveOriginal) {
    this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, 1, null);
  }

  /**
   * @param in Token stream to be filtered.
   * @param generateWordParts If 1, causes parts of words to be generated: "PowerShot", "Power-Shot" => "Power" "Shot"
   * @param generateNumberParts If 1, causes number subwords to be generated: "500-42" => "500" "42"
   * @param catenateWords  1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
   * @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
   * @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
   * @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
   * @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
   * @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
   * @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
   * @param protWords If not null is the set of tokens to protect from being delimited
   * @deprecated Use {@link #WordDelimiterFilter(TokenStream, int, CharArraySet)}
   */
  @Deprecated
  public WordDelimiterFilter(TokenStream in,
                             int generateWordParts,
                             int generateNumberParts,
                             int catenateWords,
                             int catenateNumbers,
                             int catenateAll,
                             int splitOnCaseChange,
                             int preserveOriginal,
                             int splitOnNumerics,
                             int stemEnglishPossessive,
                             CharArraySet protWords) {
    this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, stemEnglishPossessive, protWords);
  }
  
  /**
   * @deprecated Use
   *             {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
   *             instead.
   */
  @Deprecated
  public WordDelimiterFilter(TokenStream in,
                             int generateWordParts,
                             int generateNumberParts,
                             int catenateWords,
                             int catenateNumbers,
                             int catenateAll,
                             int splitOnCaseChange,
                             int preserveOriginal,
                             int splitOnNumerics,
                             CharArraySet protWords) {
    this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, 1, protWords);
  }

  /**   * Compatibility constructor
   * 
   * @deprecated Use
   *             {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
   *             instead.
   */
  @Deprecated
  public WordDelimiterFilter(TokenStream in,
                             int generateWordParts,
                             int generateNumberParts,
                             int catenateWords,
                             int catenateNumbers,
                             int catenateAll,
                             int splitOnCaseChange,
                             int preserveOriginal) {
    this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal);
  }
  /**
   * Compatibility constructor
   * 
   * @deprecated Use
   *             {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
   *             instead.
   */
  @Deprecated
  public WordDelimiterFilter(TokenStream in,
                             byte[] charTypeTable,
                             int generateWordParts,
                             int generateNumberParts,
                             int catenateWords,
                             int catenateNumbers,
                             int catenateAll) {
    this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
  }
  /**
   * Compatibility constructor
   * 
   * @deprecated Use
   *             {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
   *             instead.
   */
  @Deprecated
  public WordDelimiterFilter(TokenStream in,
                             int generateWordParts,
                             int generateNumberParts,
                             int catenateWords,
                             int catenateNumbers,
                             int catenateAll) {
    this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
  }
  
  @Override
  public boolean incrementToken() throws IOException {
    while (true) {
      if (!hasSavedState) {
        // process a new input word
        if (!input.incrementToken()) {
          return false;
        }

        int termLength = termAttribute.length();
        char[] termBuffer = termAttribute.buffer();
        
        accumPosInc += posIncAttribute.getPositionIncrement();

        iterator.setText(termBuffer, termLength);
        iterator.next();

        // word of no delimiters, or protected word: just return it
        if ((iterator.current == 0 && iterator.end == termLength) ||
            (protWords != null && protWords.contains(termBuffer, 0, termLength))) {
          posIncAttribute.setPositionIncrement(accumPosInc);
          accumPosInc = 0;
          return true;
        }
        
        // word of simply delimiters
        if (iterator.end == WordDelimiterIterator.DONE && !has(PRESERVE_ORIGINAL)) {
          // if the posInc is 1, simply ignore it in the accumulation
          if (posIncAttribute.getPositionIncrement() == 1) {
            accumPosInc--;
          }
          continue;
        }

        saveState();

        hasOutputToken = false;
        hasOutputFollowingOriginal = !has(PRESERVE_ORIGINAL);
        lastConcatCount = 0;
        
        if (has(PRESERVE_ORIGINAL)) {
          posIncAttribute.setPositionIncrement(accumPosInc);
          accumPosInc = 0;
          return true;
        }
      }
      
      // at the end of the string, output any concatenations
      if (iterator.end == WordDelimiterIterator.DONE) {
        if (!concat.isEmpty()) {
          if (flushConcatenation(concat)) {
            return true;
          }
        }
        
        if (!concatAll.isEmpty()) {
          // only if we haven't output this same combo above!
          if (concatAll.subwordCount > lastConcatCount) {
            concatAll.writeAndClear();
            return true;
          }
          concatAll.clear();
        }
        
        // no saved concatenations, on to the next input word
        hasSavedState = false;
        continue;
      }
      
      // word surrounded by delimiters: always output
      if (iterator.isSingleWord()) {
        generatePart(true);
        iterator.next();
        return true;
      }
      
      int wordType = iterator.type();
      
      // do we already have queued up incompatible concatenations?
      if (!concat.isEmpty() && (concat.type & wordType) == 0) {
        if (flushConcatenation(concat)) {
          hasOutputToken = false;
          return true;
        }
        hasOutputToken = false;
      }
      
      // add subwords depending upon options
      if (shouldConcatenate(wordType)) {
        if (concat.isEmpty()) {
          concat.type = wordType;
        }
        concatenate(concat);
      }
      
      // add all subwords (catenateAll)
      if (has(CATENATE_ALL)) {
        concatenate(concatAll);
      }
      
      // if we should output the word or number part
      if (shouldGenerateParts(wordType)) {
        generatePart(false);
        iterator.next();
        return true;
      }
        
      iterator.next();
    }
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public void reset() throws IOException {
    super.reset();
    hasSavedState = false;
    concat.clear();
    concatAll.clear();
    accumPosInc = 0;
  }

  // ================================================= Helper Methods ================================================

  /**
   * Saves the existing attribute states
   */
  private void saveState() {
    // otherwise, we have delimiters, save state
    savedStartOffset = offsetAttribute.startOffset();
    savedEndOffset = offsetAttribute.endOffset();
    // if length by start + end offsets doesn't match the term text then assume this is a synonym and don't adjust the offsets.
    hasIllegalOffsets = (savedEndOffset - savedStartOffset != termAttribute.length());
    savedType = typeAttribute.type();

    if (savedBuffer.length < termAttribute.length()) {
      savedBuffer = new char[ArrayUtil.oversize(termAttribute.length(), RamUsageEstimator.NUM_BYTES_CHAR)];
    }

    System.arraycopy(termAttribute.buffer(), 0, savedBuffer, 0, termAttribute.length());
    iterator.text = savedBuffer;

    hasSavedState = true;
  }

  /**
   * Flushes the given WordDelimiterConcatenation by either writing its concat and then clearing, or just clearing.
   *
   * @param concatenation WordDelimiterConcatenation that will be flushed
   * @return {@code true} if the concatenation was written before it was cleared, {@code} false otherwise
   */
  private boolean flushConcatenation(WordDelimiterConcatenation concatenation) {
    lastConcatCount = concatenation.subwordCount;
    if (concatenation.subwordCount != 1 || !shouldGenerateParts(concatenation.type)) {
      concatenation.writeAndClear();
      return true;
    }
    concatenation.clear();
    return false;
  }

  /**
   * Determines whether to concatenate a word or number if the current word is the given type
   *
   * @param wordType Type of the current word used to determine if it should be concatenated
   * @return {@code true} if concatenation should occur, {@code false} otherwise
   */
  private boolean shouldConcatenate(int wordType) {
    return (has(CATENATE_WORDS) && isAlpha(wordType)) || (has(CATENATE_NUMBERS) && isDigit(wordType));
  }

  /**
   * Determines whether a word/number part should be generated for a word of the given type
   *
   * @param wordType Type of the word used to determine if a word/number part should be generated
   * @return {@code true} if a word/number part should be generated, {@code false} otherwise
   */
  private boolean shouldGenerateParts(int wordType) {
    return (has(GENERATE_WORD_PARTS) && isAlpha(wordType)) || (has(GENERATE_NUMBER_PARTS) && isDigit(wordType));
  }

  /**
   * Concatenates the saved buffer to the given WordDelimiterConcatenation
   *
   * @param concatenation WordDelimiterConcatenation to concatenate the buffer to
   */
  private void concatenate(WordDelimiterConcatenation concatenation) {
    if (concatenation.isEmpty()) {
      concatenation.startOffset = savedStartOffset + iterator.current;
    }
    concatenation.append(savedBuffer, iterator.current, iterator.end - iterator.current);
    concatenation.endOffset = savedStartOffset + iterator.end;
  }

  /**
   * Generates a word/number part, updating the appropriate attributes
   *
   * @param isSingleWord {@code true} if the generation is occurring from a single word, {@code false} otherwise
   */
  private void generatePart(boolean isSingleWord) {
    clearAttributes();
    termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current);

    int startOffSet = (isSingleWord || !hasIllegalOffsets) ? savedStartOffset + iterator.current : savedStartOffset;
    int endOffSet = (hasIllegalOffsets) ? savedEndOffset : savedStartOffset + iterator.end;

    offsetAttribute.setOffset(startOffSet, endOffSet);
    posIncAttribute.setPositionIncrement(position(false));
    typeAttribute.setType(savedType);
  }

  /**
   * Get the position increment gap for a subword or concatenation
   *
   * @param inject true if this token wants to be injected
   * @return position increment gap
   */
  private int position(boolean inject) {
    int posInc = accumPosInc;

    if (hasOutputToken) {
      accumPosInc = 0;
      return inject ? 0 : Math.max(1, posInc);
    }

    hasOutputToken = true;
    
    if (!hasOutputFollowingOriginal) {
      // the first token following the original is 0 regardless
      hasOutputFollowingOriginal = true;
      return 0;
    }
    // clear the accumulated position increment
    accumPosInc = 0;
    return Math.max(1, posInc);
  }

  /**
   * Checks if the given word type includes {@link #ALPHA}
   *
   * @param type Word type to check
   * @return {@code true} if the type contains ALPHA, {@code false} otherwise
   */
  static boolean isAlpha(int type) {
    return (type & ALPHA) != 0;
  }

  /**
   * Checks if the given word type includes {@link #DIGIT}
   *
   * @param type Word type to check
   * @return {@code true} if the type contains DIGIT, {@code false} otherwise
   */
  static boolean isDigit(int type) {
    return (type & DIGIT) != 0;
  }

  /**
   * Checks if the given word type includes {@link #SUBWORD_DELIM}
   *
   * @param type Word type to check
   * @return {@code true} if the type contains SUBWORD_DELIM, {@code false} otherwise
   */
  static boolean isSubwordDelim(int type) {
    return (type & SUBWORD_DELIM) != 0;
  }

  /**
   * Checks if the given word type includes {@link #UPPER}
   *
   * @param type Word type to check
   * @return {@code true} if the type contains UPPER, {@code false} otherwise
   */
  static boolean isUpper(int type) {
    return (type & UPPER) != 0;
  }

  /**
   * Determines whether the given flag is set
   *
   * @param flag Flag to see if set
   * @return {@code} true if flag is set
   */
  private boolean has(int flag) {
    return (flags & flag) != 0;
  }

  // ================================================= Inner Classes =================================================

  /**
   * A WDF concatenated 'run'
   */
  final class WordDelimiterConcatenation {
    final StringBuilder buffer = new StringBuilder();
    int startOffset;
    int endOffset;
    int type;
    int subwordCount;

    /**
     * Appends the given text of the given length, to the concetenation at the given offset
     *
     * @param text Text to append
     * @param offset Offset in the concetenation to add the text
     * @param length Length of the text to append
     */
    void append(char text[], int offset, int length) {
      buffer.append(text, offset, length);
      subwordCount++;
    }

    /**
     * Writes the concatenation to the attributes
     */
    void write() {
      clearAttributes();
      if (termAttribute.length() < buffer.length()) {
        termAttribute.resizeBuffer(buffer.length());
      }
      char termbuffer[] = termAttribute.buffer();
      
      buffer.getChars(0, buffer.length(), termbuffer, 0);
      termAttribute.setLength(buffer.length());
        
      if (hasIllegalOffsets) {
        offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
      }
      else {
        offsetAttribute.setOffset(startOffset, endOffset);
      }
      posIncAttribute.setPositionIncrement(position(true));
      typeAttribute.setType(savedType);
      accumPosInc = 0;
    }

    /**
     * Determines if the concatenation is empty
     *
     * @return {@code true} if the concatenation is empty, {@code false} otherwise
     */
    boolean isEmpty() {
      return buffer.length() == 0;
    }

    /**
     * Clears the concatenation and resets its state
     */
    void clear() {
      buffer.setLength(0);
      startOffset = endOffset = type = subwordCount = 0;
    }

    /**
     * Convenience method for the common scenario of having to write the concetenation and then clearing its state
     */
    void writeAndClear() {
      write();
      clear();
    }
  }
  // questions:
  // negative numbers?  -42 indexed as just 42?
  // dollar sign?  $42
  // percent sign?  33%
  // downsides:  if source text is "powershot" then a query of "PowerShot" won't match!
}