JapaneseNumberFilter.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ja;


import java.io.IOException;
import java.math.BigDecimal;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;

/**
 * A {@link TokenFilter} that normalizes Japanese numbers (kansūji) to regular Arabic
 * decimal numbers in half-width characters.
 * <p>
 * Japanese numbers are often written using a combination of kanji and Arabic numbers with
 * various kinds punctuation. For example, ３．２千 means 3200. This filter does this kind
 * of normalization and allows a search for 3200 to match ３．２千 in text, but can also be
 * used to make range facets based on the normalized numbers and so on.
 * <p>
 * Notice that this analyzer uses a token composition scheme and relies on punctuation
 * tokens being found in the token stream. Please make sure your {@link JapaneseTokenizer}
 * has {@code discardPunctuation} set to false. In case punctuation characters, such as ．
 * (U+FF0E FULLWIDTH FULL STOP), is removed from the token stream, this filter would find
 * input tokens tokens ３ and ２千 and give outputs 3 and 2000 instead of 3200, which is
 * likely not the intended result. If you want to remove punctuation characters from your
 * index that are not part of normalized numbers, add a
 * {@link org.apache.lucene.analysis.StopFilter} with the punctuation you wish to
 * remove after {@link JapaneseNumberFilter} in your analyzer chain.
 * <p>
 * Below are some examples of normalizations this filter supports. The input is untokenized
 * text and the result is the single term attribute emitted for the input.
 * <ul>
 * <li>〇〇七 becomes 7</li>
 * <li>一〇〇〇 becomes 1000</li>
 * <li>三千2百２十三 becomes 3223</li>
 * <li>兆六百万五千一 becomes 1000006005001</li>
 * <li>３．２千 becomes 3200</li>
 * <li>１．２万３４５．６７ becomes 12345.67</li>
 * <li>4,647.100 becomes 4647.1</li>
 * <li>15,7 becomes 157 (be aware of this weakness)</li>
 * </ul>
 * <p>
 * Tokens preceded by a token with {@link PositionIncrementAttribute} of zero are left
 * left untouched and emitted as-is.
 * <p>
 * This filter does not use any part-of-speech information for its normalization and
 * the motivation for this is to also support n-grammed token streams in the future.
 * <p>
 * This filter may in some cases normalize tokens that are not numbers in their context.
 * For example, is 田中京一 is a name and means Tanaka Kyōichi, but 京一 (Kyōichi) out of
 * context can strictly speaking also represent the number 10000000000000001. This filter
 * respects the {@link KeywordAttribute}, which can be used to prevent specific
 * normalizations from happening.
 * <p>
 * Also notice that token attributes such as
 * {@link org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute},
 * {@link org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute},
 * {@link org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute} and
 * {@link org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute} are left
 * unchanged and will inherit the values of the last token used to compose the normalized
 * number and can be wrong. Hence, for １０万 (10000), we will have
 * {@link org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute}
 * set to マン. This is a known issue and is subject to a future improvement.
 * <p>
 * Japanese formal numbers (daiji), accounting numbers and decimal fractions are currently
 * not supported.
 */
public class JapaneseNumberFilter extends TokenFilter {

  private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAttr = addAttribute(OffsetAttribute.class);
  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
  private final PositionIncrementAttribute posIncrAttr = addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLengthAttr = addAttribute(PositionLengthAttribute.class);

  private static char NO_NUMERAL = Character.MAX_VALUE;

  private static char[] numerals;

  private static char[] exponents;

  private State state;

  private StringBuilder numeral;

  private int fallThroughTokens;
  
  private boolean exhausted = false;

  static {
    numerals = new char[0x10000];
    for (int i = 0; i < numerals.length; i++) {
      numerals[i] = NO_NUMERAL;
    }
    numerals['〇'] = 0; // 〇 U+3007 0
    numerals['一'] = 1; // 一 U+4E00 1
    numerals['二'] = 2; // 二 U+4E8C 2
    numerals['三'] = 3; // 三 U+4E09 3
    numerals['四'] = 4; // 四 U+56DB 4
    numerals['五'] = 5; // 五 U+4E94 5
    numerals['六'] = 6; // 六 U+516D 6
    numerals['七'] = 7; // 七 U+4E03 7
    numerals['八'] = 8; // 八 U+516B 8
    numerals['九'] = 9; // 九 U+4E5D 9

    exponents = new char[0x10000];
    for (int i = 0; i < exponents.length; i++) {
      exponents[i] = 0;
    }
    exponents['十'] = 1;  // 十 U+5341 10
    exponents['百'] = 2;  // 百 U+767E 100
    exponents['千'] = 3;  // 千 U+5343 1,000
    exponents['万'] = 4;  // 万 U+4E07 10,000
    exponents['億'] = 8;  // 億 U+5104 100,000,000
    exponents['兆'] = 12; // 兆 U+5146 1,000,000,000,000
    exponents['京'] = 16; // 京 U+4EAC 10,000,000,000,000,000
    exponents['垓'] = 20; // 垓 U+5793 100,000,000,000,000,000,000
  }

  public JapaneseNumberFilter(TokenStream input) {
    super(input);
  }

  @Override
  public final boolean incrementToken() throws IOException {

    // Emit previously captured token we read past earlier
    if (state != null) {
      restoreState(state);
      state = null;
      return true;
    }

    if (exhausted) {
      return false;
    }
    
    if (!input.incrementToken()) {
      exhausted = true;
      return false;
    }

    if (keywordAttr.isKeyword()) {
      return true;
    }

    if (fallThroughTokens > 0) {
      fallThroughTokens--;
      return true;
    }

    if (posIncrAttr.getPositionIncrement() == 0) {
      fallThroughTokens = posLengthAttr.getPositionLength() - 1;
      return true;
    }

    boolean moreTokens = true;
    boolean composedNumberToken = false;
    int startOffset = 0;
    int endOffset = 0;
    State preCompositionState = captureState();
    String term = termAttr.toString();
    boolean numeralTerm = isNumeral(term);
    
    while (moreTokens && numeralTerm) {

      if (!composedNumberToken) {
        startOffset = offsetAttr.startOffset();
        composedNumberToken = true;
      }

      endOffset = offsetAttr.endOffset();
      moreTokens = input.incrementToken();
      if (moreTokens == false) {
        exhausted = true;
      }

      if (posIncrAttr.getPositionIncrement() == 0) {
        // This token is a stacked/synonym token, capture number of tokens "under" this token,
        // except the first token, which we will emit below after restoring state
        fallThroughTokens = posLengthAttr.getPositionLength() - 1;
        state = captureState();
        restoreState(preCompositionState);
        return moreTokens;
      }

      numeral.append(term);

      if (moreTokens) {
        term = termAttr.toString();
        numeralTerm = isNumeral(term) || isNumeralPunctuation(term);
      }
    }

    if (composedNumberToken) {
      if (moreTokens) {
        // We have read past all numerals and there are still tokens left, so
        // capture the state of this token and emit it on our next incrementToken()
        state = captureState();
      }

      String normalizedNumber = normalizeNumber(numeral.toString());

      termAttr.setEmpty();
      termAttr.append(normalizedNumber);
      offsetAttr.setOffset(startOffset, endOffset);

      numeral = new StringBuilder();
      return true;
    }
    return moreTokens;
  }

  @Override
  public void reset() throws IOException {
    super.reset();
    fallThroughTokens = 0;
    numeral = new StringBuilder();
    state = null;
    exhausted = false;
  }

  /**
   * Normalizes a Japanese number
   *
   * @param number number or normalize
   * @return normalized number, or number to normalize on error (no op)
   */
  public String normalizeNumber(String number) {
    try {
      BigDecimal normalizedNumber = parseNumber(new NumberBuffer(number));
      if (normalizedNumber == null) {
        return number;
      }
      return normalizedNumber.stripTrailingZeros().toPlainString();
    } catch (NumberFormatException | ArithmeticException e) {
      // Return the source number in case of error, i.e. malformed input
      return number;
    }
  }

  /**
   * Parses a Japanese number
   *
   * @param buffer buffer to parse
   * @return parsed number, or null on error or end of input
   */
  private BigDecimal parseNumber(NumberBuffer buffer) {
    BigDecimal sum = BigDecimal.ZERO;
    BigDecimal result = parseLargePair(buffer);

    if (result == null) {
      return null;
    }

    while (result != null) {
      sum = sum.add(result);
      result = parseLargePair(buffer);
    }

    return sum;
  }

  /**
   * Parses a pair of large numbers, i.e. large kanji factor is 10,000（万）or larger
   *
   * @param buffer buffer to parse
   * @return parsed pair, or null on error or end of input
   */
  private BigDecimal parseLargePair(NumberBuffer buffer) {
    BigDecimal first = parseMediumNumber(buffer);
    BigDecimal second = parseLargeKanjiNumeral(buffer);

    if (first == null && second == null) {
      return null;
    }

    if (second == null) {
      // If there's no second factor, we return the first one
      // This can happen if we our number is smaller than 10,000 (万)
      return first;
    }

    if (first == null) {
      // If there's no first factor, just return the second one,
      // which is the same as multiplying by 1, i.e. with 万
      return second;
    }

    return first.multiply(second);
  }

  /**
   * Parses a "medium sized" number, typically less than 10,000（万）, but might be larger
   * due to a larger factor from {link parseBasicNumber}.
   *
   * @param buffer buffer to parse
   * @return parsed number, or null on error or end of input
   */
  private BigDecimal parseMediumNumber(NumberBuffer buffer) {
    BigDecimal sum = BigDecimal.ZERO;
    BigDecimal result = parseMediumPair(buffer);

    if (result == null) {
      return null;
    }

    while (result != null) {
      sum = sum.add(result);
      result = parseMediumPair(buffer);
    }

    return sum;
  }

  /**
   * Parses a pair of "medium sized" numbers, i.e. large kanji factor is at most 1,000（千）
   *
   * @param buffer buffer to parse
   * @return parsed pair, or null on error or end of input
   */
  private BigDecimal parseMediumPair(NumberBuffer buffer) {

    BigDecimal first = parseBasicNumber(buffer);
    BigDecimal second = parseMediumKanjiNumeral(buffer);

    if (first == null && second == null) {
      return null;
    }

    if (second == null) {
      // If there's no second factor, we return the first one
      // This can happen if we just have a plain number such as 五
      return first;
    }

    if (first == null) {
      // If there's no first factor, just return the second one,
      // which is the same as multiplying by 1, i.e. with 千
      return second;
    }

    // Return factors multiplied
    return first.multiply(second);
  }

  /**
   * Parse a basic number, which is a sequence of Arabic numbers or a sequence or 0-9 kanji numerals (〇 to 九).
   *
   * @param buffer buffer to parse
   * @return parsed number, or null on error or end of input
   */
  private BigDecimal parseBasicNumber(NumberBuffer buffer) {
    StringBuilder builder = new StringBuilder();
    int i = buffer.position();

    while (i < buffer.length()) {
      char c = buffer.charAt(i);

      if (isArabicNumeral(c)) {
        // Arabic numerals; 0 to 9 or ０ to ９ (full-width)
        builder.append(arabicNumeralValue(c));
      } else if (isKanjiNumeral(c)) {
        // Kanji numerals; 〇, 一, 二, 三, 四, 五, 六, 七, 八, or 九
        builder.append(kanjiNumeralValue(c));
      } else if (isDecimalPoint(c)) {
        builder.append(".");
      } else if (isThousandSeparator(c)) {
        // Just skip and move to the next character
      } else {
        // We don't have an Arabic nor kanji numeral, nor separation or punctuation, so we'll stop.
        break;
      }

      i++;
      buffer.advance();
    }

    if (builder.length() == 0) {
      // We didn't build anything, so we don't have a number
      return null;
    }

    return new BigDecimal(builder.toString());
  }

  /**
   * Parse large kanji numerals (ten thousands or larger)
   *
   * @param buffer buffer to parse
   * @return parsed number, or null on error or end of input
   */
  public BigDecimal parseLargeKanjiNumeral(NumberBuffer buffer) {
    int i = buffer.position();

    if (i >= buffer.length()) {
      return null;
    }

    char c = buffer.charAt(i);
    int power = exponents[c];

    if (power > 3) {
      buffer.advance();
      return BigDecimal.TEN.pow(power);
    }

    return null;
  }

  /**
   * Parse medium kanji numerals (tens, hundreds or thousands)
   *
   * @param buffer buffer to parse
   * @return parsed number or null on error
   */
  public BigDecimal parseMediumKanjiNumeral(NumberBuffer buffer) {
    int i = buffer.position();

    if (i >= buffer.length()) {
      return null;
    }

    char c = buffer.charAt(i);
    int power = exponents[c];

    if (1 <= power && power <= 3) {
      buffer.advance();
      return BigDecimal.TEN.pow(power);
    }

    return null;
  }

  /**
   * Numeral predicate
   *
   * @param input string to test
   * @return true if and only if input is a numeral
   */
  public boolean isNumeral(String input) {
    for (int i = 0; i < input.length(); i++) {
      if (!isNumeral(input.charAt(i))) {
        return false;
      }
    }
    return true;
  }

  /**
   * Numeral predicate
   *
   * @param c character to test
   * @return true if and only if c is a numeral
   */
  public boolean isNumeral(char c) {
    return isArabicNumeral(c) || isKanjiNumeral(c) || exponents[c] > 0;
  }

  /**
   * Numeral punctuation predicate
   *
   * @param input string to test
   * @return true if and only if c is a numeral punctuation string
   */
  public boolean isNumeralPunctuation(String input) {
    for (int i = 0; i < input.length(); i++) {
      if (!isNumeralPunctuation(input.charAt(i))) {
        return false;
      }
    }
    return true;
  }

  /**
   * Numeral punctuation predicate
   *
   * @param c character to test
   * @return true if and only if c is a numeral punctuation character
   */
  public boolean isNumeralPunctuation(char c) {
    return isDecimalPoint(c) || isThousandSeparator(c);
  }

  /**
   * Arabic numeral predicate. Both half-width and full-width characters are supported
   *
   * @param c character to test
   * @return true if and only if c is an Arabic numeral
   */
  public boolean isArabicNumeral(char c) {
    return isHalfWidthArabicNumeral(c) || isFullWidthArabicNumeral(c);
  }

  /**
   * Arabic half-width numeral predicate
   *
   * @param c character to test
   * @return true if and only if c is a half-width Arabic numeral
   */
  private boolean isHalfWidthArabicNumeral(char c) {
    // 0 U+0030 - 9 U+0039
    return '0' <= c && c <= '9';
  }

  /**
   * Arabic full-width numeral predicate
   *
   * @param c character to test
   * @return true if and only if c is a full-width Arabic numeral
   */
  private boolean isFullWidthArabicNumeral(char c) {
    // ０ U+FF10 - ９ U+FF19
    return '０' <= c && c <= '９';
  }

  /**
   * Returns the numeric value for the specified character Arabic numeral.
   * Behavior is undefined if a non-Arabic numeral is provided
   *
   * @param c arabic numeral character
   * @return numeral value
   */
  private int arabicNumeralValue(char c) {
    int offset;
    if (isHalfWidthArabicNumeral(c)) {
      offset = '0';
    } else {
      offset = '０';
    }
    return c - offset;
  }

  /**
   * Kanji numeral predicate that tests if the provided character is one of 〇, 一, 二, 三, 四, 五, 六, 七, 八, or 九.
   * Larger number kanji gives a false value.
   *
   * @param c character to test
   * @return true if and only is character is one of 〇, 一, 二, 三, 四, 五, 六, 七, 八, or 九 (0 to 9)
   */
  private boolean isKanjiNumeral(char c) {
    return numerals[c] != NO_NUMERAL;
  }

  /**
   * Returns the value for the provided kanji numeral. Only numeric values for the characters where
   * {link isKanjiNumeral} return true are supported - behavior is undefined for other characters.
   *
   * @param c kanji numeral character
   * @return numeral value
   * @see #isKanjiNumeral(char)
   */
  private int kanjiNumeralValue(char c) {
    return numerals[c];
  }

  /**
   * Decimal point predicate
   *
   * @param c character to test
   * @return true if and only if c is a decimal point
   */
  private boolean isDecimalPoint(char c) {
    return c == '.'   // U+002E FULL STOP 
        || c == '．'; // U+FF0E FULLWIDTH FULL STOP
  }

  /**
   * Thousand separator predicate
   *
   * @param c character to test
   * @return true if and only if c is a thousand separator predicate
   */
  private boolean isThousandSeparator(char c) {
    return c == ','   // U+002C COMMA
        || c == '，'; // U+FF0C FULLWIDTH COMMA
  }

  /**
   * Buffer that holds a Japanese number string and a position index used as a parsed-to marker
   */
  public static class NumberBuffer {

    private int position;

    private String string;

    public NumberBuffer(String string) {
      this.string = string;
      this.position = 0;
    }

    public char charAt(int index) {
      return string.charAt(index);
    }

    public int length() {
      return string.length();
    }

    public void advance() {
      position++;
    }

    public int position() {
      return position;
    }
  }
}