JapaneseIterationMarkCharFilter.java example

Explorer
heliosearch-master
- lucene
- solr
package org.apache.lucene.analysis.ja;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.util.RollingCharBuffer;

import java.io.IOException;
import java.io.Reader;

/**
 * Normalizes Japanese horizontal iteration marks (odoriji) to their expanded form.
 * <p>
 * Sequences of iteration marks are supported.  In case an illegal sequence of iteration
 * marks is encountered, the implementation emits the illegal source character as-is
 * without considering its script.  For example, with input "?ゝ", we get
 * "??" even though "?" isn't hiragana.
 * </p>
 * <p>
 * Note that a full stop punctuation character "。" (U+3002) can not be iterated
 * (see below). Iteration marks themselves can be emitted in case they are illegal,
 * i.e. if they go back past the beginning of the character stream.
 * </p>
 * <p>
 * The implementation buffers input until a full stop punctuation character (U+3002)
 * or EOF is reached in order to not keep a copy of the character stream in memory.
 * Vertical iteration marks, which are even rarer than horizontal iteration marks in
 * contemporary Japanese, are unsupported.
 * </p>
 */
public class JapaneseIterationMarkCharFilter extends CharFilter {

  /** Normalize kanji iteration marks by default */
  public static final boolean NORMALIZE_KANJI_DEFAULT = true; 

  /** Normalize kana iteration marks by default */
  public static final boolean NORMALIZE_KANA_DEFAULT = true;

  private static final char KANJI_ITERATION_MARK = '\u3005';           // 々

  private static final char HIRAGANA_ITERATION_MARK = '\u309d';        // ゝ

  private static final char HIRAGANA_VOICED_ITERATION_MARK = '\u309e'; // ゞ

  private static final char KATAKANA_ITERATION_MARK = '\u30fd';        // ヽ

  private static final char KATAKANA_VOICED_ITERATION_MARK = '\u30fe'; // ヾ

  private static final char FULL_STOP_PUNCTUATION = '\u3002';           // 。

  // Hiragana to dakuten map (lookup using code point - 0x30ab（か）*/
  private static char[] h2d = new char[50];

  // Katakana to dakuten map (lookup using code point - 0x30ab（カ
  private static char[] k2d = new char[50];

  private final RollingCharBuffer buffer = new RollingCharBuffer();

  private int bufferPosition = 0;

  private int iterationMarksSpanSize = 0;

  private int iterationMarkSpanEndPosition = 0;

  private boolean normalizeKanji;

  private boolean normalizeKana;

  static {
    // Hiragana dakuten map
    h2d[0] = '\u304c';  // か => が
    h2d[1] = '\u304c';  // が => が
    h2d[2] = '\u304e';  // き => ぎ
    h2d[3] = '\u304e';  // ぎ => ぎ
    h2d[4] = '\u3050';  // く => ぐ
    h2d[5] = '\u3050';  // ぐ => ぐ
    h2d[6] = '\u3052';  // け => げ
    h2d[7] = '\u3052';  // げ => げ
    h2d[8] = '\u3054';  // こ => ご
    h2d[9] = '\u3054';  // ご => ご
    h2d[10] = '\u3056'; // さ => ざ
    h2d[11] = '\u3056'; // ざ => ざ
    h2d[12] = '\u3058'; // し => じ
    h2d[13] = '\u3058'; // じ => じ
    h2d[14] = '\u305a'; // す => ず
    h2d[15] = '\u305a'; // ず => ず
    h2d[16] = '\u305c'; // せ => ぜ
    h2d[17] = '\u305c'; // ぜ => ぜ
    h2d[18] = '\u305e'; // そ => ぞ
    h2d[19] = '\u305e'; // ぞ => ぞ
    h2d[20] = '\u3060'; // た => だ
    h2d[21] = '\u3060'; // だ => だ
    h2d[22] = '\u3062'; // ち => ぢ
    h2d[23] = '\u3062'; // ぢ => ぢ
    h2d[24] = '\u3063';
    h2d[25] = '\u3065'; // つ => づ
    h2d[26] = '\u3065'; // づ => づ
    h2d[27] = '\u3067'; // て => で
    h2d[28] = '\u3067'; // で => で
    h2d[29] = '\u3069'; // と => ど
    h2d[30] = '\u3069'; // ど => ど
    h2d[31] = '\u306a';
    h2d[32] = '\u306b';
    h2d[33] = '\u306c';
    h2d[34] = '\u306d';
    h2d[35] = '\u306e';
    h2d[36] = '\u3070'; // は => ば
    h2d[37] = '\u3070'; // ば => ば
    h2d[38] = '\u3071';
    h2d[39] = '\u3073'; // ひ => び
    h2d[40] = '\u3073'; // び => び
    h2d[41] = '\u3074';
    h2d[42] = '\u3076'; // ふ => ぶ
    h2d[43] = '\u3076'; // ぶ => ぶ
    h2d[44] = '\u3077';
    h2d[45] = '\u3079'; // へ => べ
    h2d[46] = '\u3079'; // べ => べ
    h2d[47] = '\u307a';
    h2d[48] = '\u307c'; // ほ => ぼ
    h2d[49] = '\u307c'; // ぼ => ぼ

    // Make katakana dakuten map from hiragana map
    char codePointDifference = '\u30ab' - '\u304b'; // カ - か
    assert h2d.length == k2d.length;
    for (int i = 0; i < k2d.length; i++) {
      k2d[i] = (char) (h2d[i] + codePointDifference);
    }
  }

  /**
   * Constructor. Normalizes both kanji and kana iteration marks by default.
   *
   * @param input char stream
   */
  public JapaneseIterationMarkCharFilter(Reader input) {
    this(input, NORMALIZE_KANJI_DEFAULT, NORMALIZE_KANA_DEFAULT);
  }


  /**
   * Constructor
   *
   * @param input          char stream
   * @param normalizeKanji indicates whether kanji iteration marks should be normalized
   * @param normalizeKana indicates whether kana iteration marks should be normalized
   */
  public JapaneseIterationMarkCharFilter(Reader input, boolean normalizeKanji, boolean normalizeKana) {
    super(input);
    this.normalizeKanji = normalizeKanji;
    this.normalizeKana = normalizeKana;
    buffer.reset(input);
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public int read(char[] buffer, int offset, int length) throws IOException {
    int read = 0;

    for (int i = offset; i < offset + length; i++) {
      int c = read();
      if (c == -1) {
        break;
      }
      buffer[i] = (char) c;
      read++;
    }

    return read == 0 ? -1 : read;
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public int read() throws IOException {
    int ic = buffer.get(bufferPosition);

    // End of input
    if (ic == -1) {
      buffer.freeBefore(bufferPosition);
      return ic;
    }
    
    char c = (char) ic;

    // Skip surrogate pair characters
    if (Character.isHighSurrogate(c) || Character.isLowSurrogate(c)) {
      iterationMarkSpanEndPosition = bufferPosition + 1;
    }

    // Free rolling buffer on full stop
    if (c == FULL_STOP_PUNCTUATION) {
      buffer.freeBefore(bufferPosition);
      iterationMarkSpanEndPosition = bufferPosition + 1;
    }
    
    // Normalize iteration mark
    if (isIterationMark(c)) {
      c = normalizeIterationMark(c);
    }
    
    bufferPosition++;
    return c;
  }

  /**
   * Normalizes the iteration mark character c
   *
   * @param c iteration mark character to normalize
   * @return normalized iteration mark
   * @throws IOException If there is a low-level I/O error.
   */
  private char normalizeIterationMark(char c) throws IOException {

    // Case 1: Inside an iteration mark span
    if (bufferPosition < iterationMarkSpanEndPosition) {
      return normalize(sourceCharacter(bufferPosition, iterationMarksSpanSize), c);
    }

    // Case 2: New iteration mark spans starts where the previous one ended, which is illegal
    if (bufferPosition == iterationMarkSpanEndPosition) {
      // Emit the illegal iteration mark and increase end position to indicate that we can't
      // start a new span on the next position either
      iterationMarkSpanEndPosition++;
      return c;
    }

    // Case 3: New iteration mark span
    iterationMarksSpanSize = nextIterationMarkSpanSize();
    iterationMarkSpanEndPosition = bufferPosition + iterationMarksSpanSize;
    return normalize(sourceCharacter(bufferPosition, iterationMarksSpanSize), c);
  }

  /**
   * Finds the number of subsequent next iteration marks
   *
   * @return number of iteration marks starting at the current buffer position
   * @throws IOException If there is a low-level I/O error.
   */
  private int nextIterationMarkSpanSize() throws IOException {
    int spanSize = 0;
    for (int i = bufferPosition; buffer.get(i) != -1 && isIterationMark((char) (buffer.get(i))); i++) {
      spanSize++;
    }
    // Restrict span size so that we don't go past the previous end position
    if (bufferPosition - spanSize < iterationMarkSpanEndPosition) {
      spanSize = bufferPosition - iterationMarkSpanEndPosition;
    }
    return spanSize;
  }

  /**
   * Returns the source character for a given position and iteration mark span size
   *
   * @param position buffer position (should not exceed bufferPosition)
   * @param spanSize iteration mark span size
   * @return source character
   * @throws IOException If there is a low-level I/O error.
   */
  private char sourceCharacter(int position, int spanSize) throws IOException {
    return (char) buffer.get(position - spanSize);
  }

  /**
   * Normalize a character
   *
   * @param c character to normalize
   * @param m repetition mark referring to c
   * @return normalized character - return c on illegal iteration marks
   */
  private char normalize(char c, char m) {
    if (isHiraganaIterationMark(m)) {
      return normalizedHiragana(c, m);
    }

    if (isKatakanaIterationMark(m)) {
      return normalizedKatakana(c, m);
    }

    return c; // If m is not kana and we are to normalize it, we assume it is kanji and simply return it
  }

  /**
   * Normalize hiragana character
   *
   * @param c hiragana character
   * @param m repetition mark referring to c
   * @return normalized character - return c on illegal iteration marks
   */
  private char normalizedHiragana(char c, char m) {
    switch (m) {
      case HIRAGANA_ITERATION_MARK:
        return isHiraganaDakuten(c) ? (char) (c - 1) : c;
      case HIRAGANA_VOICED_ITERATION_MARK:
        return lookupHiraganaDakuten(c);
      default:
        return c;
    }
  }

  /**
   * Normalize katakana character
   *
   * @param c katakana character
   * @param m repetition mark referring to c
   * @return normalized character - return c on illegal iteration marks
   */
  private char normalizedKatakana(char c, char m) {
    switch (m) {
      case KATAKANA_ITERATION_MARK:
        return isKatakanaDakuten(c) ? (char) (c - 1) : c;
      case KATAKANA_VOICED_ITERATION_MARK:
        return lookupKatakanaDakuten(c);
      default:
        return c;
    }
  }

  /**
   * Iteration mark character predicate
   *
   * @param c character to test
   * @return true if c is an iteration mark character.  Otherwise false.
   */
  private boolean isIterationMark(char c) {
    return isKanjiIterationMark(c) || isHiraganaIterationMark(c) || isKatakanaIterationMark(c);
  }

  /**
   * Hiragana iteration mark character predicate
   *
   * @param c character to test
   * @return true if c is a hiragana iteration mark character.  Otherwise false.
   */
  private boolean isHiraganaIterationMark(char c) {
    if (normalizeKana) {
      return c == HIRAGANA_ITERATION_MARK || c == HIRAGANA_VOICED_ITERATION_MARK;
    } else {
      return false;
    }
  }

  /**
   * Katakana iteration mark character predicate
   *
   * @param c character to test
   * @return true if c is a katakana iteration mark character.  Otherwise false.
   */
  private boolean isKatakanaIterationMark(char c) {
    if (normalizeKana) {
      return c == KATAKANA_ITERATION_MARK || c == KATAKANA_VOICED_ITERATION_MARK;
    } else {
      return false;
    }
  }

  /**
   * Kanji iteration mark character predicate
   *
   * @param c character to test
   * @return true if c is a kanji iteration mark character.  Otherwise false.
   */
  private boolean isKanjiIterationMark(char c) {
    if (normalizeKanji) {
      return c == KANJI_ITERATION_MARK;
    } else {
      return false;
    }
  }

  /**
   * Look up hiragana dakuten
   *
   * @param c character to look up
   * @return hiragana dakuten variant of c or c itself if no dakuten variant exists
   */
  private char lookupHiraganaDakuten(char c) {
    return lookup(c, h2d, '\u304b'); // Code point is for か
  }

  /**
   * Look up katakana dakuten. Only full-width katakana are supported.
   *
   * @param c character to look up
   * @return katakana dakuten variant of c or c itself if no dakuten variant exists
   */
  private char lookupKatakanaDakuten(char c) {
    return lookup(c, k2d, '\u30ab'); // Code point is for カ
  }

  /**
   * Hiragana dakuten predicate
   *
   * @param c character to check
   * @return true if c is a hiragana dakuten and otherwise false
   */
  private boolean isHiraganaDakuten(char c) {
    return inside(c, h2d, '\u304b') && c == lookupHiraganaDakuten(c);
  }

  /**
   * Katakana dakuten predicate
   *
   * @param c character to check
   * @return true if c is a hiragana dakuten and otherwise false
   */
  private boolean isKatakanaDakuten(char c) {
    return inside(c, k2d, '\u30ab') && c == lookupKatakanaDakuten(c);
  }

  /**
   * Looks up a character in dakuten map and returns the dakuten variant if it exists.
   * Otherwise return the character being looked up itself
   *
   * @param c      character to look up
   * @param map    dakuten map
   * @param offset code point offset from c
   * @return mapped character or c if no mapping exists
   */
  private char lookup(char c, char[] map, char offset) {
    if (!inside(c, map, offset)) {
      return c;
    } else {
      return map[c - offset];
    }
  }

  /**
   * Predicate indicating if the lookup character is within dakuten map range
   *
   * @param c      character to look up
   * @param map    dakuten map
   * @param offset code point offset from c
   * @return true if c is mapped by map and otherwise false
   */
  private boolean inside(char c, char[] map, char offset) {
    return c >= offset && c < offset + map.length;
  }


  @Override
  protected int correct(int currentOff) {
    return currentOff; // this filter doesn't change the length of strings
  }
}