package org.apache.lucene.analysis.icu.segmentation; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.text.CharacterIterator; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.BreakIterator; import com.ibm.icu.text.RuleBasedBreakIterator; import com.ibm.icu.text.UnicodeSet; /** * Syllable iterator for Lao text. * <p> * This breaks Lao text into syllables according to: * <i>Syllabification of Lao Script for Line Breaking</i> * Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak, * Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP. * <ul> * <li>http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf * <li>http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf * </ul> * <p> * Most work is accomplished with RBBI rules, however some additional special logic is needed * that cannot be coded in a grammar, and this is implemented here. * <p> * For example, what appears to be a final consonant might instead be part of the next syllable. * Rules match in a greedy fashion, leaving an illegal sequence that matches no rules. * <p> * Take for instance the text ກວ່າດອກ * The first rule greedily matches ກວ່າດ, but then ອກ is encountered, which is illegal. * What LaoBreakIterator does, according to the paper: * <ol> * <li>backtrack and remove the ດ from the last syllable, placing it on the current syllable. * <li>verify the modified previous syllable (ກວ່າ ) is still legal. * <li>verify the modified current syllable (ດອກ) is now legal. * <li>If 2 or 3 fails, then restore the ດ to the last syllable and skip the current character. * </ol> * <p> * Finally, LaoBreakIterator also takes care of the second concern mentioned in the paper. * This is the issue of combining marks being in the wrong order (typos). * @lucene.experimental */ public class LaoBreakIterator extends BreakIterator { RuleBasedBreakIterator rules; CharArrayIterator text; CharArrayIterator working = new CharArrayIterator(); int workingOffset = 0; CharArrayIterator verifyText = new CharArrayIterator(); RuleBasedBreakIterator verify; private static final UnicodeSet laoSet; static { laoSet = new UnicodeSet("[:Lao:]"); laoSet.compact(); laoSet.freeze(); } public LaoBreakIterator(RuleBasedBreakIterator rules) { this.rules = (RuleBasedBreakIterator) rules.clone(); this.verify = (RuleBasedBreakIterator) rules.clone(); } @Override public int current() { int current = rules.current(); return current == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + current; } @Override public int first() { working.setText(this.text.getText(), this.text.getStart(), this.text.getLength()); rules.setText(working); workingOffset = 0; int first = rules.first(); return first == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + first; } @Override public int following(int offset) { throw new UnsupportedOperationException(); } @Override public CharacterIterator getText() { return text; } @Override public int last() { throw new UnsupportedOperationException(); } @Override public int next() { int current = current(); int next = rules.next(); if (next == BreakIterator.DONE) return next; else next += workingOffset; char c = working.current(); int following = rules.next(); // lookahead if (following != BreakIterator.DONE) { following += workingOffset; if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) { workingOffset = next - 1; working.setText(text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset); return next - 1; } rules.previous(); // undo the lookahead } return next; } @Override public int next(int n) { if (n < 0) throw new UnsupportedOperationException("Backwards traversal is unsupported"); int result = current(); while (n > 0) { result = next(); --n; } return result; } @Override public int previous() { throw new UnsupportedOperationException("Backwards traversal is unsupported"); } @Override public void setText(CharacterIterator text) { if (!(text instanceof CharArrayIterator)) throw new UnsupportedOperationException("unsupported CharacterIterator"); this.text = (CharArrayIterator) text; ccReorder(this.text.getText(), this.text.getStart(), this.text.getLength()); working.setText(this.text.getText(), this.text.getStart(), this.text.getLength()); rules.setText(working); workingOffset = 0; } @Override public void setText(String newText) { CharArrayIterator ci = new CharArrayIterator(); ci.setText(newText.toCharArray(), 0, newText.length()); setText(ci); } private boolean verifyPushBack(int current, int next) { int shortenedSyllable = next - current - 1; verifyText.setText(text.getText(), text.getStart() + current, shortenedSyllable); verify.setText(verifyText); if (verify.next() != shortenedSyllable || verify.getRuleStatus() == 0) return false; verifyText.setText(text.getText(), text.getStart() + next - 1, text.getLength() - next + 1); verify.setText(verifyText); return (verify.next() != BreakIterator.DONE && verify.getRuleStatus() != 0); } // TODO: only bubblesort around runs of combining marks, instead of the entire text. private void ccReorder(char[] text, int start, int length) { boolean reordered; do { int prevCC = 0; reordered = false; for (int i = start; i < start + length; i++) { final char c = text[i]; final int cc = UCharacter.getCombiningClass(c); if (cc > 0 && cc < prevCC) { // swap text[i] = text[i - 1]; text[i - 1] = c; reordered = true; } else { prevCC = cc; } } } while (reordered == true); } /** * Clone method. Creates another LaoBreakIterator with the same behavior * and current state as this one. * @return The clone. */ @Override public Object clone() { LaoBreakIterator other = (LaoBreakIterator) super.clone(); other.rules = (RuleBasedBreakIterator) rules.clone(); other.verify = (RuleBasedBreakIterator) verify.clone(); if (text != null) other.text = (CharArrayIterator) text.clone(); if (working != null) other.working = (CharArrayIterator) working.clone(); if (verifyText != null) other.verifyText = (CharArrayIterator) verifyText.clone(); return other; } }