package org.apache.lucene.analysis.icu.segmentation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.CharacterIterator;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UnicodeSet;
/**
* Syllable iterator for Lao text.
* <p>
* This breaks Lao text into syllables according to:
* <i>Syllabification of Lao Script for Line Breaking</i>
* Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
* Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP.
* <ul>
* <li>http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
* <li>http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
* </ul>
* <p>
* Most work is accomplished with RBBI rules, however some additional special logic is needed
* that cannot be coded in a grammar, and this is implemented here.
* <p>
* For example, what appears to be a final consonant might instead be part of the next syllable.
* Rules match in a greedy fashion, leaving an illegal sequence that matches no rules.
* <p>
* Take for instance the text ກວ່າດອກ
* The first rule greedily matches ກວ່າດ, but then ອກ is encountered, which is illegal.
* What LaoBreakIterator does, according to the paper:
* <ol>
* <li>backtrack and remove the ດ from the last syllable, placing it on the current syllable.
* <li>verify the modified previous syllable (ກວ່າ ) is still legal.
* <li>verify the modified current syllable (ດອກ) is now legal.
* <li>If 2 or 3 fails, then restore the ດ to the last syllable and skip the current character.
* </ol>
* <p>
* Finally, LaoBreakIterator also takes care of the second concern mentioned in the paper.
* This is the issue of combining marks being in the wrong order (typos).
* @lucene.experimental
*/
public class LaoBreakIterator extends BreakIterator {
RuleBasedBreakIterator rules;
CharArrayIterator text;
CharArrayIterator working = new CharArrayIterator();
int workingOffset = 0;
CharArrayIterator verifyText = new CharArrayIterator();
RuleBasedBreakIterator verify;
private static final UnicodeSet laoSet;
static {
laoSet = new UnicodeSet("[:Lao:]");
laoSet.compact();
laoSet.freeze();
}
public LaoBreakIterator(RuleBasedBreakIterator rules) {
this.rules = (RuleBasedBreakIterator) rules.clone();
this.verify = (RuleBasedBreakIterator) rules.clone();
}
@Override
public int current() {
int current = rules.current();
return current == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + current;
}
@Override
public int first() {
working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
rules.setText(working);
workingOffset = 0;
int first = rules.first();
return first == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + first;
}
@Override
public int following(int offset) {
throw new UnsupportedOperationException();
}
@Override
public CharacterIterator getText() {
return text;
}
@Override
public int last() {
throw new UnsupportedOperationException();
}
@Override
public int next() {
int current = current();
int next = rules.next();
if (next == BreakIterator.DONE)
return next;
else
next += workingOffset;
char c = working.current();
int following = rules.next(); // lookahead
if (following != BreakIterator.DONE) {
following += workingOffset;
if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) {
workingOffset = next - 1;
working.setText(text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset);
return next - 1;
}
rules.previous(); // undo the lookahead
}
return next;
}
@Override
public int next(int n) {
if (n < 0)
throw new UnsupportedOperationException("Backwards traversal is unsupported");
int result = current();
while (n > 0) {
result = next();
--n;
}
return result;
}
@Override
public int previous() {
throw new UnsupportedOperationException("Backwards traversal is unsupported");
}
@Override
public void setText(CharacterIterator text) {
if (!(text instanceof CharArrayIterator))
throw new UnsupportedOperationException("unsupported CharacterIterator");
this.text = (CharArrayIterator) text;
ccReorder(this.text.getText(), this.text.getStart(), this.text.getLength());
working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
rules.setText(working);
workingOffset = 0;
}
@Override
public void setText(String newText) {
CharArrayIterator ci = new CharArrayIterator();
ci.setText(newText.toCharArray(), 0, newText.length());
setText(ci);
}
private boolean verifyPushBack(int current, int next) {
int shortenedSyllable = next - current - 1;
verifyText.setText(text.getText(), text.getStart() + current, shortenedSyllable);
verify.setText(verifyText);
if (verify.next() != shortenedSyllable || verify.getRuleStatus() == 0)
return false;
verifyText.setText(text.getText(), text.getStart() + next - 1, text.getLength() - next + 1);
verify.setText(verifyText);
return (verify.next() != BreakIterator.DONE && verify.getRuleStatus() != 0);
}
// TODO: only bubblesort around runs of combining marks, instead of the entire text.
private void ccReorder(char[] text, int start, int length) {
boolean reordered;
do {
int prevCC = 0;
reordered = false;
for (int i = start; i < start + length; i++) {
final char c = text[i];
final int cc = UCharacter.getCombiningClass(c);
if (cc > 0 && cc < prevCC) {
// swap
text[i] = text[i - 1];
text[i - 1] = c;
reordered = true;
} else {
prevCC = cc;
}
}
} while (reordered == true);
}
/**
* Clone method. Creates another LaoBreakIterator with the same behavior
* and current state as this one.
* @return The clone.
*/
@Override
public Object clone() {
LaoBreakIterator other = (LaoBreakIterator) super.clone();
other.rules = (RuleBasedBreakIterator) rules.clone();
other.verify = (RuleBasedBreakIterator) verify.clone();
if (text != null)
other.text = (CharArrayIterator) text.clone();
if (working != null)
other.working = (CharArrayIterator) working.clone();
if (verifyText != null)
other.verifyText = (CharArrayIterator) verifyText.clone();
return other;
}
}