package org.apache.lucene.analysis.th;
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Locale;
import java.lang.Character.UnicodeBlock;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.text.BreakIterator;
/**
* {@link TokenFilter} that use {@link java.text.BreakIterator} to break each
* Token that is Thai into separate Token(s) for each Thai word.
* <p>WARNING: this filter may not work correctly with all JREs.
* It is known to work with Sun/Oracle and Harmony JREs.
*/
public final class ThaiWordFilter extends TokenFilter {
private BreakIterator breaker = null;
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
private State thaiState = null;
public ThaiWordFilter(TokenStream input) {
super(input);
breaker = BreakIterator.getWordInstance(new Locale("th"));
termAtt = addAttribute(TermAttribute.class);
offsetAtt = addAttribute(OffsetAttribute.class);
}
@Override
public final boolean incrementToken() throws IOException {
if (thaiState != null) {
int start = breaker.current();
int end = breaker.next();
if (end != BreakIterator.DONE) {
restoreState(thaiState);
termAtt.setTermBuffer(termAtt.termBuffer(), start, end - start);
offsetAtt.setOffset(offsetAtt.startOffset() + start, offsetAtt.startOffset() + end);
return true;
}
thaiState = null;
}
if (input.incrementToken() == false || termAtt.termLength() == 0)
return false;
String text = termAtt.term();
if (UnicodeBlock.of(text.charAt(0)) != UnicodeBlock.THAI) {
termAtt.setTermBuffer(text.toLowerCase());
return true;
}
thaiState = captureState();
breaker.setText(text);
int end = breaker.next();
if (end != BreakIterator.DONE) {
termAtt.setTermBuffer(text, 0, end);
offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset() + end);
return true;
}
return false;
}
@Override
public void reset() throws IOException {
super.reset();
thaiState = null;
}
}