package org.fnlp.app.lucene; import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.AttributeSource; public final class SentenceTokenizer extends Tokenizer { /** * End of sentence punctuation: 。,!?;,!?; */ private final static String PUNCTION = "。,!?;,!?;"; /** * Space-like characters that need to be skipped: such as space, tab, newline, carriage return. */ public static final String SPACES = "  \t\r\n"; private final StringBuilder buffer = new StringBuilder(); private int tokenStart = 0, tokenEnd = 0; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); public SentenceTokenizer(Reader reader) { super(reader); } public SentenceTokenizer(AttributeSource source, Reader reader) { super(source, reader); } public SentenceTokenizer(AttributeFactory factory, Reader reader) { super(factory, reader); } @Override public boolean incrementToken() throws IOException { clearAttributes(); buffer.setLength(0); int ci; char ch, pch; boolean atBegin = true; tokenStart = tokenEnd; ci = input.read(); ch = (char) ci; while (true) { if (ci == -1) { break; } else if (PUNCTION.indexOf(ch) != -1) { // End of a sentence buffer.append(ch); tokenEnd++; break; } else if (atBegin && SPACES.indexOf(ch) != -1) { tokenStart++; tokenEnd++; ci = input.read(); ch = (char) ci; } else { buffer.append(ch); atBegin = false; tokenEnd++; pch = ch; ci = input.read(); ch = (char) ci; // Two spaces, such as CR, LF if (SPACES.indexOf(ch) != -1 && SPACES.indexOf(pch) != -1) { // buffer.append(ch); tokenEnd++; break; } } } if (buffer.length() == 0) return false; else { termAtt.setEmpty().append(buffer); offsetAtt.setOffset(correctOffset(tokenStart), correctOffset(tokenEnd)); typeAtt.setType("sentence"); return true; } } @Override public void reset() throws IOException { tokenStart = tokenEnd = 0; } @Override public void end() { // set final offset final int finalOffset = correctOffset(tokenEnd); offsetAtt.setOffset(finalOffset, finalOffset); } }