/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.util; import java.io.IOException; import java.io.Reader; import java.text.BreakIterator; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.util.AttributeFactory; /** * Breaks text into sentences with a {@link BreakIterator} and * allows subclasses to decompose these sentences into words. * <p> * This can be used by subclasses that need sentence context * for tokenization purposes, such as CJK segmenters. * <p> * Additionally it can be used by subclasses that want to mark * sentence boundaries (with a custom attribute, extra token, position * increment, etc) for downstream processing. * * @lucene.experimental */ public abstract class SegmentingTokenizerBase extends Tokenizer { protected static final int BUFFERMAX = 1024; protected final char buffer[] = new char[BUFFERMAX]; /** true length of text in the buffer */ private int length = 0; /** length in buffer that can be evaluated safely, up to a safe end point */ private int usableLength = 0; /** accumulated offset of previous buffers for this reader, for offsetAtt */ protected int offset = 0; private final BreakIterator iterator; private final CharArrayIterator wrapper = CharArrayIterator.newSentenceInstance(); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); /** * Construct a new SegmenterBase, using * the provided BreakIterator for sentence segmentation. * <p> * Note that you should never share BreakIterators across different * TokenStreams, instead a newly created or cloned one should always * be provided to this constructor. */ public SegmentingTokenizerBase(BreakIterator iterator) { this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, iterator); } /** * Construct a new SegmenterBase, also supplying the AttributeFactory */ public SegmentingTokenizerBase(AttributeFactory factory, BreakIterator iterator) { super(factory); this.iterator = iterator; } @Override public final boolean incrementToken() throws IOException { if (length == 0 || !incrementWord()) { while (!incrementSentence()) { refill(); if (length <= 0) // no more bytes to read; return false; } } return true; } @Override public void reset() throws IOException { super.reset(); wrapper.setText(buffer, 0, 0); iterator.setText(wrapper); length = usableLength = offset = 0; } @Override public final void end() throws IOException { super.end(); final int finalOffset = correctOffset(length < 0 ? offset : offset + length); offsetAtt.setOffset(finalOffset, finalOffset); } /** Returns the last unambiguous break position in the text. */ private int findSafeEnd() { for (int i = length - 1; i >= 0; i--) if (isSafeEnd(buffer[i])) return i + 1; return -1; } /** For sentence tokenization, these are the unambiguous break positions. */ protected boolean isSafeEnd(char ch) { switch(ch) { case 0x000D: case 0x000A: case 0x0085: case 0x2028: case 0x2029: return true; default: return false; } } /** * Refill the buffer, accumulating the offset and setting usableLength to the * last unambiguous break position */ private void refill() throws IOException { offset += usableLength; int leftover = length - usableLength; System.arraycopy(buffer, usableLength, buffer, 0, leftover); int requested = buffer.length - leftover; int returned = read(input, buffer, leftover, requested); length = returned < 0 ? leftover : returned + leftover; if (returned < requested) /* reader has been emptied, process the rest */ usableLength = length; else { /* still more data to be read, find a safe-stopping place */ usableLength = findSafeEnd(); if (usableLength < 0) usableLength = length; /* * more than IOBUFFER of text without breaks, * gonna possibly truncate tokens */ } wrapper.setText(buffer, 0, Math.max(0, usableLength)); iterator.setText(wrapper); } // TODO: refactor to a shared readFully somewhere // (NGramTokenizer does this too): /** commons-io's readFully, but without bugs if offset != 0 */ private static int read(Reader input, char[] buffer, int offset, int length) throws IOException { assert length >= 0 : "length must not be negative: " + length; int remaining = length; while (remaining > 0) { int location = length - remaining; int count = input.read(buffer, offset + location, remaining); if (-1 == count) { // EOF break; } remaining -= count; } return length - remaining; } /** * return true if there is a token from the buffer, or null if it is * exhausted. */ private boolean incrementSentence() throws IOException { if (length == 0) // we must refill the buffer return false; while (true) { int start = iterator.current(); if (start == BreakIterator.DONE) return false; // BreakIterator exhausted // find the next set of boundaries int end = iterator.next(); if (end == BreakIterator.DONE) return false; // BreakIterator exhausted setNextSentence(start, end); if (incrementWord()) { return true; } } } /** Provides the next input sentence for analysis */ protected abstract void setNextSentence(int sentenceStart, int sentenceEnd); /** Returns true if another word is available */ protected abstract boolean incrementWord(); }