/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.text.BreakIterator;
import java.text.CharacterIterator;
/**
* Wraps another {@link BreakIterator} to skip past breaks that would result in passages that are too
* short. It's still possible to get a short passage but only at the very end of the input text.
* <p>
* Important: This is not a general purpose {@link BreakIterator}; it's only designed to work in a way
* compatible with the {@link UnifiedHighlighter}. Some assumptions are checked with Java assertions.
*
* @lucene.experimental
*/
public class LengthGoalBreakIterator extends BreakIterator {
private final BreakIterator baseIter;
private final int lengthGoal;
private final boolean isMinimumLength; // if false then is "closest to" length
/** Breaks will be at least {@code minLength} apart (to the extent possible). */
public static LengthGoalBreakIterator createMinLength(BreakIterator baseIter, int minLength) {
return new LengthGoalBreakIterator(baseIter, minLength, true);
}
/** Breaks will be on average {@code targetLength} apart; the closest break to this target (before or after)
* is chosen. */
public static LengthGoalBreakIterator createClosestToLength(BreakIterator baseIter, int targetLength) {
return new LengthGoalBreakIterator(baseIter, targetLength, false);
}
private LengthGoalBreakIterator(BreakIterator baseIter, int lengthGoal, boolean isMinimumLength) {
this.baseIter = baseIter;
this.lengthGoal = lengthGoal;
this.isMinimumLength = isMinimumLength;
}
// note: the only methods that will get called are setText(txt), getText(),
// getSummaryPassagesNoHighlight: current(), first(), next()
// highlightOffsetsEnums: preceding(int), and following(int)
// Nonetheless we make some attempt to implement the rest; mostly delegating.
@Override
public String toString() {
String goalDesc = isMinimumLength ? "minLen" : "targetLen";
return getClass().getSimpleName() + "{" + goalDesc + "=" + lengthGoal + ", baseIter=" + baseIter + "}";
}
@Override
public Object clone() {
return new LengthGoalBreakIterator((BreakIterator) baseIter.clone(), lengthGoal, isMinimumLength);
}
@Override
public CharacterIterator getText() {
return baseIter.getText();
}
@Override
public void setText(String newText) {
baseIter.setText(newText);
}
@Override
public void setText(CharacterIterator newText) {
baseIter.setText(newText);
}
@Override
public int current() {
return baseIter.current();
}
@Override
public int first() {
return baseIter.first();
}
@Override
public int last() {
return baseIter.last();
}
@Override
public int next(int n) {
assert false : "Not supported";
return baseIter.next(n); // probably wrong
}
// called by getSummaryPassagesNoHighlight to generate default summary.
@Override
public int next() {
return following(current());
}
@Override
public int previous() {
assert false : "Not supported";
return baseIter.previous();
}
// called while the current position is the start of a new passage; find end of passage
@Override
public int following(int followingIdx) {
final int startIdx = current();
if (followingIdx < startIdx) {
assert false : "Not supported";
return baseIter.following(followingIdx);
}
final int targetIdx = startIdx + lengthGoal;
// When followingIdx >= targetIdx, we can simply delegate since it will be >= the target
if (followingIdx >= targetIdx - 1) {
return baseIter.following(followingIdx);
}
// If target exceeds the text length, return the last index.
if (targetIdx >= getText().getEndIndex()) {
return baseIter.last();
}
// Find closest break >= the target
final int afterIdx = baseIter.following(targetIdx - 1);
if (afterIdx == DONE) { // we're at the end; can this happen?
return current();
}
if (afterIdx == targetIdx) { // right on the money
return afterIdx;
}
if (isMinimumLength) { // thus never undershoot
return afterIdx;
}
// note: it is a shame that we invoke preceding() *in addition to* following(); BI's are sometimes expensive.
// Find closest break < target
final int beforeIdx = baseIter.preceding(targetIdx); // or could do baseIter.previous() but we hope the BI implements preceding()
if (beforeIdx <= followingIdx) { // too far back
return moveToBreak(afterIdx);
}
if (targetIdx - beforeIdx <= afterIdx - targetIdx) {
return beforeIdx;
}
return moveToBreak(afterIdx);
}
private int moveToBreak(int idx) { // precondition: idx is a known break
// bi.isBoundary(idx) has side-effect of moving the position. Not obvious!
//boolean moved = baseIter.isBoundary(idx); // probably not particularly expensive
//assert moved && current() == idx;
// TODO fix: Would prefer to do "- 1" instead of "- 2" but CustomSeparatorBreakIterator has a bug.
int current = baseIter.following(idx - 2);
assert current == idx : "following() didn't move us to the expected index.";
return idx;
}
// called at start of new Passage given first word start offset
@Override
public int preceding(int offset) {
return baseIter.preceding(offset); // no change needed
}
@Override
public boolean isBoundary(int offset) {
assert false : "Not supported";
return baseIter.isBoundary(offset);
}
}