package org.elasticsearch.common.lucene.search.vectorhighlight;
import gnu.trove.set.hash.TCharHashSet;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
/**
* A copy of Lucene {@link org.apache.lucene.search.vectorhighlight.SimpleBoundaryScanner}.
* <p/>
* Uses specialized char set to lookup boundary, and fixes a problem with start offset in the
* beginning of the text: https://issues.apache.org/jira/browse/LUCENE-3697 (which has a problem
* with multiple empty fields to highlight...).
*/
public class SimpleBoundaryScanner2 implements BoundaryScanner {
public static final int DEFAULT_MAX_SCAN = 20;
public static final char[] DEFAULT_BOUNDARY_CHARS = {'.', ',', '!', '?', ' ', '\t', '\n'};
public static final SimpleBoundaryScanner2 DEFAULT = new SimpleBoundaryScanner2();
public int maxScan;
public TCharHashSet boundaryChars;
public SimpleBoundaryScanner2() {
this(DEFAULT_MAX_SCAN, DEFAULT_BOUNDARY_CHARS);
}
public SimpleBoundaryScanner2(int maxScan, char[] boundaryChars) {
this.maxScan = maxScan;
this.boundaryChars = new TCharHashSet(boundaryChars);
}
public int findStartOffset(StringBuilder buffer, int start) {
// avoid illegal start offset
if (start > buffer.length() || start < 1) return start;
int offset, count = maxScan;
for (offset = start; offset > 0 && count > 0; count--) {
// found?
if (boundaryChars.contains(buffer.charAt(offset - 1))) return offset;
offset--;
}
// LUCENE-3697
if (offset == 0) {
return 0;
}
// not found
return start;
}
public int findEndOffset(StringBuilder buffer, int start) {
// avoid illegal start offset
if (start > buffer.length() || start < 0) return start;
int offset, count = maxScan;
//for( offset = start; offset <= buffer.length() && count > 0; count-- ){
for (offset = start; offset < buffer.length() && count > 0; count--) {
// found?
if (boundaryChars.contains(buffer.charAt(offset))) return offset;
offset++;
}
// not found
return start;
}
}