/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.exoplatform.services.jcr.impl.core.query.lucene;
import org.apache.lucene.document.Field; //NOSONAR
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermVectorOffsetInfo;
import org.exoplatform.services.jcr.util.Text;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
/**
* This is an adapted version of the <code>FulltextHighlighter</code> posted in
* issue: <a href="http://issues.apache.org/jira/browse/LUCENE-644">LUCENE-644</a>.
* <br>
* Important: for this highlighter to function properly, field must be stored
* with token offsets.<br> Use Field constructor {@link
* Field#Field(String,String,Field.Store,Field.Index,Field.TermVector)
* Field(String, String, Field.Store, Field.Index, Field.TermVector)} where the
* last argument is either {@link Field.TermVector#WITH_POSITIONS_OFFSETS} or
* {@link org.apache.lucene.document.Field.TermVector#WITH_OFFSETS}
*
* @see org.apache.lucene.index.TermPositionVector
* @see org.apache.lucene.index.TermFreqVector
*/
public class DefaultHighlighter {
/**
* A default value of <tt>3</tt>
*/
public static final int DEFAULT_MAXFRAGMENTS = 3;
/**
* A default value of <tt>75</tt>
*/
public static final int DEFAULT_SURROUND = 75;
public static final String START_EXCERPT = "<excerpt>";
public static final String END_EXCERPT = "</excerpt>";
public static final String START_FRAGMENT_SEPARATOR = "<fragment>";
public static final String END_FRAGMENT_SEPARATOR = "</fragment>";
public static final String START_HIGHLIGHT = "<highlight>";
public static final String END_HIGHLIGHT = "</highlight>";
protected DefaultHighlighter() {
}
/**
* @param tvec the term position vector for this hit
* @param queryTerms the query terms.
* @param text the original text that was used to create the
* tokens.
* @param excerptStart this string is prepended to the excerpt
* @param excerptEnd this string is appended to the excerpt
* @param fragmentStart this string is prepended to every fragment
* @param fragmentEnd this string is appended to the end of every
* fragment.
* @param hlStart the string used to prepend a highlighted token, for
* example <tt>"<b>"</tt>
* @param hlEnd the string used to append a highlighted token, for
* example <tt>"</b>"</tt>
* @param maxFragments the maximum number of fragments
* @param surround the maximum number of chars surrounding a
* highlighted token
* @return a String with text fragments where tokens from the query are
* highlighted
*/
public static String highlight(TermPositionVector tvec,
Set<Term> queryTerms,
String text,
String excerptStart,
String excerptEnd,
String fragmentStart,
String fragmentEnd,
String hlStart,
String hlEnd,
int maxFragments,
int surround)
throws IOException {
return new DefaultHighlighter().doHighlight(tvec, queryTerms, text,
excerptStart, excerptEnd, fragmentStart, fragmentEnd, hlStart,
hlEnd, maxFragments, surround);
}
/**
* @param tvec the term position vector for this hit
* @param queryTerms the query terms.
* @param text the original text that was used to create the tokens.
* @param maxFragments the maximum number of fragments
* @param surround the maximum number of chars surrounding a highlighted
* token
* @return a String with text fragments where tokens from the query are
* highlighted
*/
public static String highlight(TermPositionVector tvec,
Set<Term> queryTerms,
String text,
int maxFragments,
int surround)
throws IOException {
return highlight(tvec, queryTerms, text, START_EXCERPT, END_EXCERPT,
START_FRAGMENT_SEPARATOR, END_FRAGMENT_SEPARATOR,
START_HIGHLIGHT, END_HIGHLIGHT, maxFragments, surround);
}
/**
* @see #highlight(TermPositionVector, Set, String, String, String, String, String, String, String, int, int)
*/
protected String doHighlight(TermPositionVector tvec,
Set<Term> queryTerms,
String text,
String excerptStart,
String excerptEnd,
String fragmentStart,
String fragmentEnd,
String hlStart,
String hlEnd,
int maxFragments,
int surround) throws IOException {
String[] terms = new String[queryTerms.size()];
Iterator<Term> it = queryTerms.iterator();
for (int i = 0; it.hasNext(); i++) {
terms[i] = it.next().text();
}
List<TermVectorOffsetInfo> list = new ArrayList<TermVectorOffsetInfo>();
int[] tvecindexes = tvec.indexesOf(terms, 0, terms.length);
for (int i = 0; i < tvecindexes.length; i++) {
TermVectorOffsetInfo[] termoffsets = tvec.getOffsets(tvecindexes[i]);
list.addAll(Arrays.asList(termoffsets));
}
TermVectorOffsetInfo[] offsets = (TermVectorOffsetInfo[]) list.toArray(new TermVectorOffsetInfo[list.size()]);
// sort offsets
if (terms.length > 1) {
Arrays.sort(offsets, new TermVectorOffsetInfoSorter());
}
return mergeFragments(offsets, text, excerptStart,
excerptEnd, fragmentStart, fragmentEnd, hlStart, hlEnd,
maxFragments, surround);
}
protected String mergeFragments(TermVectorOffsetInfo[] offsets,
String text,
String excerptStart,
String excerptEnd,
String fragmentStart,
String fragmentEnd,
String hlStart,
String hlEnd,
int maxFragments,
int surround) throws IOException {
if (offsets == null || offsets.length == 0) {
// nothing to highlight
return createDefaultExcerpt(text, excerptStart, excerptEnd,
fragmentStart, fragmentEnd, surround * 2);
}
int lastOffset = offsets.length; // Math.min(10, offsets.length); // 10 terms is plenty?
List<FragmentInfo> fragmentInfoList = new ArrayList<FragmentInfo>();
if (offsets[0].getEndOffset() <= text.length()) {
FragmentInfo fi = new FragmentInfo(offsets[0], surround * 2);
for (int i = 1; i < lastOffset; i++) {
if (offsets[i].getEndOffset() > text.length()) {
break;
}
if (fi.add(offsets[i])) {
continue;
}
fragmentInfoList.add(fi);
fi = new FragmentInfo(offsets[i], surround * 2);
}
fragmentInfoList.add(fi);
}
if (fragmentInfoList.isEmpty()) {
// nothing to highlight
return createDefaultExcerpt(text, excerptStart, excerptEnd,
fragmentStart, fragmentEnd, surround * 2);
}
// sort with score
Collections.sort(fragmentInfoList, new FragmentInfoScoreSorter());
// extract best fragments
List<FragmentInfo> bestFragmentsList = new ArrayList<FragmentInfo>();
for (int i = 0; i < Math.min(fragmentInfoList.size(), maxFragments); i++) {
bestFragmentsList.add(fragmentInfoList.get(i));
}
// re-sort with positions
Collections.sort(bestFragmentsList, new FragmentInfoPositionSorter());
// merge #maxFragments fragments
StringReader reader = new StringReader(text);
StringBuilder sb = new StringBuilder(excerptStart);
int pos = 0;
char[] cbuf;
int skip;
int nextStart;
int skippedChars;
int firstWhitespace;
for (int i = 0; i < bestFragmentsList.size(); i++) {
FragmentInfo fi = bestFragmentsList.get(i);
fi.trim();
nextStart = fi.getStartOffset();
skip = nextStart - pos;
if (skip > surround * 2) {
skip -= surround;
if (i > 0) {
// end last fragment
cbuf = new char[surround];
reader.read(cbuf, 0, surround);
// find last whitespace
skippedChars = 1;
for (; skippedChars < surround + 1; skippedChars++) {
if (Character.isWhitespace(cbuf[surround - skippedChars])) {
break;
}
}
pos += surround;
if (skippedChars > surround) {
skippedChars = surround;
}
sb.append(Text.encodeIllegalXMLCharacters(
new String(cbuf, 0, surround - skippedChars)));
sb.append(fragmentEnd);
}
}
if (skip >= surround) {
if (i > 0) {
skip -= surround;
}
// skip
reader.skip((long) skip);
pos += skip;
}
// start fragment
cbuf = new char[nextStart - pos];
skippedChars = Math.max(cbuf.length - 1, 0);
firstWhitespace = skippedChars;
reader.read(cbuf, 0, nextStart - pos);
pos += (nextStart - pos);
sb.append(fragmentStart);
// find last period followed by whitespace
if (cbuf.length > 0) {
for (; skippedChars >= 0; skippedChars--) {
if (Character.isWhitespace(cbuf[skippedChars])) {
firstWhitespace = skippedChars;
if (skippedChars - 1 >= 0
&& cbuf[skippedChars - 1] == '.') {
skippedChars++;
break;
}
}
}
}
boolean sentenceStart = true;
if (skippedChars == -1) {
if (pos == cbuf.length) {
// this fragment is the start of the text -> skip none
skippedChars = 0;
} else {
sentenceStart = false;
skippedChars = firstWhitespace + 1;
}
}
if (!sentenceStart) {
sb.append("... ");
}
sb.append(Text.encodeIllegalXMLCharacters(
new String(cbuf, skippedChars, cbuf.length - skippedChars)));
// iterate terms
for (Iterator<TermVectorOffsetInfo> iter = fi.iterator(); iter.hasNext();) {
TermVectorOffsetInfo ti = iter.next();
nextStart = ti.getStartOffset();
if (nextStart - pos > 0) {
cbuf = new char[nextStart - pos];
int charsRead = reader.read(cbuf, 0, nextStart - pos);
pos += (nextStart - pos);
sb.append(cbuf, 0, charsRead);
}
sb.append(hlStart);
nextStart = ti.getEndOffset();
// print term
cbuf = new char[nextStart - pos];
reader.read(cbuf, 0, nextStart - pos);
pos += (nextStart - pos);
sb.append(cbuf);
sb.append(hlEnd);
}
}
if (pos != 0) {
// end fragment
if (offsets.length > lastOffset) {
surround = Math.min(offsets[lastOffset].getStartOffset() - pos, surround);
}
cbuf = new char[surround];
skip = reader.read(cbuf, 0, surround);
boolean EOF = reader.read() == -1;
if (skip >= 0) {
if (!EOF) {
skippedChars = 1;
for (; skippedChars < surround + 1; skippedChars++) {
if (Character.isWhitespace(cbuf[surround - skippedChars])) {
break;
}
}
if (skippedChars > surround) {
skippedChars = surround;
}
} else {
skippedChars = 0;
}
sb.append(Text.encodeIllegalXMLCharacters(
new String(cbuf, 0, EOF ? skip : (surround - skippedChars))));
if (!EOF) {
char lastChar = sb.charAt(sb.length() - 1);
if (lastChar != '.' && lastChar != '!' && lastChar != '?') {
sb.append(" ...");
}
}
}
sb.append(fragmentEnd);
}
sb.append(excerptEnd);
return sb.toString();
}
/**
* Creates a default excerpt with the given text.
*
* @param text the text.
* @param excerptStart the excerpt start.
* @param excerptEnd the excerpt end.
* @param fragmentStart the fragement start.
* @param fragmentEnd the fragment end.
* @param maxLength the maximum length of the fragment.
* @return a default excerpt.
* @throws IOException if an error occurs while reading from the text.
*/
protected String createDefaultExcerpt(String text,
String excerptStart,
String excerptEnd,
String fragmentStart,
String fragmentEnd,
int maxLength) throws IOException {
StringReader reader = new StringReader(text);
StringBuilder excerpt = new StringBuilder(excerptStart);
excerpt.append(fragmentStart);
if (!text.isEmpty()) {
int min = excerpt.length();
char[] buf = new char[maxLength];
int len = reader.read(buf);
StringBuilder tmp = new StringBuilder();
tmp.append(buf, 0, len);
if (len == buf.length) {
for (int i = tmp.length() - 1; i > min; i--) {
if (Character.isWhitespace(tmp.charAt(i))) {
tmp.delete(i, tmp.length());
tmp.append(" ...");
break;
}
}
}
excerpt.append(Text.encodeIllegalXMLCharacters(tmp.toString()));
}
excerpt.append(fragmentEnd).append(excerptEnd);
return excerpt.toString();
}
private static class FragmentInfo {
List<TermVectorOffsetInfo> offsetInfosList;
int startOffset;
int endOffset;
int mergeGap;
int numTerms;
public FragmentInfo(TermVectorOffsetInfo offsetinfo, int mergeGap) {
offsetInfosList = new ArrayList<TermVectorOffsetInfo>();
offsetInfosList.add(offsetinfo);
startOffset = offsetinfo.getStartOffset();
endOffset = offsetinfo.getEndOffset();
this.mergeGap = mergeGap;
numTerms = 1;
}
public boolean add(TermVectorOffsetInfo offsetinfo) {
if (offsetinfo.getStartOffset() > (endOffset + mergeGap)) {
return false;
}
offsetInfosList.add(offsetinfo);
numTerms++;
endOffset = offsetinfo.getEndOffset();
return true;
}
public Iterator<TermVectorOffsetInfo> iterator() {
return offsetInfosList.iterator();
}
public int getStartOffset() {
return startOffset;
}
public int numTerms() {
return numTerms;
}
public void trim() {
int end = startOffset + (mergeGap / 2);
Iterator<TermVectorOffsetInfo> it = offsetInfosList.iterator();
while (it.hasNext()) {
TermVectorOffsetInfo tvoi = it.next();
if (tvoi.getStartOffset() > end) {
it.remove();
}
}
}
}
private static class FragmentInfoScoreSorter
implements java.util.Comparator<FragmentInfo> {
public int compare(FragmentInfo o1, FragmentInfo o2) {
int s1 = o1.numTerms();
int s2 = o2.numTerms();
if (s1 == s2) {
return o1.getStartOffset() < o2.getStartOffset() ? -1 : 1;
}
return s1 > s2 ? -1 : 1;
}
public boolean equals(Object obj) {
return false;
}
}
private static class FragmentInfoPositionSorter
implements java.util.Comparator<FragmentInfo> {
public int compare(FragmentInfo o1, FragmentInfo o2) {
int s1 = o1.getStartOffset();
int s2 = o2.getStartOffset();
if (s1 == s2) {
return 0;
}
return s1 < s2 ? -1 : 1;
}
public boolean equals(Object obj) {
return false;
}
}
private static class TermVectorOffsetInfoSorter
implements java.util.Comparator<TermVectorOffsetInfo> {
public int compare(TermVectorOffsetInfo o1, TermVectorOffsetInfo o2) {
int s1 = o1.getStartOffset();
int s2 = o2.getStartOffset();
if (s1 == s2) {
return 0;
}
return s1 < s2 ? -1 : 1;
}
public boolean equals(Object obj) {
return false;
}
}
}