/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.exoplatform.services.jcr.impl.core.query.lucene;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermVectorOffsetInfo;
import org.apache.lucene.util.PriorityQueue;
import java.io.IOException;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* <code>WeightedHighlighter</code> implements a highlighter that weights the
* fragments based on the proximity of the highlighted terms to each other. The
* returned fragments are not necessarily in sequence as the text occurs in the
* content.
*/
public class WeightedHighlighter extends DefaultHighlighter
{
/**
* Punctuation characters that mark the end of a sentence.
*/
private static final BitSet PUNCTUATION = new BitSet();
static
{
PUNCTUATION.set('.');
PUNCTUATION.set('!');
PUNCTUATION.set(0xa1); // inverted exclamation mark
PUNCTUATION.set('?');
PUNCTUATION.set(0xbf); // inverted question mark
// todo add more
}
protected WeightedHighlighter()
{
}
/**
* @param tvec the term position vector for this hit
* @param queryTerms the query terms.
* @param text the original text that was used to create the
* tokens.
* @param excerptStart this string is prepended to the excerpt
* @param excerptEnd this string is appended to the excerpt
* @param fragmentStart this string is prepended to every fragment
* @param fragmentEnd this string is appended to the end of every
* fragment.
* @param hlStart the string used to prepend a highlighted token, for
* example <tt>"<b>"</tt>
* @param hlEnd the string used to append a highlighted token, for
* example <tt>"</b>"</tt>
* @param maxFragments the maximum number of fragments
* @param surround the maximum number of chars surrounding a
* highlighted token
* @return a String with text fragments where tokens from the query are
* highlighted
*/
public static String highlight(TermPositionVector tvec, Set<Term> queryTerms, String text, String excerptStart,
String excerptEnd, String fragmentStart, String fragmentEnd, String hlStart, String hlEnd, int maxFragments,
int surround) throws IOException
{
return new WeightedHighlighter().doHighlight(tvec, queryTerms, text, excerptStart, excerptEnd, fragmentStart,
fragmentEnd, hlStart, hlEnd, maxFragments, surround);
}
/**
* @param tvec the term position vector for this hit
* @param queryTerms the query terms.
* @param text the original text that was used to create the tokens.
* @param maxFragments the maximum number of fragments
* @param surround the maximum number of chars surrounding a highlighted
* token
* @return a String with text fragments where tokens from the query are
* highlighted
*/
public static String highlight(TermPositionVector tvec, Set<Term> queryTerms, String text, int maxFragments, int surround)
throws IOException
{
return highlight(tvec, queryTerms, text, START_EXCERPT, END_EXCERPT, START_FRAGMENT_SEPARATOR,
END_FRAGMENT_SEPARATOR, START_HIGHLIGHT, END_HIGHLIGHT, maxFragments, surround);
}
protected String mergeFragments(TermVectorOffsetInfo[] offsets, String text, String excerptStart, String excerptEnd,
String fragmentStart, String fragmentEnd, String hlStart, String hlEnd, int maxFragments, int surround)
throws IOException
{
if (offsets == null || offsets.length == 0)
{
// nothing to highlight
return createDefaultExcerpt(text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, surround * 2);
}
PriorityQueue<FragmentInfo> bestFragments = new FragmentInfoPriorityQueue(maxFragments);
for (int i = 0; i < offsets.length; i++)
{
if (offsets[i].getEndOffset() <= text.length())
{
FragmentInfo fi = new FragmentInfo(offsets[i], surround * 2);
for (int j = i + 1; j < offsets.length; j++)
{
if (offsets[j].getEndOffset() > text.length())
{
break;
}
if (!fi.add(offsets[j], text))
{
break;
}
}
bestFragments.insertWithOverflow(fi);
}
}
if (bestFragments.size() == 0)
{
return createDefaultExcerpt(text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, surround * 2);
}
// retrieve fragment infos from queue and fill into list, least
// fragment comes out first
List<FragmentInfo> infos = new LinkedList<FragmentInfo>();
while (bestFragments.size() > 0)
{
FragmentInfo fi = bestFragments.pop();
infos.add(0, fi);
}
Map<TermVectorOffsetInfo, Object> offsetInfos = new IdentityHashMap<TermVectorOffsetInfo, Object>();
// remove overlapping fragment infos
Iterator<FragmentInfo> it = infos.iterator();
while (it.hasNext())
{
FragmentInfo fi = it.next();
boolean overlap = false;
Iterator<TermVectorOffsetInfo> fit = fi.iterator();
while (fit.hasNext() && !overlap)
{
TermVectorOffsetInfo oi = fit.next();
if (offsetInfos.containsKey(oi))
{
overlap = true;
}
}
if (overlap)
{
it.remove();
}
else
{
Iterator<TermVectorOffsetInfo> oit = fi.iterator();
while (oit.hasNext())
{
offsetInfos.put(oit.next(), null);
}
}
}
// create excerpts
StringBuilder sb = new StringBuilder(excerptStart);
it = infos.iterator();
while (it.hasNext())
{
FragmentInfo fi = (FragmentInfo)it.next();
sb.append(fragmentStart);
int limit = Math.max(0, fi.getStartOffset() / 2 + fi.getEndOffset() / 2 - surround);
int len = startFragment(sb, text, fi.getStartOffset(), limit);
TermVectorOffsetInfo lastOffsetInfo = null;
Iterator<TermVectorOffsetInfo> fIt = fi.iterator();
while (fIt.hasNext())
{
TermVectorOffsetInfo oi = fIt.next();
if (lastOffsetInfo != null)
{
// fill in text between terms
sb.append(text.substring(lastOffsetInfo.getEndOffset(), oi.getStartOffset()));
}
sb.append(hlStart);
sb.append(text.substring(oi.getStartOffset(), oi.getEndOffset()));
sb.append(hlEnd);
lastOffsetInfo = oi;
}
limit = Math.min(text.length(), fi.getStartOffset() - len + (surround * 2));
endFragment(sb, text, fi.getEndOffset(), limit);
sb.append(fragmentEnd);
}
sb.append(excerptEnd);
return sb.toString();
}
/**
* Writes the start of a fragment to the string buffer <code>sb</code>. The
* first occurrence of a matching term is indicated by the
* <code>offset</code> into the <code>text</code>.
*
* @param sb where to append the start of the fragment.
* @param text the original text.
* @param offset the start offset of the first matching term in the
* fragment.
* @param limit do not go back further than <code>limit</code>.
* @return the length of the start fragment that was appended to
* <code>sb</code>.
*/
private static int startFragment(StringBuilder sb, String text, int offset, int limit)
{
if (limit == 0)
{
// append all
sb.append(text.substring(0, offset));
return offset;
}
String intro = "... ";
int start = offset;
for (int i = offset - 1; i >= limit; i--)
{
if (Character.isWhitespace(text.charAt(i)))
{
// potential start
start = i + 1;
if (i - 1 >= limit && PUNCTUATION.get(text.charAt(i - 1)))
{
// start of sentence found
intro = "";
break;
}
}
}
sb.append(intro).append(text.substring(start, offset));
return offset - start;
}
/**
* Writes the end of a fragment to the string buffer <code>sb</code>. The
* last occurrence of a matching term is indicated by the
* <code>offset</code> into the <code>text</code>.
*
* @param sb where to append the start of the fragment.
* @param text the original text.
* @param offset the end offset of the last matching term in the fragment.
* @param limit do not go further than <code>limit</code>.
*/
private static void endFragment(StringBuilder sb, String text, int offset, int limit)
{
if (limit == text.length())
{
// append all
sb.append(text.substring(offset));
return;
}
int end = offset;
for (int i = end; i < limit; i++)
{
if (Character.isWhitespace(text.charAt(i)))
{
// potential end
end = i;
}
}
sb.append(text.substring(offset, end)).append(" ...");
}
private static class FragmentInfo
{
List<TermVectorOffsetInfo> offsetInfosList;
int startOffset;
int endOffset;
int maxFragmentSize;
int quality;
public FragmentInfo(TermVectorOffsetInfo offsetinfo, int maxFragmentSize)
{
offsetInfosList = new ArrayList<TermVectorOffsetInfo>();
offsetInfosList.add(offsetinfo);
startOffset = offsetinfo.getStartOffset();
endOffset = offsetinfo.getEndOffset();
this.maxFragmentSize = maxFragmentSize;
quality = 0;
}
public boolean add(TermVectorOffsetInfo offsetinfo, String text)
{
if (offsetinfo.getEndOffset() > (startOffset + maxFragmentSize))
{
return false;
}
offsetInfosList.add(offsetinfo);
if (offsetinfo.getStartOffset() - endOffset <= 3)
{
// boost quality when terms are adjacent
// and only separated by whitespace character
boolean boost = true;
for (int i = endOffset; i < offsetinfo.getStartOffset(); i++)
{
if (!Character.isWhitespace(text.charAt(i)))
{
boost = false;
break;
}
}
if (boost)
{
quality += 10;
}
else
{
quality++;
}
}
else
{
quality++;
}
endOffset = offsetinfo.getEndOffset();
return true;
}
public Iterator<TermVectorOffsetInfo> iterator()
{
return offsetInfosList.iterator();
}
public int getStartOffset()
{
return startOffset;
}
public int getEndOffset()
{
return endOffset;
}
public int getQuality()
{
return quality;
}
}
private static class FragmentInfoPriorityQueue extends PriorityQueue<FragmentInfo>
{
public FragmentInfoPriorityQueue(int size)
{
initialize(size);
}
/**
* Checks the quality of two {@link FragmentInfo} objects. The one with
* the lower quality is considered less than the other. If both
* fragments have the same quality, the one with the higher start offset
* is considered the lesser. This will result in a queue that keeps the
* {@link FragmentInfo} with the best quality.
*/
protected boolean lessThan(FragmentInfo infoA, FragmentInfo infoB)
{
if (infoA.getQuality() == infoB.getQuality())
{
return infoA.getStartOffset() > infoB.getStartOffset();
}
return infoA.getQuality() < infoB.getQuality();
}
}
}