/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.exoplatform.services.jcr.impl.core.query.lucene; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermPositionVector; import org.apache.lucene.index.TermVectorOffsetInfo; import org.apache.lucene.util.PriorityQueue; import java.io.IOException; import java.util.ArrayList; import java.util.BitSet; import java.util.IdentityHashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; /** * <code>WeightedHighlighter</code> implements a highlighter that weights the * fragments based on the proximity of the highlighted terms to each other. The * returned fragments are not necessarily in sequence as the text occurs in the * content. */ public class WeightedHighlighter extends DefaultHighlighter { /** * Punctuation characters that mark the end of a sentence. */ private static final BitSet PUNCTUATION = new BitSet(); static { PUNCTUATION.set('.'); PUNCTUATION.set('!'); PUNCTUATION.set(0xa1); // inverted exclamation mark PUNCTUATION.set('?'); PUNCTUATION.set(0xbf); // inverted question mark // todo add more } protected WeightedHighlighter() { } /** * @param tvec the term position vector for this hit * @param queryTerms the query terms. * @param text the original text that was used to create the * tokens. * @param excerptStart this string is prepended to the excerpt * @param excerptEnd this string is appended to the excerpt * @param fragmentStart this string is prepended to every fragment * @param fragmentEnd this string is appended to the end of every * fragment. * @param hlStart the string used to prepend a highlighted token, for * example <tt>"<b>"</tt> * @param hlEnd the string used to append a highlighted token, for * example <tt>"</b>"</tt> * @param maxFragments the maximum number of fragments * @param surround the maximum number of chars surrounding a * highlighted token * @return a String with text fragments where tokens from the query are * highlighted */ public static String highlight(TermPositionVector tvec, Set<Term> queryTerms, String text, String excerptStart, String excerptEnd, String fragmentStart, String fragmentEnd, String hlStart, String hlEnd, int maxFragments, int surround) throws IOException { return new WeightedHighlighter().doHighlight(tvec, queryTerms, text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, hlStart, hlEnd, maxFragments, surround); } /** * @param tvec the term position vector for this hit * @param queryTerms the query terms. * @param text the original text that was used to create the tokens. * @param maxFragments the maximum number of fragments * @param surround the maximum number of chars surrounding a highlighted * token * @return a String with text fragments where tokens from the query are * highlighted */ public static String highlight(TermPositionVector tvec, Set<Term> queryTerms, String text, int maxFragments, int surround) throws IOException { return highlight(tvec, queryTerms, text, START_EXCERPT, END_EXCERPT, START_FRAGMENT_SEPARATOR, END_FRAGMENT_SEPARATOR, START_HIGHLIGHT, END_HIGHLIGHT, maxFragments, surround); } protected String mergeFragments(TermVectorOffsetInfo[] offsets, String text, String excerptStart, String excerptEnd, String fragmentStart, String fragmentEnd, String hlStart, String hlEnd, int maxFragments, int surround) throws IOException { if (offsets == null || offsets.length == 0) { // nothing to highlight return createDefaultExcerpt(text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, surround * 2); } PriorityQueue<FragmentInfo> bestFragments = new FragmentInfoPriorityQueue(maxFragments); for (int i = 0; i < offsets.length; i++) { if (offsets[i].getEndOffset() <= text.length()) { FragmentInfo fi = new FragmentInfo(offsets[i], surround * 2); for (int j = i + 1; j < offsets.length; j++) { if (offsets[j].getEndOffset() > text.length()) { break; } if (!fi.add(offsets[j], text)) { break; } } bestFragments.insertWithOverflow(fi); } } if (bestFragments.size() == 0) { return createDefaultExcerpt(text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, surround * 2); } // retrieve fragment infos from queue and fill into list, least // fragment comes out first List<FragmentInfo> infos = new LinkedList<FragmentInfo>(); while (bestFragments.size() > 0) { FragmentInfo fi = bestFragments.pop(); infos.add(0, fi); } Map<TermVectorOffsetInfo, Object> offsetInfos = new IdentityHashMap<TermVectorOffsetInfo, Object>(); // remove overlapping fragment infos Iterator<FragmentInfo> it = infos.iterator(); while (it.hasNext()) { FragmentInfo fi = it.next(); boolean overlap = false; Iterator<TermVectorOffsetInfo> fit = fi.iterator(); while (fit.hasNext() && !overlap) { TermVectorOffsetInfo oi = fit.next(); if (offsetInfos.containsKey(oi)) { overlap = true; } } if (overlap) { it.remove(); } else { Iterator<TermVectorOffsetInfo> oit = fi.iterator(); while (oit.hasNext()) { offsetInfos.put(oit.next(), null); } } } // create excerpts StringBuilder sb = new StringBuilder(excerptStart); it = infos.iterator(); while (it.hasNext()) { FragmentInfo fi = (FragmentInfo)it.next(); sb.append(fragmentStart); int limit = Math.max(0, fi.getStartOffset() / 2 + fi.getEndOffset() / 2 - surround); int len = startFragment(sb, text, fi.getStartOffset(), limit); TermVectorOffsetInfo lastOffsetInfo = null; Iterator<TermVectorOffsetInfo> fIt = fi.iterator(); while (fIt.hasNext()) { TermVectorOffsetInfo oi = fIt.next(); if (lastOffsetInfo != null) { // fill in text between terms sb.append(text.substring(lastOffsetInfo.getEndOffset(), oi.getStartOffset())); } sb.append(hlStart); sb.append(text.substring(oi.getStartOffset(), oi.getEndOffset())); sb.append(hlEnd); lastOffsetInfo = oi; } limit = Math.min(text.length(), fi.getStartOffset() - len + (surround * 2)); endFragment(sb, text, fi.getEndOffset(), limit); sb.append(fragmentEnd); } sb.append(excerptEnd); return sb.toString(); } /** * Writes the start of a fragment to the string buffer <code>sb</code>. The * first occurrence of a matching term is indicated by the * <code>offset</code> into the <code>text</code>. * * @param sb where to append the start of the fragment. * @param text the original text. * @param offset the start offset of the first matching term in the * fragment. * @param limit do not go back further than <code>limit</code>. * @return the length of the start fragment that was appended to * <code>sb</code>. */ private static int startFragment(StringBuilder sb, String text, int offset, int limit) { if (limit == 0) { // append all sb.append(text.substring(0, offset)); return offset; } String intro = "... "; int start = offset; for (int i = offset - 1; i >= limit; i--) { if (Character.isWhitespace(text.charAt(i))) { // potential start start = i + 1; if (i - 1 >= limit && PUNCTUATION.get(text.charAt(i - 1))) { // start of sentence found intro = ""; break; } } } sb.append(intro).append(text.substring(start, offset)); return offset - start; } /** * Writes the end of a fragment to the string buffer <code>sb</code>. The * last occurrence of a matching term is indicated by the * <code>offset</code> into the <code>text</code>. * * @param sb where to append the start of the fragment. * @param text the original text. * @param offset the end offset of the last matching term in the fragment. * @param limit do not go further than <code>limit</code>. */ private static void endFragment(StringBuilder sb, String text, int offset, int limit) { if (limit == text.length()) { // append all sb.append(text.substring(offset)); return; } int end = offset; for (int i = end; i < limit; i++) { if (Character.isWhitespace(text.charAt(i))) { // potential end end = i; } } sb.append(text.substring(offset, end)).append(" ..."); } private static class FragmentInfo { List<TermVectorOffsetInfo> offsetInfosList; int startOffset; int endOffset; int maxFragmentSize; int quality; public FragmentInfo(TermVectorOffsetInfo offsetinfo, int maxFragmentSize) { offsetInfosList = new ArrayList<TermVectorOffsetInfo>(); offsetInfosList.add(offsetinfo); startOffset = offsetinfo.getStartOffset(); endOffset = offsetinfo.getEndOffset(); this.maxFragmentSize = maxFragmentSize; quality = 0; } public boolean add(TermVectorOffsetInfo offsetinfo, String text) { if (offsetinfo.getEndOffset() > (startOffset + maxFragmentSize)) { return false; } offsetInfosList.add(offsetinfo); if (offsetinfo.getStartOffset() - endOffset <= 3) { // boost quality when terms are adjacent // and only separated by whitespace character boolean boost = true; for (int i = endOffset; i < offsetinfo.getStartOffset(); i++) { if (!Character.isWhitespace(text.charAt(i))) { boost = false; break; } } if (boost) { quality += 10; } else { quality++; } } else { quality++; } endOffset = offsetinfo.getEndOffset(); return true; } public Iterator<TermVectorOffsetInfo> iterator() { return offsetInfosList.iterator(); } public int getStartOffset() { return startOffset; } public int getEndOffset() { return endOffset; } public int getQuality() { return quality; } } private static class FragmentInfoPriorityQueue extends PriorityQueue<FragmentInfo> { public FragmentInfoPriorityQueue(int size) { initialize(size); } /** * Checks the quality of two {@link FragmentInfo} objects. The one with * the lower quality is considered less than the other. If both * fragments have the same quality, the one with the higher start offset * is considered the lesser. This will result in a queue that keeps the * {@link FragmentInfo} with the best quality. */ protected boolean lessThan(FragmentInfo infoA, FragmentInfo infoB) { if (infoA.getQuality() == infoB.getQuality()) { return infoA.getStartOffset() > infoB.getStartOffset(); } return infoA.getQuality() < infoB.getQuality(); } } }