/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.exoplatform.services.jcr.impl.core.query.lucene; import org.apache.lucene.document.Field; //NOSONAR import org.apache.lucene.index.Term; import org.apache.lucene.index.TermPositionVector; import org.apache.lucene.index.TermVectorOffsetInfo; import org.exoplatform.services.jcr.util.Text; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Set; /** * This is an adapted version of the <code>FulltextHighlighter</code> posted in * issue: <a href="http://issues.apache.org/jira/browse/LUCENE-644">LUCENE-644</a>. * <br> * Important: for this highlighter to function properly, field must be stored * with token offsets.<br> Use Field constructor {@link * Field#Field(String,String,Field.Store,Field.Index,Field.TermVector) * Field(String, String, Field.Store, Field.Index, Field.TermVector)} where the * last argument is either {@link Field.TermVector#WITH_POSITIONS_OFFSETS} or * {@link org.apache.lucene.document.Field.TermVector#WITH_OFFSETS} * * @see org.apache.lucene.index.TermPositionVector * @see org.apache.lucene.index.TermFreqVector */ public class DefaultHighlighter { /** * A default value of <tt>3</tt> */ public static final int DEFAULT_MAXFRAGMENTS = 3; /** * A default value of <tt>75</tt> */ public static final int DEFAULT_SURROUND = 75; public static final String START_EXCERPT = "<excerpt>"; public static final String END_EXCERPT = "</excerpt>"; public static final String START_FRAGMENT_SEPARATOR = "<fragment>"; public static final String END_FRAGMENT_SEPARATOR = "</fragment>"; public static final String START_HIGHLIGHT = "<highlight>"; public static final String END_HIGHLIGHT = "</highlight>"; protected DefaultHighlighter() { } /** * @param tvec the term position vector for this hit * @param queryTerms the query terms. * @param text the original text that was used to create the * tokens. * @param excerptStart this string is prepended to the excerpt * @param excerptEnd this string is appended to the excerpt * @param fragmentStart this string is prepended to every fragment * @param fragmentEnd this string is appended to the end of every * fragment. * @param hlStart the string used to prepend a highlighted token, for * example <tt>"<b>"</tt> * @param hlEnd the string used to append a highlighted token, for * example <tt>"</b>"</tt> * @param maxFragments the maximum number of fragments * @param surround the maximum number of chars surrounding a * highlighted token * @return a String with text fragments where tokens from the query are * highlighted */ public static String highlight(TermPositionVector tvec, Set<Term> queryTerms, String text, String excerptStart, String excerptEnd, String fragmentStart, String fragmentEnd, String hlStart, String hlEnd, int maxFragments, int surround) throws IOException { return new DefaultHighlighter().doHighlight(tvec, queryTerms, text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, hlStart, hlEnd, maxFragments, surround); } /** * @param tvec the term position vector for this hit * @param queryTerms the query terms. * @param text the original text that was used to create the tokens. * @param maxFragments the maximum number of fragments * @param surround the maximum number of chars surrounding a highlighted * token * @return a String with text fragments where tokens from the query are * highlighted */ public static String highlight(TermPositionVector tvec, Set<Term> queryTerms, String text, int maxFragments, int surround) throws IOException { return highlight(tvec, queryTerms, text, START_EXCERPT, END_EXCERPT, START_FRAGMENT_SEPARATOR, END_FRAGMENT_SEPARATOR, START_HIGHLIGHT, END_HIGHLIGHT, maxFragments, surround); } /** * @see #highlight(TermPositionVector, Set, String, String, String, String, String, String, String, int, int) */ protected String doHighlight(TermPositionVector tvec, Set<Term> queryTerms, String text, String excerptStart, String excerptEnd, String fragmentStart, String fragmentEnd, String hlStart, String hlEnd, int maxFragments, int surround) throws IOException { String[] terms = new String[queryTerms.size()]; Iterator<Term> it = queryTerms.iterator(); for (int i = 0; it.hasNext(); i++) { terms[i] = it.next().text(); } List<TermVectorOffsetInfo> list = new ArrayList<TermVectorOffsetInfo>(); int[] tvecindexes = tvec.indexesOf(terms, 0, terms.length); for (int i = 0; i < tvecindexes.length; i++) { TermVectorOffsetInfo[] termoffsets = tvec.getOffsets(tvecindexes[i]); list.addAll(Arrays.asList(termoffsets)); } TermVectorOffsetInfo[] offsets = (TermVectorOffsetInfo[]) list.toArray(new TermVectorOffsetInfo[list.size()]); // sort offsets if (terms.length > 1) { Arrays.sort(offsets, new TermVectorOffsetInfoSorter()); } return mergeFragments(offsets, text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, hlStart, hlEnd, maxFragments, surround); } protected String mergeFragments(TermVectorOffsetInfo[] offsets, String text, String excerptStart, String excerptEnd, String fragmentStart, String fragmentEnd, String hlStart, String hlEnd, int maxFragments, int surround) throws IOException { if (offsets == null || offsets.length == 0) { // nothing to highlight return createDefaultExcerpt(text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, surround * 2); } int lastOffset = offsets.length; // Math.min(10, offsets.length); // 10 terms is plenty? List<FragmentInfo> fragmentInfoList = new ArrayList<FragmentInfo>(); if (offsets[0].getEndOffset() <= text.length()) { FragmentInfo fi = new FragmentInfo(offsets[0], surround * 2); for (int i = 1; i < lastOffset; i++) { if (offsets[i].getEndOffset() > text.length()) { break; } if (fi.add(offsets[i])) { continue; } fragmentInfoList.add(fi); fi = new FragmentInfo(offsets[i], surround * 2); } fragmentInfoList.add(fi); } if (fragmentInfoList.isEmpty()) { // nothing to highlight return createDefaultExcerpt(text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, surround * 2); } // sort with score Collections.sort(fragmentInfoList, new FragmentInfoScoreSorter()); // extract best fragments List<FragmentInfo> bestFragmentsList = new ArrayList<FragmentInfo>(); for (int i = 0; i < Math.min(fragmentInfoList.size(), maxFragments); i++) { bestFragmentsList.add(fragmentInfoList.get(i)); } // re-sort with positions Collections.sort(bestFragmentsList, new FragmentInfoPositionSorter()); // merge #maxFragments fragments StringReader reader = new StringReader(text); StringBuilder sb = new StringBuilder(excerptStart); int pos = 0; char[] cbuf; int skip; int nextStart; int skippedChars; int firstWhitespace; for (int i = 0; i < bestFragmentsList.size(); i++) { FragmentInfo fi = bestFragmentsList.get(i); fi.trim(); nextStart = fi.getStartOffset(); skip = nextStart - pos; if (skip > surround * 2) { skip -= surround; if (i > 0) { // end last fragment cbuf = new char[surround]; reader.read(cbuf, 0, surround); // find last whitespace skippedChars = 1; for (; skippedChars < surround + 1; skippedChars++) { if (Character.isWhitespace(cbuf[surround - skippedChars])) { break; } } pos += surround; if (skippedChars > surround) { skippedChars = surround; } sb.append(Text.encodeIllegalXMLCharacters( new String(cbuf, 0, surround - skippedChars))); sb.append(fragmentEnd); } } if (skip >= surround) { if (i > 0) { skip -= surround; } // skip reader.skip((long) skip); pos += skip; } // start fragment cbuf = new char[nextStart - pos]; skippedChars = Math.max(cbuf.length - 1, 0); firstWhitespace = skippedChars; reader.read(cbuf, 0, nextStart - pos); pos += (nextStart - pos); sb.append(fragmentStart); // find last period followed by whitespace if (cbuf.length > 0) { for (; skippedChars >= 0; skippedChars--) { if (Character.isWhitespace(cbuf[skippedChars])) { firstWhitespace = skippedChars; if (skippedChars - 1 >= 0 && cbuf[skippedChars - 1] == '.') { skippedChars++; break; } } } } boolean sentenceStart = true; if (skippedChars == -1) { if (pos == cbuf.length) { // this fragment is the start of the text -> skip none skippedChars = 0; } else { sentenceStart = false; skippedChars = firstWhitespace + 1; } } if (!sentenceStart) { sb.append("... "); } sb.append(Text.encodeIllegalXMLCharacters( new String(cbuf, skippedChars, cbuf.length - skippedChars))); // iterate terms for (Iterator<TermVectorOffsetInfo> iter = fi.iterator(); iter.hasNext();) { TermVectorOffsetInfo ti = iter.next(); nextStart = ti.getStartOffset(); if (nextStart - pos > 0) { cbuf = new char[nextStart - pos]; int charsRead = reader.read(cbuf, 0, nextStart - pos); pos += (nextStart - pos); sb.append(cbuf, 0, charsRead); } sb.append(hlStart); nextStart = ti.getEndOffset(); // print term cbuf = new char[nextStart - pos]; reader.read(cbuf, 0, nextStart - pos); pos += (nextStart - pos); sb.append(cbuf); sb.append(hlEnd); } } if (pos != 0) { // end fragment if (offsets.length > lastOffset) { surround = Math.min(offsets[lastOffset].getStartOffset() - pos, surround); } cbuf = new char[surround]; skip = reader.read(cbuf, 0, surround); boolean EOF = reader.read() == -1; if (skip >= 0) { if (!EOF) { skippedChars = 1; for (; skippedChars < surround + 1; skippedChars++) { if (Character.isWhitespace(cbuf[surround - skippedChars])) { break; } } if (skippedChars > surround) { skippedChars = surround; } } else { skippedChars = 0; } sb.append(Text.encodeIllegalXMLCharacters( new String(cbuf, 0, EOF ? skip : (surround - skippedChars)))); if (!EOF) { char lastChar = sb.charAt(sb.length() - 1); if (lastChar != '.' && lastChar != '!' && lastChar != '?') { sb.append(" ..."); } } } sb.append(fragmentEnd); } sb.append(excerptEnd); return sb.toString(); } /** * Creates a default excerpt with the given text. * * @param text the text. * @param excerptStart the excerpt start. * @param excerptEnd the excerpt end. * @param fragmentStart the fragement start. * @param fragmentEnd the fragment end. * @param maxLength the maximum length of the fragment. * @return a default excerpt. * @throws IOException if an error occurs while reading from the text. */ protected String createDefaultExcerpt(String text, String excerptStart, String excerptEnd, String fragmentStart, String fragmentEnd, int maxLength) throws IOException { StringReader reader = new StringReader(text); StringBuilder excerpt = new StringBuilder(excerptStart); excerpt.append(fragmentStart); if (!text.isEmpty()) { int min = excerpt.length(); char[] buf = new char[maxLength]; int len = reader.read(buf); StringBuilder tmp = new StringBuilder(); tmp.append(buf, 0, len); if (len == buf.length) { for (int i = tmp.length() - 1; i > min; i--) { if (Character.isWhitespace(tmp.charAt(i))) { tmp.delete(i, tmp.length()); tmp.append(" ..."); break; } } } excerpt.append(Text.encodeIllegalXMLCharacters(tmp.toString())); } excerpt.append(fragmentEnd).append(excerptEnd); return excerpt.toString(); } private static class FragmentInfo { List<TermVectorOffsetInfo> offsetInfosList; int startOffset; int endOffset; int mergeGap; int numTerms; public FragmentInfo(TermVectorOffsetInfo offsetinfo, int mergeGap) { offsetInfosList = new ArrayList<TermVectorOffsetInfo>(); offsetInfosList.add(offsetinfo); startOffset = offsetinfo.getStartOffset(); endOffset = offsetinfo.getEndOffset(); this.mergeGap = mergeGap; numTerms = 1; } public boolean add(TermVectorOffsetInfo offsetinfo) { if (offsetinfo.getStartOffset() > (endOffset + mergeGap)) { return false; } offsetInfosList.add(offsetinfo); numTerms++; endOffset = offsetinfo.getEndOffset(); return true; } public Iterator<TermVectorOffsetInfo> iterator() { return offsetInfosList.iterator(); } public int getStartOffset() { return startOffset; } public int numTerms() { return numTerms; } public void trim() { int end = startOffset + (mergeGap / 2); Iterator<TermVectorOffsetInfo> it = offsetInfosList.iterator(); while (it.hasNext()) { TermVectorOffsetInfo tvoi = it.next(); if (tvoi.getStartOffset() > end) { it.remove(); } } } } private static class FragmentInfoScoreSorter implements java.util.Comparator<FragmentInfo> { public int compare(FragmentInfo o1, FragmentInfo o2) { int s1 = o1.numTerms(); int s2 = o2.numTerms(); if (s1 == s2) { return o1.getStartOffset() < o2.getStartOffset() ? -1 : 1; } return s1 > s2 ? -1 : 1; } public boolean equals(Object obj) { return false; } } private static class FragmentInfoPositionSorter implements java.util.Comparator<FragmentInfo> { public int compare(FragmentInfo o1, FragmentInfo o2) { int s1 = o1.getStartOffset(); int s2 = o2.getStartOffset(); if (s1 == s2) { return 0; } return s1 < s2 ? -1 : 1; } public boolean equals(Object obj) { return false; } } private static class TermVectorOffsetInfoSorter implements java.util.Comparator<TermVectorOffsetInfo> { public int compare(TermVectorOffsetInfo o1, TermVectorOffsetInfo o2) { int s1 = o1.getStartOffset(); int s2 = o2.getStartOffset(); if (s1 == s2) { return 0; } return s1 < s2 ? -1 : 1; } public boolean equals(Object obj) { return false; } } }