/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.uhighlight; import java.io.IOException; import java.text.BreakIterator; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.PriorityQueue; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; /** * Internal highlighter abstraction that operates on a per field basis. * * @lucene.internal */ public class FieldHighlighter { protected final String field; protected final FieldOffsetStrategy fieldOffsetStrategy; protected final BreakIterator breakIterator; // note: stateful! protected final PassageScorer passageScorer; protected final int maxPassages; protected final int maxNoHighlightPassages; protected final PassageFormatter passageFormatter; public FieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy, BreakIterator breakIterator, PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages, PassageFormatter passageFormatter) { this.field = field; this.fieldOffsetStrategy = fieldOffsetStrategy; this.breakIterator = breakIterator; this.passageScorer = passageScorer; this.maxPassages = maxPassages; this.maxNoHighlightPassages = maxNoHighlightPassages; this.passageFormatter = passageFormatter; } public String getField() { return field; } public UnifiedHighlighter.OffsetSource getOffsetSource() { return fieldOffsetStrategy.getOffsetSource(); } /** * The primary method -- highlight this doc, assuming a specific field and given this content. */ public Object highlightFieldForDoc(IndexReader reader, int docId, String content) throws IOException { // TODO accept LeafReader instead? // note: it'd be nice to accept a CharSequence for content, but we need a CharacterIterator impl for it. if (content.length() == 0) { return null; // nothing to do } breakIterator.setText(content); List<OffsetsEnum> offsetsEnums = fieldOffsetStrategy.getOffsetsEnums(reader, docId, content); Passage[] passages; try { // Highlight the offsetsEnum list against the content to produce Passages. passages = highlightOffsetsEnums(offsetsEnums);// and breakIterator & scorer } finally { // Ensure closeable resources get closed IOUtils.close(offsetsEnums); } // Format the resulting Passages. if (passages.length == 0) { // no passages were returned, so ask for a default summary passages = getSummaryPassagesNoHighlight(maxNoHighlightPassages == -1 ? maxPassages : maxNoHighlightPassages); } if (passages.length > 0) { return passageFormatter.format(passages, content); } else { return null; } } /** * Called to summarize a document when no highlights were found. * By default this just returns the first * {@link #maxPassages} sentences; subclasses can override to customize. * The state of {@link #breakIterator} should be at the beginning. */ protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) { assert breakIterator.current() == breakIterator.first(); List<Passage> passages = new ArrayList<>(Math.min(maxPassages, 10)); int pos = breakIterator.current(); assert pos == 0; while (passages.size() < maxPassages) { int next = breakIterator.next(); if (next == BreakIterator.DONE) { break; } Passage passage = new Passage(); passage.setScore(Float.NaN); passage.setStartOffset(pos); passage.setEndOffset(next); passages.add(passage); pos = next; } return passages.toArray(new Passage[passages.size()]); } // algorithm: treat sentence snippets as miniature documents // we can intersect these with the postings lists via BreakIterator.preceding(offset),s // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) protected Passage[] highlightOffsetsEnums(List<OffsetsEnum> offsetsEnums) throws IOException { PassageScorer scorer = passageScorer; BreakIterator breakIterator = this.breakIterator; final int contentLength = breakIterator.getText().getEndIndex(); PriorityQueue<OffsetsEnum> offsetsEnumQueue = new PriorityQueue<>(offsetsEnums.size() + 1); for (OffsetsEnum off : offsetsEnums) { off.setWeight(scorer.weight(contentLength, off.freq())); off.nextPosition(); // go to first position offsetsEnumQueue.add(off); } offsetsEnumQueue.add(new OffsetsEnum(null, EMPTY)); // a sentinel for termination PriorityQueue<Passage> passageQueue = new PriorityQueue<>(Math.min(64, maxPassages + 1), (left, right) -> { if (left.getScore() < right.getScore()) { return -1; } else if (left.getScore() > right.getScore()) { return 1; } else { return left.getStartOffset() - right.getStartOffset(); } }); Passage passage = new Passage(); // the current passage in-progress. Will either get reset or added to queue. OffsetsEnum off; while ((off = offsetsEnumQueue.poll()) != null) { int start = off.startOffset(); if (start == -1) { throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); } int end = off.endOffset(); // LUCENE-5166: this hit would span the content limit... however more valid // hits may exist (they are sorted by start). so we pretend like we never // saw this term, it won't cause a passage to be added to passageQueue or anything. assert EMPTY.startOffset() == Integer.MAX_VALUE; if (start < contentLength && end > contentLength) { continue; } // See if this term should be part of a new passage. if (start >= passage.getEndOffset()) { if (passage.getStartOffset() >= 0) { // true if this passage has terms; otherwise couldn't find any (yet) // finalize passage passage.setScore(passage.getScore() * scorer.norm(passage.getStartOffset())); // new sentence: first add 'passage' to queue if (passageQueue.size() == maxPassages && passage.getScore() < passageQueue.peek().getScore()) { passage.reset(); // can't compete, just reset it } else { passageQueue.offer(passage); if (passageQueue.size() > maxPassages) { passage = passageQueue.poll(); passage.reset(); } else { passage = new Passage(); } } } // if we exceed limit, we are done if (start >= contentLength) { break; } // advance breakIterator passage.setStartOffset(Math.max(breakIterator.preceding(start + 1), 0)); passage.setEndOffset(Math.min(breakIterator.following(start), contentLength)); } // Add this term to the passage. int tf = 0; while (true) { tf++; BytesRef term = off.getTerm();// a reference; safe to refer to assert term != null; passage.addMatch(start, end, term); // see if there are multiple occurrences of this term in this passage. If so, add them. if (!off.hasMorePositions()) { break; // No more in the entire text. Already removed from pq; move on } off.nextPosition(); start = off.startOffset(); end = off.endOffset(); if (start >= passage.getEndOffset() || end > contentLength) { // it's beyond this passage offsetsEnumQueue.offer(off); break; } } passage.setScore(passage.getScore() + off.getWeight() * scorer.tf(tf, passage.getEndOffset() - passage.getStartOffset())); } Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]); for (Passage p : passages) { p.sort(); } // sort in ascending order Arrays.sort(passages, (left, right) -> left.getStartOffset() - right.getStartOffset()); return passages; } protected static final PostingsEnum EMPTY = new PostingsEnum() { @Override public int nextPosition() throws IOException { return 0; } @Override public int startOffset() throws IOException { return Integer.MAX_VALUE; } @Override public int endOffset() throws IOException { return Integer.MAX_VALUE; } @Override public BytesRef getPayload() throws IOException { return null; } @Override public int freq() throws IOException { return 0; } @Override public int docID() { return NO_MORE_DOCS; } @Override public int nextDoc() throws IOException { return NO_MORE_DOCS; } @Override public int advance(int target) throws IOException { return NO_MORE_DOCS; } @Override public long cost() { return 0; } }; }