FieldHighlighter.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.uhighlight;

import java.io.IOException;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.PriorityQueue;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;

/**
 * Internal highlighter abstraction that operates on a per field basis.
 *
 * @lucene.internal
 */
public class FieldHighlighter {

  protected final String field;
  protected final FieldOffsetStrategy fieldOffsetStrategy;
  protected final BreakIterator breakIterator; // note: stateful!
  protected final PassageScorer passageScorer;
  protected final int maxPassages;
  protected final int maxNoHighlightPassages;
  protected final PassageFormatter passageFormatter;

  public FieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy, BreakIterator breakIterator,
                          PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages,
                          PassageFormatter passageFormatter) {
    this.field = field;
    this.fieldOffsetStrategy = fieldOffsetStrategy;
    this.breakIterator = breakIterator;
    this.passageScorer = passageScorer;
    this.maxPassages = maxPassages;
    this.maxNoHighlightPassages = maxNoHighlightPassages;
    this.passageFormatter = passageFormatter;
  }

  public String getField() {
    return field;
  }

  public UnifiedHighlighter.OffsetSource getOffsetSource() {
    return fieldOffsetStrategy.getOffsetSource();
  }

  /**
   * The primary method -- highlight this doc, assuming a specific field and given this content.
   */
  public Object highlightFieldForDoc(IndexReader reader, int docId, String content) throws IOException {
    // TODO accept LeafReader instead?
    // note: it'd be nice to accept a CharSequence for content, but we need a CharacterIterator impl for it.
    if (content.length() == 0) {
      return null; // nothing to do
    }

    breakIterator.setText(content);

    List<OffsetsEnum> offsetsEnums = fieldOffsetStrategy.getOffsetsEnums(reader, docId, content);

    Passage[] passages;
    try {
      // Highlight the offsetsEnum list against the content to produce Passages.
      passages = highlightOffsetsEnums(offsetsEnums);// and breakIterator & scorer
    } finally {
      // Ensure closeable resources get closed
      IOUtils.close(offsetsEnums);
    }

    // Format the resulting Passages.
    if (passages.length == 0) {
      // no passages were returned, so ask for a default summary
      passages = getSummaryPassagesNoHighlight(maxNoHighlightPassages == -1 ? maxPassages : maxNoHighlightPassages);
    }

    if (passages.length > 0) {
      return passageFormatter.format(passages, content);
    } else {
      return null;
    }
  }

  /**
   * Called to summarize a document when no highlights were found.
   * By default this just returns the first
   * {@link #maxPassages} sentences; subclasses can override to customize.
   * The state of {@link #breakIterator} should be at the beginning.
   */
  protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) {
    assert breakIterator.current() == breakIterator.first();

    List<Passage> passages = new ArrayList<>(Math.min(maxPassages, 10));
    int pos = breakIterator.current();
    assert pos == 0;
    while (passages.size() < maxPassages) {
      int next = breakIterator.next();
      if (next == BreakIterator.DONE) {
        break;
      }
      Passage passage = new Passage();
      passage.setScore(Float.NaN);
      passage.setStartOffset(pos);
      passage.setEndOffset(next);
      passages.add(passage);
      pos = next;
    }

    return passages.toArray(new Passage[passages.size()]);
  }

  // algorithm: treat sentence snippets as miniature documents
  // we can intersect these with the postings lists via BreakIterator.preceding(offset),s
  // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
  protected Passage[] highlightOffsetsEnums(List<OffsetsEnum> offsetsEnums)
      throws IOException {
    PassageScorer scorer = passageScorer;
    BreakIterator breakIterator = this.breakIterator;
    final int contentLength = breakIterator.getText().getEndIndex();

    PriorityQueue<OffsetsEnum> offsetsEnumQueue = new PriorityQueue<>(offsetsEnums.size() + 1);
    for (OffsetsEnum off : offsetsEnums) {
      off.setWeight(scorer.weight(contentLength, off.freq()));
      off.nextPosition(); // go to first position
      offsetsEnumQueue.add(off);
    }
    offsetsEnumQueue.add(new OffsetsEnum(null, EMPTY)); // a sentinel for termination

    PriorityQueue<Passage> passageQueue = new PriorityQueue<>(Math.min(64, maxPassages + 1), (left, right) -> {
      if (left.getScore() < right.getScore()) {
        return -1;
      } else if (left.getScore() > right.getScore()) {
        return 1;
      } else {
        return left.getStartOffset() - right.getStartOffset();
      }
    });
    Passage passage = new Passage(); // the current passage in-progress.  Will either get reset or added to queue.

    OffsetsEnum off;
    while ((off = offsetsEnumQueue.poll()) != null) {
      int start = off.startOffset();
      if (start == -1) {
        throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
      }
      int end = off.endOffset();
      // LUCENE-5166: this hit would span the content limit... however more valid
      // hits may exist (they are sorted by start). so we pretend like we never
      // saw this term, it won't cause a passage to be added to passageQueue or anything.
      assert EMPTY.startOffset() == Integer.MAX_VALUE;
      if (start < contentLength && end > contentLength) {
        continue;
      }
      // See if this term should be part of a new passage.
      if (start >= passage.getEndOffset()) {
        if (passage.getStartOffset() >= 0) { // true if this passage has terms; otherwise couldn't find any (yet)
          // finalize passage
          passage.setScore(passage.getScore() * scorer.norm(passage.getStartOffset()));
          // new sentence: first add 'passage' to queue
          if (passageQueue.size() == maxPassages && passage.getScore() < passageQueue.peek().getScore()) {
            passage.reset(); // can't compete, just reset it
          } else {
            passageQueue.offer(passage);
            if (passageQueue.size() > maxPassages) {
              passage = passageQueue.poll();
              passage.reset();
            } else {
              passage = new Passage();
            }
          }
        }
        // if we exceed limit, we are done
        if (start >= contentLength) {
          break;
        }
        // advance breakIterator
        passage.setStartOffset(Math.max(breakIterator.preceding(start + 1), 0));
        passage.setEndOffset(Math.min(breakIterator.following(start), contentLength));
      }
      // Add this term to the passage.
      int tf = 0;
      while (true) {
        tf++;
        BytesRef term = off.getTerm();// a reference; safe to refer to
        assert term != null;
        passage.addMatch(start, end, term);
        // see if there are multiple occurrences of this term in this passage. If so, add them.
        if (!off.hasMorePositions()) {
          break; // No more in the entire text. Already removed from pq; move on
        }
        off.nextPosition();
        start = off.startOffset();
        end = off.endOffset();
        if (start >= passage.getEndOffset() || end > contentLength) { // it's beyond this passage
          offsetsEnumQueue.offer(off);
          break;
        }
      }
      passage.setScore(passage.getScore() + off.getWeight() * scorer.tf(tf, passage.getEndOffset() - passage.getStartOffset()));
    }

    Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]);
    for (Passage p : passages) {
      p.sort();
    }
    // sort in ascending order
    Arrays.sort(passages, (left, right) -> left.getStartOffset() - right.getStartOffset());
    return passages;
  }

  protected static final PostingsEnum EMPTY = new PostingsEnum() {

    @Override
    public int nextPosition() throws IOException {
      return 0;
    }

    @Override
    public int startOffset() throws IOException {
      return Integer.MAX_VALUE;
    }

    @Override
    public int endOffset() throws IOException {
      return Integer.MAX_VALUE;
    }

    @Override
    public BytesRef getPayload() throws IOException {
      return null;
    }

    @Override
    public int freq() throws IOException {
      return 0;
    }

    @Override
    public int docID() {
      return NO_MORE_DOCS;
    }

    @Override
    public int nextDoc() throws IOException {
      return NO_MORE_DOCS;
    }

    @Override
    public int advance(int target) throws IOException {
      return NO_MORE_DOCS;
    }

    @Override
    public long cost() {
      return 0;
    }
  };
}