SearchTravRetHighlightTask.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.benchmark.byTask.tasks;

import java.text.BreakIterator;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner;
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
import org.apache.lucene.search.vectorhighlight.FieldQuery;
import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder;
import org.apache.lucene.search.vectorhighlight.WeightedFragListBuilder;
import org.apache.lucene.util.ArrayUtil;

/**
 * Search and Traverse and Retrieve docs task.  Highlight the fields in the retrieved documents.
 *
 * <p>Note: This task reuses the reader if it is already open.
 * Otherwise a reader is opened at start and closed at the end.
 * </p>
 *
 * <p>Takes optional multivalued, comma separated param string as: type[<enum>],maxFrags[<int>],fields[name1;name2;...]</p>
 * <ul>
 * <li>type - the highlighter implementation, e.g. "UH"</li>
 * <li>maxFrags - The maximum number of fragments to score by the highlighter</li>
 * <li>fields - The fields to highlight.  If not specified all fields will be highlighted (or at least attempted)</li>
 * </ul>
 * Example:
 * <pre>"SearchHlgtSameRdr" SearchTravRetHighlight(type[UH],maxFrags[3],fields[body]) > : 1000
 * </pre>
 *
 * Documents must be stored in order for this task to work.  Additionally, term vector positions can be used as well,
 * and offsets in postings is another option.
 *
 * <p>Other side effects: counts additional 1 (record) for each traversed hit,
 * and 1 more for each retrieved (non null) document and 1 for each fragment returned.</p>
 */
public class SearchTravRetHighlightTask extends SearchTravTask {
  private int maxDocCharsToAnalyze; // max leading content chars to highlight
  private int maxFrags = 1; // aka passages
  private Set<String> hlFields = Collections.singleton("body");
  private String type;
  private HLImpl hlImpl;
  private Analyzer analyzer;

  public SearchTravRetHighlightTask(PerfRunData runData) {
    super(runData);
  }

  @Override
  public void setParams(String params) {
    // can't call super because super doesn't understand our params syntax
    this.params = params;
    // TODO consider instead using data.getConfig().get("highlighter.*")?
    String[] splits = params.split(",");
    for (String split : splits) {
      if (split.startsWith("type[") == true) {
        type = split.substring("type[".length(), split.length() - 1);
      } else if (split.startsWith("maxFrags[") == true) {
        maxFrags = (int) Float.parseFloat(split.substring("maxFrags[".length(), split.length() - 1));
      } else if (split.startsWith("fields[") == true) {
        String fieldNames = split.substring("fields[".length(), split.length() - 1);
        String[] fieldSplits = fieldNames.split(";");
        hlFields = new HashSet<>(Arrays.asList(fieldSplits));
      }
    }
  }

  @Override
  public void setup() throws Exception {
    super.setup();
    //check to make sure either the doc is being stored
    PerfRunData data = getRunData();
    if (data.getConfig().get("doc.stored", false) == false){
      throw new Exception("doc.stored must be set to true");
    }
    maxDocCharsToAnalyze = data.getConfig().get("highlighter.maxDocCharsToAnalyze", Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
    analyzer = data.getAnalyzer();
    String type = this.type;
    if (type == null) {
      type = data.getConfig().get("highlighter", null);
    }
    switch (type) {
      case "NONE": hlImpl = new NoHLImpl(); break;
      case "SH_A": hlImpl = new StandardHLImpl(false); break;
      case "SH_V": hlImpl = new StandardHLImpl(true); break;

      case "FVH_V": hlImpl = new FastVectorHLImpl(); break;

      case "UH": hlImpl = new UnifiedHLImpl(null); break;
      case "UH_A": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.ANALYSIS); break;
      case "UH_V": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.TERM_VECTORS); break;
      case "UH_P": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS); break;
      case "UH_PV": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS_WITH_TERM_VECTORS); break;

      case "PH_P": hlImpl = new PostingsHLImpl(); break;

      default: throw new Exception("unrecognized highlighter type: " + type + " (try 'UH')");
    }
  }

  // here is where we intercept ReadTask's logic to do the highlighting, and nothing else (no retrieval of all field vals)
  @Override
  protected int withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
    hlImpl.withTopDocs(searcher, q, hits);
    // note: it'd be nice if we knew the sum kilobytes of text across these hits so we could return that. It'd be a more
    //  useful number to gauge the amount of work. But given "average" document sizes and lots of queries, returning the
    //  number of docs is reasonable.
    return hits.scoreDocs.length; // always return # scored docs.
  }

  private interface HLImpl {
    void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception;
  }

  private volatile int preventOptimizeAway = 0;

  private class StandardHLImpl implements HLImpl {
    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<em>", "</em>");
    DefaultEncoder encoder = new DefaultEncoder();
    Highlighter highlighter = new Highlighter(formatter, encoder, null);
    boolean termVecs;

    StandardHLImpl(boolean termVecs) {
      highlighter.setEncoder(new DefaultEncoder());
      highlighter.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
      this.termVecs = termVecs;
    }

    @Override
    public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
      IndexReader reader = searcher.getIndexReader();
      highlighter.setFragmentScorer(new QueryScorer(q));
      // highlighter.setTextFragmenter();  unfortunately no sentence mechanism, not even regex. Default here is trivial
      for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
        Document document = reader.document(scoreDoc.doc, hlFields);
        Fields tvFields = termVecs ? reader.getTermVectors(scoreDoc.doc) : null;
        for (IndexableField indexableField : document) {
          TokenStream tokenStream;
          if (termVecs) {
            tokenStream = TokenSources.getTokenStream(indexableField.name(), tvFields,
                indexableField.stringValue(), analyzer, maxDocCharsToAnalyze);
          } else {
            tokenStream = analyzer.tokenStream(indexableField.name(), indexableField.stringValue());
          }
          // will close TokenStream:
          String[] fragments = highlighter.getBestFragments(tokenStream, indexableField.stringValue(), maxFrags);
          preventOptimizeAway = fragments.length;
        }
      }
    }
  }

  private class FastVectorHLImpl implements HLImpl {
    int fragSize = 100;
    WeightedFragListBuilder fragListBuilder = new WeightedFragListBuilder();
    BoundaryScanner bs = new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(Locale.ENGLISH));
    ScoreOrderFragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder(bs);
    String[] preTags = {"<em>"};
    String[] postTags = {"</em>"};
    Encoder encoder = new DefaultEncoder();// new SimpleHTMLEncoder();
    FastVectorHighlighter highlighter = new FastVectorHighlighter(
        true,   // phraseHighlight
        false); // requireFieldMatch -- not pertinent to our benchmark

    @Override
    public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
      IndexReader reader = searcher.getIndexReader();
      final FieldQuery fq = highlighter.getFieldQuery( q, reader);
      for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
        for (String hlField : hlFields) {
          String[] fragments = highlighter.getBestFragments(fq, reader, scoreDoc.doc, hlField, fragSize, maxFrags,
              fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
          preventOptimizeAway = fragments.length;
        }
      }
    }
  }

  private ScoreDoc[] docIdOrder(ScoreDoc[] scoreDocs) {
    ScoreDoc[] clone = new ScoreDoc[scoreDocs.length];
    System.arraycopy(scoreDocs, 0, clone, 0, scoreDocs.length);
    ArrayUtil.introSort(clone, (a, b) -> Integer.compare(a.doc, b.doc));
    return clone;
  }

  private class PostingsHLImpl implements HLImpl {
    PostingsHighlighter highlighter;
    String[] fields = hlFields.toArray(new String[hlFields.size()]);
    int[] maxPassages;
    PostingsHLImpl() {
      highlighter = new PostingsHighlighter(maxDocCharsToAnalyze) {
        @Override
        protected Analyzer getIndexAnalyzer(String field) { // thus support wildcards
          return analyzer;
        }

        @Override
        protected BreakIterator getBreakIterator(String field) {
          return BreakIterator.getSentenceInstance(Locale.ENGLISH);
        }
      };
      maxPassages = new int[hlFields.size()];
      Arrays.fill(maxPassages, maxFrags);
    }

    @Override
    public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
      Map<String, String[]> result = highlighter.highlightFields(fields, q, searcher, hits, maxPassages);
      preventOptimizeAway = result.size();
    }
  }

  private class UnifiedHLImpl implements HLImpl {
    UnifiedHighlighter highlighter;
    IndexSearcher lastSearcher;
    UnifiedHighlighter.OffsetSource offsetSource; // null means auto select
    String[] fields = hlFields.toArray(new String[hlFields.size()]);
    int[] maxPassages;

    UnifiedHLImpl(final UnifiedHighlighter.OffsetSource offsetSource) {
      this.offsetSource = offsetSource;
      maxPassages = new int[hlFields.size()];
      Arrays.fill(maxPassages, maxFrags);
    }

    private void reset(IndexSearcher searcher) {
      if (lastSearcher == searcher) {
        return;
      }
      lastSearcher = searcher;
      highlighter = new UnifiedHighlighter(searcher, analyzer) {
        @Override
        protected OffsetSource getOffsetSource(String field) {
          return offsetSource != null ? offsetSource : super.getOffsetSource(field);
        }
      };
      highlighter.setBreakIterator(() -> BreakIterator.getSentenceInstance(Locale.ENGLISH));
      highlighter.setMaxLength(maxDocCharsToAnalyze);
      highlighter.setHighlightPhrasesStrictly(true);
      highlighter.setHandleMultiTermQuery(true);
    }

    @Override
    public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
      reset(searcher);
      Map<String, String[]> result = highlighter.highlightFields(fields, q, hits, maxPassages);
      preventOptimizeAway = result.size();
    }
  }

  private class NoHLImpl implements HLImpl {

    @Override
    public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
      //just retrieve the HL fields
      for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
        preventOptimizeAway += searcher.doc(scoreDoc.doc, hlFields).iterator().hasNext() ? 2 : 1;
      }
    }
  }
}