PostingsSolrHighlighter.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.highlight;

import java.io.IOException;
import java.text.BreakIterator;
import java.util.Collections;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator;
import org.apache.lucene.search.postingshighlight.DefaultPassageFormatter;
import org.apache.lucene.search.postingshighlight.Passage;
import org.apache.lucene.search.postingshighlight.PassageFormatter;
import org.apache.lucene.search.postingshighlight.PassageScorer;
import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
import org.apache.lucene.search.postingshighlight.WholeBreakIterator;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.PluginInfo;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.plugin.PluginInfoInitialized;

/** 
 * Highlighter impl that uses {@link PostingsHighlighter}
 * <p>
 * Example configuration:
 * <pre class="prettyprint">
 *   <requestHandler name="/select" class="solr.SearchHandler">
 *     <lst name="defaults">
 *       <str name="hl.method">postings</str>
 *       <int name="hl.snippets">1</int>
 *       <str name="hl.tag.pre">&lt;em&gt;</str>
 *       <str name="hl.tag.post">&lt;/em&gt;</str>
 *       <str name="hl.tag.ellipsis">... </str>
 *       <bool name="hl.defaultSummary">true</bool>
 *       <str name="hl.encoder">simple</str>
 *       <float name="hl.score.k1">1.2</float>
 *       <float name="hl.score.b">0.75</float>
 *       <float name="hl.score.pivot">87</float>
 *       <str name="hl.bs.language"></str>
 *       <str name="hl.bs.country"></str>
 *       <str name="hl.bs.variant"></str>
 *       <str name="hl.bs.type">SENTENCE</str>
 *       <int name="hl.maxAnalyzedChars">51200</int>
 *       <str name="hl.multiValuedSeparatorChar"> </str>
 *       <bool name="hl.highlightMultiTerm">false</bool>
 *     </lst>
 *   </requestHandler>
 * </pre>
 * <p>
 * Notes:
 *  <ul>
 *    <li>fields to highlight must be configured with storeOffsetsWithPositions="true"
 *    <li>hl.q (string) can specify the query
 *    <li>hl.fl (string) specifies the field list.
 *    <li>hl.snippets (int) specifies how many underlying passages form the resulting snippet.
 *    <li>hl.tag.pre (string) specifies text which appears before a highlighted term.
 *    <li>hl.tag.post (string) specifies text which appears after a highlighted term.
 *    <li>hl.tag.ellipsis (string) specifies text which joins non-adjacent passages.
 *    <li>hl.defaultSummary (bool) specifies if a field should have a default summary.
 *    <li>hl.encoder (string) can be 'html' (html escapes content) or 'simple' (no escaping).
 *    <li>hl.score.k1 (float) specifies bm25 scoring parameter 'k1'
 *    <li>hl.score.b (float) specifies bm25 scoring parameter 'b'
 *    <li>hl.score.pivot (float) specifies bm25 scoring parameter 'avgdl'
 *    <li>hl.bs.type (string) specifies how to divide text into passages: [SENTENCE, LINE, WORD, CHAR, WHOLE]
 *    <li>hl.bs.language (string) specifies language code for BreakIterator. default is empty string (root locale)
 *    <li>hl.bs.country (string) specifies country code for BreakIterator. default is empty string (root locale)
 *    <li>hl.bs.variant (string) specifies country code for BreakIterator. default is empty string (root locale)
 *    <li>hl.maxAnalyzedChars specifies how many characters at most will be processed in a document.
 *    <li>hl.multiValuedSeparatorChar specifies the logical separator between values for multi-valued fields.
 *    <li>hl.highlightMultiTerm enables highlighting for range/wildcard/fuzzy/prefix queries.
 *        NOTE: currently hl.maxAnalyzedChars cannot yet be specified per-field
 *  </ul>
 *  
 * @lucene.experimental 
 */
public class PostingsSolrHighlighter extends SolrHighlighter implements PluginInfoInitialized {
  
  @Override
  public void init(PluginInfo info) {}

  @Override
  public NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req, String[] defaultFields) throws IOException {
    final SolrParams params = req.getParams(); 
    
    // if highlighting isnt enabled, then why call doHighlighting?
    if (!isHighlightingEnabled(params))
      return null;

    SolrIndexSearcher searcher = req.getSearcher();
    int[] docIDs = toDocIDs(docs);

    // fetch the unique keys
    String[] keys = getUniqueKeys(searcher, docIDs);

    // query-time parameters
    String[] fieldNames = getHighlightFields(query, req, defaultFields);

    int maxPassages[] = new int[fieldNames.length];
    for (int i = 0; i < fieldNames.length; i++) {
      maxPassages[i] = params.getFieldInt(fieldNames[i], HighlightParams.SNIPPETS, 1);
    }

    PostingsHighlighter highlighter = getHighlighter(req);
    Map<String,String[]> snippets = highlighter.highlightFields(fieldNames, query, searcher, docIDs, maxPassages);
    return encodeSnippets(keys, fieldNames, snippets);
  }

  /** Creates an instance of the Lucene PostingsHighlighter. Provided for subclass extension so that
   * a subclass can return a subclass of {@link PostingsSolrHighlighter.SolrExtendedPostingsHighlighter}. */
  protected PostingsHighlighter getHighlighter(SolrQueryRequest req) {
    return new SolrExtendedPostingsHighlighter(req);
  }

  /** 
   * Encodes the resulting snippets into a namedlist
   * @param keys the document unique keys
   * @param fieldNames field names to highlight in the order
   * @param snippets map from field name to snippet array for the docs
   * @return encoded namedlist of summaries
   */
  protected NamedList<Object> encodeSnippets(String[] keys, String[] fieldNames, Map<String,String[]> snippets) {
    NamedList<Object> list = new SimpleOrderedMap<>();
    for (int i = 0; i < keys.length; i++) {
      NamedList<Object> summary = new SimpleOrderedMap<>();
      for (String field : fieldNames) {
        String snippet = snippets.get(field)[i];
        // box in an array to match the format of existing highlighters, 
        // even though it's always one element.
        if (snippet == null) {
          summary.add(field, new String[0]);
        } else {
          summary.add(field, new String[] { snippet });
        }
      }
      list.add(keys[i], summary);
    }
    return list;
  }
  
  /** Converts solr's DocList to the int[] docIDs */
  protected int[] toDocIDs(DocList docs) {
    int[] docIDs = new int[docs.size()];
    DocIterator iterator = docs.iterator();
    for (int i = 0; i < docIDs.length; i++) {
      if (!iterator.hasNext()) {
        throw new AssertionError();
      }
      docIDs[i] = iterator.nextDoc();
    }
    if (iterator.hasNext()) {
      throw new AssertionError();
    }
    return docIDs;
  }
  
  /** Retrieves the unique keys for the topdocs to key the results */
  protected String[] getUniqueKeys(SolrIndexSearcher searcher, int[] docIDs) throws IOException {
    IndexSchema schema = searcher.getSchema();
    SchemaField keyField = schema.getUniqueKeyField();
    if (keyField != null) {
      Set<String> selector = Collections.singleton(keyField.getName());
      String uniqueKeys[] = new String[docIDs.length];
      for (int i = 0; i < docIDs.length; i++) {
        int docid = docIDs[i];
        Document doc = searcher.doc(docid, selector);
        String id = schema.printableUniqueKey(doc);
        uniqueKeys[i] = id;
      }
      return uniqueKeys;
    } else {
      return new String[docIDs.length];
    }
  }

  /** From {@link #getHighlighter(org.apache.solr.request.SolrQueryRequest)}. */
  public class SolrExtendedPostingsHighlighter extends PostingsHighlighter {
    protected final SolrParams params;
    protected final IndexSchema schema;

    public SolrExtendedPostingsHighlighter(SolrQueryRequest req) {
      super(req.getParams().getInt(HighlightParams.MAX_CHARS, DEFAULT_MAX_CHARS));
      this.params = req.getParams();
      this.schema = req.getSchema();
    }

    @Override
    protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
      boolean defaultSummary = params.getFieldBool(fieldName, HighlightParams.DEFAULT_SUMMARY, true);
      if (defaultSummary) {
        return super.getEmptyHighlight(fieldName, bi, maxPassages);
      } else {
        //TODO reuse logic of DefaultSolrHighlighter.alternateField
        return new Passage[0];
      }
    }

    @Override
    protected PassageFormatter getFormatter(String fieldName) {
      String preTag = params.getFieldParam(fieldName, HighlightParams.TAG_PRE, "<em>");
      String postTag = params.getFieldParam(fieldName, HighlightParams.TAG_POST, "</em>");
      String ellipsis = params.getFieldParam(fieldName, HighlightParams.TAG_ELLIPSIS, "... ");
      String encoder = params.getFieldParam(fieldName, HighlightParams.ENCODER, "simple");
      return new DefaultPassageFormatter(preTag, postTag, ellipsis, "html".equals(encoder));
    }

    @Override
    protected PassageScorer getScorer(String fieldName) {
      float k1 = params.getFieldFloat(fieldName, HighlightParams.SCORE_K1, 1.2f);
      float b = params.getFieldFloat(fieldName, HighlightParams.SCORE_B, 0.75f);
      float pivot = params.getFieldFloat(fieldName, HighlightParams.SCORE_PIVOT, 87f);
      return new PassageScorer(k1, b, pivot);
    }

    @Override
    protected BreakIterator getBreakIterator(String field) {
      String type = params.getFieldParam(field, HighlightParams.BS_TYPE);
      if ("WHOLE".equals(type)) {
        return new WholeBreakIterator();
      } else if ("SEPARATOR".equals(type)) {
        char customSep = parseBiSepChar(params.getFieldParam(field, HighlightParams.BS_SEP));
        return new CustomSeparatorBreakIterator(customSep);
      } else {
        String language = params.getFieldParam(field, HighlightParams.BS_LANGUAGE);
        String country = params.getFieldParam(field, HighlightParams.BS_COUNTRY);
        String variant = params.getFieldParam(field, HighlightParams.BS_VARIANT);
        Locale locale = parseLocale(language, country, variant);
        return parseBreakIterator(type, locale);
      }
    }

    /**
     * parse custom separator char for {@link CustomSeparatorBreakIterator}
     */
    protected char parseBiSepChar(String sepChar) {
      if (sepChar == null) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, HighlightParams.BS_SEP + " not passed");
      }
      if (sepChar.length() != 1) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, HighlightParams.BS_SEP +
            " must be a single char but got: '" + sepChar + "'");
      }
      return sepChar.charAt(0);
    }

    @Override
    protected char getMultiValuedSeparator(String field) {
      String sep = params.getFieldParam(field, HighlightParams.MULTI_VALUED_SEPARATOR, " ");
      if (sep.length() != 1) {
        throw new IllegalArgumentException(HighlightParams.MULTI_VALUED_SEPARATOR + " must be exactly one character.");
      }
      return sep.charAt(0);
    }

    @Override
    protected Analyzer getIndexAnalyzer(String field) {
      if (params.getFieldBool(field, HighlightParams.HIGHLIGHT_MULTI_TERM, false)) {
        return schema.getIndexAnalyzer();
      } else {
        return null;
      }
    }
  }

  /** parse a break iterator type for the specified locale */
  protected BreakIterator parseBreakIterator(String type, Locale locale) {
    if (type == null || "SENTENCE".equals(type)) {
      return BreakIterator.getSentenceInstance(locale);
    } else if ("LINE".equals(type)) {
      return BreakIterator.getLineInstance(locale);
    } else if ("WORD".equals(type)) {
      return BreakIterator.getWordInstance(locale);
    } else if ("CHARACTER".equals(type)) {
      return BreakIterator.getCharacterInstance(locale);
    } else {
      throw new IllegalArgumentException("Unknown " + HighlightParams.BS_TYPE + ": " + type);
    }
  }
  
  /** parse a locale from a language+country+variant spec */
  protected Locale parseLocale(String language, String country, String variant) {
    if (language == null && country == null && variant == null) {
      return Locale.ROOT;
    } else if (language != null && country == null && variant != null) {
      throw new IllegalArgumentException("To specify variant, country is required");
    } else if (language != null && country != null && variant != null) {
      return new Locale(language, country, variant);
    } else if (language != null && country != null) {
      return new Locale(language, country);
    } else { 
      return new Locale(language);
    }
  }
}