/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.search.fetch.subphase.highlight; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.search.highlight.Formatter; import org.apache.lucene.search.highlight.Fragmenter; import org.apache.lucene.search.highlight.NullFragmenter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.search.highlight.SimpleSpanFragmenter; import org.apache.lucene.search.highlight.TextFragment; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.CollectionUtil; import org.elasticsearch.ExceptionsHelper; import org.elasticsearch.common.text.Text; import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.search.fetch.FetchPhaseExecutionException; import org.elasticsearch.search.fetch.FetchSubPhase; import org.elasticsearch.search.internal.SearchContext; import java.io.IOException; import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; public class PlainHighlighter implements Highlighter { private static final String CACHE_KEY = "highlight-plain"; @Override public HighlightField highlight(HighlighterContext highlighterContext) { SearchContextHighlight.Field field = highlighterContext.field; SearchContext context = highlighterContext.context; FetchSubPhase.HitContext hitContext = highlighterContext.hitContext; FieldMapper mapper = highlighterContext.mapper; Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT; if (!hitContext.cache().containsKey(CACHE_KEY)) { Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter> mappers = new HashMap<>(); hitContext.cache().put(CACHE_KEY, mappers); } @SuppressWarnings("unchecked") Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter> cache = (Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter>) hitContext.cache().get(CACHE_KEY); org.apache.lucene.search.highlight.Highlighter entry = cache.get(mapper); if (entry == null) { QueryScorer queryScorer = new CustomQueryScorer(highlighterContext.query, field.fieldOptions().requireFieldMatch() ? mapper.fieldType().name() : null); queryScorer.setExpandMultiTermQuery(true); Fragmenter fragmenter; if (field.fieldOptions().numberOfFragments() == 0) { fragmenter = new NullFragmenter(); } else if (field.fieldOptions().fragmenter() == null) { fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize()); } else if ("simple".equals(field.fieldOptions().fragmenter())) { fragmenter = new SimpleFragmenter(field.fieldOptions().fragmentCharSize()); } else if ("span".equals(field.fieldOptions().fragmenter())) { fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize()); } else { throw new IllegalArgumentException("unknown fragmenter option [" + field.fieldOptions().fragmenter() + "] for the field [" + highlighterContext.fieldName + "]"); } Formatter formatter = new SimpleHTMLFormatter(field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0]); entry = new org.apache.lucene.search.highlight.Highlighter(formatter, encoder, queryScorer); entry.setTextFragmenter(fragmenter); // always highlight across all data entry.setMaxDocCharsToAnalyze(Integer.MAX_VALUE); cache.put(mapper, entry); } // a HACK to make highlighter do highlighting, even though its using the single frag list builder int numberOfFragments = field.fieldOptions().numberOfFragments() == 0 ? 1 : field.fieldOptions().numberOfFragments(); ArrayList<TextFragment> fragsList = new ArrayList<>(); List<Object> textsToHighlight; Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().getType()).mappers().indexAnalyzer(); try { textsToHighlight = HighlightUtils.loadFieldValues(field, mapper, context, hitContext); for (Object textToHighlight : textsToHighlight) { String text; if (textToHighlight instanceof BytesRef) { text = mapper.fieldType().valueForDisplay(textToHighlight).toString(); } else { text = textToHighlight.toString(); } try (TokenStream tokenStream = analyzer.tokenStream(mapper.fieldType().name(), text)) { if (!tokenStream.hasAttribute(CharTermAttribute.class) || !tokenStream.hasAttribute(OffsetAttribute.class)) { // can't perform highlighting if the stream has no terms (binary token stream) or no offsets continue; } TextFragment[] bestTextFragments = entry.getBestTextFragments(tokenStream, text, false, numberOfFragments); for (TextFragment bestTextFragment : bestTextFragments) { if (bestTextFragment != null && bestTextFragment.getScore() > 0) { fragsList.add(bestTextFragment); } } } } } catch (Exception e) { if (ExceptionsHelper.unwrap(e, BytesRefHash.MaxBytesLengthExceededException.class) != null) { // this can happen if for example a field is not_analyzed and ignore_above option is set. // the field will be ignored when indexing but the huge term is still in the source and // the plain highlighter will parse the source and try to analyze it. return null; } else { throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e); } } if (field.fieldOptions().scoreOrdered()) { CollectionUtil.introSort(fragsList, new Comparator<TextFragment>() { @Override public int compare(TextFragment o1, TextFragment o2) { return Math.round(o2.getScore() - o1.getScore()); } }); } String[] fragments; // number_of_fragments is set to 0 but we have a multivalued field if (field.fieldOptions().numberOfFragments() == 0 && textsToHighlight.size() > 1 && fragsList.size() > 0) { fragments = new String[fragsList.size()]; for (int i = 0; i < fragsList.size(); i++) { fragments[i] = fragsList.get(i).toString(); } } else { // refine numberOfFragments if needed numberOfFragments = fragsList.size() < numberOfFragments ? fragsList.size() : numberOfFragments; fragments = new String[numberOfFragments]; for (int i = 0; i < fragments.length; i++) { fragments[i] = fragsList.get(i).toString(); } } if (fragments.length > 0) { return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments)); } int noMatchSize = highlighterContext.field.fieldOptions().noMatchSize(); if (noMatchSize > 0 && textsToHighlight.size() > 0) { // Pull an excerpt from the beginning of the string but make sure to split the string on a term boundary. String fieldContents = textsToHighlight.get(0).toString(); int end; try { end = findGoodEndForNoHighlightExcerpt(noMatchSize, analyzer, mapper.fieldType().name(), fieldContents); } catch (Exception e) { throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e); } if (end > 0) { return new HighlightField(highlighterContext.fieldName, new Text[] { new Text(fieldContents.substring(0, end)) }); } } return null; } @Override public boolean canHighlight(FieldMapper fieldMapper) { return true; } private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException { try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) { if (!tokenStream.hasAttribute(OffsetAttribute.class)) { // Can't split on term boundaries without offsets return -1; } int end = -1; tokenStream.reset(); while (tokenStream.incrementToken()) { OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class); if (attr.endOffset() >= noMatchSize) { // Jump to the end of this token if it wouldn't put us past the boundary if (attr.endOffset() == noMatchSize) { end = noMatchSize; } return end; } end = attr.endOffset(); } tokenStream.end(); // We've exhausted the token stream so we should just highlight everything. return end; } } }