/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.uhighlight; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.PriorityQueue; import java.util.Set; import java.util.TreeSet; import java.util.function.Function; import java.util.function.Predicate; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Fields; import org.apache.lucene.index.FilterLeafReader; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TwoPhaseIterator; import org.apache.lucene.search.highlight.WeightedSpanTerm; import org.apache.lucene.search.highlight.WeightedSpanTermExtractor; import org.apache.lucene.search.spans.SpanCollector; import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanWeight; import org.apache.lucene.search.spans.Spans; import org.apache.lucene.util.BytesRef; /** * Helps the {@link FieldOffsetStrategy} with strict position highlighting (e.g. highlight phrases correctly). * This is a stateful class holding information about the query, but it can (and is) re-used across highlighting * documents. Despite this state; it's immutable after construction. The approach taken in this class is very similar * to the standard Highlighter's {@link WeightedSpanTermExtractor} which is in fact re-used here. However, we ought to * completely rewrite it to use the SpanCollector interface to collect offsets directly. We'll get better * phrase accuracy. * * @lucene.internal */ public class PhraseHelper { public static final PhraseHelper NONE = new PhraseHelper(new MatchAllDocsQuery(), "_ignored_", (s) -> false, spanQuery -> null, query -> null, true); //TODO it seems this ought to be a general thing on Spans? private static final Comparator<? super Spans> SPANS_COMPARATOR = (o1, o2) -> { int cmp = Integer.compare(o1.docID(), o2.docID()); if (cmp != 0) { return cmp; } if (o1.docID() == DocIdSetIterator.NO_MORE_DOCS) { return 0; // don't ask for start/end position; not sure if we can even call those methods } cmp = Integer.compare(o1.startPosition(), o2.startPosition()); if (cmp != 0) { return cmp; } else { return Integer.compare(o1.endPosition(), o2.endPosition()); } }; private final String fieldName; private final Set<Term> positionInsensitiveTerms; // (TermQuery terms) private final Set<SpanQuery> spanQueries; private final boolean willRewrite; private final Predicate<String> fieldMatcher; /** * Constructor. * {@code rewriteQueryPred} is an extension hook to override the default choice of * {@link WeightedSpanTermExtractor#mustRewriteQuery(SpanQuery)}. By default unknown query types are rewritten, * so use this to return {@link Boolean#FALSE} if you know the query doesn't need to be rewritten. * Similarly, {@code preExtractRewriteFunction} is also an extension hook for extract to allow different queries * to be set before the {@link WeightedSpanTermExtractor}'s extraction is invoked. * {@code ignoreQueriesNeedingRewrite} effectively ignores any query clause that needs to be "rewritten", which is * usually limited to just a {@link SpanMultiTermQueryWrapper} but could be other custom ones. * {@code fieldMatcher} The field name predicate to use for extracting the query part that must be highlighted. */ public PhraseHelper(Query query, String field, Predicate<String> fieldMatcher, Function<SpanQuery, Boolean> rewriteQueryPred, Function<Query, Collection<Query>> preExtractRewriteFunction, boolean ignoreQueriesNeedingRewrite) { this.fieldName = field; this.fieldMatcher = fieldMatcher; // filter terms to those we want positionInsensitiveTerms = new FieldFilteringTermSet(); spanQueries = new HashSet<>(); // TODO Have toSpanQuery(query) Function as an extension point for those with custom Query impls boolean[] mustRewriteHolder = {false}; // boolean wrapped in 1-ary array so it's mutable from inner class // For TermQueries or other position insensitive queries, collect the Terms. // For other Query types, WSTE will convert to an equivalent SpanQuery. NOT extracting position spans here. new WeightedSpanTermExtractor(field) { //anonymous constructor { setExpandMultiTermQuery(true); //necessary for mustRewriteQuery(spanQuery) to work. try { extract(query, 1f, null); // null because we won't actually extract right now; we're not collecting } catch (Exception e) { throw new RuntimeException(e); } } @Override protected void extract(Query query, float boost, Map<String, WeightedSpanTerm> terms) throws IOException { Collection<Query> newQueriesToExtract = preExtractRewriteFunction.apply(query); if (newQueriesToExtract != null) { for (Query newQuery : newQueriesToExtract) { extract(newQuery, boost, terms); } } else { super.extract(query, boost, terms); } } @Override protected boolean isQueryUnsupported(Class<? extends Query> clazz) { if (clazz.isAssignableFrom(MultiTermQuery.class)) { return true; //We do MTQ processing separately in MultiTermHighlighting.java } return true; //TODO set to false and provide a hook to customize certain queries. } @Override protected void extractWeightedTerms(Map<String, WeightedSpanTerm> terms, Query query, float boost) throws IOException { query.createWeight(UnifiedHighlighter.EMPTY_INDEXSEARCHER, false, boost) .extractTerms(positionInsensitiveTerms); } @Override protected void extractWeightedSpanTerms(Map<String, WeightedSpanTerm> terms, SpanQuery spanQuery, float boost) throws IOException { // if this span query isn't for this field, skip it. Set<String> fieldNameSet = new HashSet<>();//TODO reuse. note: almost always size 1 collectSpanQueryFields(spanQuery, fieldNameSet); for (String spanField : fieldNameSet) { if (!fieldMatcher.test(spanField)) { return; } } // TODO allow users to override the answer to mustRewriteQuery boolean mustRewriteQuery = mustRewriteQuery(spanQuery); if (ignoreQueriesNeedingRewrite && mustRewriteQuery) { return;// ignore this query } mustRewriteHolder[0] |= mustRewriteQuery; spanQueries.add(spanQuery); } @Override protected boolean mustRewriteQuery(SpanQuery spanQuery) { Boolean rewriteQ = rewriteQueryPred.apply(spanQuery);// allow to override return rewriteQ != null ? rewriteQ : super.mustRewriteQuery(spanQuery); } }; // calling the constructor triggered the extraction/visiting we want. Hacky; yes. willRewrite = mustRewriteHolder[0]; } Set<SpanQuery> getSpanQueries() { return spanQueries; } /** * If there is no position sensitivity then use of the instance of this class can be ignored. */ boolean hasPositionSensitivity() { return spanQueries.isEmpty() == false; } /** * Rewrite is needed for handling a {@link SpanMultiTermQueryWrapper} (MTQ / wildcards) or some * custom things. When true, the resulting term list will probably be different than what it was known * to be initially. */ boolean willRewrite() { return willRewrite; } /** * Collect a list of pre-positioned {@link Spans} for each term, given a reader that has just one document. * It returns no mapping for query terms that occurs in a position insensitive way which therefore don't * need to be filtered. */ Map<BytesRef, Spans> getTermToSpans(LeafReader leafReader, int doc) throws IOException { if (spanQueries.isEmpty()) { return Collections.emptyMap(); } final LeafReader filteredReader = new SingleFieldFilterLeafReader(leafReader, fieldName); // for each SpanQuery, collect the member spans into a map. Map<BytesRef, Spans> result = new HashMap<>(); for (SpanQuery spanQuery : spanQueries) { getTermToSpans(spanQuery, filteredReader.getContext(), doc, result); } return result; } // code extracted & refactored from WSTE.extractWeightedSpanTerms() private void getTermToSpans(SpanQuery spanQuery, LeafReaderContext readerContext, int doc, Map<BytesRef, Spans> result) throws IOException { // note: in WSTE there was some field specific looping that seemed pointless so that isn't here. final IndexSearcher searcher = new IndexSearcher(readerContext.reader()); searcher.setQueryCache(null); if (willRewrite) { spanQuery = (SpanQuery) searcher.rewrite(spanQuery); // searcher.rewrite loops till done } // Get the underlying query terms TreeSet<Term> termSet = new FieldFilteringTermSet(); // sorted so we can loop over results in order shortly... searcher.createWeight(spanQuery, false, 1.0f).extractTerms(termSet);//needsScores==false // Get Spans by running the query against the reader // TODO it might make sense to re-use/cache the Spans instance, to advance forward between docs SpanWeight spanWeight = (SpanWeight) searcher.createNormalizedWeight(spanQuery, false); Spans spans = spanWeight.getSpans(readerContext, SpanWeight.Postings.POSITIONS); if (spans == null) { return; } TwoPhaseIterator twoPhaseIterator = spans.asTwoPhaseIterator(); if (twoPhaseIterator != null) { if (twoPhaseIterator.approximation().advance(doc) != doc || !twoPhaseIterator.matches()) { return; } } else if (spans.advance(doc) != doc) { // preposition, and return doing nothing if find none return; } // Consume the Spans into a cache. This instance is used as a source for multiple cloned copies. // It's important we do this and not re-use the same original Spans instance since these will be iterated // independently later on; sometimes in ways that prevents sharing the original Spans. CachedSpans cachedSpansSource = new CachedSpans(spans); // consumes spans for this doc only and caches spans = null;// we don't use it below // Map terms to a Spans instance (aggregate if necessary) for (final Term queryTerm : termSet) { // note: we expect that at least one query term will pass these filters. This is because the collected // spanQuery list were already filtered by these conditions. if (positionInsensitiveTerms.contains(queryTerm)) { continue; } // copy-constructor refers to same data (shallow) but has iteration state from the beginning CachedSpans cachedSpans = new CachedSpans(cachedSpansSource); // Add the span to whatever span may or may not exist Spans existingSpans = result.get(queryTerm.bytes()); if (existingSpans != null) { if (existingSpans instanceof MultiSpans) { ((MultiSpans) existingSpans).addSpans(cachedSpans); } else { // upgrade to MultiSpans MultiSpans multiSpans = new MultiSpans(); multiSpans.addSpans(existingSpans); multiSpans.addSpans(cachedSpans); result.put(queryTerm.bytes(), multiSpans); } } else { result.put(queryTerm.bytes(), cachedSpans); } } } /** * Returns terms as a List, but expanded to any terms in phraseHelper' keySet if present. That can only * happen if willRewrite() is true. */ List<BytesRef> expandTermsIfRewrite(BytesRef[] terms, Map<BytesRef, Spans> strictPhrasesTermToSpans) { if (willRewrite()) { Set<BytesRef> allTermSet = new LinkedHashSet<>(terms.length + strictPhrasesTermToSpans.size()); Collections.addAll(allTermSet, terms);//FYI already sorted; will keep order if (allTermSet.addAll(strictPhrasesTermToSpans.keySet())) { // true if any were added List<BytesRef> sourceTerms = Arrays.asList(allTermSet.toArray(new BytesRef[allTermSet.size()])); sourceTerms.sort(Comparator.naturalOrder()); return sourceTerms; } } return Arrays.asList(terms); // no rewrite; use original terms } /** * Returns a filtered postings where the position must be in the given Spans. * The Spans must be in a positioned state (not initial) and should not be shared between other terms. * {@code postingsEnum} should be positioned at the * document (the same one as the spans) but it hasn't iterated the positions yet. * The Spans should be the result of a simple * lookup from {@link #getTermToSpans(LeafReader, int)}, and so it could be null which could mean * either it's completely filtered or that there should be no filtering; this class knows what to do. * <p> * Due to limitations in filtering, the {@link PostingsEnum#freq()} is un-changed even if some positions * get filtered. So when {@link PostingsEnum#nextPosition()} is called or {@code startOffset} or {@code * endOffset} beyond the "real" positions, these methods returns {@link Integer#MAX_VALUE}. * <p> * <b>This will return null if it's completely filtered out (i.e. effectively has no postings).</b> */ PostingsEnum filterPostings(BytesRef term, PostingsEnum postingsEnum, Spans spans) throws IOException { if (spans == null) { if (hasPositionSensitivity() == false || positionInsensitiveTerms.contains(new Term(fieldName, term))) { return postingsEnum; // no filtering } else { return null; // completely filtered out } } if (postingsEnum.docID() != spans.docID()) { throw new IllegalStateException("Spans & Postings doc ID misaligned or not positioned"); } return new FilterLeafReader.FilterPostingsEnum(postingsEnum) { // freq() is max times nextPosition can be called. We'll set this var to -1 when exhausted. int remainingPositions = postingsEnum.freq(); @Override public String toString() { String where; try { where = "[" + startOffset() + ":" + endOffset() + "]"; } catch (IOException e) { where = "[" + e + "]"; } return "'" + term.utf8ToString() + "'@" + where + " filtered by " + spans; } @Override public int nextDoc() throws IOException { throw new IllegalStateException("not expected"); // don't need to implement; just used on one doc } @Override public int advance(int target) throws IOException { throw new IllegalStateException("not expected"); // don't need to implement; just used on one doc } @Override public int nextPosition() throws IOException { // loop over posting positions... NEXT_POS_LOOP: while (remainingPositions > 0) { final int thisPos = super.nextPosition(); remainingPositions--; // loop spans forward (if necessary) while the span end is behind thisPos while (spans.endPosition() <= thisPos) { if (spans.nextStartPosition() == Spans.NO_MORE_POSITIONS) { // advance break NEXT_POS_LOOP; } assert spans.docID() == postingsEnum.docID(); } // is this position within the span? if (thisPos >= spans.startPosition()) { assert thisPos < spans.endPosition(); // guaranteed by previous loop return thisPos; // yay! } // else continue and try the next position } remainingPositions = -1; // signify done return Integer.MAX_VALUE; } @Override public int startOffset() throws IOException { return remainingPositions >= 0 ? super.startOffset() : Integer.MAX_VALUE; } @Override public int endOffset() throws IOException { return remainingPositions >= 0 ? super.endOffset() : Integer.MAX_VALUE; } }; } /** * Simple TreeSet that filters out Terms not matching the provided predicate on {@code add()}. */ private class FieldFilteringTermSet extends TreeSet<Term> { @Override public boolean add(Term term) { if (fieldMatcher.test(term.field())) { if (term.field().equals(fieldName)) { return super.add(term); } else { return super.add(new Term(fieldName, term.bytes())); } } else { return false; } } } /** * A single {@link Spans} view over multiple spans. At least one span is mandatory, but you should probably * supply more than one. Furthermore, the given spans are expected to be positioned to a document already * via a call to next or advance). */ // TODO move to Lucene core as a Spans utility class? static class MultiSpans extends Spans { final PriorityQueue<Spans> spansQueue = new PriorityQueue<>(SPANS_COMPARATOR); long cost; void addSpans(Spans spans) { if (spans.docID() < 0 || spans.docID() == NO_MORE_DOCS) { throw new IllegalArgumentException("Expecting given spans to be in a positioned state."); } spansQueue.add(spans); cost = Math.max(cost, spans.cost()); } // DocIdSetIterator methods: @Override public int nextDoc() throws IOException { if (spansQueue.isEmpty()) { return NO_MORE_DOCS; } return advance(spansQueue.peek().docID() + 1); } @Override public int advance(int target) throws IOException { if (spansQueue.isEmpty()) { return NO_MORE_DOCS; } while (true) { Spans spans = spansQueue.peek(); if (spans.docID() >= target) { return spans.docID(); } spansQueue.remove(); // must remove before modify state if (spans.advance(target) != NO_MORE_DOCS) { // ... otherwise it's not re-added spansQueue.add(spans); } else if (spansQueue.isEmpty()) { return NO_MORE_DOCS; } } } @Override public int docID() { if (spansQueue.isEmpty()) { return NO_MORE_DOCS; } return spansQueue.peek().docID(); } @Override public long cost() { return cost; } // Spans methods: @Override public int nextStartPosition() throws IOException { // advance any spans at the initial position per document boolean atDocStart = false; while (spansQueue.peek().startPosition() == -1) { atDocStart = true; Spans headSpans = spansQueue.remove(); // remove because we will change state headSpans.nextStartPosition(); spansQueue.add(headSpans); } if (!atDocStart) { Spans headSpans = spansQueue.remove(); // remove because we will change state headSpans.nextStartPosition(); spansQueue.add(headSpans); } return startPosition(); } @Override public int startPosition() { return spansQueue.peek().startPosition(); } @Override public int endPosition() { return spansQueue.peek().endPosition(); } @Override public int width() { return spansQueue.peek().width(); } @Override public void collect(SpanCollector collector) throws IOException { spansQueue.peek().collect(collector); } @Override public float positionsCost() { return 100f;// no idea; and we can't delegate due to not allowing to call it dependent on TwoPhaseIterator } } /** * This reader will just delegate every call to a single field in the wrapped * LeafReader. This way we ensure that all queries going through this reader target the same field. */ static final class SingleFieldFilterLeafReader extends FilterLeafReader { final String fieldName; SingleFieldFilterLeafReader(LeafReader in, String fieldName) { super(in); this.fieldName = fieldName; } @Override public FieldInfos getFieldInfos() { throw new UnsupportedOperationException(); } @Override public Fields fields() throws IOException { return new FilterFields(super.fields()) { @Override public Terms terms(String field) throws IOException { return super.terms(fieldName); } @Override public Iterator<String> iterator() { return Collections.singletonList(fieldName).iterator(); } @Override public int size() { return 1; } }; } @Override public NumericDocValues getNumericDocValues(String field) throws IOException { return super.getNumericDocValues(fieldName); } @Override public BinaryDocValues getBinaryDocValues(String field) throws IOException { return super.getBinaryDocValues(fieldName); } @Override public SortedDocValues getSortedDocValues(String field) throws IOException { return super.getSortedDocValues(fieldName); } @Override public NumericDocValues getNormValues(String field) throws IOException { return super.getNormValues(fieldName); } @Override public CacheHelper getCoreCacheHelper() { return null; } @Override public CacheHelper getReaderCacheHelper() { return null; } } /** * A Spans based on a list of cached spans for one doc. It is pre-positioned to this doc. */ private static class CachedSpans extends Spans { private static class CachedSpan { final int start; final int end; CachedSpan(int start, int end) { this.start = start; this.end = end; } } final int docId; final ArrayList<CachedSpan> cachedSpanList; int index = -1; CachedSpans(Spans spans) throws IOException { this.docId = spans.docID(); assert this.docId != -1; // Consume the spans for this doc into a list. There's always at least one; the first/current one. cachedSpanList = new ArrayList<>(); while (spans.nextStartPosition() != NO_MORE_POSITIONS) { cachedSpanList.add(new CachedSpan(spans.startPosition(), spans.endPosition())); } assert !cachedSpanList.isEmpty(); // bad Span impl? } /** * Clone; reset iteration state. */ CachedSpans(CachedSpans cloneMe) { docId = cloneMe.docId; cachedSpanList = cloneMe.cachedSpanList; } @Override public int nextDoc() throws IOException { throw new UnsupportedOperationException("Not expected"); } @Override public int advance(int target) throws IOException { throw new UnsupportedOperationException("Not expected"); } @Override public int docID() { return docId; } @Override public long cost() { return 1; } @Override public int nextStartPosition() throws IOException { index++; return startPosition(); } @Override public int startPosition() { return index < 0 ? -1 : index >= cachedSpanList.size() ? NO_MORE_POSITIONS : cachedSpanList.get(index).start; } @Override public int endPosition() { return index < 0 ? -1 : index >= cachedSpanList.size() ? NO_MORE_POSITIONS : cachedSpanList.get(index).end; } @Override public int width() { return endPosition() - startPosition(); } @Override public void collect(SpanCollector collector) throws IOException { throw new UnsupportedOperationException("Not expected"); } @Override public float positionsCost() { return 1f; } } // class CachedSpans }