/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
* Ultimately returns a list of {@link OffsetsEnum} yielding potentially highlightable words in the text. Needs
* information about the query up front.
*
* @lucene.internal
*/
public abstract class FieldOffsetStrategy {
protected final String field;
protected final PhraseHelper phraseHelper; // Query: position-sensitive information TODO: rename
protected final BytesRef[] terms; // Query: free-standing terms
protected final CharacterRunAutomaton[] automata; // Query: free-standing wildcards (multi-term query)
public FieldOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) {
this.field = field;
this.terms = queryTerms;
this.phraseHelper = phraseHelper;
this.automata = automata;
}
public String getField() {
return field;
}
public abstract UnifiedHighlighter.OffsetSource getOffsetSource();
/**
* The primary method -- return offsets for highlightable words in the specified document.
* IMPORTANT: remember to close them all.
*/
public abstract List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException;
protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader leafReader, int doc) throws IOException {
final Terms termsIndex = leafReader.terms(field);
if (termsIndex == null) {
return Collections.emptyList();
}
// For strict positions, get a Map of term to Spans:
// note: ScriptPhraseHelper.NONE does the right thing for these method calls
final Map<BytesRef, Spans> strictPhrasesTermToSpans =
phraseHelper.getTermToSpans(leafReader, doc);
// Usually simply wraps terms in a List; but if willRewrite() then can be expanded
final List<BytesRef> sourceTerms =
phraseHelper.expandTermsIfRewrite(terms, strictPhrasesTermToSpans);
final List<OffsetsEnum> offsetsEnums = new ArrayList<>(sourceTerms.size() + automata.length);
// Handle sourceTerms:
if (!sourceTerms.isEmpty()) {
TermsEnum termsEnum = termsIndex.iterator();//does not return null
for (BytesRef term : sourceTerms) {
if (termsEnum.seekExact(term)) {
PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
if (postingsEnum == null) {
// no offsets or positions available
throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
}
if (doc == postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted
postingsEnum = phraseHelper.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term));
if (postingsEnum != null) {
offsetsEnums.add(new OffsetsEnum(term, postingsEnum));
}
}
}
}
}
// Handle automata
if (automata.length > 0) {
offsetsEnums.addAll(createAutomataOffsetsFromTerms(termsIndex, doc));
}
return offsetsEnums;
}
protected List<OffsetsEnum> createAutomataOffsetsFromTerms(Terms termsIndex, int doc) throws IOException {
List<List<PostingsEnum>> automataPostings = new ArrayList<>(automata.length);
for (int i = 0; i < automata.length; i++) {
automataPostings.add(new ArrayList<>());
}
TermsEnum termsEnum = termsIndex.iterator();
BytesRef term;
CharsRefBuilder refBuilder = new CharsRefBuilder();
while ((term = termsEnum.next()) != null) {
for (int i = 0; i < automata.length; i++) {
CharacterRunAutomaton automaton = automata[i];
refBuilder.copyUTF8Bytes(term);
if (automaton.run(refBuilder.chars(), 0, refBuilder.length())) {
PostingsEnum postings = termsEnum.postings(null, PostingsEnum.OFFSETS);
if (doc == postings.advance(doc)) {
automataPostings.get(i).add(postings);
}
}
}
}
List<OffsetsEnum> offsetsEnums = new ArrayList<>(automata.length); //will be at most this long
for (int i = 0; i < automata.length; i++) {
CharacterRunAutomaton automaton = automata[i];
List<PostingsEnum> postingsEnums = automataPostings.get(i);
int size = postingsEnums.size();
if (size > 0) { //only add if we have offsets
BytesRef wildcardTerm = new BytesRef(automaton.toString());
if (size == 1) { //don't wrap in a composite if there's only one OffsetsEnum
offsetsEnums.add(new OffsetsEnum(wildcardTerm, postingsEnums.get(0)));
} else {
offsetsEnums.add(new OffsetsEnum(wildcardTerm, new CompositeOffsetsPostingsEnum(postingsEnums)));
}
}
}
return offsetsEnums;
}
}