/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.vectorhighlight; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo; /** * A abstract implementation of {@link FragListBuilder}. */ public abstract class BaseFragListBuilder implements FragListBuilder { public static final int MARGIN_DEFAULT = 6; public static final int MIN_FRAG_CHAR_SIZE_FACTOR = 3; final int margin; final int minFragCharSize; public BaseFragListBuilder( int margin ){ if( margin < 0 ) throw new IllegalArgumentException( "margin(" + margin + ") is too small. It must be 0 or higher." ); this.margin = margin; this.minFragCharSize = Math.max( 1, margin * MIN_FRAG_CHAR_SIZE_FACTOR ); } public BaseFragListBuilder(){ this( MARGIN_DEFAULT ); } protected FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, FieldFragList fieldFragList, int fragCharSize ){ if( fragCharSize < minFragCharSize ) throw new IllegalArgumentException( "fragCharSize(" + fragCharSize + ") is too small. It must be " + minFragCharSize + " or higher." ); List<WeightedPhraseInfo> wpil = new ArrayList<>(); IteratorQueue<WeightedPhraseInfo> queue = new IteratorQueue<>(fieldPhraseList.getPhraseList().iterator()); WeightedPhraseInfo phraseInfo = null; int startOffset = 0; while((phraseInfo = queue.top()) != null){ // if the phrase violates the border of previous fragment, discard it and try next phrase if( phraseInfo.getStartOffset() < startOffset ) { queue.removeTop(); continue; } wpil.clear(); final int currentPhraseStartOffset = phraseInfo.getStartOffset(); int currentPhraseEndOffset = phraseInfo.getEndOffset(); int spanStart = Math.max(currentPhraseStartOffset - margin, startOffset); int spanEnd = Math.max(currentPhraseEndOffset, spanStart + fragCharSize); if (acceptPhrase(queue.removeTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) { wpil.add(phraseInfo); } while((phraseInfo = queue.top()) != null) { // pull until we crossed the current spanEnd if (phraseInfo.getEndOffset() <= spanEnd) { currentPhraseEndOffset = phraseInfo.getEndOffset(); if (acceptPhrase(queue.removeTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) { wpil.add(phraseInfo); } } else { break; } } if (wpil.isEmpty()) { continue; } final int matchLen = currentPhraseEndOffset - currentPhraseStartOffset; // now recalculate the start and end position to "center" the result final int newMargin = Math.max(0, (fragCharSize-matchLen)/2); // matchLen can be > fragCharSize prevent IAOOB here spanStart = currentPhraseStartOffset - newMargin; if (spanStart < startOffset) { spanStart = startOffset; } // whatever is bigger here we grow this out spanEnd = spanStart + Math.max(matchLen, fragCharSize); startOffset = spanEnd; fieldFragList.add(spanStart, spanEnd, wpil); } return fieldFragList; } /** * A predicate to decide if the given {@link WeightedPhraseInfo} should be * accepted as a highlighted phrase or if it should be discarded. * <p> * The default implementation discards phrases that are composed of more than one term * and where the matchLength exceeds the fragment character size. * * @param info the phrase info to accept * @param matchLength the match length of the current phrase * @param fragCharSize the configured fragment character size * @return <code>true</code> if this phrase info should be accepted as a highligh phrase */ protected boolean acceptPhrase(WeightedPhraseInfo info, int matchLength, int fragCharSize) { return info.getTermsOffsets().size() <= 1 || matchLength <= fragCharSize; } private static final class IteratorQueue<T> { private final Iterator<T> iter; private T top; public IteratorQueue(Iterator<T> iter) { this.iter = iter; T removeTop = removeTop(); assert removeTop == null; } public T top() { return top; } public T removeTop() { T currentTop = top; if (iter.hasNext()) { top = iter.next(); } else { top = null; } return currentTop; } } }