/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.vectorhighlight; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo; import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo; import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; /** * A weighted implementation of {@link FieldFragList}. */ public class WeightedFieldFragList extends FieldFragList { /** * a constructor. * * @param fragCharSize the length (number of chars) of a fragment */ public WeightedFieldFragList( int fragCharSize ) { super( fragCharSize ); } /* (non-Javadoc) * @see org.apache.lucene.search.vectorhighlight.FieldFragList#add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) */ @Override public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) { List<SubInfo> tempSubInfos = new ArrayList<>(); List<SubInfo> realSubInfos = new ArrayList<>(); HashSet<String> distinctTerms = new HashSet<>(); int length = 0; for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ float phraseTotalBoost = 0; for ( TermInfo ti : phraseInfo.getTermsInfos()) { if ( distinctTerms.add( ti.getText() ) ) phraseTotalBoost += ti.getWeight() * phraseInfo.getBoost(); length++; } tempSubInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum(), phraseTotalBoost ) ); } // We want that terms per fragment (length) is included into the weight. Otherwise a one-word-query // would cause an equal weight for all fragments regardless of how much words they contain. // To avoid that fragments containing a high number of words possibly "outrank" more relevant fragments // we "bend" the length with a standard-normalization a little bit. float norm = length * ( 1 / (float)Math.sqrt( length ) ); float totalBoost = 0; for ( SubInfo tempSubInfo : tempSubInfos ) { float subInfoBoost = tempSubInfo.getBoost() * norm; realSubInfos.add( new SubInfo( tempSubInfo.getText(), tempSubInfo.getTermsOffsets(), tempSubInfo.getSeqnum(), subInfoBoost )); totalBoost += subInfoBoost; } getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, realSubInfos, totalBoost ) ); } }