package org.apache.lucene.search.highlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.HashMap; import java.util.HashSet; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Query; /** * {@link Scorer} implementation which scores text fragments by the number of * unique query terms found. This class uses the {@link QueryTermExtractor} * class to process determine the query terms and their boosts to be used. */ // TODO: provide option to boost score of fragments near beginning of document // based on fragment.getFragNum() public class QueryTermScorer implements Scorer { TextFragment currentTextFragment = null; HashSet<String> uniqueTermsInFragment; float totalScore = 0; float maxTermWeight = 0; private HashMap<String,WeightedTerm> termsToFind; private CharTermAttribute termAtt; /** * * @param query a Lucene query (ideally rewritten using query.rewrite before * being passed to this class and the searcher) */ public QueryTermScorer(Query query) { this(QueryTermExtractor.getTerms(query)); } /** * * @param query a Lucene query (ideally rewritten using query.rewrite before * being passed to this class and the searcher) * @param fieldName the Field name which is used to match Query terms */ public QueryTermScorer(Query query, String fieldName) { this(QueryTermExtractor.getTerms(query, false, fieldName)); } /** * * @param query a Lucene query (ideally rewritten using query.rewrite before * being passed to this class and the searcher) * @param reader used to compute IDF which can be used to a) score selected * fragments better b) use graded highlights eg set font color * intensity * @param fieldName the field on which Inverse Document Frequency (IDF) * calculations are based */ public QueryTermScorer(Query query, IndexReader reader, String fieldName) { this(QueryTermExtractor.getIdfWeightedTerms(query, reader, fieldName)); } public QueryTermScorer(WeightedTerm[] weightedTerms) { termsToFind = new HashMap<String,WeightedTerm>(); for (int i = 0; i < weightedTerms.length; i++) { WeightedTerm existingTerm = termsToFind .get(weightedTerms[i].term); if ((existingTerm == null) || (existingTerm.weight < weightedTerms[i].weight)) { // if a term is defined more than once, always use the highest scoring // weight termsToFind.put(weightedTerms[i].term, weightedTerms[i]); maxTermWeight = Math.max(maxTermWeight, weightedTerms[i].getWeight()); } } } /* (non-Javadoc) * @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream) */ public TokenStream init(TokenStream tokenStream) { termAtt = tokenStream.addAttribute(CharTermAttribute.class); return null; } /* * (non-Javadoc) * * @see * org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache * .lucene.search.highlight.TextFragment) */ public void startFragment(TextFragment newFragment) { uniqueTermsInFragment = new HashSet<String>(); currentTextFragment = newFragment; totalScore = 0; } /* (non-Javadoc) * @see org.apache.lucene.search.highlight.Scorer#getTokenScore() */ public float getTokenScore() { String termText = termAtt.toString(); WeightedTerm queryTerm = termsToFind.get(termText); if (queryTerm == null) { // not a query term - return return 0; } // found a query term - is it unique in this doc? if (!uniqueTermsInFragment.contains(termText)) { totalScore += queryTerm.getWeight(); uniqueTermsInFragment.add(termText); } return queryTerm.getWeight(); } /* (non-Javadoc) * @see org.apache.lucene.search.highlight.Scorer#getFragmentScore() */ public float getFragmentScore() { return totalScore; } /* * (non-Javadoc) * * @see * org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed() */ public void allFragmentsProcessed() { // this class has no special operations to perform at end of processing } /** * * @return The highest weighted term (useful for passing to GradientFormatter * to set top end of coloring scale. */ public float getMaxTermWeight() { return maxTermWeight; } }