package org.apache.lucene.corpus.stats; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; /** * Lucene-agnostic IDF calculator */ public class IDFIndexCalc extends IDFCalc { private final IndexReader reader; public IDFIndexCalc(IndexReader reader) { super(reader.numDocs()); this.reader = reader; } /** * @param t term * @return idf for a single term or {@link #UNSEEN_IDF} if term is not found in * index * @throws java.io.IOException if encountered by underlying reader */ public double singleTermIDF(Term t) throws IOException { return getIDF(reader.docFreq(t)); } /** * Splits s on whitespace and then sums idf for each subtoken. This is * equivalent to multiplying the probabilities or, calculating the probability * if each term is completely independent of the other. The result is an * upperbound on the actual idf of the phrase. This is fast to * compute and yields decent results in practice. For more exact IDF for * phrases, consider indexing ngrams. * <p> * Make sure to remove stop words before calculating the IDF. * A stop word will have an actual DF of 0, which will * be converted to {@value #DEFAULT_UNSEEN_COUNT}. * * @param s string * @param t term * @return sum of idf for individual terms * @throws java.io.IOException if encountered by underlying reader */ public double multiTermIDFSum(String s, Term t) throws IOException { double sum = 0.0; for (String termString : s.trim().split(" +")) { Term tmp = new Term(t.field(), termString); sum += getIDF(reader.docFreq(tmp)); } return sum; } /** * @param s string * @param t term from which to use field * @return double[] of length 2, stats[0] is the sum of the individual term idfs * and stats[1] is the minimum idf for the phrase * @throws java.io.IOException if encountered by underlying reader */ public double[] multiTermIDF(String s, Term t) throws IOException { // be careful: must pre-analyze and divide subterms by whitespace!!! double[] stats = new double[]{0.0, Double.MAX_VALUE}; // sum, min df, ... for (String termString : s.trim().split(" +")) { Term tmp = new Term(t.field(), termString); int df = reader.docFreq(tmp); double idf = getIDF(df); stats[0] += idf; if (df < stats[1]) stats[1] = df; } return stats; } public double[] multiTermStats(String s, String field) throws IOException { return multiTermIDF(s, new Term(field, "")); } }