package org.apache.lucene.corpus.stats;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class IDFCalc {
private final static int DEFAULT_UNSEEN_COUNT = 2;
private final static int MAX_BUFF = 50;
private final double UNSEEN_IDF;
private final double[] buffered = new double[MAX_BUFF];
private final int D;
private final int D_PLUS_ONE;
IDFCalc(int numDocs) {
D = numDocs;
//add one to avoid log of 1 = 0 in downstream calculations
D_PLUS_ONE = D + 1;
UNSEEN_IDF = getUnbufferedIDF(DEFAULT_UNSEEN_COUNT);
buffered[0] = UNSEEN_IDF;
for (int i = 1; i < MAX_BUFF; i++) {
buffered[i] = getUnbufferedIDF(i);
}
}
/**
* @param df document freq
* @return inverse document frequency for @param df.
* If df <= 0, returns {@link #UNSEEN_IDF}
*/
double getIDF(int df) {
if (df < 0)
return UNSEEN_IDF;
if (df < MAX_BUFF) {
return buffered[df];
}
// TODO: add a check for cnt > maxDoc
return getUnbufferedIDF(df);
}
private double getUnbufferedIDF(int cnt) {
return Math.log((double) (D_PLUS_ONE) / (double) cnt);
}
/**
* calculate the document frequency from an IDF
*
* @param idf idf
* @return estimated frequency based on idf
*/
public double unIDF(double idf) {
return unIDF(D_PLUS_ONE, idf);
}
/**
* calculate the document frequency from D and an idf
*
* @param totalDocs total number of documents
* @param idf idf
* @return estimated document frequency
*/
private double unIDF(int totalDocs, double idf) {
return (double) (totalDocs) / (Math.pow(Math.E, idf)); // make sure the base
// is the same as above
}
public double getIDF(int totalDocs, int cnt) {
return Math.log((double) (totalDocs) / (double) cnt);
}
/**
* @return D -- total number of docs used in IDF calculations. Note that D+1 is
* actually used to calculate idf to avoid idf=0.
*/
public int getD() {
return D;
}
}