package uk.ac.shef.dcs.jate.algorithm;
import uk.ac.shef.dcs.jate.JATEException;
import uk.ac.shef.dcs.jate.feature.AbstractFeature;
import uk.ac.shef.dcs.jate.feature.FrequencyTermBased;
import uk.ac.shef.dcs.jate.model.JATETerm;
import java.util.*;
import org.apache.log4j.Logger;
/**
* Residual IDF, see
* Church, K. and Gale, W. 1995a. Inverse Document Frequency (IDF): A Measure of Deviation from Poisson. In Proceedings of the 3rd Workshop on Very Large Corpora. Cambridge, Massachusetts, USA, pp.121-30.
*/
public class RIDF extends Algorithm{
private static final Logger LOG = Logger.getLogger(RIDF.class.getName());
@Override
public List<JATETerm> execute(Collection<String> candidates) throws JATEException {
AbstractFeature feature = features.get(FrequencyTermBased.class.getName());
validateFeature(feature, FrequencyTermBased.class);
FrequencyTermBased fFeature = (FrequencyTermBased) feature;
double totalDocs = (double) fFeature.getTotalDocs();
List<JATETerm> result = new ArrayList<>();
StringBuilder msg = new StringBuilder("Beginning computing RIDF values,");
msg.append(", total terms=" + candidates.size());
LOG.info(msg.toString());
for(String tString: candidates){
JATETerm term = new JATETerm(tString);
/*int ttf = fFeature.getTTF(tString);
double cf_over_N = (double) ttf / totalDocs;
double exponential = Math.exp(0 - cf_over_N);
double nominator = totalDocs * (1 - exponential);
double denominator = (double) fFeature.getTermFrequencyInDocument(tString).size();
if (denominator == 0) {
denominator=1; //this shouldnt occur. a term that is firstly extracted from the corpus must have a source
}
double ridf = Math.log(nominator / denominator) / Math.log(2.0);
*/
int ttf = fFeature.getTTF(tString);
double attf = (double) ttf / totalDocs;
double pi = Math.exp(0 - attf);
double eidf = 0-(Math.log(1-pi)/Math.log(2));//expected idf
double df = (double) fFeature.getTermFrequencyInDocument(tString).size();
double idf= Math.log(totalDocs / df);
double ridf = idf-eidf;
term.setScore(ridf);
result.add(term);
}
Collections.sort(result);
LOG.info("Complete");
return result;
}
}