package uk.ac.shef.dcs.jate.algorithm; import uk.ac.shef.dcs.jate.JATEException; import uk.ac.shef.dcs.jate.feature.AbstractFeature; import uk.ac.shef.dcs.jate.feature.FrequencyTermBased; import uk.ac.shef.dcs.jate.model.JATETerm; import java.util.*; import org.apache.log4j.Logger; /** * tfidf modified to work at corpus level. Namely, "tf" is now "ttf"=total term frequency in the corpus */ public class TFIDF extends Algorithm { private static final Logger LOG = Logger.getLogger(TFIDF.class.getName()); @Override public List<JATETerm> execute(Collection<String> candidates) throws JATEException { AbstractFeature feature = features.get(FrequencyTermBased.class.getName()); validateFeature(feature, FrequencyTermBased.class); FrequencyTermBased fFeature = (FrequencyTermBased) feature; double totalDocs = (double) fFeature.getTotalDocs(); List<JATETerm> result = new ArrayList<>(); StringBuilder msg = new StringBuilder("Beginning computing TermEx values,"); msg.append(", total terms=" + candidates.size()); LOG.info(msg.toString()); for (String tString: candidates) { JATETerm term = new JATETerm(tString); double tf = fFeature.getTTFNorm(tString); double df = fFeature.getTermFrequencyInDocument(tString).size(); double idf = Math.log(totalDocs / df); term.setScore(tf * idf); result.add(term); } Collections.sort(result); LOG.info("Complete"); return result; } }