package com.hkorte.elasticsearch.significance.measures; import com.hkorte.elasticsearch.significance.model.ScoredTerm; import org.elasticsearch.common.xcontent.XContentBuilder; import java.io.IOException; /** * The default heuristic used in Elasticsearch to compute significant terms. * * @see org.elasticsearch.search.aggregations.bucket.significant.InternalSignificantTerms */ public class DefaultHeuristic extends SignificanceMeasure { @Override public String shortName() { return "default"; } /** * @param n00 docs which do not contain word with negative class * @param n01 docs which do not contain word with positive class * @param n10 docs which contain word with negative class * @param n11 docs which contain word with positive class * @return The default heuristic used in ES to compute significant terms */ @Override public double compute(long n00, long n01, long n10, long n11) { return apply(n00, n01, n10, n11).getScore(); } @Override public ScoredTerm apply(long n00, long n01, long n10, long n11) { long subsetSize = n11 + n01; long supersetSize = n11 + n10 + n01 + n00; if ((subsetSize == 0) || (supersetSize == 0)) { // avoid any divide by zero issues return new ScoredTerm(0.0, n00, n01, n10, n11); } long subsetFreq = n11; long supersetFreq = n11 + n10; double subsetProbability = (double) subsetFreq / (double) subsetSize; double supersetProbability = (double) supersetFreq / (double) supersetSize; // Using absoluteProbabilityChange alone favours very common words e.g. you, we etc // because a doubling in popularity of a common term is a big percent difference // whereas a rare term would have to achieve a hundred-fold increase in popularity to // achieve the same difference measure. // In favouring common words as suggested features for search we would get high // recall but low precision. double absoluteProbabilityChange = subsetProbability - supersetProbability; if (absoluteProbabilityChange <= 0) { return new ScoredTerm(0.0, n00, n01, n10, n11); } // Using relativeProbabilityChange tends to favour rarer terms e.g.mis-spellings or // unique URLs. // A very low-probability term can very easily double in popularity due to the low // numbers required to do so whereas a high-probability term would have to add many // extra individual sightings to achieve the same shift. // In favouring rare words as suggested features for search we would get high // precision but low recall. double relativeProbabilityChange = (subsetProbability / supersetProbability); // A blend of the above metrics - favours medium-rare terms to strike a useful // balance between precision and recall. double score = absoluteProbabilityChange * relativeProbabilityChange; return new CustomScoredTerm(score, n00, n01, n10, n11, subsetProbability, supersetProbability, absoluteProbabilityChange, relativeProbabilityChange); } private static class CustomScoredTerm extends ScoredTerm { private double subsetProbability; private double supersetProbability; private double absoluteProbabilityChange; private double relativeProbabilityChange; public CustomScoredTerm(double score, long n00, long n01, long n10, long n11, double subsetProbability, double supersetProbability, double absoluteProbabilityChange, double relativeProbabilityChange) { super(score, n00, n01, n10, n11); this.subsetProbability = subsetProbability; this.supersetProbability = supersetProbability; this.absoluteProbabilityChange = absoluteProbabilityChange; this.relativeProbabilityChange = relativeProbabilityChange; } @Override public void addCustomFields(XContentBuilder builder) throws IOException { builder.field("subsetProbability", subsetProbability); builder.field("supersetProbability", supersetProbability); builder.field("absoluteProbabilityChange", absoluteProbabilityChange); builder.field("relativeProbabilityChange", relativeProbabilityChange); } } }