DefaultHeuristic.java example

Explorer

significance-showcase-master
- src
  - main
    - java
      - com
        hkorte
        elasticsearch
        significance
        SignificanceShowcasePlugin.java
        SignificantTermsProvider.java
        measures
        ChiSquared.java
        DefaultHeuristic.java
        KullbackLeiblerDivergence.java
        MutualInformation.java
        SignificanceMeasure.java
        model
        ScoredTerm.java
        rest
        SignificanceShowcaseRestHandler.java
        SignificanceShowcaseRestModule.java
  - test
    - java
      - com
        hkorte
        elasticsearch
        significance
        SignificanceShowcaseStarter.java

package com.hkorte.elasticsearch.significance.measures;

import com.hkorte.elasticsearch.significance.model.ScoredTerm;
import org.elasticsearch.common.xcontent.XContentBuilder;

import java.io.IOException;

/**
 * The default heuristic used in Elasticsearch to compute significant terms.
 *
 * @see org.elasticsearch.search.aggregations.bucket.significant.InternalSignificantTerms
 */
public class DefaultHeuristic extends SignificanceMeasure {

	@Override
	public String shortName() {
		return "default";
	}

	/**
	 * @param n00 docs which do not contain word with negative class
	 * @param n01 docs which do not contain word with positive class
	 * @param n10 docs which contain word with negative class
	 * @param n11 docs which contain word with positive class
	 * @return The default heuristic used in ES to compute significant terms
	 */
	@Override
	public double compute(long n00, long n01, long n10, long n11) {
		return apply(n00, n01, n10, n11).getScore();
	}

	@Override
	public ScoredTerm apply(long n00, long n01, long n10, long n11) {
		long subsetSize = n11 + n01;
		long supersetSize = n11 + n10 + n01 + n00;
		if ((subsetSize == 0) || (supersetSize == 0)) {
			// avoid any divide by zero issues
			return new ScoredTerm(0.0, n00, n01, n10, n11);
		}

		long subsetFreq = n11;
		long supersetFreq = n11 + n10;

		double subsetProbability = (double) subsetFreq / (double) subsetSize;
		double supersetProbability = (double) supersetFreq / (double) supersetSize;

		// Using absoluteProbabilityChange alone favours very common words e.g. you, we etc
		// because a doubling in popularity of a common term is a big percent difference
		// whereas a rare term would have to achieve a hundred-fold increase in popularity to
		// achieve the same difference measure.
		// In favouring common words as suggested features for search we would get high
		// recall but low precision.
		double absoluteProbabilityChange = subsetProbability - supersetProbability;
		if (absoluteProbabilityChange <= 0) {
			return new ScoredTerm(0.0, n00, n01, n10, n11);
		}
		// Using relativeProbabilityChange tends to favour rarer terms e.g.mis-spellings or
		// unique URLs.
		// A very low-probability term can very easily double in popularity due to the low
		// numbers required to do so whereas a high-probability term would have to add many
		// extra individual sightings to achieve the same shift.
		// In favouring rare words as suggested features for search we would get high
		// precision but low recall.
		double relativeProbabilityChange = (subsetProbability / supersetProbability);

		// A blend of the above metrics - favours medium-rare terms to strike a useful
		// balance between precision and recall.
		double score = absoluteProbabilityChange * relativeProbabilityChange;
		return new CustomScoredTerm(score, n00, n01, n10, n11, subsetProbability,
				supersetProbability, absoluteProbabilityChange, relativeProbabilityChange);
	}

	private static class CustomScoredTerm extends ScoredTerm {

		private double subsetProbability;
		private double supersetProbability;
		private double absoluteProbabilityChange;
		private double relativeProbabilityChange;

		public CustomScoredTerm(double score, long n00, long n01, long n10, long n11, double subsetProbability,
								double supersetProbability, double absoluteProbabilityChange,
								double relativeProbabilityChange) {
			super(score, n00, n01, n10, n11);
			this.subsetProbability = subsetProbability;
			this.supersetProbability = supersetProbability;
			this.absoluteProbabilityChange = absoluteProbabilityChange;
			this.relativeProbabilityChange = relativeProbabilityChange;
		}

		@Override
		public void addCustomFields(XContentBuilder builder) throws IOException {
			builder.field("subsetProbability", subsetProbability);
			builder.field("supersetProbability", supersetProbability);
			builder.field("absoluteProbabilityChange", absoluteProbabilityChange);
			builder.field("relativeProbabilityChange", relativeProbabilityChange);
		}
	}
}