package org.basex.util.ft; import static java.lang.StrictMath.*; /** * Default scoring model, assembling all score calculations. * * @author BaseX Team 2005-12, BSD License * @author Christian Gruen */ public final class Scoring { /** Scoring multiplier to store values as integers. */ private static final int MP = 1000; /** Logarithmic base for calculating the score value. */ private static final double LOG = Math.E - 1; /** Scoring step. */ private static final double SCORESTEP = 0.8; /** Private constructor. */ private Scoring() { } /** * Calculates a score value, based on the token length * and complete text length. * @param tl token length * @param l complete length * @return result */ public static double word(final int tl, final double l) { return min(1, log(1 + LOG * tl / l)); } /** * Combines two scoring values. * @param o old value * @param n new value * @return result */ public static double and(final double o, final double n) { return 1 - (1 - o) * (1 - n); } /** * Combines two scoring values. * @param o old value * @param n new value * @return result */ public static double or(final double o, final double n) { return and(o, n); } /** * Inverses the scoring value for FTNot. * @param d scoring value * @return inverse scoring value */ public static double not(final double d) { return 1 - d; } /** * Returns a score for the let clause. * @param s summed up scoring values * @param c number of values * @return new score value */ public static double let(final double s, final int c) { return s / c; } /** * <p>Calculates a TF-IDF value for the specified values. * Used definition:</p> * <p>{@code freq(i, j) / max(l, freq(l, j)) * log(1 + N / n(i))}</p> * <p>The result is multiplied with the {@link #MP} constant to yield * integer values. The value {@code 2} is used as minimum score, * as the total minimum value will be subtracted by 1 to avoid eventual * {@code 0} scores.</p> * @param freq frequency of the token. TF: freq(i, j) * @param mfreq maximum occurrence of a token. TF: max(l, freq(l, j)) * @param docs number of documents in the collection. IDF: N * @param tokens number of documents containing the token. IDF: n(i) * @return score value */ public static int tfIDF(final double freq, final double mfreq, final double docs, final double tokens) { return (int) max(2, MP * freq / mfreq * log(1 + docs / tokens)); } /** * Calculates the score for a text node. * Used if no index score is available. * @param npv number of pos values * @param is total number of index entries * @param tokl token length * @param tl text length * @return score value */ public static double textNode(final int npv, final int is, final int tokl, final int tl) { return max((double) npv / is, log(tokl * npv + 1) / log(tl + 1)); } /** * Returns the scoring value for a phrase. * @param w1 score of word1 * @param w2 score of word2 * @return score of the phrase */ public static double intersect(final double w1, final double w2) { return (w1 + w2) / 2; } /** * Returns the union value. * @param w1 score of word1 * @param w2 score of word2 * @return score of the phrase */ public static double union(final double w1, final double w2) { return max(w1, w2); } /** * Returns a score for a single step. * @param sc current score value * @return new score value */ public static double step(final double sc) { return sc * SCORESTEP; } }