package doser.lucene.query; import java.io.IOException; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.SmallFloat; public class FuzzyLabelSimilarity extends Similarity { // NOPMD by quh on // 28.02.14 10:49 /** * Collection statistics for the TF-IDF model. The only statistic of * interest to this model is idf. */ private static class IDFStats extends SimWeight { private final String field; /** The idf and its explanation */ private final Explanation idf; private final float queryBoost; private float queryNorm; private float queryWeight; private float value; public IDFStats(final String field, final Explanation idf, final float queryBoost) { // TODO: Validate? this.field = field; this.idf = idf; this.queryBoost = queryBoost; // this.queryWeight = idf.getValue() * queryBoost; queryWeight = queryBoost;// compute query weight } @Override public float getValueForNormalization() { // TODO: (sorta LUCENE-1907) make non-static class and expose this // squaring via a nice method to subclasses? return queryWeight * queryWeight; // sum of squared // weights } @Override public void normalize(final float queryNorm, final float topLevelBoost) { this.queryNorm = queryNorm * topLevelBoost; queryWeight *= this.queryNorm; // normalize query weight value = queryWeight;// * idf.getValue(); // idf for // document } } private final class TFIDFSimScorer extends SimScorer { private final NumericDocValues norms; private final IDFStats stats; private final float weightValue; TFIDFSimScorer(final IDFStats stats, final NumericDocValues norms) throws IOException { this.stats = stats; weightValue = stats.value; this.norms = norms; } @Override public float computePayloadFactor(final int doc, final int start, final int end, final BytesRef payload) { return scorePayload(doc, start, end, payload); } @Override public float computeSlopFactor(final int distance) { return sloppyFreq(distance); } @Override public Explanation explain(final int doc, final Explanation freq) { return explainScore(doc, freq, stats, norms); } // /////////////////////// // Test // //////////////////////// @Override public float score(final int doc, final float freq) { final float raw = tf(freq) * weightValue; // compute tf(f)*weight return norms == null ? raw : raw * decodeNormValue(norms.get(doc)); // normalize // for // field } } /** Cache of decoded bytes. */ private static final float[] NORM_TABLE = new float[256]; static { for (int i = 0; i < 256; i++) { NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte) i); } } /** * True if overlap tokens (tokens with a position of increment of zero) are * discounted from the document's length. */ private boolean discountOverlaps = true; @Override public final long computeNorm(final FieldInvertState state) { final float normValue = lengthNorm(state); return encodeNormValue(normValue); } @Override public final SimWeight computeWeight(final float queryBoost, final CollectionStatistics collectionStats, final TermStatistics... termStats) { final Explanation idf = termStats.length == 1 ? this.idfExplain( collectionStats, termStats[0]) : this.idfExplain( collectionStats, termStats); return new IDFStats(collectionStats.field(), idf, queryBoost); } /** * Computes a score factor based on the fraction of all query terms that a * document contains. This value is multiplied into scores. * * <p> * The presence of a large portion of the query terms indicates a better * match with the query, so implementations of this method usually return * larger values when the ratio between these parameters is large and * smaller values when the ratio between them is small. * * @param overlap * the number of query terms matched in the document * @param maxOverlap * the total number of terms in the query * @return a score factor based on term overlap with the query */ @Override public float coord(final int overlap, final int maxOverlap) { return overlap / (float) maxOverlap; } /** * Decodes a normalization factor stored in an index. * * @see #encodeNormValue(float) */ public final float decodeNormValue(final long norm) { return NORM_TABLE[(int) (norm & 0xFF)]; // & 0xFF maps negative bytes to // positive above 127 } /** Encodes a normalization factor for storage in an index. */ public final long encodeNormValue(final float floatVal) { return SmallFloat.floatToByte315(floatVal); } private Explanation explainScore(final int doc, final Explanation freq, final IDFStats stats, final NumericDocValues norms) { final Explanation result = new Explanation(); result.setDescription("score(doc=" + doc + ",freq=" + freq + "), product of:"); // explain query weight final Explanation queryExpl = new Explanation(); queryExpl.setDescription("queryWeight, product of:"); final Explanation boostExpl = new Explanation(stats.queryBoost, "boost"); if (stats.queryBoost != 1.0f) { queryExpl.addDetail(boostExpl); } queryExpl.addDetail(stats.idf); final Explanation queryNormExpl = new Explanation(stats.queryNorm, "queryNorm"); queryExpl.addDetail(queryNormExpl); queryExpl.setValue(boostExpl.getValue() * stats.idf.getValue() * queryNormExpl.getValue()); result.addDetail(queryExpl); // explain field weight final Explanation fieldExpl = new Explanation(); fieldExpl.setDescription("fieldWeight in " + doc + ", product of:"); final Explanation tfExplanation = new Explanation(); tfExplanation.setValue(tf(freq.getValue())); tfExplanation.setDescription("tf(freq=" + freq.getValue() + "), with freq of:"); tfExplanation.addDetail(freq); fieldExpl.addDetail(tfExplanation); fieldExpl.addDetail(stats.idf); final Explanation fieldNormExpl = new Explanation(); final float fieldNorm = norms != null ? decodeNormValue(norms.get(doc)) : 1.0f; fieldNormExpl.setValue(fieldNorm); fieldNormExpl.setDescription("fieldNorm(doc=" + doc + ")"); fieldExpl.addDetail(fieldNormExpl); fieldExpl.setValue(tfExplanation.getValue() * stats.idf.getValue() * fieldNormExpl.getValue()); result.addDetail(fieldExpl); // combine them result.setValue(queryExpl.getValue() * fieldExpl.getValue()); if (queryExpl.getValue() == 1.0f) { return fieldExpl; } return result; } /** * Returns true if overlap tokens are discounted from the document's length. * * @see #setDiscountOverlaps */ public boolean getDiscountOverlaps() { // NOPMD by quh on 28.02.14 10:49 return discountOverlaps; } /** * Computes a score factor based on a term's document frequency (the number * of documents which contain the term). This value is multiplied by the * {@link #tf(float)} factor for each term in the query and these products * are then summed to form the initial score for a document. * * <p> * Terms that occur in fewer documents are better indicators of topic, so * implementations of this method usually return larger values for rare * terms, and smaller values for common terms. * * @param docFreq * the number of documents which contain the term * @param numDocs * the total number of documents in the collection * @return a score factor based on the term's document frequency */ public float idf(final long docFreq, final long numDocs) { return (float) (Math.log(numDocs / (double) (docFreq + 1)) + 1.0); } /** * Computes a score factor for a simple term and returns an explanation for * that score factor. * * <p> * The default implementation uses: * * <pre class="prettyprint"> * idf(docFreq, searcher.maxDoc()); * </pre> * * Note that {@link CollectionStatistics#maxDoc()} is used instead of * {@link org.apache.lucene.index.IndexReader#numDocs() * IndexReader#numDocs()} because also {@link TermStatistics#docFreq()} is * used, and when the latter is inaccurate, so is * {@link CollectionStatistics#maxDoc()}, and in the same direction. In * addition, {@link CollectionStatistics#maxDoc()} is more efficient to * compute * * @param collectionStats * collection-level statistics * @param termStats * term-level statistics for the term * @return an Explain object that includes both an idf score factor and an * explanation for the term. */ public Explanation idfExplain(final CollectionStatistics collectionStats, final TermStatistics termStats) { final long docfreq = termStats.docFreq(); final long max = collectionStats.maxDoc(); final float idf = idf(docfreq, max); return new Explanation(idf, "idf(docFreq=" + docfreq + ", maxDocs=" + max + ")"); } /** * Computes a score factor for a phrase. * * <p> * The default implementation sums the idf factor for each term in the * phrase. * * @param collectionStats * collection-level statistics * @param termStats * term-level statistics for the terms in the phrase * @return an Explain object that includes both an idf score factor for the * phrase and an explanation for each term. */ public Explanation idfExplain(final CollectionStatistics collectionStats, final TermStatistics termStats[]) { final long max = collectionStats.maxDoc(); float idf = 0.0f; final Explanation exp = new Explanation(); exp.setDescription("idf(), sum of:"); for (final TermStatistics stat : termStats) { final long docFreq = stat.docFreq(); final float termIdf = idf(docFreq, max); exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + docFreq + ", maxDocs=" + max + ")")); idf += termIdf; } exp.setValue(idf); return exp; } /** * Compute an index-time normalization value for this field instance. * <p> * This value will be stored in a single byte lossy representation by * {@link #encodeNormValue(float)}. * * @param state * statistics of the current field (such as length, boost, etc) * @return an index-time normalization value */ public float lengthNorm(final FieldInvertState state) { final int numTerms; if (discountOverlaps) { numTerms = state.getLength() - state.getNumOverlap(); } else { numTerms = state.getLength(); } return state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms))); } /** * Computes the normalization value for a query given the sum of the squared * weights of each of the query terms. This value is multiplied into the * weight of each query term. While the classic query normalization factor * is computed as 1/sqrt(sumOfSquaredWeights), other implementations might * completely ignore sumOfSquaredWeights (ie return 1). * * <p> * This does not affect ranking, but the default implementation does make * scores from different queries more comparable than they would be by * eliminating the magnitude of the Query vector as a factor in the score. * * @param sumSquaredWeights * the sum of the squares of query term weights * @return a normalization factor for query weights */ @Override public float queryNorm(final float sumSquaredWeights) { return (float) (1.0 / Math.sqrt(sumSquaredWeights)); } /** * Calculate a scoring factor based on the data in the payload. * Implementations are responsible for interpreting what is in the payload. * Lucene makes no assumptions about what is in the byte array. * * @param doc * The docId currently being scored. * @param start * The start position of the payload * @param end * The end position of the payload * @param payload * The payload byte array to be scored * @return An implementation dependent float to be used as a scoring factor */ public float scorePayload(final int doc, final int start, final int end, final BytesRef payload) { return 1; } /** * Determines whether overlap tokens (Tokens with 0 position increment) are * ignored when computing norm. By default this is true, meaning overlap * tokens do not count when computing norms. * * @lucene.experimental * * @see #computeNorm */ public void setDiscountOverlaps(final boolean bool) { discountOverlaps = bool; } @Override public final SimScorer simScorer(final SimWeight stats, final AtomicReaderContext context) throws IOException { final IDFStats idfstats = (IDFStats) stats; return new TFIDFSimScorer(idfstats, context.reader().getNormValues( idfstats.field)); } /** * Computes the amount of a sloppy phrase match, based on an edit distance. * This value is summed for each sloppy phrase match in a document to form * the frequency to be used in scoring instead of the exact term count. * * <p> * A phrase match with a small edit distance to a document passage more * closely matches the document, so implementations of this method usually * return larger values when the edit distance is small and smaller values * when it is large. * * @see PhraseQuery#setSlop(int) * @param distance * the edit distance of this sloppy phrase match * @return the frequency increment for this match */ public float sloppyFreq(final int distance) { return 1.0f / (distance + 1); } /** * Computes a score factor based on a term or phrase's frequency in a * document. This value is multiplied by the {@link #idf(long, long)} factor * for each term in the query and these products are then summed to form the * initial score for a document. * * <p> * Terms and phrases repeated in a document indicate the topic of the * document, so implementations of this method usually return larger values * when <code>freq</code> is large, and smaller values when * <code>freq</code> is small. * * @param freq * the frequency of a term within a document * @return a score factor based on a term's within-document frequency */ public float tf(final float freq) { // NOPMD by quh on 28.02.14 10:49 return (float) Math.sqrt(freq); } @Override public String toString() { return "DefaultSimilarity"; } }