package org.apache.lucene.sandbox.queries; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.search.*; import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PriorityQueue; /** * Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms. * In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration * of fuzzy scoring factors. * This generally produces good results for queries where users may provide details in a number of * fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and * a fast query. * * For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because * we are not looking for matches on multiple variants in any one doc). Additionally, a specialized * TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer * terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query * term) and this is factored into the variant's boost. If the source query term does not exist in the * index the average IDF of the variants is used. */ public class FuzzyLikeThisQuery extends Query { // TODO: generalize this query (at least it should not reuse this static sim! // a better way might be to convert this into multitermquery rewrite methods. // the rewrite method can 'average' the TermContext's term statistics (docfreq,totalTermFreq) // provided to TermQuery, so that the general idea is agnostic to any scoring system... static TFIDFSimilarity sim=new DefaultSimilarity(); Query rewrittenQuery=null; ArrayList<FieldVals> fieldVals=new ArrayList<>(); Analyzer analyzer; ScoreTermQueue q; int MAX_VARIANTS_PER_TERM=50; boolean ignoreTF=false; private int maxNumTerms; @Override public int hashCode() { final int prime = 31; int result = super.hashCode(); result = prime * result + ((analyzer == null) ? 0 : analyzer.hashCode()); result = prime * result + ((fieldVals == null) ? 0 : fieldVals.hashCode()); result = prime * result + (ignoreTF ? 1231 : 1237); result = prime * result + maxNumTerms; return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; if (!super.equals(obj)) { return false; } FuzzyLikeThisQuery other = (FuzzyLikeThisQuery) obj; if (analyzer == null) { if (other.analyzer != null) return false; } else if (!analyzer.equals(other.analyzer)) return false; if (fieldVals == null) { if (other.fieldVals != null) return false; } else if (!fieldVals.equals(other.fieldVals)) return false; if (ignoreTF != other.ignoreTF) return false; if (maxNumTerms != other.maxNumTerms) return false; return true; } /** * * @param maxNumTerms The total number of terms clauses that will appear once rewritten as a BooleanQuery */ public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer) { q=new ScoreTermQueue(maxNumTerms); this.analyzer=analyzer; this.maxNumTerms = maxNumTerms; } class FieldVals { String queryString; String fieldName; float minSimilarity; int prefixLength; public FieldVals(String name, float similarity, int length, String queryString) { fieldName = name; minSimilarity = similarity; prefixLength = length; this.queryString = queryString; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((fieldName == null) ? 0 : fieldName.hashCode()); result = prime * result + Float.floatToIntBits(minSimilarity); result = prime * result + prefixLength; result = prime * result + ((queryString == null) ? 0 : queryString.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; FieldVals other = (FieldVals) obj; if (fieldName == null) { if (other.fieldName != null) return false; } else if (!fieldName.equals(other.fieldName)) return false; if (Float.floatToIntBits(minSimilarity) != Float .floatToIntBits(other.minSimilarity)) return false; if (prefixLength != other.prefixLength) return false; if (queryString == null) { if (other.queryString != null) return false; } else if (!queryString.equals(other.queryString)) return false; return true; } } /** * Adds user input for "fuzzification" * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed * @param minSimilarity The minimum similarity of the term variants (see FuzzyTermsEnum) * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermsEnum) */ public void addTerms(String queryString, String fieldName,float minSimilarity, int prefixLength) { fieldVals.add(new FieldVals(fieldName,minSimilarity,prefixLength,queryString)); } private void addTerms(IndexReader reader, FieldVals f) throws IOException { if (f.queryString == null) return; final Terms terms = MultiFields.getTerms(reader, f.fieldName); if (terms == null) { return; } try (TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); int corpusNumDocs = reader.numDocs(); HashSet<String> processedTerms = new HashSet<>(); ts.reset(); while (ts.incrementToken()) { String term = termAtt.toString(); if (!processedTerms.contains(term)) { processedTerms.add(term); ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term float minScore = 0; Term startTerm = new Term(f.fieldName, term); AttributeSource atts = new AttributeSource(); MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength); //store the df so all variants use same idf int df = reader.docFreq(startTerm); int numVariants = 0; int totalVariantDocFreqs = 0; BytesRef possibleMatch; BoostAttribute boostAtt = fe.attributes().addAttribute(BoostAttribute.class); while ((possibleMatch = fe.next()) != null) { numVariants++; totalVariantDocFreqs += fe.docFreq(); float score = boostAtt.getBoost(); if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) { ScoreTerm st = new ScoreTerm(new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm); variantsQ.insertWithOverflow(st); minScore = variantsQ.top().score; // maintain minScore } maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY); } if (numVariants > 0) { int avgDf = totalVariantDocFreqs / numVariants; if (df == 0)//no direct match we can use as df for all variants { df = avgDf; //use avg df of all variants } // take the top variants (scored by edit distance) and reset the score // to include an IDF factor then add to the global queue for ranking // overall top query terms int size = variantsQ.size(); for (int i = 0; i < size; i++) { ScoreTerm st = variantsQ.pop(); st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs); q.insertWithOverflow(st); } } } } ts.end(); } } @Override public Query rewrite(IndexReader reader) throws IOException { if(rewrittenQuery!=null) { return rewrittenQuery; } //load up the list of possible terms for (Iterator<FieldVals> iter = fieldVals.iterator(); iter.hasNext(); ) { FieldVals f = iter.next(); addTerms(reader, f); } //clear the list of fields fieldVals.clear(); BooleanQuery bq=new BooleanQuery(); //create BooleanQueries to hold the variants for each token/field pair and ensure it // has no coord factor //Step 1: sort the termqueries by term/field HashMap<Term,ArrayList<ScoreTerm>> variantQueries=new HashMap<>(); int size = q.size(); for(int i = 0; i < size; i++) { ScoreTerm st = q.pop(); ArrayList<ScoreTerm> l= variantQueries.get(st.fuzziedSourceTerm); if(l==null) { l=new ArrayList<>(); variantQueries.put(st.fuzziedSourceTerm,l); } l.add(st); } //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries for (Iterator<ArrayList<ScoreTerm>> iter = variantQueries.values().iterator(); iter.hasNext();) { ArrayList<ScoreTerm> variants = iter.next(); if(variants.size()==1) { //optimize where only one selected variant ScoreTerm st= variants.get(0); Query tq = ignoreTF ? new ConstantScoreQuery(new TermQuery(st.term)) : new TermQuery(st.term, 1); tq.setBoost(st.score); // set the boost to a mix of IDF and score bq.add(tq, BooleanClause.Occur.SHOULD); } else { BooleanQuery termVariants=new BooleanQuery(true); //disable coord and IDF for these term variants for (Iterator<ScoreTerm> iterator2 = variants.iterator(); iterator2 .hasNext();) { ScoreTerm st = iterator2.next(); // found a match Query tq = ignoreTF ? new ConstantScoreQuery(new TermQuery(st.term)) : new TermQuery(st.term, 1); tq.setBoost(st.score); // set the boost using the ScoreTerm's score termVariants.add(tq, BooleanClause.Occur.SHOULD); // add to query } bq.add(termVariants, BooleanClause.Occur.SHOULD); // add to query } } //TODO possible alternative step 3 - organize above booleans into a new layer of field-based // booleans with a minimum-should-match of NumFields-1? bq.setBoost(getBoost()); this.rewrittenQuery=bq; return bq; } //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best // term variants) then is reset with IDF for use in ranking against all other // terms/fields private static class ScoreTerm{ public Term term; public float score; Term fuzziedSourceTerm; public ScoreTerm(Term term, float score, Term fuzziedSourceTerm){ this.term = term; this.score = score; this.fuzziedSourceTerm=fuzziedSourceTerm; } } private static class ScoreTermQueue extends PriorityQueue<ScoreTerm> { public ScoreTermQueue(int size){ super(size); } /* (non-Javadoc) * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object) */ @Override protected boolean lessThan(ScoreTerm termA, ScoreTerm termB) { if (termA.score== termB.score) return termA.term.compareTo(termB.term) > 0; else return termA.score < termB.score; } } /* (non-Javadoc) * @see org.apache.lucene.search.Query#toString(java.lang.String) */ @Override public String toString(String field) { return null; } public boolean isIgnoreTF() { return ignoreTF; } public void setIgnoreTF(boolean ignoreTF) { this.ignoreTF = ignoreTF; } }