/** * */ package querqy.lucene.rewrite; import java.io.IOException; import java.util.LinkedList; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.Query; import querqy.CompoundCharSequence; import querqy.lucene.rewrite.BooleanQueryFactory.Clause; import querqy.lucene.rewrite.cache.TermQueryCache; import querqy.model.AbstractNodeVisitor; import querqy.model.BooleanQuery; import querqy.model.DisjunctionMaxQuery; import querqy.model.Term; /** * @author René Kriegler, @renekrie * */ public class LuceneQueryBuilder extends AbstractNodeVisitor<LuceneQueryFactory<?>> { enum ParentType { BQ, DMQ } final boolean normalizeBooleanQueryBoost; final float dmqTieBreakerMultiplier; final DocumentFrequencyAndTermContextProvider dftcp; final SearchFieldsAndBoosting searchFieldsAndBoosting; final TermSubQueryBuilder termSubQueryBuilder; LinkedList<BooleanQueryFactory> clauseStack = new LinkedList<>(); LinkedList<DisjunctionMaxQueryFactory> dmqStack = new LinkedList<>(); boolean useBooleanQueryForDMQ = false; protected ParentType parentType = ParentType.BQ; public LuceneQueryBuilder(DocumentFrequencyAndTermContextProvider dftcp, Analyzer analyzer, SearchFieldsAndBoosting searchFieldsAndBoosting, float dmqTieBreakerMultiplier, TermQueryCache termQueryCache) { this( dftcp, analyzer, searchFieldsAndBoosting, dmqTieBreakerMultiplier, true, termQueryCache); } /** * <p>Field names and boost factors are applied like this:</p> * <p>If a term doesn't already have a field name, generate all term queries for all fields and boost factors * from generatedQueryFieldsAndBoostings - if the term was generated by some rewriter - or from queryFieldsAndBoostings * otherwise.</p> * <p>If a term already has a field name use the boost factor for this field from generatedQueryFieldsAndBoostings if * the term was generated, and from queryFieldsAndBoostings otherwise. If the respective map doesn't contain the field, * use the defaultGeneratedFieldBoostFactor for generated terms. If the term is not generated, treat the field name as * part of the term text (= "fieldname:value").</p> * * @param dftcp * @param analyzer * @param searchFieldsAndBoosting * @param dmqTieBreakerMultiplier * @param normalizeBooleanQueryBoost * @param termQueryCache The term query cache or null */ public LuceneQueryBuilder(DocumentFrequencyAndTermContextProvider dftcp, Analyzer analyzer, SearchFieldsAndBoosting searchFieldsAndBoosting, float dmqTieBreakerMultiplier, boolean normalizeBooleanQueryBoost, TermQueryCache termQueryCache) { this.searchFieldsAndBoosting = searchFieldsAndBoosting; this.dmqTieBreakerMultiplier = dmqTieBreakerMultiplier; this.normalizeBooleanQueryBoost = normalizeBooleanQueryBoost; this.dftcp = dftcp; termSubQueryBuilder = new TermSubQueryBuilder(analyzer, termQueryCache); } public void reset() { clauseStack.clear(); dmqStack.clear(); useBooleanQueryForDMQ = false; } public Query createQuery(querqy.model.Query query, boolean useBooleanQueryForDMQ) throws IOException { boolean tmp = this.useBooleanQueryForDMQ; try { this.useBooleanQueryForDMQ = useBooleanQueryForDMQ; return createQuery(query); } finally { this.useBooleanQueryForDMQ = tmp; } } public Query createQuery(querqy.model.Query query) throws IOException { LuceneQueryFactory<?> factory = visit(query); factory.prepareDocumentFrequencyCorrection(dftcp, false); return factory.createQuery(null, dmqTieBreakerMultiplier, dftcp); } @Override public LuceneQueryFactory<?> visit(querqy.model.Query query) { parentType = ParentType.BQ; return visit((BooleanQuery) query); } @Override public LuceneQueryFactory<?> visit(BooleanQuery booleanQuery) { BooleanQueryFactory bq = new BooleanQueryFactory( booleanQuery.isGenerated(), normalizeBooleanQueryBoost && parentType == ParentType.DMQ); ParentType myParentType = parentType; parentType = ParentType.BQ; clauseStack.add(bq); super.visit(booleanQuery); clauseStack.removeLast(); parentType = myParentType; Clause result; switch (bq.getNumberOfClauses()) { case 0: // no sub-query - this can happen if analysis filters out all tokens (stopwords) return new NeverMatchQueryFactory(); case 1: Clause firstClause = bq.getFirstClause(); if (firstClause.occur == Occur.SHOULD) { // optimise and propagate the single clause up one level, but only // if occur equals neither MUST nor MUST_NOT, which would be lost on the // top level query result = bq.getFirstClause(); } else { result = new Clause(bq, occur(booleanQuery.occur)); } break; default: result = new Clause(bq, occur(booleanQuery.occur)); } switch (parentType) { case BQ: if (!clauseStack.isEmpty()) { clauseStack.getLast().add(result); return bq; } else {// else we are the top BQ return result.queryFactory; } case DMQ: if (result.occur != Occur.SHOULD) { // create a wrapper query BooleanQueryFactory wrapper = new BooleanQueryFactory(true, false); wrapper.add(result); bq = wrapper; } dmqStack.getLast().add(bq); return bq; default: throw new RuntimeException("Unknown parentType " + parentType); } } protected Occur occur(querqy.model.SubQuery.Occur occur) { switch (occur) { case MUST: return Occur.MUST; case MUST_NOT: return Occur.MUST_NOT; case SHOULD: return Occur.SHOULD; } throw new IllegalArgumentException("Cannot handle occur value: " + occur.name()); } @Override public LuceneQueryFactory<?> visit(DisjunctionMaxQuery disjunctionMaxQuery) { ParentType myParentType = parentType; parentType = ParentType.DMQ; DisjunctionMaxQueryFactory dmq = new DisjunctionMaxQueryFactory(); dmqStack.add(dmq); super.visit(disjunctionMaxQuery); dmqStack.removeLast(); parentType = myParentType; switch (dmq.getNumberOfDisjuncts()) { case 0: // no sub-query - this can happen if analysis filters out all tokens (stopwords) return new NeverMatchQueryFactory(); case 1: LuceneQueryFactory<?> firstDisjunct = dmq.getFirstDisjunct(); clauseStack.getLast().add(firstDisjunct, occur(disjunctionMaxQuery.occur)); return firstDisjunct; default: // FIXME: we can decide this earlier --> avoid creating DMQ in case of // MUST_NOT boolean useBQ = this.useBooleanQueryForDMQ || (disjunctionMaxQuery.occur == querqy.model.SubQuery.Occur.MUST_NOT); if (useBQ) { // FIXME: correct to normalize boost? BooleanQueryFactory bq = new BooleanQueryFactory(true, false); for (LuceneQueryFactory<?> queryFactory : dmq.disjuncts) { bq.add(queryFactory, Occur.SHOULD); } clauseStack.getLast().add(bq, occur(disjunctionMaxQuery.occur)); return bq; } clauseStack.getLast().add(dmq, occur(disjunctionMaxQuery.occur)); return dmq; } } @Override public LuceneQueryFactory<?> visit(final Term term) { DisjunctionMaxQueryFactory siblings = dmqStack.getLast(); String fieldname = term.getField(); Term termToUse = null; try { FieldBoost fieldBoost = searchFieldsAndBoosting.getFieldBoost(term); if (fieldBoost == null) { if (fieldname != null && !term.isGenerated() && !searchFieldsAndBoosting.hasSearchField(fieldname, term)) { // someone searches in a field that is not set as a search field or didn't intend to search in a field at all // --> set value to fieldname + ":" + value in search in all fields Term termWithFieldInValue = new Term(null, new CompoundCharSequence(":", fieldname, term.getValue())); fieldBoost = searchFieldsAndBoosting.getFieldBoost(termWithFieldInValue); if (fieldBoost != null) { termToUse = termWithFieldInValue; } } } else { termToUse = term; } if (fieldBoost == null) { throw new RuntimeException("Could not get FieldBoost for term: " + term); } for (String searchField: searchFieldsAndBoosting.getSearchFields(termToUse)) { addTerm(searchField, fieldBoost, siblings, termToUse); } } catch (IOException e) { // REVISIT: throw more specific exception? // - or save exception in Builder and then throw IOException from // build() throw new RuntimeException(e); } return null; } /** * * <p> * Applies analysis to a term and adds the result to the Lucene query factory * tree. * </p> * * <p> * The analysis might emit multiple tokens for the input term. If these * tokens constitute a sequence (according to the position attribute), a * BooleanQuery will be created and each position in the sequence constitutes * a MUST clause of this BooleanQuery. If multiple tokens occur at the same * position, a DismaxQuery will be created in this position and the tokens * constitute its disjuncts. The tiebreak factor will be set to the * dmqTieBreakerMultiplier property of this LuceneQueryBuilder. * </p> * * * @param fieldname * @param boost * @param target * @param sourceTerm * @throws IOException */ void addTerm(String fieldname, FieldBoost boost, DisjunctionMaxQueryFactory target, Term sourceTerm) throws IOException { TermSubQueryFactory queryFactory = termSubQueryBuilder.termToFactory(fieldname, sourceTerm, boost);//termToFactory(fieldname, sourceTerm, boost); if (queryFactory != null) { target.add(queryFactory); boost.registerTermSubQuery(fieldname, queryFactory, sourceTerm); } } }