package org.apache.lucene.search; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermContext; import org.apache.lucene.index.TermState; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> { // Defaults derived from rough tests with a 20.0 million // doc Wikipedia index. With more than 350 terms in the // query, the filter method is fastest: public static int DEFAULT_TERM_COUNT_CUTOFF = 350; // If the query will hit more than 1 in 1000 of the docs // in the index (0.1%), the filter method is fastest: public static double DEFAULT_DOC_COUNT_PERCENT = 0.1; private int termCountCutoff = DEFAULT_TERM_COUNT_CUTOFF; private double docCountPercent = DEFAULT_DOC_COUNT_PERCENT; /** If the number of terms in this query is equal to or * larger than this setting then {@link * MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} is used. */ public void setTermCountCutoff(int count) { termCountCutoff = count; } /** @see #setTermCountCutoff */ public int getTermCountCutoff() { return termCountCutoff; } /** If the number of documents to be visited in the * postings exceeds this specified percentage of the * maxDoc() for the index, then {@link * MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} is used. * @param percent 0.0 to 100.0 */ public void setDocCountPercent(double percent) { docCountPercent = percent; } /** @see #setDocCountPercent */ public double getDocCountPercent() { return docCountPercent; } @Override protected BooleanQuery getTopLevelQuery() { return new BooleanQuery(true); } @Override protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/, TermContext states) { topLevel.add(new TermQuery(term, states), BooleanClause.Occur.SHOULD); } @Override public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { // Get the enum and start visiting terms. If we // exhaust the enum before hitting either of the // cutoffs, we use ConstantBooleanQueryRewrite; else, // ConstantFilterRewrite: final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc()); final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff); final CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit); collectTerms(reader, query, col); final int size = col.pendingTerms.size(); if (col.hasCutOff) { return MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query); } else { final BooleanQuery bq = getTopLevelQuery(); if (size > 0) { final BytesRefHash pendingTerms = col.pendingTerms; final int sort[] = pendingTerms.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); for(int i = 0; i < size; i++) { final int pos = sort[i]; // docFreq is not used for constant score here, we pass 1 // to explicitely set a fake value, so it's not calculated addClause(bq, new Term(query.field, pendingTerms.get(pos, new BytesRef())), 1, 1.0f, col.array.termState[pos]); } } // Strip scores final Query result = new ConstantScoreQuery(bq); result.setBoost(query.getBoost()); return result; } } static final class CutOffTermCollector extends TermCollector { CutOffTermCollector(int docCountCutoff, int termCountLimit) { this.docCountCutoff = docCountCutoff; this.termCountLimit = termCountLimit; } @Override public void setNextEnum(TermsEnum termsEnum) { this.termsEnum = termsEnum; } @Override public boolean collect(BytesRef bytes) throws IOException { int pos = pendingTerms.add(bytes); docVisitCount += termsEnum.docFreq(); if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { hasCutOff = true; return false; } final TermState termState = termsEnum.termState(); assert termState != null; if (pos < 0) { pos = (-pos)-1; array.termState[pos].register(termState, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } else { array.termState[pos] = new TermContext(topReaderContext, termState, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } return true; } int docVisitCount = 0; boolean hasCutOff = false; TermsEnum termsEnum; final int docCountCutoff, termCountLimit; final TermStateByteStart array = new TermStateByteStart(16); final BytesRefHash pendingTerms = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectAllocator()), 16, array); } @Override public int hashCode() { final int prime = 1279; return (int) (prime * termCountCutoff + Double.doubleToLongBits(docCountPercent)); } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; ConstantScoreAutoRewrite other = (ConstantScoreAutoRewrite) obj; if (other.termCountCutoff != termCountCutoff) { return false; } if (Double.doubleToLongBits(other.docCountPercent) != Double.doubleToLongBits(docCountPercent)) { return false; } return true; } /** Special implementation of BytesStartArray that keeps parallel arrays for {@link TermContext} */ static final class TermStateByteStart extends DirectBytesStartArray { TermContext[] termState; public TermStateByteStart(int initSize) { super(initSize); } @Override public int[] init() { final int[] ord = super.init(); termState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; assert termState.length >= ord.length; return ord; } @Override public int[] grow() { final int[] ord = super.grow(); if (termState.length < ord.length) { TermContext[] tmpTermState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(termState, 0, tmpTermState, 0, termState.length); termState = tmpTermState; } assert termState.length >= ord.length; return ord; } @Override public int[] clear() { termState = null; return super.clear(); } } }