package org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.DocsEnum; // javadoc @link import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.util.BytesRef; /** * A {@link Filter} that only accepts documents whose single * term value in the specified field is contained in the * provided set of allowed terms. * * <p/> * * This is the same functionality as TermsFilter (from * contrib/queries), except this filter requires that the * field contains only a single term for all documents. * Because of drastically different implementations, they * also have different performance characteristics, as * described below. * * <p/> * * The first invocation of this filter on a given field will * be slower, since a {@link FieldCache.DocTermsIndex} must be * created. Subsequent invocations using the same field * will re-use this cache. However, as with all * functionality based on {@link FieldCache}, persistent RAM * is consumed to hold the cache, and is not freed until the * {@link IndexReader} is closed. In contrast, TermsFilter * has no persistent RAM consumption. * * * <p/> * * With each search, this filter translates the specified * set of Terms into a private {@link OpenBitSet} keyed by * term number per unique {@link IndexReader} (normally one * reader per segment). Then, during matching, the term * number for each docID is retrieved from the cache and * then checked for inclusion using the {@link OpenBitSet}. * Since all testing is done using RAM resident data * structures, performance should be very fast, most likely * fast enough to not require further caching of the * DocIdSet for each possible combination of terms. * However, because docIDs are simply scanned linearly, an * index with a great many small documents may find this * linear scan too costly. * * <p/> * * In contrast, TermsFilter builds up an {@link OpenBitSet}, * keyed by docID, every time it's created, by enumerating * through all matching docs using {@link DocsEnum} to seek * and scan through each term's docID list. While there is * no linear scan of all docIDs, besides the allocation of * the underlying array in the {@link OpenBitSet}, this * approach requires a number of "disk seeks" in proportion * to the number of terms, which can be exceptionally costly * when there are cache misses in the OS's IO cache. * * <p/> * * Generally, this filter will be slower on the first * invocation for a given field, but subsequent invocations, * even if you change the allowed set of Terms, should be * faster than TermsFilter, especially as the number of * Terms being matched increases. If you are matching only * a very small number of terms, and those terms in turn * match a very small number of documents, TermsFilter may * perform faster. * * <p/> * * Which filter is best is very application dependent. */ public class FieldCacheTermsFilter extends Filter { private String field; private BytesRef[] terms; public FieldCacheTermsFilter(String field, BytesRef... terms) { this.field = field; this.terms = terms; } public FieldCacheTermsFilter(String field, String... terms) { this.field = field; this.terms = new BytesRef[terms.length]; for (int i = 0; i < terms.length; i++) this.terms[i] = new BytesRef(terms[i]); } public FieldCache getFieldCache() { return FieldCache.DEFAULT; } @Override public DocIdSet getDocIdSet(IndexReader reader) throws IOException { return new FieldCacheTermsFilterDocIdSet(getFieldCache().getTermsIndex(reader, field)); } protected class FieldCacheTermsFilterDocIdSet extends DocIdSet { private FieldCache.DocTermsIndex fcsi; private OpenBitSet openBitSet; public FieldCacheTermsFilterDocIdSet(FieldCache.DocTermsIndex fcsi) { this.fcsi = fcsi; openBitSet = new OpenBitSet(this.fcsi.size()); final BytesRef spare = new BytesRef(); for (int i=0;i<terms.length;i++) { int termNumber = this.fcsi.binarySearchLookup(terms[i], spare); if (termNumber > 0) { openBitSet.fastSet(termNumber); } } } @Override public DocIdSetIterator iterator() { return new FieldCacheTermsFilterDocIdSetIterator(); } /** This DocIdSet implementation is cacheable. */ @Override public boolean isCacheable() { return true; } protected class FieldCacheTermsFilterDocIdSetIterator extends DocIdSetIterator { private int doc = -1; @Override public int docID() { return doc; } @Override public int nextDoc() { try { while (!openBitSet.fastGet(fcsi.getOrd(++doc))) {} } catch (ArrayIndexOutOfBoundsException e) { doc = NO_MORE_DOCS; } return doc; } @Override public int advance(int target) { try { doc = target; while (!openBitSet.fastGet(fcsi.getOrd(doc))) { doc++; } } catch (ArrayIndexOutOfBoundsException e) { doc = NO_MORE_DOCS; } return doc; } } } }