package org.apache.lucene.queries;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.*;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
/**
* Constructs a filter for docs matching any of the terms added to this class.
* Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
* a sequence. An example might be a collection of primary keys from a database query result or perhaps
* a choice of "category" labels picked by the end user. As a filter, this is much faster than the
* equivalent query (a BooleanQuery with many "should" TermQueries)
*/
public final class TermsFilter extends Filter {
/*
* this class is often used for large number of terms in a single field.
* to optimize for this case and to be filter-cache friendly we
* serialize all terms into a single byte array and store offsets
* in a parallel array to keep the # of object constant and speed up
* equals / hashcode.
*
* This adds quite a bit of complexity but allows large term filters to
* be efficient for GC and cache-lookups
*/
private final int[] offsets;
private final byte[] termsBytes;
private final TermsAndField[] termsAndFields;
private final int hashCode; // cached hashcode for fast cache lookups
private static final int PRIME = 31;
/**
* Creates a new {@link TermsFilter} from the given list. The list
* can contain duplicate terms and multiple fields.
*/
public TermsFilter(final List<Term> terms) {
this(new FieldAndTermEnum() {
// we need to sort for deduplication and to have a common cache key
final Iterator<Term> iter = sort(terms).iterator();
@Override
public BytesRef next() {
if (iter.hasNext()) {
Term next = iter.next();
field = next.field();
return next.bytes();
}
return null;
}}, terms.size());
}
/**
* Creates a new {@link TermsFilter} from the given {@link BytesRef} list for
* a single field.
*/
public TermsFilter(final String field, final List<BytesRef> terms) {
this(new FieldAndTermEnum(field) {
// we need to sort for deduplication and to have a common cache key
final Iterator<BytesRef> iter = sort(terms).iterator();
@Override
public BytesRef next() {
if (iter.hasNext()) {
return iter.next();
}
return null;
}
}, terms.size());
}
/**
* Creates a new {@link TermsFilter} from the given {@link BytesRef} array for
* a single field.
*/
public TermsFilter(final String field, final BytesRef...terms) {
// this ctor prevents unnecessary Term creations
this(field, Arrays.asList(terms));
}
/**
* Creates a new {@link TermsFilter} from the given array. The array can
* contain duplicate terms and multiple fields.
*/
public TermsFilter(final Term... terms) {
this(Arrays.asList(terms));
}
private TermsFilter(FieldAndTermEnum iter, int length) {
// TODO: maybe use oal.index.PrefixCodedTerms instead?
// If number of terms is more than a few hundred it
// should be a win
// TODO: we also pack terms in FieldCache/DocValues
// ... maybe we can refactor to share that code
// TODO: yet another option is to build the union of the terms in
// an automaton an call intersect on the termsenum if the density is high
int hash = 9;
byte[] serializedTerms = new byte[0];
this.offsets = new int[length+1];
int lastEndOffset = 0;
int index = 0;
ArrayList<TermsAndField> termsAndFields = new ArrayList<>();
TermsAndField lastTermsAndField = null;
BytesRef previousTerm = null;
String previousField = null;
BytesRef currentTerm;
String currentField;
while((currentTerm = iter.next()) != null) {
currentField = iter.field();
if (currentField == null) {
throw new IllegalArgumentException("Field must not be null");
}
if (previousField != null) {
// deduplicate
if (previousField.equals(currentField)) {
if (previousTerm.bytesEquals(currentTerm)){
continue;
}
} else {
final int start = lastTermsAndField == null ? 0 : lastTermsAndField.end;
lastTermsAndField = new TermsAndField(start, index, previousField);
termsAndFields.add(lastTermsAndField);
}
}
hash = PRIME * hash + currentField.hashCode();
hash = PRIME * hash + currentTerm.hashCode();
if (serializedTerms.length < lastEndOffset+currentTerm.length) {
serializedTerms = ArrayUtil.grow(serializedTerms, lastEndOffset+currentTerm.length);
}
System.arraycopy(currentTerm.bytes, currentTerm.offset, serializedTerms, lastEndOffset, currentTerm.length);
offsets[index] = lastEndOffset;
lastEndOffset += currentTerm.length;
index++;
previousTerm = currentTerm;
previousField = currentField;
}
offsets[index] = lastEndOffset;
final int start = lastTermsAndField == null ? 0 : lastTermsAndField.end;
lastTermsAndField = new TermsAndField(start, index, previousField);
termsAndFields.add(lastTermsAndField);
this.termsBytes = ArrayUtil.shrink(serializedTerms, lastEndOffset);
this.termsAndFields = termsAndFields.toArray(new TermsAndField[termsAndFields.size()]);
this.hashCode = hash;
}
@Override
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
final AtomicReader reader = context.reader();
FixedBitSet result = null; // lazy init if needed - no need to create a big bitset ahead of time
final Fields fields = reader.fields();
final BytesRef spare = new BytesRef(this.termsBytes);
if (fields == null) {
return result;
}
Terms terms = null;
TermsEnum termsEnum = null;
DocsEnum docs = null;
for (TermsAndField termsAndField : this.termsAndFields) {
if ((terms = fields.terms(termsAndField.field)) != null) {
termsEnum = terms.iterator(termsEnum); // this won't return null
for (int i = termsAndField.start; i < termsAndField.end; i++) {
spare.offset = offsets[i];
spare.length = offsets[i+1] - offsets[i];
if (termsEnum.seekExact(spare)) {
docs = termsEnum.docs(acceptDocs, docs, DocsEnum.FLAG_NONE); // no freq since we don't need them
if (result == null) {
if (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
result = new FixedBitSet(reader.maxDoc());
// lazy init but don't do it in the hot loop since we could read many docs
result.set(docs.docID());
}
}
while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
result.set(docs.docID());
}
}
}
}
}
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if ((obj == null) || (obj.getClass() != this.getClass())) {
return false;
}
TermsFilter test = (TermsFilter) obj;
// first check the fields before even comparing the bytes
if (test.hashCode == hashCode && Arrays.equals(termsAndFields, test.termsAndFields)) {
int lastOffset = termsAndFields[termsAndFields.length - 1].end;
// compare offsets since we sort they must be identical
if (ArrayUtil.equals(offsets, 0, test.offsets, 0, lastOffset + 1)) {
// straight byte comparison since we sort they must be identical
return ArrayUtil.equals(termsBytes, 0, test.termsBytes, 0, offsets[lastOffset]);
}
}
return false;
}
@Override
public int hashCode() {
return hashCode;
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
BytesRef spare = new BytesRef(termsBytes);
boolean first = true;
for (int i = 0; i < termsAndFields.length; i++) {
TermsAndField current = termsAndFields[i];
for (int j = current.start; j < current.end; j++) {
spare.offset = offsets[j];
spare.length = offsets[j+1] - offsets[j];
if (!first) {
builder.append(' ');
}
first = false;
builder.append(current.field).append(':');
builder.append(spare.utf8ToString());
}
}
return builder.toString();
}
private static final class TermsAndField {
final int start;
final int end;
final String field;
TermsAndField(int start, int end, String field) {
super();
this.start = start;
this.end = end;
this.field = field;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((field == null) ? 0 : field.hashCode());
result = prime * result + end;
result = prime * result + start;
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
TermsAndField other = (TermsAndField) obj;
if (field == null) {
if (other.field != null) return false;
} else if (!field.equals(other.field)) return false;
if (end != other.end) return false;
if (start != other.start) return false;
return true;
}
}
private static abstract class FieldAndTermEnum {
protected String field;
public abstract BytesRef next();
public FieldAndTermEnum() {}
public FieldAndTermEnum(String field) { this.field = field; }
public String field() {
return field;
}
}
/*
* simple utility that returns the in-place sorted list
*/
private static <T extends Comparable<? super T>> List<T> sort(List<T> toSort) {
if (toSort.isEmpty()) {
throw new IllegalArgumentException("no terms provided");
}
Collections.sort(toSort);
return toSort;
}
}