/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.search; import java.util.Collection; import java.util.Collections; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.BitUtil; import org.apache.lucene.util.RamUsageEstimator; /** * <code>HashDocSet</code> represents an unordered set of Lucene Document Ids * using a primitive int hash table. It can be a better choice if there are few docs * in the set because it takes up less memory and is faster to iterate and take * set intersections. * * * @since solr 0.9 */ public final class HashDocSet extends DocSetBase { private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(HashDocSet.class) + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER; /** Default load factor to use for HashDocSets. We keep track of the inverse * since multiplication is so much faster than division. The default * is 1.0f / 0.75f */ static float DEFAULT_INVERSE_LOAD_FACTOR = 1.0f /0.75f; // public final static int MAX_SIZE = SolrConfig.config.getInt("//HashDocSet/@maxSize",-1); // lucene docs are numbered from 0, so a neg number must be used for missing. // an alternative to having to init the array to EMPTY at the start is // private final static int EMPTY=-1; private final int[] table; private final int size; private final int mask; public HashDocSet(HashDocSet set) { this.table = set.table.clone(); this.size = set.size; this.mask = set.mask; } /** Create a HashDocSet from a list of *unique* ids */ public HashDocSet(int[] docs, int offset, int len) { this(docs, offset, len, DEFAULT_INVERSE_LOAD_FACTOR); } /** Create a HashDocSet from a list of *unique* ids */ public HashDocSet(int[] docs, int offset, int len, float inverseLoadFactor) { int tsize = Math.max(BitUtil.nextHighestPowerOfTwo(len), 1); if (tsize < len * inverseLoadFactor) { tsize <<= 1; } mask=tsize-1; table = new int[tsize]; // (for now) better then: Arrays.fill(table, EMPTY); // https://issues.apache.org/jira/browse/SOLR-390 for (int i=tsize-1; i>=0; i--) table[i]=EMPTY; int end = offset + len; for (int i=offset; i<end; i++) { put(docs[i]); } size = len; } void put(int doc) { int s = doc & mask; while (table[s]!=EMPTY) { // Adding an odd number to this power-of-two hash table is // guaranteed to do a full traversal, so instead of re-hashing // we jump straight to a "linear" traversal. // The key is that we provide many different ways to do the // traversal (tablesize/2) based on the last hash code (the doc). // Rely on loop invariant code motion to eval ((doc>>7)|1) only once. // otherwise, we would need to pull the first case out of the loop. s = (s + ((doc>>7)|1)) & mask; } table[s]=doc; } @Override public boolean exists(int doc) { int s = doc & mask; for(;;) { int v = table[s]; if (v==EMPTY) return false; if (v==doc) return true; // see put() for algorithm details. s = (s + ((doc>>7)|1)) & mask; } } @Override public int size() { return size; } @Override public DocIterator iterator() { return new DocIterator() { int pos=0; int doc; { goNext(); } @Override public boolean hasNext() { return pos < table.length; } @Override public Integer next() { return nextDoc(); } @Override public void remove() { } void goNext() { while (pos<table.length && table[pos]==EMPTY) pos++; } // modify to return -1 at end of iteration? @Override public int nextDoc() { int doc = table[pos]; pos++; goNext(); return doc; } @Override public float score() { return 0.0f; } }; } @Override public DocSet intersection(DocSet other) { if (other instanceof HashDocSet) { // set "a" to the smallest doc set for the most efficient // intersection. final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other; final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this; int[] result = new int[a.size()]; int resultCount=0; for (int i=0; i<a.table.length; i++) { int id=a.table[i]; if (id >= 0 && b.exists(id)) { result[resultCount++]=id; } } return new HashDocSet(result,0,resultCount); } else { int[] result = new int[size()]; int resultCount=0; for (int i=0; i<table.length; i++) { int id=table[i]; if (id >= 0 && other.exists(id)) { result[resultCount++]=id; } } return new HashDocSet(result,0,resultCount); } } @Override public int intersectionSize(DocSet other) { if (other instanceof HashDocSet) { // set "a" to the smallest doc set for the most efficient // intersection. final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other; final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this; int resultCount=0; for (int i=0; i<a.table.length; i++) { int id=a.table[i]; if (id >= 0 && b.exists(id)) { resultCount++; } } return resultCount; } else { int resultCount=0; for (int i=0; i<table.length; i++) { int id=table[i]; if (id >= 0 && other.exists(id)) { resultCount++; } } return resultCount; } } @Override public boolean intersects(DocSet other) { if (other instanceof HashDocSet) { // set "a" to the smallest doc set for the most efficient // intersection. final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other; final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this; for (int i=0; i<a.table.length; i++) { int id=a.table[i]; if (id >= 0 && b.exists(id)) { return true; } } return false; } else { for (int i=0; i<table.length; i++) { int id=table[i]; if (id >= 0 && other.exists(id)) { return true; } } return false; } } @Override public DocSet andNot(DocSet other) { int[] result = new int[size()]; int resultCount=0; for (int i=0; i<table.length; i++) { int id=table[i]; if (id >= 0 && !other.exists(id)) { result[resultCount++]=id; } } return new HashDocSet(result,0,resultCount); } @Override public DocSet union(DocSet other) { if (other instanceof HashDocSet) { // set "a" to the smallest doc set final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other; final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this; int[] result = new int[a.size()+b.size()]; int resultCount=0; // iterate over the largest table first, adding w/o checking. for (int i=0; i<b.table.length; i++) { int id=b.table[i]; if (id>=0) result[resultCount++]=id; } // now iterate over smaller set, adding all not already in larger set. for (int i=0; i<a.table.length; i++) { int id=a.table[i]; if (id>=0 && !b.exists(id)) result[resultCount++]=id; } return new HashDocSet(result,0,resultCount); } else { return other.union(this); } } @Override public HashDocSet clone() { return new HashDocSet(this); } // don't implement andNotSize() and unionSize() on purpose... they are implemented // in BaseDocSet in terms of intersectionSize(). @Override public long ramBytesUsed() { return BASE_RAM_BYTES_USED + (table.length<<2); } @Override public Collection<Accountable> getChildResources() { return Collections.emptyList(); } }