/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.search; import org.apache.lucene.util.BitUtil; /** * <code>HashDocSet</code> represents an unordered set of Lucene Document Ids * using a primitive int hash table. It can be a better choice if there are few docs * in the set because it takes up less memory and is faster to iterate and take * set intersections. * * @version $Id: HashDocSet.java 790938 2009-07-03 15:09:21Z yonik $ * @since solr 0.9 */ public final class HashDocSet extends DocSetBase { /** Default load factor to use for HashDocSets. We keep track of the inverse * since multiplication is so much faster than division. The default * is 1.0f / 0.75f */ static float DEFAULT_INVERSE_LOAD_FACTOR = 1.0f /0.75f; // public final static int MAX_SIZE = SolrConfig.config.getInt("//HashDocSet/@maxSize",-1); // lucene docs are numbered from 0, so a neg number must be used for missing. // an alternative to having to init the array to EMPTY at the start is // private final static int EMPTY=-1; private final int[] table; private final int size; private final int mask; /** Create a HashDocSet from a list of *unique* ids */ public HashDocSet(int[] docs, int offset, int len) { this(docs, offset, len, DEFAULT_INVERSE_LOAD_FACTOR); } /** Create a HashDocSet from a list of *unique* ids */ public HashDocSet(int[] docs, int offset, int len, float inverseLoadFactor) { int tsize = Math.max(BitUtil.nextHighestPowerOfTwo(len), 1); if (tsize < len * inverseLoadFactor) { tsize <<= 1; } mask=tsize-1; table = new int[tsize]; // (for now) better then: Arrays.fill(table, EMPTY); // https://issues.apache.org/jira/browse/SOLR-390 for (int i=tsize-1; i>=0; i--) table[i]=EMPTY; int end = offset + len; for (int i=offset; i<end; i++) { put(docs[i]); } size = len; } void put(int doc) { int s = doc & mask; while (table[s]!=EMPTY) { // Adding an odd number to this power-of-two hash table is // guaranteed to do a full traversal, so instead of re-hashing // we jump straight to a "linear" traversal. // The key is that we provide many different ways to do the // traversal (tablesize/2) based on the last hash code (the doc). // Rely on loop invariant code motion to eval ((doc>>7)|1) only once. // otherwise, we would need to pull the first case out of the loop. s = (s + ((doc>>7)|1)) & mask; } table[s]=doc; } public boolean exists(int doc) { int s = doc & mask; for(;;) { int v = table[s]; if (v==EMPTY) return false; if (v==doc) return true; // see put() for algorithm details. s = (s + ((doc>>7)|1)) & mask; } } public int size() { return size; } public DocIterator iterator() { return new DocIterator() { int pos=0; int doc; { goNext(); } public boolean hasNext() { return pos < table.length; } public Integer next() { return nextDoc(); } public void remove() { } void goNext() { while (pos<table.length && table[pos]==EMPTY) pos++; } // modify to return -1 at end of iteration? public int nextDoc() { int doc = table[pos]; pos++; goNext(); return doc; } public float score() { return 0.0f; } }; } public long memSize() { return (table.length<<2) + 20; } @Override public DocSet intersection(DocSet other) { if (other instanceof HashDocSet) { // set "a" to the smallest doc set for the most efficient // intersection. final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other; final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this; int[] result = new int[a.size()]; int resultCount=0; for (int i=0; i<a.table.length; i++) { int id=a.table[i]; if (id >= 0 && b.exists(id)) { result[resultCount++]=id; } } return new HashDocSet(result,0,resultCount); } else { int[] result = new int[size()]; int resultCount=0; for (int i=0; i<table.length; i++) { int id=table[i]; if (id >= 0 && other.exists(id)) { result[resultCount++]=id; } } return new HashDocSet(result,0,resultCount); } } @Override public int intersectionSize(DocSet other) { if (other instanceof HashDocSet) { // set "a" to the smallest doc set for the most efficient // intersection. final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other; final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this; int resultCount=0; for (int i=0; i<a.table.length; i++) { int id=a.table[i]; if (id >= 0 && b.exists(id)) { resultCount++; } } return resultCount; } else { int resultCount=0; for (int i=0; i<table.length; i++) { int id=table[i]; if (id >= 0 && other.exists(id)) { resultCount++; } } return resultCount; } } @Override public DocSet andNot(DocSet other) { int[] result = new int[size()]; int resultCount=0; for (int i=0; i<table.length; i++) { int id=table[i]; if (id >= 0 && !other.exists(id)) { result[resultCount++]=id; } } return new HashDocSet(result,0,resultCount); } @Override public DocSet union(DocSet other) { if (other instanceof HashDocSet) { // set "a" to the smallest doc set final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other; final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this; int[] result = new int[a.size()+b.size()]; int resultCount=0; // iterate over the largest table first, adding w/o checking. for (int i=0; i<b.table.length; i++) { int id=b.table[i]; if (id>=0) result[resultCount++]=id; } // now iterate over smaller set, adding all not already in larger set. for (int i=0; i<a.table.length; i++) { int id=a.table[i]; if (id>=0 && !b.exists(id)) result[resultCount++]=id; } return new HashDocSet(result,0,resultCount); } else { return other.union(this); } } // don't implement andNotSize() and unionSize() on purpose... they are implemented // in BaseDocSet in terms of intersectionSize(). }