package org.apache.lucene.util; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_MASK; import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SHIFT; import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE; import java.util.Arrays; import java.util.Comparator; import java.util.concurrent.atomic.AtomicLong; import org.apache.lucene.util.ByteBlockPool.DirectAllocator; /** * {@link BytesRefHash} is a special purpose hash-map like data-structure * optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of * byte arrays to ordinal (Map<BytesRef,int>) storing the hashed bytes * efficiently in continuous storage. The mapping to the ordinal is * encapsulated inside {@link BytesRefHash} and is guaranteed to be increased * for each added {@link BytesRef}. * * <p> * Note: The maximum capacity {@link BytesRef} instance passed to * {@link #add(BytesRef)} must not be longer than {@link ByteBlockPool#BYTE_BLOCK_SIZE}-2. * The internal storage is limited to 2GB total byte storage. * </p> * * @lucene.internal */ public final class BytesRefHash { public static final int DEFAULT_CAPACITY = 16; // the following fields are needed by comparator, // so package private to prevent access$-methods: final ByteBlockPool pool; int[] bytesStart; private final BytesRef scratch1 = new BytesRef(); private int hashSize; private int hashHalfSize; private int hashMask; private int count; private int lastCount = -1; private int[] ords; private final BytesStartArray bytesStartArray; private Counter bytesUsed; /** * Creates a new {@link BytesRefHash} with a {@link ByteBlockPool} using a * {@link DirectAllocator}. */ public BytesRefHash() { this(new ByteBlockPool(new DirectAllocator())); } /** * Creates a new {@link BytesRefHash} */ public BytesRefHash(ByteBlockPool pool) { this(pool, DEFAULT_CAPACITY, new DirectBytesStartArray(DEFAULT_CAPACITY)); } /** * Creates a new {@link BytesRefHash} */ public BytesRefHash(ByteBlockPool pool, int capacity, BytesStartArray bytesStartArray) { hashSize = capacity; hashHalfSize = hashSize >> 1; hashMask = hashSize - 1; this.pool = pool; ords = new int[hashSize]; Arrays.fill(ords, -1); this.bytesStartArray = bytesStartArray; bytesStart = bytesStartArray.init(); bytesUsed = bytesStartArray.bytesUsed() == null? Counter.newCounter() : bytesStartArray.bytesUsed(); bytesUsed.addAndGet(hashSize * RamUsageEstimator.NUM_BYTES_INT); } /** * Returns the number of {@link BytesRef} values in this {@link BytesRefHash}. * * @return the number of {@link BytesRef} values in this {@link BytesRefHash}. */ public int size() { return count; } /** * Populates and returns a {@link BytesRef} with the bytes for the given ord. * <p> * Note: the given ord must be a positive integer less that the current size ( * {@link #size()}) * </p> * * @param ord the ord * @param ref the {@link BytesRef} to populate * * @return the given BytesRef instance populated with the bytes for the given ord */ public BytesRef get(int ord, BytesRef ref) { assert bytesStart != null : "bytesStart is null - not initialized"; assert ord < bytesStart.length: "ord exceeds byteStart len: " + bytesStart.length; return pool.setBytesRef(ref, bytesStart[ord]); } /** * Returns the ords array in arbitrary order. Valid ords start at offset of 0 * and end at a limit of {@link #size()} - 1 * <p> * Note: This is a destructive operation. {@link #clear()} must be called in * order to reuse this {@link BytesRefHash} instance. * </p> */ public int[] compact() { assert bytesStart != null : "Bytesstart is null - not initialized"; int upto = 0; for (int i = 0; i < hashSize; i++) { if (ords[i] != -1) { if (upto < i) { ords[upto] = ords[i]; ords[i] = -1; } upto++; } } assert upto == count; lastCount = count; return ords; } /** * Returns the values array sorted by the referenced byte values. * <p> * Note: This is a destructive operation. {@link #clear()} must be called in * order to reuse this {@link BytesRefHash} instance. * </p> * * @param comp * the {@link Comparator} used for sorting */ public int[] sort(final Comparator<BytesRef> comp) { final int[] compact = compact(); new SorterTemplate() { @Override protected void swap(int i, int j) { final int o = compact[i]; compact[i] = compact[j]; compact[j] = o; } @Override protected int compare(int i, int j) { final int ord1 = compact[i], ord2 = compact[j]; assert bytesStart.length > ord1 && bytesStart.length > ord2; return comp.compare(pool.setBytesRef(scratch1, bytesStart[ord1]), pool.setBytesRef(scratch2, bytesStart[ord2])); } @Override protected void setPivot(int i) { final int ord = compact[i]; assert bytesStart.length > ord; pool.setBytesRef(pivot, bytesStart[ord]); } @Override protected int comparePivot(int j) { final int ord = compact[j]; assert bytesStart.length > ord; return comp.compare(pivot, pool.setBytesRef(scratch2, bytesStart[ord])); } private final BytesRef pivot = new BytesRef(), scratch1 = new BytesRef(), scratch2 = new BytesRef(); }.quickSort(0, count - 1); return compact; } private boolean equals(int ord, BytesRef b) { return pool.setBytesRef(scratch1, bytesStart[ord]).bytesEquals(b); } private boolean shrink(int targetSize) { // Cannot use ArrayUtil.shrink because we require power // of 2: int newSize = hashSize; while (newSize >= 8 && newSize / 4 > targetSize) { newSize /= 2; } if (newSize != hashSize) { bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -(hashSize - newSize)); hashSize = newSize; ords = new int[hashSize]; Arrays.fill(ords, -1); hashHalfSize = newSize / 2; hashMask = newSize - 1; return true; } else { return false; } } /** * Clears the {@link BytesRef} which maps to the given {@link BytesRef} */ public void clear(boolean resetPool) { lastCount = count; count = 0; if (resetPool) { pool.dropBuffersAndReset(); } bytesStart = bytesStartArray.clear(); if (lastCount != -1 && shrink(lastCount)) { // shrink clears the hash entries return; } Arrays.fill(ords, -1); } public void clear() { clear(true); } /** * Closes the BytesRefHash and releases all internally used memory */ public void close() { clear(true); ords = null; bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -hashSize); } /** * Adds a new {@link BytesRef} * * @param bytes * the bytes to hash * @return the ord the given bytes are hashed if there was no mapping for the * given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees * that the return value will always be >= 0 if the given bytes * haven't been hashed before. * * @throws MaxBytesLengthExceededException * if the given bytes are > 2 + * {@link ByteBlockPool#BYTE_BLOCK_SIZE} */ public int add(BytesRef bytes) { return add(bytes, bytes.hashCode()); } /** * Adds a new {@link BytesRef} with a pre-calculated hash code. * * @param bytes * the bytes to hash * @param code * the bytes hash code * * <p> * Hashcode is defined as: * * <pre class="prettyprint"> * int hash = 0; * for (int i = offset; i < offset + length; i++) { * hash = 31 * hash + bytes[i]; * } * </pre> * * @return the ord the given bytes are hashed if there was no mapping for the * given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees * that the return value will always be >= 0 if the given bytes * haven't been hashed before. * * @throws MaxBytesLengthExceededException * if the given bytes are > * {@link ByteBlockPool#BYTE_BLOCK_SIZE} - 2 */ public int add(BytesRef bytes, int code) { assert bytesStart != null : "Bytesstart is null - not initialized"; final int length = bytes.length; // final position int hashPos = code & hashMask; int e = ords[hashPos]; if (e != -1 && !equals(e, bytes)) { // Conflict: keep searching different locations in // the hash table. final int inc = ((code >> 8) + code) | 1; do { code += inc; hashPos = code & hashMask; e = ords[hashPos]; } while (e != -1 && !equals(e, bytes)); } if (e == -1) { // new entry final int len2 = 2 + bytes.length; if (len2 + pool.byteUpto > BYTE_BLOCK_SIZE) { if (len2 > BYTE_BLOCK_SIZE) { throw new MaxBytesLengthExceededException("bytes can be at most " + (BYTE_BLOCK_SIZE - 2) + " in length; got " + bytes.length); } pool.nextBuffer(); } final byte[] buffer = pool.buffer; final int bufferUpto = pool.byteUpto; if (count >= bytesStart.length) { bytesStart = bytesStartArray.grow(); assert count < bytesStart.length + 1 : "count: " + count + " len: " + bytesStart.length; } e = count++; bytesStart[e] = bufferUpto + pool.byteOffset; // We first encode the length, followed by the // bytes. Length is encoded as vInt, but will consume // 1 or 2 bytes at most (we reject too-long terms, // above). if (length < 128) { // 1 byte to store length buffer[bufferUpto] = (byte) length; pool.byteUpto += length + 1; assert length >= 0: "Length must be positive: " + length; System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 1, length); } else { // 2 byte to store length buffer[bufferUpto] = (byte) (0x80 | (length & 0x7f)); buffer[bufferUpto + 1] = (byte) ((length >> 7) & 0xff); pool.byteUpto += length + 2; System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 2, length); } assert ords[hashPos] == -1; ords[hashPos] = e; if (count == hashHalfSize) { rehash(2 * hashSize, true); } return e; } return -(e + 1); } public int addByPoolOffset(int offset) { assert bytesStart != null : "Bytesstart is null - not initialized"; // final position int code = offset; int hashPos = offset & hashMask; int e = ords[hashPos]; if (e != -1 && bytesStart[e] != offset) { // Conflict: keep searching different locations in // the hash table. final int inc = ((code >> 8) + code) | 1; do { code += inc; hashPos = code & hashMask; e = ords[hashPos]; } while (e != -1 && bytesStart[e] != offset); } if (e == -1) { // new entry if (count >= bytesStart.length) { bytesStart = bytesStartArray.grow(); assert count < bytesStart.length + 1 : "count: " + count + " len: " + bytesStart.length; } e = count++; bytesStart[e] = offset; assert ords[hashPos] == -1; ords[hashPos] = e; if (count == hashHalfSize) { rehash(2 * hashSize, false); } return e; } return -(e + 1); } /** * Called when hash is too small (> 50% occupied) or too large (< 20% * occupied). */ private void rehash(final int newSize, boolean hashOnData) { final int newMask = newSize - 1; bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (newSize)); final int[] newHash = new int[newSize]; Arrays.fill(newHash, -1); for (int i = 0; i < hashSize; i++) { final int e0 = ords[i]; if (e0 != -1) { int code; if (hashOnData) { final int off = bytesStart[e0]; final int start = off & BYTE_BLOCK_MASK; final byte[] bytes = pool.buffers[off >> BYTE_BLOCK_SHIFT]; code = 0; final int len; int pos; if ((bytes[start] & 0x80) == 0) { // length is 1 byte len = bytes[start]; pos = start + 1; } else { len = (bytes[start] & 0x7f) + ((bytes[start + 1] & 0xff) << 7); pos = start + 2; } final int endPos = pos + len; while (pos < endPos) { code = 31 * code + bytes[pos++]; } } else { code = bytesStart[e0]; } int hashPos = code & newMask; assert hashPos >= 0; if (newHash[hashPos] != -1) { final int inc = ((code >> 8) + code) | 1; do { code += inc; hashPos = code & newMask; } while (newHash[hashPos] != -1); } newHash[hashPos] = e0; } } hashMask = newMask; bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (-ords.length)); ords = newHash; hashSize = newSize; hashHalfSize = newSize / 2; } /** * reinitializes the {@link BytesRefHash} after a previous {@link #clear()} * call. If {@link #clear()} has not been called previously this method has no * effect. */ public void reinit() { if (bytesStart == null) { bytesStart = bytesStartArray.init(); } if (ords == null) { ords = new int[hashSize]; bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * hashSize); } } /** * Returns the bytesStart offset into the internally used * {@link ByteBlockPool} for the given ord * * @param ord * the ord to look up * @return the bytesStart offset into the internally used * {@link ByteBlockPool} for the given ord */ public int byteStart(int ord) { assert bytesStart != null : "Bytesstart is null - not initialized"; assert ord >= 0 && ord < count : ord; return bytesStart[ord]; } /** * Thrown if a {@link BytesRef} exceeds the {@link BytesRefHash} limit of * {@link ByteBlockPool#BYTE_BLOCK_SIZE}-2. */ @SuppressWarnings("serial") public static class MaxBytesLengthExceededException extends RuntimeException { MaxBytesLengthExceededException(String message) { super(message); } } /** Manages allocation of the per-term addresses. */ public abstract static class BytesStartArray { /** * Initializes the BytesStartArray. This call will allocate memory * * @return the initialized bytes start array */ public abstract int[] init(); /** * Grows the {@link BytesStartArray} * * @return the grown array */ public abstract int[] grow(); /** * clears the {@link BytesStartArray} and returns the cleared instance. * * @return the cleared instance, this might be <code>null</code> */ public abstract int[] clear(); /** * A {@link Counter} reference holding the number of bytes used by this * {@link BytesStartArray}. The {@link BytesRefHash} uses this reference to * track it memory usage * * @return a {@link AtomicLong} reference holding the number of bytes used * by this {@link BytesStartArray}. */ public abstract Counter bytesUsed(); } /** A simple {@link BytesStartArray} that tracks all * memory allocation using a shared {@link Counter} * instance. */ public static class TrackingDirectBytesStartArray extends BytesStartArray { protected final int initSize; private int[] bytesStart; protected final Counter bytesUsed; public TrackingDirectBytesStartArray(int initSize, Counter bytesUsed) { this.initSize = initSize; this.bytesUsed = bytesUsed; } @Override public int[] clear() { if (bytesStart != null) { bytesUsed.addAndGet(-bytesStart.length * RamUsageEstimator.NUM_BYTES_INT); } return bytesStart = null; } @Override public int[] grow() { assert bytesStart != null; final int oldSize = bytesStart.length; bytesStart = ArrayUtil.grow(bytesStart, bytesStart.length + 1); bytesUsed.addAndGet((bytesStart.length - oldSize) * RamUsageEstimator.NUM_BYTES_INT); return bytesStart; } @Override public int[] init() { bytesStart = new int[ArrayUtil.oversize(initSize, RamUsageEstimator.NUM_BYTES_INT)]; bytesUsed.addAndGet((bytesStart.length) * RamUsageEstimator.NUM_BYTES_INT); return bytesStart; } @Override public Counter bytesUsed() { return bytesUsed; } } /** A simple {@link BytesStartArray} that tracks * memory allocation using a private {@link AtomicLong} * instance. */ public static class DirectBytesStartArray extends BytesStartArray { // TODO: can't we just merge this w/ // TrackingDirectBytesStartArray...? Just add a ctor // that makes a private bytesUsed? protected final int initSize; private int[] bytesStart; private final Counter bytesUsed; public DirectBytesStartArray(int initSize) { this.bytesUsed = Counter.newCounter(); this.initSize = initSize; } @Override public int[] clear() { return bytesStart = null; } @Override public int[] grow() { assert bytesStart != null; return bytesStart = ArrayUtil.grow(bytesStart, bytesStart.length + 1); } @Override public int[] init() { return bytesStart = new int[ArrayUtil.oversize(initSize, RamUsageEstimator.NUM_BYTES_INT)]; } @Override public Counter bytesUsed() { return bytesUsed; } } }