/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.presto.operator; import com.facebook.presto.spi.Page; import com.facebook.presto.spi.PageBuilder; import io.airlift.units.DataSize; import it.unimi.dsi.fastutil.HashCommon; import it.unimi.dsi.fastutil.longs.LongArrayList; import java.util.Arrays; import static com.facebook.presto.operator.SyntheticAddress.decodePosition; import static com.facebook.presto.operator.SyntheticAddress.decodeSliceIndex; import static com.facebook.presto.util.HashCollisionsEstimator.estimateNumberOfHashCollisions; import static io.airlift.slice.SizeOf.sizeOf; import static io.airlift.units.DataSize.Unit.KILOBYTE; import static java.lang.Math.toIntExact; import static java.util.Objects.requireNonNull; // This implementation assumes arrays used in the hash are always a power of 2 public final class PagesHash { private static final DataSize CACHE_SIZE = new DataSize(128, KILOBYTE); private final LongArrayList addresses; private final PagesHashStrategy pagesHashStrategy; private final int channelCount; private final int mask; private final int[] key; private final long size; // Native array of hashes for faster collisions resolution compared // to accessing values in blocks. We use bytes to reduce memory foot print // and there is no performance gain from storing full hashes private final byte[] positionToHashes; private final long hashCollisions; private final double expectedHashCollisions; public PagesHash( LongArrayList addresses, PagesHashStrategy pagesHashStrategy, PositionLinks.Builder positionLinks) { this.addresses = requireNonNull(addresses, "addresses is null"); this.pagesHashStrategy = requireNonNull(pagesHashStrategy, "pagesHashStrategy is null"); this.channelCount = pagesHashStrategy.getChannelCount(); // reserve memory for the arrays int hashSize = HashCommon.arraySize(addresses.size(), 0.75f); mask = hashSize - 1; key = new int[hashSize]; Arrays.fill(key, -1); positionToHashes = new byte[addresses.size()]; // We will process addresses in batches, to save memory on array of hashes. int positionsInStep = Math.min(addresses.size() + 1, (int) CACHE_SIZE.toBytes() / Integer.SIZE); long[] positionToFullHashes = new long[positionsInStep]; long hashCollisionsLocal = 0; for (int step = 0; step * positionsInStep <= addresses.size(); step++) { int stepBeginPosition = step * positionsInStep; int stepEndPosition = Math.min((step + 1) * positionsInStep, addresses.size()); int stepSize = stepEndPosition - stepBeginPosition; // First extract all hashes from blocks to native array. // Somehow having this as a separate loop is much faster compared // to extracting hashes on the fly in the loop below. for (int position = 0; position < stepSize; position++) { int realPosition = position + stepBeginPosition; long hash = readHashPosition(realPosition); positionToFullHashes[position] = hash; positionToHashes[realPosition] = (byte) hash; } // index pages for (int position = 0; position < stepSize; position++) { int realPosition = position + stepBeginPosition; if (isPositionNull(realPosition)) { continue; } long hash = positionToFullHashes[position]; int pos = getHashPosition(hash, mask); // look for an empty slot or a slot containing this key while (key[pos] != -1) { int currentKey = key[pos]; if (((byte) hash) == positionToHashes[currentKey] && positionEqualsPositionIgnoreNulls(currentKey, realPosition)) { // found a slot for this key // link the new key position to the current key position realPosition = positionLinks.link(realPosition, currentKey); // key[pos] updated outside of this loop break; } // increment position and mask to handler wrap around pos = (pos + 1) & mask; hashCollisionsLocal++; } key[pos] = realPosition; } } size = sizeOf(addresses.elements()) + pagesHashStrategy.getSizeInBytes() + sizeOf(key) + sizeOf(positionToHashes); hashCollisions = hashCollisionsLocal; expectedHashCollisions = estimateNumberOfHashCollisions(addresses.size(), hashSize); } public final int getChannelCount() { return channelCount; } public int getPositionCount() { return addresses.size(); } public long getInMemorySizeInBytes() { return size; } public long getHashCollisions() { return hashCollisions; } public double getExpectedHashCollisions() { return expectedHashCollisions; } public int getAddressIndex(int position, Page hashChannelsPage, Page allChannelsPage) { return getAddressIndex(position, hashChannelsPage, allChannelsPage, pagesHashStrategy.hashRow(position, hashChannelsPage)); } public int getAddressIndex(int rightPosition, Page hashChannelsPage, Page allChannelsPage, long rawHash) { int pos = getHashPosition(rawHash, mask); while (key[pos] != -1) { if (positionEqualsCurrentRowIgnoreNulls(key[pos], (byte) rawHash, rightPosition, hashChannelsPage)) { return key[pos]; } // increment position and mask to handler wrap around pos = (pos + 1) & mask; } return -1; } public void appendTo(long position, PageBuilder pageBuilder, int outputChannelOffset) { long pageAddress = addresses.getLong(toIntExact(position)); int blockIndex = decodeSliceIndex(pageAddress); int blockPosition = decodePosition(pageAddress); pagesHashStrategy.appendTo(blockIndex, blockPosition, pageBuilder, outputChannelOffset); } private boolean isPositionNull(int position) { long pageAddress = addresses.getLong(position); int blockIndex = decodeSliceIndex(pageAddress); int blockPosition = decodePosition(pageAddress); return pagesHashStrategy.isPositionNull(blockIndex, blockPosition); } private long readHashPosition(int position) { long pageAddress = addresses.getLong(position); int blockIndex = decodeSliceIndex(pageAddress); int blockPosition = decodePosition(pageAddress); return pagesHashStrategy.hashPosition(blockIndex, blockPosition); } private boolean positionEqualsCurrentRowIgnoreNulls(int leftPosition, byte rawHash, int rightPosition, Page rightPage) { if (positionToHashes[leftPosition] != rawHash) { return false; } long pageAddress = addresses.getLong(leftPosition); int blockIndex = decodeSliceIndex(pageAddress); int blockPosition = decodePosition(pageAddress); return pagesHashStrategy.positionEqualsRowIgnoreNulls(blockIndex, blockPosition, rightPosition, rightPage); } private boolean positionEqualsPositionIgnoreNulls(int leftPosition, int rightPosition) { long leftPageAddress = addresses.getLong(leftPosition); int leftBlockIndex = decodeSliceIndex(leftPageAddress); int leftBlockPosition = decodePosition(leftPageAddress); long rightPageAddress = addresses.getLong(rightPosition); int rightBlockIndex = decodeSliceIndex(rightPageAddress); int rightBlockPosition = decodePosition(rightPageAddress); return pagesHashStrategy.positionEqualsPositionIgnoreNulls(leftBlockIndex, leftBlockPosition, rightBlockIndex, rightBlockPosition); } private static int getHashPosition(long rawHash, long mask) { // Avalanches the bits of a long integer by applying the finalisation step of MurmurHash3. // // This function implements the finalisation step of Austin Appleby's <a href="http://sites.google.com/site/murmurhash/">MurmurHash3</a>. // Its purpose is to avalanche the bits of the argument to within 0.25% bias. It is used, among other things, to scramble quickly (but deeply) the hash // values returned by {@link Object#hashCode()}. // rawHash ^= rawHash >>> 33; rawHash *= 0xff51afd7ed558ccdL; rawHash ^= rawHash >>> 33; rawHash *= 0xc4ceb9fe1a85ec53L; rawHash ^= rawHash >>> 33; return (int) (rawHash & mask); } }