/** Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package com.bigdata.rdf.lexicon; import com.bigdata.btree.BTree; import com.bigdata.btree.BTree.PartitionedCounter; /** * An encoder/decoder for long values formed from a partition identifier in the * high word and a local counter in the low word where the low N bits of the * long value are reversed and rotated into the high N bits of the long value. * <p> * The purpose of this encoding is to cause the N high bits to vary rapidly as * the local counter is driven up by writes on the index partition. This has the * effect of scattering writes on dependent indices (those using the resulting * long value as the sole or leading component of their key). * <p> * Given a source RDF/XML document with M "terms" distributed uniformly over K * TERM2ID index partitions, each term has a uniform likelihood of setting any * of the low bits of the local counter. After encoding, this means that the N * high-bits of encoded term identifier are uniformly distributed. Assuming that * the separator keys for the ID2TERM index divide the key space into equally * sized key-ranges, then the reads and writes on the ID2TERM index partitions * will be uniformly distributed as well. * <p> * The next bits in the encoded values are derived from the partition identifier * followed by the term identifier and therefore have a strong bias for the * index partition and the sequential assignment of local counter values within * an index partition respectively. This means that read / write access within * an index partition tends to have some locality, which improves B+Tree * performance through several mechanisms (mainly improved cache effects, * reduced copy-on-write for dirty leaves and nodes, and less IO costs). * <p> * When the #of ID2TERM index partitions GT <code>2^N</code>, only a subset of * those index partitions can be directly selected by the N high bits with their * uniform distribution. The next bias is the partition identifier, which begins * at ZERO (0), is inflated to (0, [1:P]), where P is the #of index partitions * generated by a scatter split, and grows relatively slowly thereafter as index * partitions are fill up and are split or are moved to redistribute the load on * the cluster. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ public class TermIdEncoder { /** * The #of low bits from the local counter that will be reversed and * written into the high-bits of the encoded long value. */ private final int N; /** * A mask with the {@link #N} low bits turned on. */ private final int mask; public String toString() { return getClass().getName() + "{N=" + N + ", mask=" + Integer.toBinaryString(mask) + "}"; } /** * Return the #of low bits from the local counter that will be reversed and * written into the high-bits of the encoded long value. */ public int getNBits() { return N; } /** * @param N * The #of low bits from the local counter that will be reversed * and written into the high-bits of the encoded long value. */ public TermIdEncoder(final int N) { if (N < 0) throw new IllegalArgumentException(); if (N > 31) throw new IllegalArgumentException(); this.N = N; /* * Construct the bit mask - this will have zeros in the high bits * that correspond to the bits of the localCounter that WILL NOT be * reversed and ones in the low bits that correspond to bits of the * localCounter that WILL be reversed. */ { int mask = 0; int bit; for (int i = 0; i < N; i++) { bit = (1 << i); mask |= bit; } this.mask = mask; } } /** * Encode a term identifier using the configured value of * {@link #getNBits() NBits}. * * @param v * A 64-bit long counter value as generated by an * {@link BTree.PartitionedCounter}. * * @return A permutation of that long value in which the low <i>N</i> bits * have been reversed and rotated into the high <i>N</i> bits. */ public long encode(final long v) { if (v == 0L) { // 0L is reserved for NULLs. throw new IllegalArgumentException(); } // the partition identifier. final long pid = 0xFFFFFFFFL & getPartitionId(v); // the local counter. final long ctr = 0xFFFFFFFFL & getLocalCounter(v); // the output value. long u = 0L; /* * Move pid to high word. */ u |= pid << (32 - N); /* * Right shift the counter over the bits that are being reversed, * extend to a long value. */ u |= ctr >>> N; /* * Use the mask to isolate the low-N bits of the counter, which are * then reversed into the high-N bits. */ final long rev = Integer.reverse(((int) ctr) & mask); /* * Overwrite the high N bits of the long value using the reversed * low N bits from the local counter. */ u |= rev << 32; return u; } /** * Reverses the effect of {@link #encode(long)}. * * @param u * An encoded long value. * * @return The decode long value. */ public long decode(final long u) { // reverse high word and mask to recover the low-N bits. final int fwd = Integer.reverse(((int) (u >>> 32))) & mask; /* * Left-shift to make room for the (un-)reversed bits and then combine * them back in. */ final int ctr = ((int) (u << N) | fwd); /* * Bring the partition identifier back to an int by shifting it the * same number of bits in the other direction. */ final int pid = ((int) (u >>> (32 - N))); // reconstruct the long counter value. return combine(pid, ctr); } /** * Return the partition identifier from the high word of a partitioned * counter. * * @param v * The partitioned counter. * * @return The high word. */ public static int getPartitionId(final long v) { return BTree.PartitionedCounter.getPartitionId(v); // return (int) (v >>> 32); } /** * Return the local counter from the low word of a partitioned counter. * * @param v * The partitioned counter. * * @return The low word. */ public static int getLocalCounter(final long v) { // return (int) v; return BTree.PartitionedCounter.getLocalCounter(v); } /** * Combines the partition identifier and the local counter using the same * logic as the {@link PartitionedCounter}. * * @param pid * The partition identifier. * @param ctr * The local counter. * * @return The long counter assembled from those values. * * @see BTree.PartitionedCounter, which performs the same operation and MUST * be consistent with this method. */ public static long combine(final int pid, final int ctr) { // return ((long) pid) << 32 | (0xFFFFFFFFL & (long) ctr); return BTree.PartitionedCounter.combine(pid, ctr); } /* * Alternative versions used for debugging. Package private for the unit * tests. */ long encode2(final long v1) { long v2 = v1 >>> N; for (int b = 0; b < N; b++) { if ((v1 & (1L << b)) != 0) { final long sv = 1L << (63 - b); v2 |= sv; } } return v2; } long decode2(final long v2) { long v1 = v2 << N; for (int b = 0; b < N; b++) { if ((v2 & (1L << (63 - b))) != 0) { v1 |= 1L << b; } } return v1; } }