/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.util.hll; /** * A serializer that writes a sequence of fixed bit-width 'words' to a byte array. * Bitwise OR is used to write words into bytes, so a low bit in a word is also * a low bit in a byte. However, a high byte in a word is written at a lower index * in the array than a low byte in a word. The first word is written at the lowest * array index. Each serializer is one time use and returns its backing byte * array.<p/> * * This encoding was chosen so that when reading bytes as octets in the typical * first-octet-is-the-high-nibble fashion, an octet-to-binary conversion * would yield a high-to-low, left-to-right view of the "short words".<p/> * * Example:<p/> * * Say short words are 5 bits wide. Our word sequence is the values * <code>[31, 1, 5]</code>. In big-endian binary format, the values are * <code>[0b11111, 0b00001, 0b00101]</code>. We use 15 of 16 bits in two bytes * and pad the last (lowest) bit of the last byte with a zero: * * <code> * [0b11111000, 0b01001010] = [0xF8, 0x4A] * </code>. */ class BigEndianAscendingWordSerializer implements IWordSerializer { // The number of bits per byte. private static final int BITS_PER_BYTE = 8; // ************************************************************************ // The length in bits of the words to be written. private final int wordLength; // The number of words to be written. private final int wordCount; // The byte array to which the words are serialized. private final byte[] bytes; // ------------------------------------------------------------------------ // Write state // Number of bits that remain writable in the current byte. private int bitsLeftInByte; // Index of byte currently being written to. private int byteIndex; // Number of words written. private int wordsWritten; // ======================================================================== /** * @param wordLength the length in bits of the words to be serialized. Must * be greater than or equal to 1 and less than or equal to 64. * @param wordCount the number of words to be serialized. Must be greater than * or equal to zero. * @param bytePadding the number of leading bytes that should pad the * serialized words. Must be greater than or equal to zero. */ public BigEndianAscendingWordSerializer(final int wordLength, final int wordCount, final int bytePadding) { if((wordLength < 1) || (wordLength > 64)) { throw new IllegalArgumentException("Word length must be >= 1 and <= 64. (was: " + wordLength + ")"); } if(wordCount < 0) { throw new IllegalArgumentException("Word count must be >= 0. (was: " + wordCount + ")"); } if(bytePadding < 0) { throw new IllegalArgumentException("Byte padding must be must be >= 0. (was: " + bytePadding + ")"); } this.wordLength = wordLength; this.wordCount = wordCount; final long bitsRequired = ((long)wordLength) * ((long)wordCount); final boolean leftoverBits = ((bitsRequired % BITS_PER_BYTE) != 0); final int bytesRequired = (int)(bitsRequired / BITS_PER_BYTE) + (leftoverBits ? 1 : 0) + bytePadding; bytes = new byte[bytesRequired]; bitsLeftInByte = BITS_PER_BYTE; byteIndex = bytePadding; wordsWritten = 0; } /* (non-Javadoc) * @see net.agkn.hll.serialization.IWordSerializer#writeWord(long) * @throws RuntimeException if the number of words written is greater than the * <code>wordCount</code> parameter in the constructor. */ @Override public void writeWord(final long word) { if(wordsWritten == wordCount) { throw new RuntimeException("Cannot write more words, backing array full!"); } int bitsLeftInWord = wordLength; while(bitsLeftInWord > 0) { // Move to the next byte if the current one is fully packed. if(bitsLeftInByte == 0) { byteIndex++; bitsLeftInByte = BITS_PER_BYTE; } final long consumedMask; if(bitsLeftInWord == 64) { consumedMask = ~0L; } else { consumedMask = ((1L << bitsLeftInWord) - 1L); } // Fix how many bits will be written in this cycle. Choose the // smaller of the remaining bits in the word or byte. final int numberOfBitsToWrite = Math.min(bitsLeftInByte, bitsLeftInWord); final int bitsInByteRemainingAfterWrite = (bitsLeftInByte - numberOfBitsToWrite); // In general, we write the highest bits of the word first, so we // strip the highest bits that were consumed in previous cycles. final long remainingBitsOfWordToWrite = (word & consumedMask); final long bitsThatTheByteCanAccept; // If there is more left in the word than can be written to this // byte, shift off the bits that can't be written off the bottom. if(bitsLeftInWord > numberOfBitsToWrite) { bitsThatTheByteCanAccept = (remainingBitsOfWordToWrite >>> (bitsLeftInWord - bitsLeftInByte)); } else { // If the byte can accept all remaining bits, there is no need // to shift off the bits that won't be written in this cycle. bitsThatTheByteCanAccept = remainingBitsOfWordToWrite; } // Align the word bits to write up against the byte bits that have // already been written. This shift may do nothing if the remainder // of the byte is being consumed in this cycle. final long alignedBits = (bitsThatTheByteCanAccept << bitsInByteRemainingAfterWrite); // Update the byte with the alignedBits. bytes[byteIndex] |= (byte)alignedBits; // Update state with bit count written. bitsLeftInWord -= numberOfBitsToWrite; bitsLeftInByte = bitsInByteRemainingAfterWrite; } wordsWritten ++; } /* (non-Javadoc) * @see net.agkn.hll.serialization.IWordSerializer#getBytes() * @throws RuntimeException if the number of words written is fewer than the * <code>wordCount</code> parameter in the constructor. */ @Override public byte[] getBytes() { if(wordsWritten < wordCount) { throw new RuntimeException("Not all words have been written! (" + wordsWritten + "/" + wordCount + ")"); } return bytes; } }