/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.exec.vector; /** * This class supports string and binary data by value reference -- i.e. each field is * explicitly present, as opposed to provided by a dictionary reference. * In some cases, all the values will be in the same byte array to begin with, * but this need not be the case. If each value is in a separate byte * array to start with, or not all of the values are in the same original * byte array, you can still assign data by reference into this column vector. * This gives flexibility to use this in multiple situations. * <p> * When setting data by reference, the caller * is responsible for allocating the byte arrays used to hold the data. * You can also set data by value, as long as you call the initBuffer() method first. * You can mix "by value" and "by reference" in the same column vector, * though that use is probably not typical. */ public class BytesColumnVector extends ColumnVector { public byte[][] vector; public int[] start; // start offset of each field /* * The length of each field. If the value repeats for every entry, then it is stored * in vector[0] and isRepeating from the superclass is set to true. */ public int[] length; // A call to increaseBufferSpace() or ensureValPreallocated() will ensure that buffer[] points to // a byte[] with sufficient space for the specified size. private byte[] buffer; // optional buffer to use when actually copying in data private int nextFree; // next free position in buffer // Hang onto a byte array for holding smaller byte values private byte[] smallBuffer; private int smallBufferNextFree; private int bufferAllocationCount; // Estimate that there will be 16 bytes per entry static final int DEFAULT_BUFFER_SIZE = 16 * VectorizedRowBatch.DEFAULT_SIZE; // Proportion of extra space to provide when allocating more buffer space. static final float EXTRA_SPACE_FACTOR = (float) 1.2; // Largest size allowed in smallBuffer static final int MAX_SIZE_FOR_SMALL_BUFFER = 1024 * 1024; /** * Use this constructor for normal operation. * All column vectors should be the default size normally. */ public BytesColumnVector() { this(VectorizedRowBatch.DEFAULT_SIZE); } /** * Don't call this constructor except for testing purposes. * * @param size number of elements in the column vector */ public BytesColumnVector(int size) { super(size); vector = new byte[size][]; start = new int[size]; length = new int[size]; } /** * Additional reset work for BytesColumnVector (releasing scratch bytes for by value strings). */ @Override public void reset() { super.reset(); initBuffer(0); } /** Set a field by reference. * * @param elementNum index within column vector to set * @param sourceBuf container of source data * @param start start byte position within source * @param length length of source byte sequence */ public void setRef(int elementNum, byte[] sourceBuf, int start, int length) { vector[elementNum] = sourceBuf; this.start[elementNum] = start; this.length[elementNum] = length; } /** * You must call initBuffer first before using setVal(). * Provide the estimated number of bytes needed to hold * a full column vector worth of byte string data. * * @param estimatedValueSize Estimated size of buffer space needed */ public void initBuffer(int estimatedValueSize) { nextFree = 0; smallBufferNextFree = 0; // if buffer is already allocated, keep using it, don't re-allocate if (buffer != null) { // Free up any previously allocated buffers that are referenced by vector if (bufferAllocationCount > 0) { for (int idx = 0; idx < vector.length; ++idx) { vector[idx] = null; } buffer = smallBuffer; // In case last row was a large bytes value } } else { // allocate a little extra space to limit need to re-allocate int bufferSize = this.vector.length * (int)(estimatedValueSize * EXTRA_SPACE_FACTOR); if (bufferSize < DEFAULT_BUFFER_SIZE) { bufferSize = DEFAULT_BUFFER_SIZE; } buffer = new byte[bufferSize]; smallBuffer = buffer; } bufferAllocationCount = 0; } /** * Initialize buffer to default size. */ public void initBuffer() { initBuffer(0); } /** * @return amount of buffer space currently allocated */ public int bufferSize() { if (buffer == null) { return 0; } return buffer.length; } /** * Set a field by actually copying in to a local buffer. * If you must actually copy data in to the array, use this method. * DO NOT USE this method unless it's not practical to set data by reference with setRef(). * Setting data by reference tends to run a lot faster than copying data in. * * @param elementNum index within column vector to set * @param sourceBuf container of source data * @param start start byte position within source * @param length length of source byte sequence */ public void setVal(int elementNum, byte[] sourceBuf, int start, int length) { if ((nextFree + length) > buffer.length) { increaseBufferSpace(length); } System.arraycopy(sourceBuf, start, buffer, nextFree, length); vector[elementNum] = buffer; this.start[elementNum] = nextFree; this.length[elementNum] = length; nextFree += length; } /** * Set a field by actually copying in to a local buffer. * If you must actually copy data in to the array, use this method. * DO NOT USE this method unless it's not practical to set data by reference with setRef(). * Setting data by reference tends to run a lot faster than copying data in. * * @param elementNum index within column vector to set * @param sourceBuf container of source data */ public void setVal(int elementNum, byte[] sourceBuf) { setVal(elementNum, sourceBuf, 0, sourceBuf.length); } /** * Preallocate space in the local buffer so the caller can fill in the value bytes themselves. * * Always use with getValPreallocatedBytes, getValPreallocatedStart, and setValPreallocated. */ public void ensureValPreallocated(int length) { if ((nextFree + length) > buffer.length) { increaseBufferSpace(length); } } public byte[] getValPreallocatedBytes() { return buffer; } public int getValPreallocatedStart() { return nextFree; } /** * Set the length of the preallocated values bytes used. * @param elementNum * @param length */ public void setValPreallocated(int elementNum, int length) { vector[elementNum] = buffer; this.start[elementNum] = nextFree; this.length[elementNum] = length; nextFree += length; } /** * Set a field to the concatenation of two string values. Result data is copied * into the internal buffer. * * @param elementNum index within column vector to set * @param leftSourceBuf container of left argument * @param leftStart start of left argument * @param leftLen length of left argument * @param rightSourceBuf container of right argument * @param rightStart start of right argument * @param rightLen length of right arugment */ public void setConcat(int elementNum, byte[] leftSourceBuf, int leftStart, int leftLen, byte[] rightSourceBuf, int rightStart, int rightLen) { int newLen = leftLen + rightLen; if ((nextFree + newLen) > buffer.length) { increaseBufferSpace(newLen); } vector[elementNum] = buffer; this.start[elementNum] = nextFree; this.length[elementNum] = newLen; System.arraycopy(leftSourceBuf, leftStart, buffer, nextFree, leftLen); nextFree += leftLen; System.arraycopy(rightSourceBuf, rightStart, buffer, nextFree, rightLen); nextFree += rightLen; } /** * Increase buffer space enough to accommodate next element. * This uses an exponential increase mechanism to rapidly * increase buffer size to enough to hold all data. * As batches get re-loaded, buffer space allocated will quickly * stabilize. * * @param nextElemLength size of next element to be added */ public void increaseBufferSpace(int nextElemLength) { // A call to increaseBufferSpace() or ensureValPreallocated() will ensure that buffer[] points to // a byte[] with sufficient space for the specified size. // This will either point to smallBuffer, or to a newly allocated byte array for larger values. if (nextElemLength > MAX_SIZE_FOR_SMALL_BUFFER) { // Larger allocations will be special-cased and will not use the normal buffer. // buffer/nextFree will be set to a newly allocated array just for the current row. // The next row will require another call to increaseBufferSpace() since this new buffer should be used up. byte[] newBuffer = new byte[nextElemLength]; ++bufferAllocationCount; // If the buffer was pointing to smallBuffer, then nextFree keeps track of the current state // of the free index for smallBuffer. We now need to save this value to smallBufferNextFree // so we don't lose this. A bit of a weird dance here. if (smallBuffer == buffer) { smallBufferNextFree = nextFree; } buffer = newBuffer; nextFree = 0; } else { // This value should go into smallBuffer. if (smallBuffer != buffer) { // Previous row was for a large bytes value ( > MAX_SIZE_FOR_SMALL_BUFFER). // Use smallBuffer if possible. buffer = smallBuffer; nextFree = smallBufferNextFree; } // smallBuffer might still be out of space if ((nextFree + nextElemLength) > buffer.length) { int newLength = smallBuffer.length * 2; while (newLength < nextElemLength) { if (newLength < 0) { throw new RuntimeException("Overflow of newLength. smallBuffer.length=" + smallBuffer.length + ", nextElemLength=" + nextElemLength); } newLength *= 2; } smallBuffer = new byte[newLength]; ++bufferAllocationCount; smallBufferNextFree = 0; // Update buffer buffer = smallBuffer; nextFree = 0; } } } /** Copy the current object contents into the output. Only copy selected entries, * as indicated by selectedInUse and the sel array. */ public void copySelected( boolean selectedInUse, int[] sel, int size, BytesColumnVector output) { // Output has nulls if and only if input has nulls. output.noNulls = noNulls; output.isRepeating = false; // Handle repeating case if (isRepeating) { output.setVal(0, vector[0], start[0], length[0]); output.isNull[0] = isNull[0]; output.isRepeating = true; return; } // Handle normal case // Copy data values over if (selectedInUse) { for (int j = 0; j < size; j++) { int i = sel[j]; output.setVal(i, vector[i], start[i], length[i]); } } else { for (int i = 0; i < size; i++) { output.setVal(i, vector[i], start[i], length[i]); } } // Copy nulls over if needed if (!noNulls) { if (selectedInUse) { for (int j = 0; j < size; j++) { int i = sel[j]; output.isNull[i] = isNull[i]; } } else { System.arraycopy(isNull, 0, output.isNull, 0, size); } } } /** Simplify vector by brute-force flattening noNulls and isRepeating * This can be used to reduce combinatorial explosion of code paths in VectorExpressions * with many arguments, at the expense of loss of some performance. */ public void flatten(boolean selectedInUse, int[] sel, int size) { flattenPush(); if (isRepeating) { isRepeating = false; // setRef is used below and this is safe, because the reference // is to data owned by this column vector. If this column vector // gets re-used, the whole thing is re-used together so there // is no danger of a dangling reference. // Only copy data values if entry is not null. The string value // at position 0 is undefined if the position 0 value is null. if (noNulls || !isNull[0]) { // loops start at position 1 because position 0 is already set if (selectedInUse) { for (int j = 1; j < size; j++) { int i = sel[j]; this.setRef(i, vector[0], start[0], length[0]); } } else { for (int i = 1; i < size; i++) { this.setRef(i, vector[0], start[0], length[0]); } } } flattenRepeatingNulls(selectedInUse, sel, size); } flattenNoNulls(selectedInUse, sel, size); } // Fill the all the vector entries with provided value public void fill(byte[] value) { noNulls = true; isRepeating = true; setRef(0, value, 0, value.length); } // Fill the column vector with nulls public void fillWithNulls() { noNulls = false; isRepeating = true; vector[0] = null; isNull[0] = true; } @Override public void setElement(int outElementNum, int inputElementNum, ColumnVector inputVector) { if (inputVector.isRepeating) { inputElementNum = 0; } if (inputVector.noNulls || !inputVector.isNull[inputElementNum]) { isNull[outElementNum] = false; BytesColumnVector in = (BytesColumnVector) inputVector; setVal(outElementNum, in.vector[inputElementNum], in.start[inputElementNum], in.length[inputElementNum]); } else { isNull[outElementNum] = true; noNulls = false; } } @Override public void init() { initBuffer(0); } public String toString(int row) { if (isRepeating) { row = 0; } if (noNulls || !isNull[row]) { return new String(vector[row], start[row], length[row]); } else { return null; } } @Override public void stringifyValue(StringBuilder buffer, int row) { if (isRepeating) { row = 0; } if (noNulls || !isNull[row]) { buffer.append('"'); buffer.append(new String(vector[row], start[row], length[row])); buffer.append('"'); } else { buffer.append("null"); } } @Override public void ensureSize(int size, boolean preserveData) { super.ensureSize(size, preserveData); if (size > vector.length) { int[] oldStart = start; start = new int[size]; int[] oldLength = length; length = new int[size]; byte[][] oldVector = vector; vector = new byte[size][]; if (preserveData) { if (isRepeating) { vector[0] = oldVector[0]; start[0] = oldStart[0]; length[0] = oldLength[0]; } else { System.arraycopy(oldVector, 0, vector, 0, oldVector.length); System.arraycopy(oldStart, 0, start, 0 , oldStart.length); System.arraycopy(oldLength, 0, length, 0, oldLength.length); } } } } @Override public void shallowCopyTo(ColumnVector otherCv) { BytesColumnVector other = (BytesColumnVector)otherCv; super.shallowCopyTo(other); other.nextFree = nextFree; other.vector = vector; other.start = start; other.length = length; other.buffer = buffer; } }