/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.physical.impl.spill;
import java.util.ArrayList;
import java.util.List;
import org.apache.drill.common.types.TypeProtos.MinorType;
import org.apache.drill.exec.expr.TypeHelper;
import org.apache.drill.exec.memory.BaseAllocator;
import org.apache.drill.exec.record.BatchSchema;
import org.apache.drill.exec.record.MaterializedField;
import org.apache.drill.exec.record.RecordBatch;
import org.apache.drill.exec.record.VectorAccessible;
import org.apache.drill.exec.record.VectorWrapper;
import org.apache.drill.exec.record.selection.SelectionVector2;
import org.apache.drill.exec.vector.ValueVector;
import org.apache.drill.exec.vector.complex.AbstractMapVector;
/**
* Given a record batch or vector container, determines the actual memory
* consumed by each column, the average row, and the entire record batch.
*/
public class RecordBatchSizer {
// private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RecordBatchSizer.class);
/**
* Column size information.
*/
public static class ColumnSize {
public final MaterializedField metadata;
/**
* Assumed size from Drill metadata.
*/
public int stdSize;
/**
* Actual memory consumed by all the vectors associated with this column.
*/
public int totalSize;
/**
* Actual average column width as determined from actual memory use. This
* size is larger than the actual data size since this size includes per-
* column overhead such as any unused vector space, etc.
*/
public int estSize;
public int capacity;
public int density;
public int dataSize;
public ColumnSize(ValueVector v) {
metadata = v.getField();
stdSize = TypeHelper.getSize(metadata.getType());
// Can't get size estimates if this is an empty batch.
int rowCount = v.getAccessor().getValueCount();
if (rowCount == 0) {
estSize = stdSize;
return;
}
// Total size taken by all vectors (and underlying buffers)
// associated with this vector.
totalSize = v.getAllocatedByteCount();
// Capacity is the number of values that the vector could
// contain. This is useful only for fixed-length vectors.
capacity = v.getValueCapacity();
// The amount of memory consumed by the payload: the actual
// data stored in the vectors.
dataSize = v.getPayloadByteCount();
// Determine "density" the number of rows compared to potential
// capacity. Low-density batches occur at block boundaries, ends
// of files and so on. Low-density batches throw off our estimates
// for Varchar columns because we don't know the actual number of
// bytes consumed (that information is hidden behind the Varchar
// implementation where we can't get at it.)
density = roundUp(dataSize * 100, totalSize);
estSize = roundUp(dataSize, rowCount);
}
@Override
public String toString() {
StringBuilder buf = new StringBuilder()
.append(metadata.getName())
.append("(type: ")
.append(metadata.getType().getMinorType().name())
.append(", std col. size: ")
.append(stdSize)
.append(", actual col. size: ")
.append(estSize)
.append(", total size: ")
.append(totalSize)
.append(", data size: ")
.append(dataSize)
.append(", row capacity: ")
.append(capacity)
.append(", density: ")
.append(density)
.append(")");
return buf.toString();
}
}
private List<ColumnSize> columnSizes = new ArrayList<>();
/**
* Number of records (rows) in the batch.
*/
private int rowCount;
/**
* Standard row width using Drill meta-data.
*/
private int stdRowWidth;
/**
* Actual batch size summing all buffers used to store data
* for the batch.
*/
private int totalBatchSize;
/**
* Actual row width computed by dividing total batch memory by the
* record count.
*/
private int grossRowWidth;
/**
* Actual row width computed by summing columns. Use this if the
* vectors are partially full; prevents overestimating row width.
*/
private int netRowWidth;
private boolean hasSv2;
private int sv2Size;
private int avgDensity;
private int netBatchSize;
public RecordBatchSizer(RecordBatch batch) {
this(batch,
(batch.getSchema().getSelectionVectorMode() == BatchSchema.SelectionVectorMode.TWO_BYTE) ?
batch.getSelectionVector2() : null);
}
public RecordBatchSizer(VectorAccessible va) {
this(va, null);
}
public RecordBatchSizer(VectorAccessible va, SelectionVector2 sv2) {
rowCount = va.getRecordCount();
for (VectorWrapper<?> vw : va) {
measureColumn(vw);
}
if (rowCount > 0) {
grossRowWidth = roundUp(totalBatchSize, rowCount);
}
if (sv2 != null) {
sv2Size = sv2.getBuffer(false).capacity();
grossRowWidth += roundUp(sv2Size, rowCount);
netRowWidth += 2;
}
int totalDensity = 0;
int usableCount = 0;
for (ColumnSize colSize : columnSizes) {
if ( colSize.density > 0 ) {
usableCount++;
}
totalDensity += colSize.density;
}
avgDensity = roundUp(totalDensity, usableCount);
}
public void applySv2() {
if (hasSv2) {
return;
}
sv2Size = BaseAllocator.nextPowerOfTwo(2 * rowCount);
grossRowWidth += roundUp(sv2Size, rowCount);
totalBatchSize += sv2Size;
}
private void measureColumn(VectorWrapper<?> vw) {
measureColumn(vw.getValueVector());
}
private void measureColumn(ValueVector v) {
// Maps consume no size themselves. However, their contained
// vectors do consume space, so visit columns recursively.
if (v.getField().getType().getMinorType() == MinorType.MAP) {
expandMap((AbstractMapVector) v);
return;
}
ColumnSize colSize = new ColumnSize(v);
columnSizes.add(colSize);
stdRowWidth += colSize.stdSize;
totalBatchSize += colSize.totalSize;
netBatchSize += colSize.dataSize;
netRowWidth += colSize.estSize;
}
private void expandMap(AbstractMapVector mapVector) {
for (ValueVector vector : mapVector) {
measureColumn(vector);
}
}
public static int roundUp(int num, int denom) {
if(denom == 0) {
return 0;
}
return (int) Math.ceil((double) num / denom);
}
public int rowCount() { return rowCount; }
public int stdRowWidth() { return stdRowWidth; }
public int grossRowWidth() { return grossRowWidth; }
public int netRowWidth() { return netRowWidth; }
public int actualSize() { return totalBatchSize; }
public boolean hasSv2() { return hasSv2; }
public int avgDensity() { return avgDensity; }
public int netSize() { return netBatchSize; }
public static final int MAX_VECTOR_SIZE = 16 * 1024 * 1024; // 16 MiB
@Override
public String toString() {
StringBuilder buf = new StringBuilder();
buf.append("Actual batch schema & sizes {\n");
for (ColumnSize colSize : columnSizes) {
buf.append(" ");
buf.append(colSize.toString());
buf.append("\n");
}
buf.append( " Records: " );
buf.append(rowCount);
buf.append(", Total size: ");
buf.append(totalBatchSize);
buf.append(", Gross row width:");
buf.append(grossRowWidth);
buf.append(", Net row width:");
buf.append(netRowWidth);
buf.append(", Density:");
buf.append(avgDensity);
buf.append("}");
return buf.toString();
}
}