/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.hive;
import com.facebook.presto.spi.PrestoException;
import com.facebook.presto.spi.RecordCursor;
import com.facebook.presto.spi.block.Block;
import com.facebook.presto.spi.type.DecimalType;
import com.facebook.presto.spi.type.Decimals;
import com.facebook.presto.spi.type.Type;
import com.facebook.presto.spi.type.TypeManager;
import com.google.common.base.Throwables;
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;
import org.apache.hadoop.hive.serde2.lazy.ByteArrayRef;
import org.apache.hadoop.hive.serde2.lazy.LazyFactory;
import org.apache.hadoop.hive.serde2.lazy.LazyObject;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.mapred.RecordReader;
import org.joda.time.DateTimeZone;
import java.io.IOException;
import java.math.BigDecimal;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import static com.facebook.presto.hive.HiveBooleanParser.isFalse;
import static com.facebook.presto.hive.HiveBooleanParser.isTrue;
import static com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR;
import static com.facebook.presto.hive.HiveDecimalParser.parseHiveDecimal;
import static com.facebook.presto.hive.HiveErrorCode.HIVE_CURSOR_ERROR;
import static com.facebook.presto.hive.HiveUtil.base64Decode;
import static com.facebook.presto.hive.HiveUtil.closeWithSuppression;
import static com.facebook.presto.hive.HiveUtil.getTableObjectInspector;
import static com.facebook.presto.hive.HiveUtil.isStructuralType;
import static com.facebook.presto.hive.HiveUtil.parseHiveDate;
import static com.facebook.presto.hive.HiveUtil.parseHiveTimestamp;
import static com.facebook.presto.hive.NumberParser.parseDouble;
import static com.facebook.presto.hive.NumberParser.parseFloat;
import static com.facebook.presto.hive.NumberParser.parseLong;
import static com.facebook.presto.hive.util.SerDeUtils.getBlockObject;
import static com.facebook.presto.spi.type.BigintType.BIGINT;
import static com.facebook.presto.spi.type.BooleanType.BOOLEAN;
import static com.facebook.presto.spi.type.Chars.isCharType;
import static com.facebook.presto.spi.type.Chars.trimSpacesAndTruncateToLength;
import static com.facebook.presto.spi.type.DateType.DATE;
import static com.facebook.presto.spi.type.Decimals.isShortDecimal;
import static com.facebook.presto.spi.type.DoubleType.DOUBLE;
import static com.facebook.presto.spi.type.IntegerType.INTEGER;
import static com.facebook.presto.spi.type.RealType.REAL;
import static com.facebook.presto.spi.type.SmallintType.SMALLINT;
import static com.facebook.presto.spi.type.StandardTypes.DECIMAL;
import static com.facebook.presto.spi.type.TimestampType.TIMESTAMP;
import static com.facebook.presto.spi.type.TinyintType.TINYINT;
import static com.facebook.presto.spi.type.VarbinaryType.VARBINARY;
import static com.facebook.presto.spi.type.Varchars.isVarcharType;
import static com.facebook.presto.spi.type.Varchars.truncateToLength;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static java.lang.Float.floatToRawIntBits;
import static java.lang.Math.max;
import static java.lang.Math.min;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
class ColumnarTextHiveRecordCursor<K>
implements RecordCursor
{
private final RecordReader<K, BytesRefArrayWritable> recordReader;
private final K key;
private final BytesRefArrayWritable value;
private final Type[] types;
private final HiveType[] hiveTypes;
private final ObjectInspector[] fieldInspectors; // DON'T USE THESE UNLESS EXTRACTION WILL BE SLOW ANYWAY
private final int[] hiveColumnIndexes;
private final boolean[] loaded;
private final boolean[] booleans;
private final long[] longs;
private final double[] doubles;
private final Slice[] slices;
private final Object[] objects;
private final boolean[] nulls;
private final long totalBytes;
private final DateTimeZone hiveStorageTimeZone;
private long completedBytes;
private boolean closed;
public ColumnarTextHiveRecordCursor(
RecordReader<K, BytesRefArrayWritable> recordReader,
long totalBytes,
Properties splitSchema,
List<HiveColumnHandle> columns,
DateTimeZone hiveStorageTimeZone,
TypeManager typeManager)
{
requireNonNull(recordReader, "recordReader is null");
checkArgument(totalBytes >= 0, "totalBytes is negative");
requireNonNull(splitSchema, "splitSchema is null");
requireNonNull(columns, "columns is null");
requireNonNull(hiveStorageTimeZone, "hiveStorageTimeZone is null");
this.recordReader = recordReader;
this.totalBytes = totalBytes;
this.key = recordReader.createKey();
this.value = recordReader.createValue();
this.hiveStorageTimeZone = hiveStorageTimeZone;
int size = columns.size();
this.types = new Type[size];
this.hiveTypes = new HiveType[size];
this.fieldInspectors = new ObjectInspector[size];
this.hiveColumnIndexes = new int[size];
this.loaded = new boolean[size];
this.booleans = new boolean[size];
this.longs = new long[size];
this.doubles = new double[size];
this.slices = new Slice[size];
this.objects = new Object[size];
this.nulls = new boolean[size];
// initialize data columns
StructObjectInspector rowInspector = getTableObjectInspector(splitSchema);
for (int i = 0; i < columns.size(); i++) {
HiveColumnHandle column = columns.get(i);
checkState(column.getColumnType() == REGULAR, "column type must be regular");
types[i] = typeManager.getType(column.getTypeSignature());
hiveTypes[i] = column.getHiveType();
hiveColumnIndexes[i] = column.getHiveColumnIndex();
fieldInspectors[i] = rowInspector.getStructFieldRef(column.getName()).getFieldObjectInspector();
}
}
@Override
public long getTotalBytes()
{
return totalBytes;
}
@Override
public long getCompletedBytes()
{
if (!closed) {
updateCompletedBytes();
}
return completedBytes;
}
@Override
public long getReadTimeNanos()
{
return 0;
}
private void updateCompletedBytes()
{
try {
long newCompletedBytes = (long) (totalBytes * recordReader.getProgress());
completedBytes = min(totalBytes, max(completedBytes, newCompletedBytes));
}
catch (IOException ignored) {
}
}
@Override
public Type getType(int field)
{
return types[field];
}
@Override
public boolean advanceNextPosition()
{
try {
if (closed || !recordReader.next(key, value)) {
close();
return false;
}
// reset loaded flags
Arrays.fill(loaded, false);
return true;
}
catch (IOException | RuntimeException e) {
closeWithSuppression(this, e);
throw new PrestoException(HIVE_CURSOR_ERROR, e);
}
}
@Override
public boolean getBoolean(int fieldId)
{
checkState(!closed, "Cursor is closed");
validateType(fieldId, boolean.class);
if (!loaded[fieldId]) {
parseBooleanColumn(fieldId);
}
return booleans[fieldId];
}
private void parseBooleanColumn(int column)
{
loaded[column] = true;
if (hiveColumnIndexes[column] >= value.size()) {
// this partition may contain fewer fields than what's declared in the schema
// this happens when additional columns are added to the hive table after a partition has been created
nulls[column] = true;
}
else {
BytesRefWritable fieldData = value.unCheckedGet(hiveColumnIndexes[column]);
byte[] bytes;
try {
bytes = fieldData.getData();
}
catch (IOException e) {
throw Throwables.propagate(e);
}
int start = fieldData.getStart();
int length = fieldData.getLength();
parseBooleanColumn(column, bytes, start, length);
}
}
private void parseBooleanColumn(int column, byte[] bytes, int start, int length)
{
boolean wasNull;
if (isTrue(bytes, start, length)) {
booleans[column] = true;
wasNull = false;
}
else if (isFalse(bytes, start, length)) {
booleans[column] = false;
wasNull = false;
}
else {
wasNull = true;
}
nulls[column] = wasNull;
}
@Override
public long getLong(int fieldId)
{
checkState(!closed, "Cursor is closed");
if (!types[fieldId].equals(BIGINT) &&
!types[fieldId].equals(INTEGER) &&
!types[fieldId].equals(SMALLINT) &&
!types[fieldId].equals(TINYINT) &&
!types[fieldId].equals(DATE) &&
!types[fieldId].equals(TIMESTAMP) &&
!isShortDecimal(types[fieldId]) &&
!types[fieldId].equals(REAL)) {
// we don't use Preconditions.checkArgument because it requires boxing fieldId, which affects inner loop performance
throw new IllegalArgumentException(
format("Expected field to be %s, %s, %s, %s, %s, %s, %s or %s , actual %s (field %s)", TINYINT, SMALLINT, INTEGER, BIGINT, DECIMAL, DATE, TIMESTAMP, REAL, types[fieldId], fieldId));
}
if (!loaded[fieldId]) {
parseLongColumn(fieldId);
}
return longs[fieldId];
}
private void parseLongColumn(int column)
{
loaded[column] = true;
if (hiveColumnIndexes[column] >= value.size()) {
// this partition may contain fewer fields than what's declared in the schema
// this happens when additional columns are added to the hive table after a partition has been created
nulls[column] = true;
}
else {
BytesRefWritable fieldData = value.unCheckedGet(hiveColumnIndexes[column]);
byte[] bytes;
try {
bytes = fieldData.getData();
}
catch (IOException e) {
throw Throwables.propagate(e);
}
int start = fieldData.getStart();
int length = fieldData.getLength();
parseLongColumn(column, bytes, start, length);
}
}
private void parseLongColumn(int column, byte[] bytes, int start, int length)
{
boolean wasNull;
if (length == 0 || (length == "\\N".length() && bytes[start] == '\\' && bytes[start + 1] == 'N')) {
wasNull = true;
}
else if (hiveTypes[column].equals(HiveType.HIVE_DATE)) {
String value = new String(bytes, start, length);
longs[column] = parseHiveDate(value);
wasNull = false;
}
else if (hiveTypes[column].equals(HiveType.HIVE_TIMESTAMP)) {
String value = new String(bytes, start, length);
longs[column] = parseHiveTimestamp(value, hiveStorageTimeZone);
wasNull = false;
}
else if (hiveTypes[column].equals(HiveType.HIVE_FLOAT)) {
longs[column] = floatToRawIntBits(parseFloat(bytes, start, length));
wasNull = false;
}
else {
longs[column] = parseLong(bytes, start, length);
wasNull = false;
}
nulls[column] = wasNull;
}
@Override
public double getDouble(int fieldId)
{
checkState(!closed, "Cursor is closed");
validateType(fieldId, double.class);
if (!loaded[fieldId]) {
parseDoubleColumn(fieldId);
}
return doubles[fieldId];
}
private void parseDoubleColumn(int column)
{
loaded[column] = true;
if (hiveColumnIndexes[column] >= value.size()) {
// this partition may contain fewer fields than what's declared in the schema
// this happens when additional columns are added to the hive table after a partition has been created
nulls[column] = true;
}
else {
BytesRefWritable fieldData = value.unCheckedGet(hiveColumnIndexes[column]);
byte[] bytes;
try {
bytes = fieldData.getData();
}
catch (IOException e) {
throw Throwables.propagate(e);
}
int start = fieldData.getStart();
int length = fieldData.getLength();
parseDoubleColumn(column, bytes, start, length);
}
}
private void parseDoubleColumn(int column, byte[] bytes, int start, int length)
{
boolean wasNull;
if (length == 0 || (length == "\\N".length() && bytes[start] == '\\' && bytes[start + 1] == 'N')) {
wasNull = true;
}
else {
doubles[column] = parseDouble(bytes, start, length);
wasNull = false;
}
nulls[column] = wasNull;
}
@Override
public Slice getSlice(int fieldId)
{
checkState(!closed, "Cursor is closed");
validateType(fieldId, Slice.class);
if (!loaded[fieldId]) {
parseStringColumn(fieldId);
}
return slices[fieldId];
}
private void parseStringColumn(int column)
{
loaded[column] = true;
if (hiveColumnIndexes[column] >= value.size()) {
// this partition may contain fewer fields than what's declared in the schema
// this happens when additional columns are added to the hive table after a partition has been created
nulls[column] = true;
}
else {
BytesRefWritable fieldData = value.unCheckedGet(hiveColumnIndexes[column]);
byte[] bytes;
try {
bytes = fieldData.getData();
}
catch (IOException e) {
throw Throwables.propagate(e);
}
int start = fieldData.getStart();
int length = fieldData.getLength();
parseStringColumn(column, bytes, start, length);
}
}
private void parseStringColumn(int column, byte[] bytes, int start, int length)
{
boolean wasNull;
if (length == "\\N".length() && bytes[start] == '\\' && bytes[start + 1] == 'N') {
wasNull = true;
}
else {
Type type = types[column];
Slice value = Slices.wrappedBuffer(Arrays.copyOfRange(bytes, start, start + length));
if (isVarcharType(type)) {
slices[column] = truncateToLength(value, type);
}
else if (isCharType(type)) {
slices[column] = trimSpacesAndTruncateToLength(value, type);
}
// this is unbelievably stupid but Hive base64 encodes binary data in a binary file format
else if (type.equals(VARBINARY)) {
// and yes we end up with an extra copy here because the Base64 only handles whole arrays
slices[column] = base64Decode(value.getBytes());
}
else {
slices[column] = value;
}
wasNull = false;
}
nulls[column] = wasNull;
}
private void parseDecimalColumn(int column)
{
loaded[column] = true;
if (hiveColumnIndexes[column] >= value.size()) {
// this partition may contain fewer fields than what's declared in the schema
// this happens when additional columns are added to the hive table after a partition has been created
nulls[column] = true;
}
else {
BytesRefWritable fieldData = value.unCheckedGet(hiveColumnIndexes[column]);
byte[] bytes;
try {
bytes = fieldData.getData();
}
catch (IOException e) {
throw Throwables.propagate(e);
}
int start = fieldData.getStart();
int length = fieldData.getLength();
parseDecimalColumn(column, bytes, start, length);
}
}
private void parseDecimalColumn(int column, byte[] bytes, int start, int length)
{
boolean wasNull;
if (length == 0 || (length == "\\N".length() && bytes[start] == '\\' && bytes[start + 1] == 'N')) {
wasNull = true;
}
else {
DecimalType columnType = (DecimalType) types[column];
BigDecimal decimal = parseHiveDecimal(bytes, start, length, columnType);
if (columnType.isShort()) {
longs[column] = decimal.unscaledValue().longValue();
}
else {
slices[column] = Decimals.encodeUnscaledValue(decimal.unscaledValue());
}
wasNull = false;
}
nulls[column] = wasNull;
}
@Override
public Object getObject(int fieldId)
{
checkState(!closed, "Cursor is closed");
validateType(fieldId, Block.class);
if (!loaded[fieldId]) {
parseObjectColumn(fieldId);
}
return objects[fieldId];
}
private void parseObjectColumn(int column)
{
loaded[column] = true;
if (hiveColumnIndexes[column] >= value.size()) {
// this partition may contain fewer fields than what's declared in the schema
// this happens when additional columns are added to the hive table after a partition has been created
nulls[column] = true;
}
else {
BytesRefWritable fieldData = value.unCheckedGet(hiveColumnIndexes[column]);
byte[] bytes;
try {
bytes = fieldData.getData();
}
catch (IOException e) {
throw Throwables.propagate(e);
}
int start = fieldData.getStart();
int length = fieldData.getLength();
parseObjectColumn(column, bytes, start, length);
}
}
private void parseObjectColumn(int column, byte[] bytes, int start, int length)
{
boolean wasNull;
if (length == "\\N".length() && bytes[start] == '\\' && bytes[start + 1] == 'N') {
wasNull = true;
}
else {
LazyObject<? extends ObjectInspector> lazyObject = LazyFactory.createLazyObject(fieldInspectors[column]);
ByteArrayRef byteArrayRef = new ByteArrayRef();
byteArrayRef.setData(bytes);
lazyObject.init(byteArrayRef, start, length);
objects[column] = getBlockObject(types[column], lazyObject.getObject(), fieldInspectors[column]);
wasNull = false;
}
nulls[column] = wasNull;
}
@Override
public boolean isNull(int fieldId)
{
checkState(!closed, "Cursor is closed");
if (!loaded[fieldId]) {
parseColumn(fieldId);
}
return nulls[fieldId];
}
private void parseColumn(int column)
{
Type type = types[column];
if (type.equals(BOOLEAN)) {
parseBooleanColumn(column);
}
else if (type.equals(BIGINT)) {
parseLongColumn(column);
}
else if (type.equals(INTEGER)) {
parseLongColumn(column);
}
else if (type.equals(SMALLINT)) {
parseLongColumn(column);
}
else if (type.equals(TINYINT)) {
parseLongColumn(column);
}
else if (type.equals(REAL)) {
parseLongColumn(column);
}
else if (type.equals(DOUBLE)) {
parseDoubleColumn(column);
}
else if (isVarcharType(type) || VARBINARY.equals(type) || isCharType(type)) {
parseStringColumn(column);
}
else if (isStructuralType(hiveTypes[column])) {
parseObjectColumn(column);
}
else if (type.equals(DATE)) {
parseLongColumn(column);
}
else if (type.equals(TIMESTAMP)) {
parseLongColumn(column);
}
else if (type instanceof DecimalType) {
parseDecimalColumn(column);
}
else {
throw new UnsupportedOperationException("Unsupported column type: " + type);
}
}
private void validateType(int fieldId, Class<?> type)
{
if (!types[fieldId].getJavaType().equals(type)) {
// we don't use Preconditions.checkArgument because it requires boxing fieldId, which affects inner loop performance
throw new IllegalArgumentException(String.format("Expected field to be %s, actual %s (field %s)", type, types[fieldId], fieldId));
}
}
@Override
public void close()
{
// some hive input formats are broken and bad things can happen if you close them multiple times
if (closed) {
return;
}
closed = true;
updateCompletedBytes();
try {
recordReader.close();
}
catch (IOException e) {
throw Throwables.propagate(e);
}
}
}