/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.presto.hive; import com.facebook.presto.spi.PrestoException; import com.facebook.presto.spi.RecordCursor; import com.facebook.presto.spi.block.Block; import com.facebook.presto.spi.type.DecimalType; import com.facebook.presto.spi.type.Decimals; import com.facebook.presto.spi.type.Type; import com.facebook.presto.spi.type.TypeManager; import com.google.common.base.Throwables; import io.airlift.slice.Slice; import io.airlift.slice.Slices; import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable; import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable; import org.apache.hadoop.hive.serde2.lazy.ByteArrayRef; import org.apache.hadoop.hive.serde2.lazy.LazyFactory; import org.apache.hadoop.hive.serde2.lazy.LazyObject; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.mapred.RecordReader; import org.joda.time.DateTimeZone; import java.io.IOException; import java.math.BigDecimal; import java.util.Arrays; import java.util.List; import java.util.Properties; import static com.facebook.presto.hive.HiveBooleanParser.isFalse; import static com.facebook.presto.hive.HiveBooleanParser.isTrue; import static com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR; import static com.facebook.presto.hive.HiveDecimalParser.parseHiveDecimal; import static com.facebook.presto.hive.HiveErrorCode.HIVE_CURSOR_ERROR; import static com.facebook.presto.hive.HiveUtil.base64Decode; import static com.facebook.presto.hive.HiveUtil.closeWithSuppression; import static com.facebook.presto.hive.HiveUtil.getTableObjectInspector; import static com.facebook.presto.hive.HiveUtil.isStructuralType; import static com.facebook.presto.hive.HiveUtil.parseHiveDate; import static com.facebook.presto.hive.HiveUtil.parseHiveTimestamp; import static com.facebook.presto.hive.NumberParser.parseDouble; import static com.facebook.presto.hive.NumberParser.parseFloat; import static com.facebook.presto.hive.NumberParser.parseLong; import static com.facebook.presto.hive.util.SerDeUtils.getBlockObject; import static com.facebook.presto.spi.type.BigintType.BIGINT; import static com.facebook.presto.spi.type.BooleanType.BOOLEAN; import static com.facebook.presto.spi.type.Chars.isCharType; import static com.facebook.presto.spi.type.Chars.trimSpacesAndTruncateToLength; import static com.facebook.presto.spi.type.DateType.DATE; import static com.facebook.presto.spi.type.Decimals.isShortDecimal; import static com.facebook.presto.spi.type.DoubleType.DOUBLE; import static com.facebook.presto.spi.type.IntegerType.INTEGER; import static com.facebook.presto.spi.type.RealType.REAL; import static com.facebook.presto.spi.type.SmallintType.SMALLINT; import static com.facebook.presto.spi.type.StandardTypes.DECIMAL; import static com.facebook.presto.spi.type.TimestampType.TIMESTAMP; import static com.facebook.presto.spi.type.TinyintType.TINYINT; import static com.facebook.presto.spi.type.VarbinaryType.VARBINARY; import static com.facebook.presto.spi.type.Varchars.isVarcharType; import static com.facebook.presto.spi.type.Varchars.truncateToLength; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; import static java.lang.Float.floatToRawIntBits; import static java.lang.Math.max; import static java.lang.Math.min; import static java.lang.String.format; import static java.util.Objects.requireNonNull; class ColumnarTextHiveRecordCursor<K> implements RecordCursor { private final RecordReader<K, BytesRefArrayWritable> recordReader; private final K key; private final BytesRefArrayWritable value; private final Type[] types; private final HiveType[] hiveTypes; private final ObjectInspector[] fieldInspectors; // DON'T USE THESE UNLESS EXTRACTION WILL BE SLOW ANYWAY private final int[] hiveColumnIndexes; private final boolean[] loaded; private final boolean[] booleans; private final long[] longs; private final double[] doubles; private final Slice[] slices; private final Object[] objects; private final boolean[] nulls; private final long totalBytes; private final DateTimeZone hiveStorageTimeZone; private long completedBytes; private boolean closed; public ColumnarTextHiveRecordCursor( RecordReader<K, BytesRefArrayWritable> recordReader, long totalBytes, Properties splitSchema, List<HiveColumnHandle> columns, DateTimeZone hiveStorageTimeZone, TypeManager typeManager) { requireNonNull(recordReader, "recordReader is null"); checkArgument(totalBytes >= 0, "totalBytes is negative"); requireNonNull(splitSchema, "splitSchema is null"); requireNonNull(columns, "columns is null"); requireNonNull(hiveStorageTimeZone, "hiveStorageTimeZone is null"); this.recordReader = recordReader; this.totalBytes = totalBytes; this.key = recordReader.createKey(); this.value = recordReader.createValue(); this.hiveStorageTimeZone = hiveStorageTimeZone; int size = columns.size(); this.types = new Type[size]; this.hiveTypes = new HiveType[size]; this.fieldInspectors = new ObjectInspector[size]; this.hiveColumnIndexes = new int[size]; this.loaded = new boolean[size]; this.booleans = new boolean[size]; this.longs = new long[size]; this.doubles = new double[size]; this.slices = new Slice[size]; this.objects = new Object[size]; this.nulls = new boolean[size]; // initialize data columns StructObjectInspector rowInspector = getTableObjectInspector(splitSchema); for (int i = 0; i < columns.size(); i++) { HiveColumnHandle column = columns.get(i); checkState(column.getColumnType() == REGULAR, "column type must be regular"); types[i] = typeManager.getType(column.getTypeSignature()); hiveTypes[i] = column.getHiveType(); hiveColumnIndexes[i] = column.getHiveColumnIndex(); fieldInspectors[i] = rowInspector.getStructFieldRef(column.getName()).getFieldObjectInspector(); } } @Override public long getTotalBytes() { return totalBytes; } @Override public long getCompletedBytes() { if (!closed) { updateCompletedBytes(); } return completedBytes; } @Override public long getReadTimeNanos() { return 0; } private void updateCompletedBytes() { try { long newCompletedBytes = (long) (totalBytes * recordReader.getProgress()); completedBytes = min(totalBytes, max(completedBytes, newCompletedBytes)); } catch (IOException ignored) { } } @Override public Type getType(int field) { return types[field]; } @Override public boolean advanceNextPosition() { try { if (closed || !recordReader.next(key, value)) { close(); return false; } // reset loaded flags Arrays.fill(loaded, false); return true; } catch (IOException | RuntimeException e) { closeWithSuppression(this, e); throw new PrestoException(HIVE_CURSOR_ERROR, e); } } @Override public boolean getBoolean(int fieldId) { checkState(!closed, "Cursor is closed"); validateType(fieldId, boolean.class); if (!loaded[fieldId]) { parseBooleanColumn(fieldId); } return booleans[fieldId]; } private void parseBooleanColumn(int column) { loaded[column] = true; if (hiveColumnIndexes[column] >= value.size()) { // this partition may contain fewer fields than what's declared in the schema // this happens when additional columns are added to the hive table after a partition has been created nulls[column] = true; } else { BytesRefWritable fieldData = value.unCheckedGet(hiveColumnIndexes[column]); byte[] bytes; try { bytes = fieldData.getData(); } catch (IOException e) { throw Throwables.propagate(e); } int start = fieldData.getStart(); int length = fieldData.getLength(); parseBooleanColumn(column, bytes, start, length); } } private void parseBooleanColumn(int column, byte[] bytes, int start, int length) { boolean wasNull; if (isTrue(bytes, start, length)) { booleans[column] = true; wasNull = false; } else if (isFalse(bytes, start, length)) { booleans[column] = false; wasNull = false; } else { wasNull = true; } nulls[column] = wasNull; } @Override public long getLong(int fieldId) { checkState(!closed, "Cursor is closed"); if (!types[fieldId].equals(BIGINT) && !types[fieldId].equals(INTEGER) && !types[fieldId].equals(SMALLINT) && !types[fieldId].equals(TINYINT) && !types[fieldId].equals(DATE) && !types[fieldId].equals(TIMESTAMP) && !isShortDecimal(types[fieldId]) && !types[fieldId].equals(REAL)) { // we don't use Preconditions.checkArgument because it requires boxing fieldId, which affects inner loop performance throw new IllegalArgumentException( format("Expected field to be %s, %s, %s, %s, %s, %s, %s or %s , actual %s (field %s)", TINYINT, SMALLINT, INTEGER, BIGINT, DECIMAL, DATE, TIMESTAMP, REAL, types[fieldId], fieldId)); } if (!loaded[fieldId]) { parseLongColumn(fieldId); } return longs[fieldId]; } private void parseLongColumn(int column) { loaded[column] = true; if (hiveColumnIndexes[column] >= value.size()) { // this partition may contain fewer fields than what's declared in the schema // this happens when additional columns are added to the hive table after a partition has been created nulls[column] = true; } else { BytesRefWritable fieldData = value.unCheckedGet(hiveColumnIndexes[column]); byte[] bytes; try { bytes = fieldData.getData(); } catch (IOException e) { throw Throwables.propagate(e); } int start = fieldData.getStart(); int length = fieldData.getLength(); parseLongColumn(column, bytes, start, length); } } private void parseLongColumn(int column, byte[] bytes, int start, int length) { boolean wasNull; if (length == 0 || (length == "\\N".length() && bytes[start] == '\\' && bytes[start + 1] == 'N')) { wasNull = true; } else if (hiveTypes[column].equals(HiveType.HIVE_DATE)) { String value = new String(bytes, start, length); longs[column] = parseHiveDate(value); wasNull = false; } else if (hiveTypes[column].equals(HiveType.HIVE_TIMESTAMP)) { String value = new String(bytes, start, length); longs[column] = parseHiveTimestamp(value, hiveStorageTimeZone); wasNull = false; } else if (hiveTypes[column].equals(HiveType.HIVE_FLOAT)) { longs[column] = floatToRawIntBits(parseFloat(bytes, start, length)); wasNull = false; } else { longs[column] = parseLong(bytes, start, length); wasNull = false; } nulls[column] = wasNull; } @Override public double getDouble(int fieldId) { checkState(!closed, "Cursor is closed"); validateType(fieldId, double.class); if (!loaded[fieldId]) { parseDoubleColumn(fieldId); } return doubles[fieldId]; } private void parseDoubleColumn(int column) { loaded[column] = true; if (hiveColumnIndexes[column] >= value.size()) { // this partition may contain fewer fields than what's declared in the schema // this happens when additional columns are added to the hive table after a partition has been created nulls[column] = true; } else { BytesRefWritable fieldData = value.unCheckedGet(hiveColumnIndexes[column]); byte[] bytes; try { bytes = fieldData.getData(); } catch (IOException e) { throw Throwables.propagate(e); } int start = fieldData.getStart(); int length = fieldData.getLength(); parseDoubleColumn(column, bytes, start, length); } } private void parseDoubleColumn(int column, byte[] bytes, int start, int length) { boolean wasNull; if (length == 0 || (length == "\\N".length() && bytes[start] == '\\' && bytes[start + 1] == 'N')) { wasNull = true; } else { doubles[column] = parseDouble(bytes, start, length); wasNull = false; } nulls[column] = wasNull; } @Override public Slice getSlice(int fieldId) { checkState(!closed, "Cursor is closed"); validateType(fieldId, Slice.class); if (!loaded[fieldId]) { parseStringColumn(fieldId); } return slices[fieldId]; } private void parseStringColumn(int column) { loaded[column] = true; if (hiveColumnIndexes[column] >= value.size()) { // this partition may contain fewer fields than what's declared in the schema // this happens when additional columns are added to the hive table after a partition has been created nulls[column] = true; } else { BytesRefWritable fieldData = value.unCheckedGet(hiveColumnIndexes[column]); byte[] bytes; try { bytes = fieldData.getData(); } catch (IOException e) { throw Throwables.propagate(e); } int start = fieldData.getStart(); int length = fieldData.getLength(); parseStringColumn(column, bytes, start, length); } } private void parseStringColumn(int column, byte[] bytes, int start, int length) { boolean wasNull; if (length == "\\N".length() && bytes[start] == '\\' && bytes[start + 1] == 'N') { wasNull = true; } else { Type type = types[column]; Slice value = Slices.wrappedBuffer(Arrays.copyOfRange(bytes, start, start + length)); if (isVarcharType(type)) { slices[column] = truncateToLength(value, type); } else if (isCharType(type)) { slices[column] = trimSpacesAndTruncateToLength(value, type); } // this is unbelievably stupid but Hive base64 encodes binary data in a binary file format else if (type.equals(VARBINARY)) { // and yes we end up with an extra copy here because the Base64 only handles whole arrays slices[column] = base64Decode(value.getBytes()); } else { slices[column] = value; } wasNull = false; } nulls[column] = wasNull; } private void parseDecimalColumn(int column) { loaded[column] = true; if (hiveColumnIndexes[column] >= value.size()) { // this partition may contain fewer fields than what's declared in the schema // this happens when additional columns are added to the hive table after a partition has been created nulls[column] = true; } else { BytesRefWritable fieldData = value.unCheckedGet(hiveColumnIndexes[column]); byte[] bytes; try { bytes = fieldData.getData(); } catch (IOException e) { throw Throwables.propagate(e); } int start = fieldData.getStart(); int length = fieldData.getLength(); parseDecimalColumn(column, bytes, start, length); } } private void parseDecimalColumn(int column, byte[] bytes, int start, int length) { boolean wasNull; if (length == 0 || (length == "\\N".length() && bytes[start] == '\\' && bytes[start + 1] == 'N')) { wasNull = true; } else { DecimalType columnType = (DecimalType) types[column]; BigDecimal decimal = parseHiveDecimal(bytes, start, length, columnType); if (columnType.isShort()) { longs[column] = decimal.unscaledValue().longValue(); } else { slices[column] = Decimals.encodeUnscaledValue(decimal.unscaledValue()); } wasNull = false; } nulls[column] = wasNull; } @Override public Object getObject(int fieldId) { checkState(!closed, "Cursor is closed"); validateType(fieldId, Block.class); if (!loaded[fieldId]) { parseObjectColumn(fieldId); } return objects[fieldId]; } private void parseObjectColumn(int column) { loaded[column] = true; if (hiveColumnIndexes[column] >= value.size()) { // this partition may contain fewer fields than what's declared in the schema // this happens when additional columns are added to the hive table after a partition has been created nulls[column] = true; } else { BytesRefWritable fieldData = value.unCheckedGet(hiveColumnIndexes[column]); byte[] bytes; try { bytes = fieldData.getData(); } catch (IOException e) { throw Throwables.propagate(e); } int start = fieldData.getStart(); int length = fieldData.getLength(); parseObjectColumn(column, bytes, start, length); } } private void parseObjectColumn(int column, byte[] bytes, int start, int length) { boolean wasNull; if (length == "\\N".length() && bytes[start] == '\\' && bytes[start + 1] == 'N') { wasNull = true; } else { LazyObject<? extends ObjectInspector> lazyObject = LazyFactory.createLazyObject(fieldInspectors[column]); ByteArrayRef byteArrayRef = new ByteArrayRef(); byteArrayRef.setData(bytes); lazyObject.init(byteArrayRef, start, length); objects[column] = getBlockObject(types[column], lazyObject.getObject(), fieldInspectors[column]); wasNull = false; } nulls[column] = wasNull; } @Override public boolean isNull(int fieldId) { checkState(!closed, "Cursor is closed"); if (!loaded[fieldId]) { parseColumn(fieldId); } return nulls[fieldId]; } private void parseColumn(int column) { Type type = types[column]; if (type.equals(BOOLEAN)) { parseBooleanColumn(column); } else if (type.equals(BIGINT)) { parseLongColumn(column); } else if (type.equals(INTEGER)) { parseLongColumn(column); } else if (type.equals(SMALLINT)) { parseLongColumn(column); } else if (type.equals(TINYINT)) { parseLongColumn(column); } else if (type.equals(REAL)) { parseLongColumn(column); } else if (type.equals(DOUBLE)) { parseDoubleColumn(column); } else if (isVarcharType(type) || VARBINARY.equals(type) || isCharType(type)) { parseStringColumn(column); } else if (isStructuralType(hiveTypes[column])) { parseObjectColumn(column); } else if (type.equals(DATE)) { parseLongColumn(column); } else if (type.equals(TIMESTAMP)) { parseLongColumn(column); } else if (type instanceof DecimalType) { parseDecimalColumn(column); } else { throw new UnsupportedOperationException("Unsupported column type: " + type); } } private void validateType(int fieldId, Class<?> type) { if (!types[fieldId].getJavaType().equals(type)) { // we don't use Preconditions.checkArgument because it requires boxing fieldId, which affects inner loop performance throw new IllegalArgumentException(String.format("Expected field to be %s, actual %s (field %s)", type, types[fieldId], fieldId)); } } @Override public void close() { // some hive input formats are broken and bad things can happen if you close them multiple times if (closed) { return; } closed = true; updateCompletedBytes(); try { recordReader.close(); } catch (IOException e) { throw Throwables.propagate(e); } } }