/**
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io.parquet.serde;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import com.google.common.base.Preconditions;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.optimizer.FieldNode;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeSpec;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.io.ParquetHiveRecord;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.parquet.hadoop.ParquetOutputFormat;
/**
*
* A ParquetHiveSerDe for Hive (with the deprecated package mapred)
*
*/
@SerDeSpec(schemaProps = {serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES,
ParquetOutputFormat.COMPRESSION})
public class ParquetHiveSerDe extends AbstractSerDe {
public static final Text MAP_KEY = new Text("key");
public static final Text MAP_VALUE = new Text("value");
public static final Text MAP = new Text("map");
public static final Text ARRAY = new Text("bag");
public static final Text LIST = new Text("list");
// Map precision to the number bytes needed for binary conversion.
public static final int PRECISION_TO_BYTE_COUNT[] = new int[38];
static {
for (int prec = 1; prec <= 38; prec++) {
// Estimated number of bytes needed.
PRECISION_TO_BYTE_COUNT[prec - 1] = (int)
Math.ceil((Math.log(Math.pow(10, prec) - 1) / Math.log(2) + 1) / 8);
}
}
private SerDeStats stats;
private ObjectInspector objInspector;
private enum LAST_OPERATION {
SERIALIZE,
DESERIALIZE,
UNKNOWN
}
private LAST_OPERATION status;
private long serializedSize;
private long deserializedSize;
private ParquetHiveRecord parquetRow;
public ParquetHiveSerDe() {
parquetRow = new ParquetHiveRecord();
stats = new SerDeStats();
}
@Override
public final void initialize(final Configuration conf, final Properties tbl) throws SerDeException {
final TypeInfo rowTypeInfo;
final List<String> columnNames;
final List<TypeInfo> columnTypes;
// Get column names and sort order
final String columnNameProperty = tbl.getProperty(serdeConstants.LIST_COLUMNS);
final String columnTypeProperty = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES);
final String columnNameDelimiter = tbl.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? tbl
.getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : String.valueOf(SerDeUtils.COMMA);
if (columnNameProperty.length() == 0) {
columnNames = new ArrayList<String>();
} else {
columnNames = Arrays.asList(columnNameProperty.split(columnNameDelimiter));
}
if (columnTypeProperty.length() == 0) {
columnTypes = new ArrayList<TypeInfo>();
} else {
columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
}
if (columnNames.size() != columnTypes.size()) {
throw new IllegalArgumentException("ParquetHiveSerde initialization failed. Number of column " +
"name and column type differs. columnNames = " + columnNames + ", columnTypes = " +
columnTypes);
}
// Create row related objects
StructTypeInfo completeTypeInfo =
(StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes);
StructTypeInfo prunedTypeInfo = null;
if (conf != null) {
String rawPrunedColumnPaths = conf.get(ColumnProjectionUtils.READ_NESTED_COLUMN_PATH_CONF_STR);
if (rawPrunedColumnPaths != null) {
List<String> prunedColumnPaths = processRawPrunedPaths(rawPrunedColumnPaths);
prunedTypeInfo = pruneFromPaths(completeTypeInfo, prunedColumnPaths);
}
}
this.objInspector = new ArrayWritableObjectInspector(completeTypeInfo, prunedTypeInfo);
// Stats part
serializedSize = 0;
deserializedSize = 0;
status = LAST_OPERATION.UNKNOWN;
}
@Override
public Object deserialize(final Writable blob) throws SerDeException {
status = LAST_OPERATION.DESERIALIZE;
deserializedSize = 0;
if (blob instanceof ArrayWritable) {
deserializedSize = ((ArrayWritable) blob).get().length;
return blob;
} else {
return null;
}
}
@Override
public ObjectInspector getObjectInspector() throws SerDeException {
return objInspector;
}
@Override
public Class<? extends Writable> getSerializedClass() {
return ParquetHiveRecord.class;
}
@Override
public Writable serialize(final Object obj, final ObjectInspector objInspector)
throws SerDeException {
if (!objInspector.getCategory().equals(Category.STRUCT)) {
throw new SerDeException("Cannot serialize " + objInspector.getCategory() + ". Can only serialize a struct");
}
serializedSize = ((StructObjectInspector)objInspector).getAllStructFieldRefs().size();
status = LAST_OPERATION.SERIALIZE;
parquetRow.value = obj;
parquetRow.inspector= (StructObjectInspector)objInspector;
return parquetRow;
}
@Override
public SerDeStats getSerDeStats() {
// must be different
assert (status != LAST_OPERATION.UNKNOWN);
if (status == LAST_OPERATION.SERIALIZE) {
stats.setRawDataSize(serializedSize);
} else {
stats.setRawDataSize(deserializedSize);
}
return stats;
}
/**
* @param table
* @return true if the table has the parquet serde defined
*/
public static boolean isParquetTable(Table table) {
return table == null ? false : ParquetHiveSerDe.class.getName().equals(table.getSerializationLib());
}
/**
* Given a list of raw pruned paths separated by ',', return a list of merged pruned paths.
* For instance, if the 'prunedPaths' is "s.a, s, s", this returns ["s"].
*/
private static List<String> processRawPrunedPaths(String prunedPaths) {
List<FieldNode> fieldNodes = new ArrayList<>();
for (String p : prunedPaths.split(",")) {
fieldNodes = FieldNode.mergeFieldNodes(fieldNodes, FieldNode.fromPath(p));
}
List<String> prunedPathList = new ArrayList<>();
for (FieldNode fn : fieldNodes) {
prunedPathList.addAll(fn.toPaths());
}
return prunedPathList;
}
/**
* Given a complete struct type info and pruned paths containing selected fields
* from the type info, return a pruned struct type info only with the selected fields.
*
* For instance, if 'originalTypeInfo' is: s:struct<a:struct<b:int, c:boolean>, d:string>
* and 'prunedPaths' is ["s.a.b,s.d"], then the result will be:
* s:struct<a:struct<b:int>, d:string>
*
* @param originalTypeInfo the complete struct type info
* @param prunedPaths a string representing the pruned paths, separated by ','
* @return the pruned struct type info
*/
private static StructTypeInfo pruneFromPaths(
StructTypeInfo originalTypeInfo, List<String> prunedPaths) {
PrunedStructTypeInfo prunedTypeInfo = new PrunedStructTypeInfo(originalTypeInfo);
for (String path : prunedPaths) {
pruneFromSinglePath(prunedTypeInfo, path);
}
return prunedTypeInfo.prune();
}
private static void pruneFromSinglePath(PrunedStructTypeInfo prunedInfo, String path) {
Preconditions.checkArgument(prunedInfo != null,
"PrunedStructTypeInfo for path '" + path + "' should not be null");
int index = path.indexOf('.');
if (index < 0) {
index = path.length();
}
String fieldName = path.substring(0, index);
prunedInfo.markSelected(fieldName);
if (index < path.length()) {
pruneFromSinglePath(prunedInfo.getChild(fieldName), path.substring(index + 1));
}
}
private static class PrunedStructTypeInfo {
final StructTypeInfo typeInfo;
final Map<String, PrunedStructTypeInfo> children;
final boolean[] selected;
PrunedStructTypeInfo(StructTypeInfo typeInfo) {
this.typeInfo = typeInfo;
this.children = new HashMap<>();
this.selected = new boolean[typeInfo.getAllStructFieldTypeInfos().size()];
for (int i = 0; i < typeInfo.getAllStructFieldTypeInfos().size(); ++i) {
TypeInfo ti = typeInfo.getAllStructFieldTypeInfos().get(i);
if (ti.getCategory() == Category.STRUCT) {
this.children.put(typeInfo.getAllStructFieldNames().get(i).toLowerCase(),
new PrunedStructTypeInfo((StructTypeInfo) ti));
}
}
}
PrunedStructTypeInfo getChild(String fieldName) {
return children.get(fieldName.toLowerCase());
}
void markSelected(String fieldName) {
for (int i = 0; i < typeInfo.getAllStructFieldNames().size(); ++i) {
if (typeInfo.getAllStructFieldNames().get(i).equalsIgnoreCase(fieldName)) {
selected[i] = true;
break;
}
}
}
StructTypeInfo prune() {
List<String> newNames = new ArrayList<>();
List<TypeInfo> newTypes = new ArrayList<>();
List<String> oldNames = typeInfo.getAllStructFieldNames();
List<TypeInfo> oldTypes = typeInfo.getAllStructFieldTypeInfos();
for (int i = 0; i < oldNames.size(); ++i) {
String fn = oldNames.get(i);
if (selected[i]) {
newNames.add(fn);
if (children.containsKey(fn.toLowerCase())) {
newTypes.add(children.get(fn.toLowerCase()).prune());
} else {
newTypes.add(oldTypes.get(i));
}
}
}
return (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(newNames, newTypes);
}
}
}