/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hive.hcatalog.streaming;
import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.security.UserGroupInformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters;
import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.io.BytesWritable;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
/**
* Streaming Writer handles delimited input (eg. CSV).
* Delimited input is parsed & reordered to match column order in table
* Uses Lazy Simple Serde to process delimited input
*/
public class DelimitedInputWriter extends AbstractRecordWriter {
private final boolean reorderingNeeded;
private String delimiter;
private char serdeSeparator;
private int[] fieldToColMapping;
private final ArrayList<String> tableColumns;
private LazySimpleSerDe serde = null;
private final LazySimpleStructObjectInspector recordObjInspector;
private final ObjectInspector[] bucketObjInspectors;
private final StructField[] bucketStructFields;
static final private Logger LOG = LoggerFactory.getLogger(DelimitedInputWriter.class.getName());
/** Constructor. Uses default separator of the LazySimpleSerde
* @param colNamesForFields Column name assignment for input fields. nulls or empty
* strings in the array indicates the fields to be skipped
* @param delimiter input field delimiter
* @param endPoint Hive endpoint
* @throws ConnectionError Problem talking to Hive
* @throws ClassNotFoundException Serde class not found
* @throws SerializationError Serde initialization/interaction failed
* @throws StreamingException Problem acquiring file system path for partition
* @throws InvalidColumn any element in colNamesForFields refers to a non existing column
*/
public DelimitedInputWriter(String[] colNamesForFields, String delimiter,
HiveEndPoint endPoint, StreamingConnection conn)
throws ClassNotFoundException, ConnectionError, SerializationError,
InvalidColumn, StreamingException {
this(colNamesForFields, delimiter, endPoint, null, conn);
}
/** Constructor. Uses default separator of the LazySimpleSerde
* @param colNamesForFields Column name assignment for input fields. nulls or empty
* strings in the array indicates the fields to be skipped
* @param delimiter input field delimiter
* @param endPoint Hive endpoint
* @param conf a Hive conf object. Can be null if not using advanced hive settings.
* @throws ConnectionError Problem talking to Hive
* @throws ClassNotFoundException Serde class not found
* @throws SerializationError Serde initialization/interaction failed
* @throws StreamingException Problem acquiring file system path for partition
* @throws InvalidColumn any element in colNamesForFields refers to a non existing column
*/
public DelimitedInputWriter(String[] colNamesForFields, String delimiter,
HiveEndPoint endPoint, HiveConf conf, StreamingConnection conn)
throws ClassNotFoundException, ConnectionError, SerializationError,
InvalidColumn, StreamingException {
this(colNamesForFields, delimiter, endPoint, conf,
(char) LazySerDeParameters.DefaultSeparators[0], conn);
}
/**
* Constructor. Allows overriding separator of the LazySimpleSerde
* @param colNamesForFields Column name assignment for input fields
* @param delimiter input field delimiter
* @param endPoint Hive endpoint
* @param conf a Hive conf object. Set to null if not using advanced hive settings.
* @param serdeSeparator separator used when encoding data that is fed into the
* LazySimpleSerde. Ensure this separator does not occur
* in the field data
* @param conn connection this Writer is to be used with
* @throws ConnectionError Problem talking to Hive
* @throws ClassNotFoundException Serde class not found
* @throws SerializationError Serde initialization/interaction failed
* @throws StreamingException Problem acquiring file system path for partition
* @throws InvalidColumn any element in colNamesForFields refers to a non existing column
*/
public DelimitedInputWriter(String[] colNamesForFields, String delimiter,
HiveEndPoint endPoint, HiveConf conf, char serdeSeparator, StreamingConnection conn)
throws ClassNotFoundException, ConnectionError, SerializationError,
InvalidColumn, StreamingException {
super(endPoint, conf, conn);
this.tableColumns = getCols(tbl);
this.serdeSeparator = serdeSeparator;
this.delimiter = delimiter;
this.fieldToColMapping = getFieldReordering(colNamesForFields, getTableColumns());
this.reorderingNeeded = isReorderingNeeded(delimiter, getTableColumns());
LOG.debug("Field reordering needed = " + this.reorderingNeeded + ", for endpoint " + endPoint);
this.serdeSeparator = serdeSeparator;
this.serde = createSerde(tbl, conf, serdeSeparator);
// get ObjInspectors for entire record and bucketed cols
try {
this.recordObjInspector = (LazySimpleStructObjectInspector) serde.getObjectInspector();
this.bucketObjInspectors = getObjectInspectorsForBucketedCols(bucketIds, recordObjInspector);
} catch (SerDeException e) {
throw new SerializationError("Unable to get ObjectInspector for bucket columns", e);
}
// get StructFields for bucketed cols
bucketStructFields = new StructField[bucketIds.size()];
List<? extends StructField> allFields = recordObjInspector.getAllStructFieldRefs();
for (int i = 0; i < bucketIds.size(); i++) {
bucketStructFields[i] = allFields.get(bucketIds.get(i));
}
}
/**
* @deprecated As of release 1.3/2.1. Replaced by {@link #DelimitedInputWriter(String[], String, HiveEndPoint, StreamingConnection)}
*/
public DelimitedInputWriter(String[] colNamesForFields, String delimiter,
HiveEndPoint endPoint)
throws ClassNotFoundException, ConnectionError, SerializationError,
InvalidColumn, StreamingException {
this(colNamesForFields, delimiter, endPoint, null, null);
}
/**
* @deprecated As of release 1.3/2.1. Replaced by {@link #DelimitedInputWriter(String[], String, HiveEndPoint, HiveConf, StreamingConnection)}
*/
public DelimitedInputWriter(String[] colNamesForFields, String delimiter,
HiveEndPoint endPoint, HiveConf conf)
throws ClassNotFoundException, ConnectionError, SerializationError,
InvalidColumn, StreamingException {
this(colNamesForFields, delimiter, endPoint, conf,
(char) LazySerDeParameters.DefaultSeparators[0], null);
}
/**
* @deprecated As of release 1.3/2.1. Replaced by {@link #DelimitedInputWriter(String[], String, HiveEndPoint, HiveConf, char, StreamingConnection)}
*/
public DelimitedInputWriter(String[] colNamesForFields, String delimiter,
HiveEndPoint endPoint, HiveConf conf, char serdeSeparator)
throws ClassNotFoundException, StreamingException {
this(colNamesForFields, delimiter, endPoint, conf, serdeSeparator, null);
}
private boolean isReorderingNeeded(String delimiter, ArrayList<String> tableColumns) {
return !( delimiter.equals(String.valueOf(getSerdeSeparator()))
&& areFieldsInColOrder(fieldToColMapping)
&& tableColumns.size()>=fieldToColMapping.length );
}
private static boolean areFieldsInColOrder(int[] fieldToColMapping) {
for(int i=0; i<fieldToColMapping.length; ++i) {
if(fieldToColMapping[i]!=i) {
return false;
}
}
return true;
}
@VisibleForTesting
static int[] getFieldReordering(String[] colNamesForFields, List<String> tableColNames)
throws InvalidColumn {
int[] result = new int[ colNamesForFields.length ];
for(int i=0; i<colNamesForFields.length; ++i) {
result[i] = -1;
}
int i=-1, fieldLabelCount=0;
for( String col : colNamesForFields ) {
++i;
if(col == null) {
continue;
}
if( col.trim().isEmpty() ) {
continue;
}
++fieldLabelCount;
int loc = tableColNames.indexOf(col);
if(loc == -1) {
throw new InvalidColumn("Column '" + col + "' not found in table for input field " + i+1);
}
result[i] = loc;
}
if(fieldLabelCount>tableColNames.size()) {
throw new InvalidColumn("Number of field names exceeds the number of columns in table");
}
return result;
}
// Reorder fields in record based on the order of columns in the table
protected byte[] reorderFields(byte[] record) throws UnsupportedEncodingException {
if(!reorderingNeeded) {
return record;
}
String[] reorderedFields = new String[getTableColumns().size()];
String decoded = new String(record);
String[] fields = decoded.split(delimiter,-1);
for (int i=0; i<fieldToColMapping.length; ++i) {
int newIndex = fieldToColMapping[i];
if(newIndex != -1) {
reorderedFields[newIndex] = fields[i];
}
}
return join(reorderedFields, getSerdeSeparator());
}
// handles nulls in items[]
// TODO: perhaps can be made more efficient by creating a byte[] directly
private static byte[] join(String[] items, char separator) {
StringBuilder buff = new StringBuilder(100);
if(items.length == 0)
return "".getBytes();
int i=0;
for(; i<items.length-1; ++i) {
if(items[i]!=null) {
buff.append(items[i]);
}
buff.append(separator);
}
if(items[i]!=null) {
buff.append(items[i]);
}
return buff.toString().getBytes();
}
protected ArrayList<String> getTableColumns() {
return tableColumns;
}
@Override
public void write(long transactionId, byte[] record)
throws SerializationError, StreamingIOFailure {
try {
byte[] orderedFields = reorderFields(record);
Object encodedRow = encode(orderedFields);
int bucket = getBucket(encodedRow);
getRecordUpdater(bucket).insert(transactionId, encodedRow);
} catch (IOException e) {
throw new StreamingIOFailure("Error writing record in transaction ("
+ transactionId + ")", e);
}
}
@Override
public AbstractSerDe getSerde() {
return serde;
}
protected LazySimpleStructObjectInspector getRecordObjectInspector() {
return recordObjInspector;
}
@Override
protected StructField[] getBucketStructFields() {
return bucketStructFields;
}
protected ObjectInspector[] getBucketObjectInspectors() {
return bucketObjInspectors;
}
@Override
public Object encode(byte[] record) throws SerializationError {
try {
BytesWritable blob = new BytesWritable();
blob.set(record, 0, record.length);
return serde.deserialize(blob);
} catch (SerDeException e) {
throw new SerializationError("Unable to convert byte[] record into Object", e);
}
}
/**
* Creates LazySimpleSerde
* @return
* @throws SerializationError if serde could not be initialized
* @param tbl
*/
protected static LazySimpleSerDe createSerde(Table tbl, HiveConf conf, char serdeSeparator)
throws SerializationError {
try {
Properties tableProps = MetaStoreUtils.getTableMetadata(tbl);
tableProps.setProperty("field.delim", String.valueOf(serdeSeparator));
LazySimpleSerDe serde = new LazySimpleSerDe();
SerDeUtils.initializeSerDe(serde, conf, tableProps, null);
return serde;
} catch (SerDeException e) {
throw new SerializationError("Error initializing serde", e);
}
}
private ArrayList<String> getCols(Table table) {
List<FieldSchema> cols = table.getSd().getCols();
ArrayList<String> colNames = new ArrayList<String>(cols.size());
for (FieldSchema col : cols) {
colNames.add(col.getName().toLowerCase());
}
return colNames;
}
public char getSerdeSeparator() {
return serdeSeparator;
}
}