/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hive.hcatalog.streaming;
import org.apache.hadoop.security.UserGroupInformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.JavaUtils;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.ql.io.AcidOutputFormat;
import org.apache.hadoop.hive.ql.io.RecordUpdater;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hive.hcatalog.common.HCatUtil;
import org.apache.thrift.TException;
import java.io.IOException;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
public abstract class AbstractRecordWriter implements RecordWriter {
static final private Logger LOG = LoggerFactory.getLogger(AbstractRecordWriter.class.getName());
final HiveConf conf;
final HiveEndPoint endPoint;
final Table tbl;
final IMetaStoreClient msClient;
protected final List<Integer> bucketIds;
ArrayList<RecordUpdater> updaters = null;
public final int totalBuckets;
private final Path partitionPath;
final AcidOutputFormat<?,?> outf;
private Object[] bucketFieldData; // Pre-allocated in constructor. Updated on each write.
private Long curBatchMinTxnId;
private Long curBatchMaxTxnId;
private static final class TableWriterPair {
private final Table tbl;
private final Path partitionPath;
TableWriterPair(Table t, Path p) {
tbl = t;
partitionPath = p;
}
}
/**
* @deprecated As of release 1.3/2.1. Replaced by {@link #AbstractRecordWriter(HiveEndPoint, HiveConf, StreamingConnection)}
*/
protected AbstractRecordWriter(HiveEndPoint endPoint, HiveConf conf)
throws ConnectionError, StreamingException {
this(endPoint, conf, null);
}
protected AbstractRecordWriter(HiveEndPoint endPoint2, HiveConf conf, StreamingConnection conn)
throws StreamingException {
this.endPoint = endPoint2;
this.conf = conf!=null ? conf
: HiveEndPoint.createHiveConf(DelimitedInputWriter.class, endPoint.metaStoreUri);
try {
msClient = HCatUtil.getHiveMetastoreClient(this.conf);
UserGroupInformation ugi = conn != null ? conn.getUserGroupInformation() : null;
if (ugi == null) {
this.tbl = msClient.getTable(endPoint.database, endPoint.table);
this.partitionPath = getPathForEndPoint(msClient, endPoint);
} else {
TableWriterPair twp = ugi.doAs(
new PrivilegedExceptionAction<TableWriterPair>() {
@Override
public TableWriterPair run() throws Exception {
return new TableWriterPair(msClient.getTable(endPoint.database, endPoint.table),
getPathForEndPoint(msClient, endPoint));
}
});
this.tbl = twp.tbl;
this.partitionPath = twp.partitionPath;
}
this.totalBuckets = tbl.getSd().getNumBuckets();
if (totalBuckets <= 0) {
throw new StreamingException("Cannot stream to table that has not been bucketed : "
+ endPoint);
}
this.bucketIds = getBucketColIDs(tbl.getSd().getBucketCols(), tbl.getSd().getCols());
this.bucketFieldData = new Object[bucketIds.size()];
String outFormatName = this.tbl.getSd().getOutputFormat();
outf = (AcidOutputFormat<?, ?>) ReflectionUtils.newInstance(JavaUtils.loadClass(outFormatName), conf);
bucketFieldData = new Object[bucketIds.size()];
} catch(InterruptedException e) {
throw new StreamingException(endPoint2.toString(), e);
} catch (MetaException | NoSuchObjectException e) {
throw new ConnectionError(endPoint2, e);
} catch (TException | ClassNotFoundException | IOException e) {
throw new StreamingException(e.getMessage(), e);
}
}
/**
* used to tag error msgs to provied some breadcrumbs
*/
String getWatermark() {
return partitionPath + " txnIds[" + curBatchMinTxnId + "," + curBatchMaxTxnId + "]";
}
// return the column numbers of the bucketed columns
private List<Integer> getBucketColIDs(List<String> bucketCols, List<FieldSchema> cols) {
ArrayList<Integer> result = new ArrayList<Integer>(bucketCols.size());
HashSet<String> bucketSet = new HashSet<String>(bucketCols);
for (int i = 0; i < cols.size(); i++) {
if( bucketSet.contains(cols.get(i).getName()) ) {
result.add(i);
}
}
return result;
}
/**
* Get the SerDe for the Objects created by {@link #encode}. This is public so that test
* frameworks can use it.
* @return serde
* @throws SerializationError
*/
public abstract AbstractSerDe getSerde() throws SerializationError;
/**
* Encode a record as an Object that Hive can read with the ObjectInspector associated with the
* serde returned by {@link #getSerde}. This is public so that test frameworks can use it.
* @param record record to be deserialized
* @return deserialized record as an Object
* @throws SerializationError
*/
public abstract Object encode(byte[] record) throws SerializationError;
protected abstract ObjectInspector[] getBucketObjectInspectors();
protected abstract StructObjectInspector getRecordObjectInspector();
protected abstract StructField[] getBucketStructFields();
// returns the bucket number to which the record belongs to
protected int getBucket(Object row) throws SerializationError {
ObjectInspector[] inspectors = getBucketObjectInspectors();
Object[] bucketFields = getBucketFields(row);
return ObjectInspectorUtils.getBucketNumber(bucketFields, inspectors, totalBuckets);
}
@Override
public void flush() throws StreamingIOFailure {
try {
for (RecordUpdater updater : updaters) {
if (updater != null) {
updater.flush();
}
}
} catch (IOException e) {
throw new StreamingIOFailure("Unable to flush recordUpdater", e);
}
}
@Override
public void clear() throws StreamingIOFailure {
}
/**
* Creates a new record updater for the new batch
* @param minTxnId smallest Txnid in the batch
* @param maxTxnID largest Txnid in the batch
* @throws StreamingIOFailure if failed to create record updater
*/
@Override
public void newBatch(Long minTxnId, Long maxTxnID)
throws StreamingIOFailure, SerializationError {
curBatchMinTxnId = minTxnId;
curBatchMaxTxnId = maxTxnID;
updaters = new ArrayList<RecordUpdater>(totalBuckets);
for (int bucket = 0; bucket < totalBuckets; bucket++) {
updaters.add(bucket, null);
}
}
@Override
public void closeBatch() throws StreamingIOFailure {
boolean haveError = false;
for (RecordUpdater updater : updaters) {
if (updater != null) {
try {
//try not to leave any files open
updater.close(false);
} catch (Exception ex) {
haveError = true;
LOG.error("Unable to close " + updater + " due to: " + ex.getMessage(), ex);
}
}
}
updaters.clear();
if(haveError) {
throw new StreamingIOFailure("Encountered errors while closing (see logs) " + getWatermark());
}
}
protected static ObjectInspector[] getObjectInspectorsForBucketedCols(List<Integer> bucketIds
, StructObjectInspector recordObjInspector)
throws SerializationError {
ObjectInspector[] result = new ObjectInspector[bucketIds.size()];
for (int i = 0; i < bucketIds.size(); i++) {
int bucketId = bucketIds.get(i);
result[i] =
recordObjInspector.getAllStructFieldRefs().get( bucketId ).getFieldObjectInspector();
}
return result;
}
private Object[] getBucketFields(Object row) throws SerializationError {
StructObjectInspector recordObjInspector = getRecordObjectInspector();
StructField[] bucketStructFields = getBucketStructFields();
for (int i = 0; i < bucketIds.size(); i++) {
bucketFieldData[i] = recordObjInspector.getStructFieldData(row, bucketStructFields[i]);
}
return bucketFieldData;
}
private RecordUpdater createRecordUpdater(int bucketId, Long minTxnId, Long maxTxnID)
throws IOException, SerializationError {
try {
// Initialize table properties from the table parameters. This is required because the table
// may define certain table parameters that may be required while writing. The table parameter
// 'transactional_properties' is one such example.
Properties tblProperties = new Properties();
tblProperties.putAll(tbl.getParameters());
return outf.getRecordUpdater(partitionPath,
new AcidOutputFormat.Options(conf)
.inspector(getSerde().getObjectInspector())
.bucket(bucketId)
.tableProperties(tblProperties)
.minimumTransactionId(minTxnId)
.maximumTransactionId(maxTxnID)
.statementId(-1)
.finalDestination(partitionPath));
} catch (SerDeException e) {
throw new SerializationError("Failed to get object inspector from Serde "
+ getSerde().getClass().getName(), e);
}
}
RecordUpdater getRecordUpdater(int bucketId) throws StreamingIOFailure, SerializationError {
RecordUpdater recordUpdater = updaters.get(bucketId);
if (recordUpdater == null) {
try {
recordUpdater = createRecordUpdater(bucketId, curBatchMinTxnId, curBatchMaxTxnId);
} catch (IOException e) {
String errMsg = "Failed creating RecordUpdater for " + getWatermark();
LOG.error(errMsg, e);
throw new StreamingIOFailure(errMsg, e);
}
updaters.set(bucketId, recordUpdater);
}
return recordUpdater;
}
private Path getPathForEndPoint(IMetaStoreClient msClient, HiveEndPoint endPoint)
throws StreamingException {
try {
String location;
if(endPoint.partitionVals==null || endPoint.partitionVals.isEmpty() ) {
location = msClient.getTable(endPoint.database,endPoint.table)
.getSd().getLocation();
} else {
location = msClient.getPartition(endPoint.database, endPoint.table,
endPoint.partitionVals).getSd().getLocation();
}
return new Path(location);
} catch (TException e) {
throw new StreamingException(e.getMessage()
+ ". Unable to get path for end point: "
+ endPoint.partitionVals, e);
}
}
}