/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.accumulo.mr;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.accumulo.core.client.AccumuloException;
import org.apache.accumulo.core.client.AccumuloSecurityException;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.Instance;
import org.apache.accumulo.core.client.IteratorSetting;
import org.apache.accumulo.core.client.mapred.AccumuloInputFormat;
import org.apache.accumulo.core.client.mapred.AccumuloRowInputFormat;
import org.apache.accumulo.core.client.mapred.RangeInputSplit;
import org.apache.accumulo.core.client.mapreduce.lib.impl.ConfiguratorBase;
import org.apache.accumulo.core.client.mock.MockInstance;
import org.apache.accumulo.core.client.security.tokens.AuthenticationToken;
import org.apache.accumulo.core.client.security.tokens.PasswordToken;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.security.Authorizations;
import org.apache.accumulo.core.util.Pair;
import org.apache.accumulo.core.util.PeekingIterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.accumulo.AccumuloConnectionParameters;
import org.apache.hadoop.hive.accumulo.AccumuloHiveRow;
import org.apache.hadoop.hive.accumulo.HiveAccumuloHelper;
import org.apache.hadoop.hive.accumulo.columns.ColumnMapper;
import org.apache.hadoop.hive.accumulo.columns.ColumnMapping;
import org.apache.hadoop.hive.accumulo.columns.HiveAccumuloColumnMapping;
import org.apache.hadoop.hive.accumulo.columns.HiveAccumuloMapColumnMapping;
import org.apache.hadoop.hive.accumulo.predicate.AccumuloPredicateHandler;
import org.apache.hadoop.hive.accumulo.serde.AccumuloSerDeParameters;
import org.apache.hadoop.hive.accumulo.serde.TooManyAccumuloColumnsException;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.security.token.TokenIdentifier;
import org.apache.hadoop.util.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Wraps older InputFormat for use with Hive.
*
* Configure input scan with proper ranges, iterators, and columns based on serde properties for
* Hive table.
*/
public class HiveAccumuloTableInputFormat implements
org.apache.hadoop.mapred.InputFormat<Text,AccumuloHiveRow> {
private static final Logger log = LoggerFactory.getLogger(HiveAccumuloTableInputFormat.class);
// Visible for testing
protected AccumuloRowInputFormat accumuloInputFormat = new AccumuloRowInputFormat();
protected AccumuloPredicateHandler predicateHandler = AccumuloPredicateHandler.getInstance();
protected HiveAccumuloHelper helper = new HiveAccumuloHelper();
@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
final AccumuloConnectionParameters accumuloParams = new AccumuloConnectionParameters(jobConf);
final Instance instance = accumuloParams.getInstance();
final ColumnMapper columnMapper;
try {
columnMapper = getColumnMapper(jobConf);
} catch (TooManyAccumuloColumnsException e) {
throw new IOException(e);
}
JobContext context = ShimLoader.getHadoopShims().newJobContext(Job.getInstance(jobConf));
Path[] tablePaths = FileInputFormat.getInputPaths(context);
try {
UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
final Connector connector;
// Need to get a Connector so we look up the user's authorizations if not otherwise specified
if (accumuloParams.useSasl() && !ugi.hasKerberosCredentials()) {
// In a YARN/Tez job, don't have the Kerberos credentials anymore, use the delegation token
AuthenticationToken token = ConfiguratorBase.getAuthenticationToken(
AccumuloInputFormat.class, jobConf);
// Convert the stub from the configuration back into a normal Token
// More reflection to support 1.6
token = helper.unwrapAuthenticationToken(jobConf, token);
connector = instance.getConnector(accumuloParams.getAccumuloUserName(), token);
} else {
// Still in the local JVM, use the username+password or Kerberos credentials
connector = accumuloParams.getConnector(instance);
}
final List<ColumnMapping> columnMappings = columnMapper.getColumnMappings();
final List<IteratorSetting> iterators = predicateHandler.getIterators(jobConf, columnMapper);
final Collection<Range> ranges = predicateHandler.getRanges(jobConf, columnMapper);
// Setting an empty collection of ranges will, unexpectedly, scan all data
// We don't want that.
if (null != ranges && ranges.isEmpty()) {
return new InputSplit[0];
}
// Set the relevant information in the Configuration for the AccumuloInputFormat
configure(jobConf, instance, connector, accumuloParams, columnMapper, iterators, ranges);
int numColumns = columnMappings.size();
List<Integer> readColIds = ColumnProjectionUtils.getReadColumnIDs(jobConf);
// Sanity check
if (numColumns < readColIds.size())
throw new IOException("Number of column mappings (" + numColumns + ")"
+ " numbers less than the hive table columns. (" + readColIds.size() + ")");
// get splits from Accumulo
InputSplit[] splits = accumuloInputFormat.getSplits(jobConf, numSplits);
HiveAccumuloSplit[] hiveSplits = new HiveAccumuloSplit[splits.length];
for (int i = 0; i < splits.length; i++) {
RangeInputSplit ris = (RangeInputSplit) splits[i];
hiveSplits[i] = new HiveAccumuloSplit(ris, tablePaths[0]);
}
return hiveSplits;
} catch (AccumuloException e) {
log.error("Could not configure AccumuloInputFormat", e);
throw new IOException(StringUtils.stringifyException(e));
} catch (AccumuloSecurityException e) {
log.error("Could not configure AccumuloInputFormat", e);
throw new IOException(StringUtils.stringifyException(e));
} catch (SerDeException e) {
log.error("Could not configure AccumuloInputFormat", e);
throw new IOException(StringUtils.stringifyException(e));
}
}
/**
* Setup accumulo input format from conf properties. Delegates to final RecordReader from mapred
* package.
*
* @param inputSplit
* @param jobConf
* @param reporter
* @return RecordReader
* @throws IOException
*/
@Override
public RecordReader<Text,AccumuloHiveRow> getRecordReader(InputSplit inputSplit,
final JobConf jobConf, final Reporter reporter) throws IOException {
final ColumnMapper columnMapper;
try {
columnMapper = getColumnMapper(jobConf);
} catch (TooManyAccumuloColumnsException e) {
throw new IOException(e);
}
try {
final List<IteratorSetting> iterators = predicateHandler.getIterators(jobConf, columnMapper);
HiveAccumuloSplit hiveSplit = (HiveAccumuloSplit) inputSplit;
RangeInputSplit rangeSplit = hiveSplit.getSplit();
log.info("Split: " + rangeSplit);
// The RangeInputSplit *should* have all of the necesary information contained in it
// which alleviates us from re-parsing our configuration from the AccumuloStorageHandler
// and re-setting it into the Configuration (like we did in getSplits(...)). Thus, it should
// be unnecessary to re-invoke configure(...)
// ACCUMULO-2962 Iterators weren't getting serialized into the InputSplit, but we can
// compensate because we still have that info.
// Should be fixed in Accumulo 1.5.2 and 1.6.1
if (null == rangeSplit.getIterators()
|| (rangeSplit.getIterators().isEmpty() && !iterators.isEmpty())) {
log.debug("Re-setting iterators on InputSplit due to Accumulo bug.");
rangeSplit.setIterators(iterators);
}
// ACCUMULO-3015 Like the above, RangeInputSplit should have the table name
// but we want it to, so just re-set it if it's null.
if (null == getTableName(rangeSplit)) {
final AccumuloConnectionParameters accumuloParams = new AccumuloConnectionParameters(
jobConf);
log.debug("Re-setting table name on InputSplit due to Accumulo bug.");
setTableName(rangeSplit, accumuloParams.getAccumuloTableName());
}
final RecordReader<Text,PeekingIterator<Map.Entry<Key,Value>>> recordReader = accumuloInputFormat
.getRecordReader(rangeSplit, jobConf, reporter);
return new HiveAccumuloRecordReader(recordReader, iterators.size());
} catch (SerDeException e) {
throw new IOException(StringUtils.stringifyException(e));
}
}
protected ColumnMapper getColumnMapper(Configuration conf) throws IOException,
TooManyAccumuloColumnsException {
final String defaultStorageType = conf.get(AccumuloSerDeParameters.DEFAULT_STORAGE_TYPE);
String[] columnNamesArr = conf.getStrings(serdeConstants.LIST_COLUMNS);
if (null == columnNamesArr) {
throw new IOException(
"Hive column names must be provided to InputFormat in the Configuration");
}
List<String> columnNames = Arrays.asList(columnNamesArr);
String serializedTypes = conf.get(serdeConstants.LIST_COLUMN_TYPES);
if (null == serializedTypes) {
throw new IOException(
"Hive column types must be provided to InputFormat in the Configuration");
}
ArrayList<TypeInfo> columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(serializedTypes);
return new ColumnMapper(conf.get(AccumuloSerDeParameters.COLUMN_MAPPINGS), defaultStorageType,
columnNames, columnTypes);
}
/**
* Configure the underlying AccumuloInputFormat
*
* @param conf
* Job configuration
* @param instance
* Accumulo instance
* @param connector
* Accumulo connector
* @param accumuloParams
* Connection information to the Accumulo instance
* @param columnMapper
* Configuration of Hive to Accumulo columns
* @param iterators
* Any iterators to be configured server-side
* @param ranges
* Accumulo ranges on for the query
* @throws AccumuloSecurityException
* @throws AccumuloException
* @throws SerDeException
*/
protected void configure(JobConf conf, Instance instance, Connector connector,
AccumuloConnectionParameters accumuloParams, ColumnMapper columnMapper,
List<IteratorSetting> iterators, Collection<Range> ranges) throws AccumuloSecurityException,
AccumuloException, SerDeException, IOException {
// Handle implementation of Instance and invoke appropriate InputFormat method
if (instance instanceof MockInstance) {
setMockInstance(conf, instance.getInstanceName());
} else {
setZooKeeperInstance(conf, instance.getInstanceName(), instance.getZooKeepers(),
accumuloParams.useSasl());
}
// Set the username/passwd for the Accumulo connection
if (accumuloParams.useSasl()) {
UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
// If we have Kerberos credentials, we should obtain the delegation token
if (ugi.hasKerberosCredentials()) {
Connector conn = accumuloParams.getConnector();
AuthenticationToken token = helper.getDelegationToken(conn);
// Send the DelegationToken down to the Configuration for Accumulo to use
setConnectorInfo(conf, accumuloParams.getAccumuloUserName(), token);
// Convert the Accumulo token in a Hadoop token
Token<? extends TokenIdentifier> accumuloToken = helper.getHadoopToken(token);
log.info("Adding Hadoop Token for Accumulo to Job's Credentials");
// Add the Hadoop token to the JobConf
helper.mergeTokenIntoJobConf(conf, accumuloToken);
if (!ugi.addToken(accumuloToken)) {
throw new IOException("Failed to add Accumulo Token to UGI");
}
}
try {
helper.addTokenFromUserToJobConf(ugi, conf);
} catch (IOException e) {
throw new IOException("Current user did not contain necessary delegation Tokens " + ugi, e);
}
} else {
setConnectorInfo(conf, accumuloParams.getAccumuloUserName(),
new PasswordToken(accumuloParams.getAccumuloPassword()));
}
// Read from the given Accumulo table
setInputTableName(conf, accumuloParams.getAccumuloTableName());
// Check Configuration for any user-provided Authorization definition
Authorizations auths = AccumuloSerDeParameters.getAuthorizationsFromConf(conf);
if (null == auths) {
// Default to all of user's authorizations when no configuration is provided
auths = connector.securityOperations().getUserAuthorizations(
accumuloParams.getAccumuloUserName());
}
// Implicitly handles users providing invalid authorizations
setScanAuthorizations(conf, auths);
// restrict with any filters found from WHERE predicates.
addIterators(conf, iterators);
// restrict with any ranges found from WHERE predicates.
// not setting ranges scans the entire table
if (null != ranges) {
log.info("Setting ranges: " + ranges);
setRanges(conf, ranges);
}
// Restrict the set of columns that we want to read from the Accumulo table
HashSet<Pair<Text,Text>> pairs = getPairCollection(columnMapper.getColumnMappings());
if (null != pairs && !pairs.isEmpty()) {
fetchColumns(conf, pairs);
}
}
// Wrap the static AccumuloInputFormat methods with methods that we can
// verify were correctly called via Mockito
protected void setMockInstance(JobConf conf, String instanceName) {
try {
AccumuloInputFormat.setMockInstance(conf, instanceName);
} catch (IllegalStateException e) {
// AccumuloInputFormat complains if you re-set an already set value. We just don't care.
log.debug("Ignoring exception setting mock instance of " + instanceName, e);
}
}
@SuppressWarnings("deprecation")
protected void setZooKeeperInstance(JobConf conf, String instanceName, String zkHosts,
boolean isSasl) throws IOException {
// To support builds against 1.5, we can't use the new 1.6 setZooKeeperInstance which
// takes a ClientConfiguration class that only exists in 1.6
try {
if (isSasl) {
// Reflection to support Accumulo 1.5. Remove when Accumulo 1.5 support is dropped
// 1.6 works with the deprecated 1.5 method, but must use reflection for 1.7-only SASL support
helper.setZooKeeperInstance(conf, AccumuloInputFormat.class, zkHosts, instanceName, isSasl);
} else {
AccumuloInputFormat.setZooKeeperInstance(conf, instanceName, zkHosts);
}
} catch (IllegalStateException ise) {
// AccumuloInputFormat complains if you re-set an already set value. We just don't care.
log.debug("Ignoring exception setting ZooKeeper instance of " + instanceName + " at "
+ zkHosts, ise);
}
}
protected void setConnectorInfo(JobConf conf, String user, AuthenticationToken token)
throws AccumuloSecurityException {
try {
AccumuloInputFormat.setConnectorInfo(conf, user, token);
} catch (IllegalStateException e) {
// AccumuloInputFormat complains if you re-set an already set value. We just don't care.
log.debug("Ignoring exception setting Accumulo Connector instance for user " + user, e);
}
}
protected void setInputTableName(JobConf conf, String tableName) {
AccumuloInputFormat.setInputTableName(conf, tableName);
}
protected void setScanAuthorizations(JobConf conf, Authorizations auths) {
AccumuloInputFormat.setScanAuthorizations(conf, auths);
}
protected void addIterators(JobConf conf, List<IteratorSetting> iterators) {
for (IteratorSetting is : iterators) {
AccumuloInputFormat.addIterator(conf, is);
}
}
protected void setRanges(JobConf conf, Collection<Range> ranges) {
AccumuloInputFormat.setRanges(conf, ranges);
}
protected void fetchColumns(JobConf conf, Set<Pair<Text,Text>> cfCqPairs) {
AccumuloInputFormat.fetchColumns(conf, cfCqPairs);
}
/**
* Create col fam/qual pairs from pipe separated values, usually from config object. Ignores
* rowID.
*
* @param columnMappings
* The list of ColumnMappings for the given query
* @return a Set of Pairs of colfams and colquals
*/
protected HashSet<Pair<Text,Text>> getPairCollection(List<ColumnMapping> columnMappings) {
final HashSet<Pair<Text,Text>> pairs = new HashSet<Pair<Text,Text>>();
for (ColumnMapping columnMapping : columnMappings) {
if (columnMapping instanceof HiveAccumuloColumnMapping) {
HiveAccumuloColumnMapping accumuloColumnMapping = (HiveAccumuloColumnMapping) columnMapping;
Text cf = new Text(accumuloColumnMapping.getColumnFamily());
Text cq = null;
// A null cq implies an empty column qualifier
if (null != accumuloColumnMapping.getColumnQualifier()) {
cq = new Text(accumuloColumnMapping.getColumnQualifier());
}
pairs.add(new Pair<Text,Text>(cf, cq));
} else if (columnMapping instanceof HiveAccumuloMapColumnMapping) {
HiveAccumuloMapColumnMapping mapMapping = (HiveAccumuloMapColumnMapping) columnMapping;
// Can't fetch prefix on colqual, must pull the entire qualifier
// TODO use an iterator to do the filter, server-side.
pairs.add(new Pair<Text,Text>(new Text(mapMapping.getColumnFamily()), null));
}
}
log.info("Computed columns to fetch (" + pairs + ") from " + columnMappings);
return pairs;
}
/**
* Reflection to work around Accumulo 1.5 and 1.6 incompatibilities. Throws an {@link IOException}
* for any reflection related exceptions
*
* @param split
* A RangeInputSplit
* @return The name of the table from the split
* @throws IOException
*/
protected String getTableName(RangeInputSplit split) throws IOException {
// ACCUMULO-3017 shenanigans with method names changing without deprecation
Method getTableName = null;
try {
getTableName = RangeInputSplit.class.getMethod("getTableName");
} catch (SecurityException e) {
log.debug("Could not get getTableName method from RangeInputSplit", e);
} catch (NoSuchMethodException e) {
log.debug("Could not get getTableName method from RangeInputSplit", e);
}
if (null != getTableName) {
try {
return (String) getTableName.invoke(split);
} catch (IllegalArgumentException e) {
log.debug("Could not invoke getTableName method from RangeInputSplit", e);
} catch (IllegalAccessException e) {
log.debug("Could not invoke getTableName method from RangeInputSplit", e);
} catch (InvocationTargetException e) {
log.debug("Could not invoke getTableName method from RangeInputSplit", e);
}
}
Method getTable;
try {
getTable = RangeInputSplit.class.getMethod("getTable");
} catch (SecurityException e) {
throw new IOException("Could not get table name from RangeInputSplit", e);
} catch (NoSuchMethodException e) {
throw new IOException("Could not get table name from RangeInputSplit", e);
}
try {
return (String) getTable.invoke(split);
} catch (IllegalArgumentException e) {
throw new IOException("Could not get table name from RangeInputSplit", e);
} catch (IllegalAccessException e) {
throw new IOException("Could not get table name from RangeInputSplit", e);
} catch (InvocationTargetException e) {
throw new IOException("Could not get table name from RangeInputSplit", e);
}
}
/**
* Sets the table name on a RangeInputSplit, accounting for change in method name. Any reflection
* related exception is wrapped in an {@link IOException}
*
* @param split
* The RangeInputSplit to operate on
* @param tableName
* The name of the table to set
* @throws IOException
*/
protected void setTableName(RangeInputSplit split, String tableName) throws IOException {
// ACCUMULO-3017 shenanigans with method names changing without deprecation
Method setTableName = null;
try {
setTableName = RangeInputSplit.class.getMethod("setTableName", String.class);
} catch (SecurityException e) {
log.debug("Could not get getTableName method from RangeInputSplit", e);
} catch (NoSuchMethodException e) {
log.debug("Could not get getTableName method from RangeInputSplit", e);
}
if (null != setTableName) {
try {
setTableName.invoke(split, tableName);
return;
} catch (IllegalArgumentException e) {
log.debug("Could not invoke getTableName method from RangeInputSplit", e);
} catch (IllegalAccessException e) {
log.debug("Could not invoke getTableName method from RangeInputSplit", e);
} catch (InvocationTargetException e) {
log.debug("Could not invoke getTableName method from RangeInputSplit", e);
}
}
Method setTable;
try {
setTable = RangeInputSplit.class.getMethod("setTable", String.class);
} catch (SecurityException e) {
throw new IOException("Could not set table name from RangeInputSplit", e);
} catch (NoSuchMethodException e) {
throw new IOException("Could not set table name from RangeInputSplit", e);
}
try {
setTable.invoke(split, tableName);
} catch (IllegalArgumentException e) {
throw new IOException("Could not set table name from RangeInputSplit", e);
} catch (IllegalAccessException e) {
throw new IOException("Could not set table name from RangeInputSplit", e);
} catch (InvocationTargetException e) {
throw new IOException("Could not set table name from RangeInputSplit", e);
}
}
}