/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.accumulo.mr; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.accumulo.core.client.AccumuloException; import org.apache.accumulo.core.client.AccumuloSecurityException; import org.apache.accumulo.core.client.Connector; import org.apache.accumulo.core.client.Instance; import org.apache.accumulo.core.client.IteratorSetting; import org.apache.accumulo.core.client.mapred.AccumuloInputFormat; import org.apache.accumulo.core.client.mapred.AccumuloRowInputFormat; import org.apache.accumulo.core.client.mapred.RangeInputSplit; import org.apache.accumulo.core.client.mapreduce.lib.impl.ConfiguratorBase; import org.apache.accumulo.core.client.mock.MockInstance; import org.apache.accumulo.core.client.security.tokens.AuthenticationToken; import org.apache.accumulo.core.client.security.tokens.PasswordToken; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.security.Authorizations; import org.apache.accumulo.core.util.Pair; import org.apache.accumulo.core.util.PeekingIterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.accumulo.AccumuloConnectionParameters; import org.apache.hadoop.hive.accumulo.AccumuloHiveRow; import org.apache.hadoop.hive.accumulo.HiveAccumuloHelper; import org.apache.hadoop.hive.accumulo.columns.ColumnMapper; import org.apache.hadoop.hive.accumulo.columns.ColumnMapping; import org.apache.hadoop.hive.accumulo.columns.HiveAccumuloColumnMapping; import org.apache.hadoop.hive.accumulo.columns.HiveAccumuloMapColumnMapping; import org.apache.hadoop.hive.accumulo.predicate.AccumuloPredicateHandler; import org.apache.hadoop.hive.accumulo.serde.AccumuloSerDeParameters; import org.apache.hadoop.hive.accumulo.serde.TooManyAccumuloColumnsException; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.hive.shims.ShimLoader; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.token.Token; import org.apache.hadoop.security.token.TokenIdentifier; import org.apache.hadoop.util.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Wraps older InputFormat for use with Hive. * * Configure input scan with proper ranges, iterators, and columns based on serde properties for * Hive table. */ public class HiveAccumuloTableInputFormat implements org.apache.hadoop.mapred.InputFormat<Text,AccumuloHiveRow> { private static final Logger log = LoggerFactory.getLogger(HiveAccumuloTableInputFormat.class); // Visible for testing protected AccumuloRowInputFormat accumuloInputFormat = new AccumuloRowInputFormat(); protected AccumuloPredicateHandler predicateHandler = AccumuloPredicateHandler.getInstance(); protected HiveAccumuloHelper helper = new HiveAccumuloHelper(); @Override public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException { final AccumuloConnectionParameters accumuloParams = new AccumuloConnectionParameters(jobConf); final Instance instance = accumuloParams.getInstance(); final ColumnMapper columnMapper; try { columnMapper = getColumnMapper(jobConf); } catch (TooManyAccumuloColumnsException e) { throw new IOException(e); } JobContext context = ShimLoader.getHadoopShims().newJobContext(Job.getInstance(jobConf)); Path[] tablePaths = FileInputFormat.getInputPaths(context); try { UserGroupInformation ugi = UserGroupInformation.getCurrentUser(); final Connector connector; // Need to get a Connector so we look up the user's authorizations if not otherwise specified if (accumuloParams.useSasl() && !ugi.hasKerberosCredentials()) { // In a YARN/Tez job, don't have the Kerberos credentials anymore, use the delegation token AuthenticationToken token = ConfiguratorBase.getAuthenticationToken( AccumuloInputFormat.class, jobConf); // Convert the stub from the configuration back into a normal Token // More reflection to support 1.6 token = helper.unwrapAuthenticationToken(jobConf, token); connector = instance.getConnector(accumuloParams.getAccumuloUserName(), token); } else { // Still in the local JVM, use the username+password or Kerberos credentials connector = accumuloParams.getConnector(instance); } final List<ColumnMapping> columnMappings = columnMapper.getColumnMappings(); final List<IteratorSetting> iterators = predicateHandler.getIterators(jobConf, columnMapper); final Collection<Range> ranges = predicateHandler.getRanges(jobConf, columnMapper); // Setting an empty collection of ranges will, unexpectedly, scan all data // We don't want that. if (null != ranges && ranges.isEmpty()) { return new InputSplit[0]; } // Set the relevant information in the Configuration for the AccumuloInputFormat configure(jobConf, instance, connector, accumuloParams, columnMapper, iterators, ranges); int numColumns = columnMappings.size(); List<Integer> readColIds = ColumnProjectionUtils.getReadColumnIDs(jobConf); // Sanity check if (numColumns < readColIds.size()) throw new IOException("Number of column mappings (" + numColumns + ")" + " numbers less than the hive table columns. (" + readColIds.size() + ")"); // get splits from Accumulo InputSplit[] splits = accumuloInputFormat.getSplits(jobConf, numSplits); HiveAccumuloSplit[] hiveSplits = new HiveAccumuloSplit[splits.length]; for (int i = 0; i < splits.length; i++) { RangeInputSplit ris = (RangeInputSplit) splits[i]; hiveSplits[i] = new HiveAccumuloSplit(ris, tablePaths[0]); } return hiveSplits; } catch (AccumuloException e) { log.error("Could not configure AccumuloInputFormat", e); throw new IOException(StringUtils.stringifyException(e)); } catch (AccumuloSecurityException e) { log.error("Could not configure AccumuloInputFormat", e); throw new IOException(StringUtils.stringifyException(e)); } catch (SerDeException e) { log.error("Could not configure AccumuloInputFormat", e); throw new IOException(StringUtils.stringifyException(e)); } } /** * Setup accumulo input format from conf properties. Delegates to final RecordReader from mapred * package. * * @param inputSplit * @param jobConf * @param reporter * @return RecordReader * @throws IOException */ @Override public RecordReader<Text,AccumuloHiveRow> getRecordReader(InputSplit inputSplit, final JobConf jobConf, final Reporter reporter) throws IOException { final ColumnMapper columnMapper; try { columnMapper = getColumnMapper(jobConf); } catch (TooManyAccumuloColumnsException e) { throw new IOException(e); } try { final List<IteratorSetting> iterators = predicateHandler.getIterators(jobConf, columnMapper); HiveAccumuloSplit hiveSplit = (HiveAccumuloSplit) inputSplit; RangeInputSplit rangeSplit = hiveSplit.getSplit(); log.info("Split: " + rangeSplit); // The RangeInputSplit *should* have all of the necesary information contained in it // which alleviates us from re-parsing our configuration from the AccumuloStorageHandler // and re-setting it into the Configuration (like we did in getSplits(...)). Thus, it should // be unnecessary to re-invoke configure(...) // ACCUMULO-2962 Iterators weren't getting serialized into the InputSplit, but we can // compensate because we still have that info. // Should be fixed in Accumulo 1.5.2 and 1.6.1 if (null == rangeSplit.getIterators() || (rangeSplit.getIterators().isEmpty() && !iterators.isEmpty())) { log.debug("Re-setting iterators on InputSplit due to Accumulo bug."); rangeSplit.setIterators(iterators); } // ACCUMULO-3015 Like the above, RangeInputSplit should have the table name // but we want it to, so just re-set it if it's null. if (null == getTableName(rangeSplit)) { final AccumuloConnectionParameters accumuloParams = new AccumuloConnectionParameters( jobConf); log.debug("Re-setting table name on InputSplit due to Accumulo bug."); setTableName(rangeSplit, accumuloParams.getAccumuloTableName()); } final RecordReader<Text,PeekingIterator<Map.Entry<Key,Value>>> recordReader = accumuloInputFormat .getRecordReader(rangeSplit, jobConf, reporter); return new HiveAccumuloRecordReader(recordReader, iterators.size()); } catch (SerDeException e) { throw new IOException(StringUtils.stringifyException(e)); } } protected ColumnMapper getColumnMapper(Configuration conf) throws IOException, TooManyAccumuloColumnsException { final String defaultStorageType = conf.get(AccumuloSerDeParameters.DEFAULT_STORAGE_TYPE); String[] columnNamesArr = conf.getStrings(serdeConstants.LIST_COLUMNS); if (null == columnNamesArr) { throw new IOException( "Hive column names must be provided to InputFormat in the Configuration"); } List<String> columnNames = Arrays.asList(columnNamesArr); String serializedTypes = conf.get(serdeConstants.LIST_COLUMN_TYPES); if (null == serializedTypes) { throw new IOException( "Hive column types must be provided to InputFormat in the Configuration"); } ArrayList<TypeInfo> columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(serializedTypes); return new ColumnMapper(conf.get(AccumuloSerDeParameters.COLUMN_MAPPINGS), defaultStorageType, columnNames, columnTypes); } /** * Configure the underlying AccumuloInputFormat * * @param conf * Job configuration * @param instance * Accumulo instance * @param connector * Accumulo connector * @param accumuloParams * Connection information to the Accumulo instance * @param columnMapper * Configuration of Hive to Accumulo columns * @param iterators * Any iterators to be configured server-side * @param ranges * Accumulo ranges on for the query * @throws AccumuloSecurityException * @throws AccumuloException * @throws SerDeException */ protected void configure(JobConf conf, Instance instance, Connector connector, AccumuloConnectionParameters accumuloParams, ColumnMapper columnMapper, List<IteratorSetting> iterators, Collection<Range> ranges) throws AccumuloSecurityException, AccumuloException, SerDeException, IOException { // Handle implementation of Instance and invoke appropriate InputFormat method if (instance instanceof MockInstance) { setMockInstance(conf, instance.getInstanceName()); } else { setZooKeeperInstance(conf, instance.getInstanceName(), instance.getZooKeepers(), accumuloParams.useSasl()); } // Set the username/passwd for the Accumulo connection if (accumuloParams.useSasl()) { UserGroupInformation ugi = UserGroupInformation.getCurrentUser(); // If we have Kerberos credentials, we should obtain the delegation token if (ugi.hasKerberosCredentials()) { Connector conn = accumuloParams.getConnector(); AuthenticationToken token = helper.getDelegationToken(conn); // Send the DelegationToken down to the Configuration for Accumulo to use setConnectorInfo(conf, accumuloParams.getAccumuloUserName(), token); // Convert the Accumulo token in a Hadoop token Token<? extends TokenIdentifier> accumuloToken = helper.getHadoopToken(token); log.info("Adding Hadoop Token for Accumulo to Job's Credentials"); // Add the Hadoop token to the JobConf helper.mergeTokenIntoJobConf(conf, accumuloToken); if (!ugi.addToken(accumuloToken)) { throw new IOException("Failed to add Accumulo Token to UGI"); } } try { helper.addTokenFromUserToJobConf(ugi, conf); } catch (IOException e) { throw new IOException("Current user did not contain necessary delegation Tokens " + ugi, e); } } else { setConnectorInfo(conf, accumuloParams.getAccumuloUserName(), new PasswordToken(accumuloParams.getAccumuloPassword())); } // Read from the given Accumulo table setInputTableName(conf, accumuloParams.getAccumuloTableName()); // Check Configuration for any user-provided Authorization definition Authorizations auths = AccumuloSerDeParameters.getAuthorizationsFromConf(conf); if (null == auths) { // Default to all of user's authorizations when no configuration is provided auths = connector.securityOperations().getUserAuthorizations( accumuloParams.getAccumuloUserName()); } // Implicitly handles users providing invalid authorizations setScanAuthorizations(conf, auths); // restrict with any filters found from WHERE predicates. addIterators(conf, iterators); // restrict with any ranges found from WHERE predicates. // not setting ranges scans the entire table if (null != ranges) { log.info("Setting ranges: " + ranges); setRanges(conf, ranges); } // Restrict the set of columns that we want to read from the Accumulo table HashSet<Pair<Text,Text>> pairs = getPairCollection(columnMapper.getColumnMappings()); if (null != pairs && !pairs.isEmpty()) { fetchColumns(conf, pairs); } } // Wrap the static AccumuloInputFormat methods with methods that we can // verify were correctly called via Mockito protected void setMockInstance(JobConf conf, String instanceName) { try { AccumuloInputFormat.setMockInstance(conf, instanceName); } catch (IllegalStateException e) { // AccumuloInputFormat complains if you re-set an already set value. We just don't care. log.debug("Ignoring exception setting mock instance of " + instanceName, e); } } @SuppressWarnings("deprecation") protected void setZooKeeperInstance(JobConf conf, String instanceName, String zkHosts, boolean isSasl) throws IOException { // To support builds against 1.5, we can't use the new 1.6 setZooKeeperInstance which // takes a ClientConfiguration class that only exists in 1.6 try { if (isSasl) { // Reflection to support Accumulo 1.5. Remove when Accumulo 1.5 support is dropped // 1.6 works with the deprecated 1.5 method, but must use reflection for 1.7-only SASL support helper.setZooKeeperInstance(conf, AccumuloInputFormat.class, zkHosts, instanceName, isSasl); } else { AccumuloInputFormat.setZooKeeperInstance(conf, instanceName, zkHosts); } } catch (IllegalStateException ise) { // AccumuloInputFormat complains if you re-set an already set value. We just don't care. log.debug("Ignoring exception setting ZooKeeper instance of " + instanceName + " at " + zkHosts, ise); } } protected void setConnectorInfo(JobConf conf, String user, AuthenticationToken token) throws AccumuloSecurityException { try { AccumuloInputFormat.setConnectorInfo(conf, user, token); } catch (IllegalStateException e) { // AccumuloInputFormat complains if you re-set an already set value. We just don't care. log.debug("Ignoring exception setting Accumulo Connector instance for user " + user, e); } } protected void setInputTableName(JobConf conf, String tableName) { AccumuloInputFormat.setInputTableName(conf, tableName); } protected void setScanAuthorizations(JobConf conf, Authorizations auths) { AccumuloInputFormat.setScanAuthorizations(conf, auths); } protected void addIterators(JobConf conf, List<IteratorSetting> iterators) { for (IteratorSetting is : iterators) { AccumuloInputFormat.addIterator(conf, is); } } protected void setRanges(JobConf conf, Collection<Range> ranges) { AccumuloInputFormat.setRanges(conf, ranges); } protected void fetchColumns(JobConf conf, Set<Pair<Text,Text>> cfCqPairs) { AccumuloInputFormat.fetchColumns(conf, cfCqPairs); } /** * Create col fam/qual pairs from pipe separated values, usually from config object. Ignores * rowID. * * @param columnMappings * The list of ColumnMappings for the given query * @return a Set of Pairs of colfams and colquals */ protected HashSet<Pair<Text,Text>> getPairCollection(List<ColumnMapping> columnMappings) { final HashSet<Pair<Text,Text>> pairs = new HashSet<Pair<Text,Text>>(); for (ColumnMapping columnMapping : columnMappings) { if (columnMapping instanceof HiveAccumuloColumnMapping) { HiveAccumuloColumnMapping accumuloColumnMapping = (HiveAccumuloColumnMapping) columnMapping; Text cf = new Text(accumuloColumnMapping.getColumnFamily()); Text cq = null; // A null cq implies an empty column qualifier if (null != accumuloColumnMapping.getColumnQualifier()) { cq = new Text(accumuloColumnMapping.getColumnQualifier()); } pairs.add(new Pair<Text,Text>(cf, cq)); } else if (columnMapping instanceof HiveAccumuloMapColumnMapping) { HiveAccumuloMapColumnMapping mapMapping = (HiveAccumuloMapColumnMapping) columnMapping; // Can't fetch prefix on colqual, must pull the entire qualifier // TODO use an iterator to do the filter, server-side. pairs.add(new Pair<Text,Text>(new Text(mapMapping.getColumnFamily()), null)); } } log.info("Computed columns to fetch (" + pairs + ") from " + columnMappings); return pairs; } /** * Reflection to work around Accumulo 1.5 and 1.6 incompatibilities. Throws an {@link IOException} * for any reflection related exceptions * * @param split * A RangeInputSplit * @return The name of the table from the split * @throws IOException */ protected String getTableName(RangeInputSplit split) throws IOException { // ACCUMULO-3017 shenanigans with method names changing without deprecation Method getTableName = null; try { getTableName = RangeInputSplit.class.getMethod("getTableName"); } catch (SecurityException e) { log.debug("Could not get getTableName method from RangeInputSplit", e); } catch (NoSuchMethodException e) { log.debug("Could not get getTableName method from RangeInputSplit", e); } if (null != getTableName) { try { return (String) getTableName.invoke(split); } catch (IllegalArgumentException e) { log.debug("Could not invoke getTableName method from RangeInputSplit", e); } catch (IllegalAccessException e) { log.debug("Could not invoke getTableName method from RangeInputSplit", e); } catch (InvocationTargetException e) { log.debug("Could not invoke getTableName method from RangeInputSplit", e); } } Method getTable; try { getTable = RangeInputSplit.class.getMethod("getTable"); } catch (SecurityException e) { throw new IOException("Could not get table name from RangeInputSplit", e); } catch (NoSuchMethodException e) { throw new IOException("Could not get table name from RangeInputSplit", e); } try { return (String) getTable.invoke(split); } catch (IllegalArgumentException e) { throw new IOException("Could not get table name from RangeInputSplit", e); } catch (IllegalAccessException e) { throw new IOException("Could not get table name from RangeInputSplit", e); } catch (InvocationTargetException e) { throw new IOException("Could not get table name from RangeInputSplit", e); } } /** * Sets the table name on a RangeInputSplit, accounting for change in method name. Any reflection * related exception is wrapped in an {@link IOException} * * @param split * The RangeInputSplit to operate on * @param tableName * The name of the table to set * @throws IOException */ protected void setTableName(RangeInputSplit split, String tableName) throws IOException { // ACCUMULO-3017 shenanigans with method names changing without deprecation Method setTableName = null; try { setTableName = RangeInputSplit.class.getMethod("setTableName", String.class); } catch (SecurityException e) { log.debug("Could not get getTableName method from RangeInputSplit", e); } catch (NoSuchMethodException e) { log.debug("Could not get getTableName method from RangeInputSplit", e); } if (null != setTableName) { try { setTableName.invoke(split, tableName); return; } catch (IllegalArgumentException e) { log.debug("Could not invoke getTableName method from RangeInputSplit", e); } catch (IllegalAccessException e) { log.debug("Could not invoke getTableName method from RangeInputSplit", e); } catch (InvocationTargetException e) { log.debug("Could not invoke getTableName method from RangeInputSplit", e); } } Method setTable; try { setTable = RangeInputSplit.class.getMethod("setTable", String.class); } catch (SecurityException e) { throw new IOException("Could not set table name from RangeInputSplit", e); } catch (NoSuchMethodException e) { throw new IOException("Could not set table name from RangeInputSplit", e); } try { setTable.invoke(split, tableName); } catch (IllegalArgumentException e) { throw new IOException("Could not set table name from RangeInputSplit", e); } catch (IllegalAccessException e) { throw new IOException("Could not set table name from RangeInputSplit", e); } catch (InvocationTargetException e) { throw new IOException("Could not set table name from RangeInputSplit", e); } } }