/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.data.management.copy.hive; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.net.URISyntaxException; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.Properties; import javax.annotation.Nonnull; import lombok.Data; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.reflect.ConstructorUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.metastore.IMetaStoreClient; import org.apache.hadoop.hive.metastore.api.Table; import com.google.common.base.Function; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.base.Predicate; import com.google.common.collect.AbstractIterator; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; import gobblin.config.client.ConfigClient; import gobblin.config.client.ConfigClientCache; import gobblin.config.client.api.ConfigStoreFactoryDoesNotExistsException; import gobblin.config.client.api.VersionStabilityPolicy; import gobblin.config.store.api.ConfigStoreCreationException; import gobblin.configuration.ConfigurationKeys; import gobblin.data.management.hive.HiveConfigClientUtils; import gobblin.dataset.IterableDatasetFinder; import gobblin.hive.HiveMetastoreClientPool; import gobblin.metrics.event.EventSubmitter; import gobblin.metrics.event.sla.SlaEventSubmitter; import gobblin.util.AutoReturnableObject; import gobblin.util.ConfigUtils; /** * Finds {@link HiveDataset}s. Will look for tables in a database using a {@link WhitelistBlacklist}, * and creates a {@link HiveDataset} for each one. */ @Slf4j public class HiveDatasetFinder implements IterableDatasetFinder<HiveDataset> { public static final String HIVE_DATASET_PREFIX = "hive.dataset"; public static final String HIVE_METASTORE_URI_KEY = HIVE_DATASET_PREFIX + ".hive.metastore.uri"; public static final String DB_KEY = HIVE_DATASET_PREFIX + ".database"; public static final String TABLE_PATTERN_KEY = HIVE_DATASET_PREFIX + ".table.pattern"; public static final String DEFAULT_TABLE_PATTERN = "*"; /* * By setting the prefix, only config keys with this prefix will be used to build a HiveDataset. * By passing scoped configurations the same config keys can be used in different contexts. * * E.g * 1. For CopySource, prefix is gobblin.dataset.copy * 2. For avro to Orc conversion, prefix is hive.dataset.conversion.avro.orc * 3. For retention, prefix is gobblin.retention. * */ public static final String HIVE_DATASET_CONFIG_PREFIX_KEY = "hive.dataset.configPrefix"; private static final String DEFAULT_HIVE_DATASET_CONIFG_PREFIX = StringUtils.EMPTY; public static final String HIVE_DATASET_IS_BLACKLISTED_KEY = "is.blacklisted"; private static final boolean DEFAULT_HIVE_DATASET_IS_BLACKLISTED_KEY = false; /** * This is an optional key. * The fully qualified name of a {@link Function} class which returns the relative uri of a dataset in the config store */ public static final String CONFIG_STORE_DATASET_URI_BUILDER_CLASS = "gobblin.config.management.datasetUriBuilderClass"; // Event names private static final String DATASET_FOUND = "DatasetFound"; private static final String DATASET_ERROR = "DatasetError"; private static final String FAILURE_CONTEXT = "FailureContext"; protected final Properties properties; protected final HiveMetastoreClientPool clientPool; protected final FileSystem fs; private final WhitelistBlacklist whitelistBlacklist; private final Optional<EventSubmitter> eventSubmitter; protected final Optional<String> configStoreUri; protected final Function<Table, String> configStoreDatasetUriBuilder; protected final String datasetConfigPrefix; protected final ConfigClient configClient; private final Config jobConfig; public HiveDatasetFinder(FileSystem fs, Properties properties) throws IOException { this(fs, properties, createClientPool(properties)); } protected HiveDatasetFinder(FileSystem fs, Properties properties, ConfigClient configClient) throws IOException { this(fs, properties, createClientPool(properties), null, configClient); } public HiveDatasetFinder(FileSystem fs, Properties properties, EventSubmitter eventSubmitter) throws IOException { this(fs, properties, createClientPool(properties), eventSubmitter); } protected HiveDatasetFinder(FileSystem fs, Properties properties, HiveMetastoreClientPool clientPool) throws IOException { this(fs, properties, clientPool, null); } protected HiveDatasetFinder(FileSystem fs, Properties properties, HiveMetastoreClientPool clientPool, EventSubmitter eventSubmitter) throws IOException { this(fs, properties, clientPool, eventSubmitter, ConfigClientCache.getClient(VersionStabilityPolicy.STRONG_LOCAL_STABILITY)); } @SuppressWarnings("unchecked") //SupressWarning justification : CONFIG_STORE_DATASET_URI_BUILDER_CLASS must be of type Function<DbAndTable, String>. //It is safe to throw RuntimeException otherwise protected HiveDatasetFinder(FileSystem fs, Properties properties, HiveMetastoreClientPool clientPool, EventSubmitter eventSubmitter, ConfigClient configClient) throws IOException { this.properties = properties; this.clientPool = clientPool; this.fs = fs; String whitelistKey = HIVE_DATASET_PREFIX + "." + WhitelistBlacklist.WHITELIST; Preconditions.checkArgument(properties.containsKey(DB_KEY) || properties.containsKey(whitelistKey), String.format("Must specify %s or %s.", DB_KEY, whitelistKey)); Config config = ConfigFactory.parseProperties(properties); if (properties.containsKey(DB_KEY)) { this.whitelistBlacklist = new WhitelistBlacklist(this.properties.getProperty(DB_KEY) + "." + this.properties.getProperty(TABLE_PATTERN_KEY, DEFAULT_TABLE_PATTERN), ""); } else { this.whitelistBlacklist = new WhitelistBlacklist(config.getConfig(HIVE_DATASET_PREFIX)); } this.eventSubmitter = Optional.fromNullable(eventSubmitter); this.configStoreUri = StringUtils.isNotBlank(properties.getProperty(ConfigurationKeys.CONFIG_MANAGEMENT_STORE_URI)) ? Optional.of(properties.getProperty(ConfigurationKeys.CONFIG_MANAGEMENT_STORE_URI)) : Optional.<String>absent(); this.datasetConfigPrefix = properties.getProperty(HIVE_DATASET_CONFIG_PREFIX_KEY, DEFAULT_HIVE_DATASET_CONIFG_PREFIX); this.configClient = configClient; try { this.configStoreDatasetUriBuilder = properties.containsKey(CONFIG_STORE_DATASET_URI_BUILDER_CLASS) ? (Function<Table, String>) ConstructorUtils .invokeConstructor(Class.forName(properties.getProperty(CONFIG_STORE_DATASET_URI_BUILDER_CLASS))) : DEFAULT_CONFIG_STORE_DATASET_URI_BUILDER; } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException | InstantiationException | ClassNotFoundException e) { throw new RuntimeException(e); } this.jobConfig = ConfigUtils.propertiesToConfig(properties); } protected static HiveMetastoreClientPool createClientPool(Properties properties) throws IOException { return HiveMetastoreClientPool.get(properties, Optional.fromNullable(properties.getProperty(HIVE_METASTORE_URI_KEY))); } /** * Get all tables in db with given table pattern. */ public Collection<DbAndTable> getTables() throws IOException { List<DbAndTable> tables = Lists.newArrayList(); try (AutoReturnableObject<IMetaStoreClient> client = this.clientPool.getClient()) { Iterable<String> databases = Iterables.filter(client.get().getAllDatabases(), new Predicate<String>() { @Override public boolean apply(String db) { return HiveDatasetFinder.this.whitelistBlacklist.acceptDb(db); } }); for (final String db : databases) { Iterable<String> tableNames = Iterables.filter(client.get().getAllTables(db), new Predicate<String>() { @Override public boolean apply(String table) { return HiveDatasetFinder.this.whitelistBlacklist.acceptTable(db, table); } }); for (String tableName : tableNames) { tables.add(new DbAndTable(db, tableName)); } } } catch (Exception exc) { throw new IOException(exc); } return tables; } @Data public static class DbAndTable { private final String db; private final String table; @Override public String toString() { return String.format("%s.%s", this.db, this.table); } } @Override public List<HiveDataset> findDatasets() throws IOException { return Lists.newArrayList(getDatasetsIterator()); } @Override public Iterator<HiveDataset> getDatasetsIterator() throws IOException { return new AbstractIterator<HiveDataset>() { private Iterator<DbAndTable> tables = getTables().iterator(); @Override protected HiveDataset computeNext() { while (this.tables.hasNext()) { DbAndTable dbAndTable = this.tables.next(); try (AutoReturnableObject<IMetaStoreClient> client = HiveDatasetFinder.this.clientPool.getClient()) { Table table = client.get().getTable(dbAndTable.getDb(), dbAndTable.getTable()); Config datasetConfig = getDatasetConfig(table); if (ConfigUtils.getBoolean(datasetConfig, HIVE_DATASET_IS_BLACKLISTED_KEY, DEFAULT_HIVE_DATASET_IS_BLACKLISTED_KEY)) { continue; } if (HiveDatasetFinder.this.eventSubmitter.isPresent()) { SlaEventSubmitter.builder().datasetUrn(dbAndTable.toString()) .eventSubmitter(HiveDatasetFinder.this.eventSubmitter.get()).eventName(DATASET_FOUND).build().submit(); } return createHiveDataset(table, datasetConfig); } catch (Throwable t) { log.error(String.format("Failed to create HiveDataset for table %s.%s", dbAndTable.getDb(), dbAndTable.getTable()), t); if (HiveDatasetFinder.this.eventSubmitter.isPresent()) { SlaEventSubmitter.builder().datasetUrn(dbAndTable.toString()) .eventSubmitter(HiveDatasetFinder.this.eventSubmitter.get()).eventName(DATASET_ERROR) .additionalMetadata(FAILURE_CONTEXT, t.toString()).build().submit(); } } } return endOfData(); } }; } /** * @deprecated Use {@link #createHiveDataset(Table, Config)} instead */ @Deprecated protected HiveDataset createHiveDataset(Table table) throws IOException { return createHiveDataset(table, ConfigFactory.empty()); } protected HiveDataset createHiveDataset(Table table, Config datasetConfig) throws IOException { return new HiveDataset(this.fs, this.clientPool, new org.apache.hadoop.hive.ql.metadata.Table(table), this.properties, datasetConfig); } @Override public Path commonDatasetRoot() { return new Path("/"); } /** * Gets the {@link Config} for this <code>dbAndTable</code>. * Cases: * <ul> * <li>If {@link #configStoreUri} is available it gets the dataset config from the config store at this uri * <li>If {@link #configStoreUri} is not available it uses the job config as dataset config * <li>If {@link #datasetConfigPrefix} is specified, only configs with this prefix is returned * <li>If {@link #datasetConfigPrefix} is not specified, all configs are returned * </ul> * @param table of the dataset to get config * @return the {@link Config} for <code>dbAndTable</code> */ private Config getDatasetConfig(Table table) throws ConfigStoreFactoryDoesNotExistsException, ConfigStoreCreationException, URISyntaxException { Config datasetConfig; // Config store enabled if (this.configStoreUri.isPresent()) { datasetConfig = this.configClient.getConfig(this.configStoreUri.get() + Path.SEPARATOR + this.configStoreDatasetUriBuilder.apply(table)); // If config store is not enabled use job config } else { datasetConfig = this.jobConfig; } return StringUtils.isBlank(this.datasetConfigPrefix) ? datasetConfig : ConfigUtils.getConfig(datasetConfig, this.datasetConfigPrefix, ConfigFactory.empty()); } private static final Function<Table, String> DEFAULT_CONFIG_STORE_DATASET_URI_BUILDER = new Function<Table, String>() { @Override public String apply(@Nonnull Table table) { return HiveConfigClientUtils.getDatasetUri(table); } }; }