/*
* Copyright © 2014-2015 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.explore.service;
import co.cask.cdap.common.conf.Constants;
import co.cask.cdap.data2.datafabric.dataset.service.DatasetService;
import co.cask.cdap.data2.util.hbase.HBaseTableUtilFactory;
import co.cask.cdap.explore.guice.ExploreRuntimeModule;
import co.cask.cdap.explore.service.hive.Hive12CDH5ExploreService;
import co.cask.cdap.explore.service.hive.Hive12ExploreService;
import co.cask.cdap.explore.service.hive.Hive13ExploreService;
import co.cask.cdap.explore.service.hive.Hive14ExploreService;
import co.cask.cdap.hive.ExploreUtils;
import co.cask.cdap.internal.app.runtime.spark.SparkUtils;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import com.google.common.io.ByteStreams;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.util.VersionInfo;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.twill.api.ClassAcceptor;
import org.apache.twill.internal.utils.Dependencies;
import org.objectweb.asm.ClassReader;
import org.objectweb.asm.ClassVisitor;
import org.objectweb.asm.ClassWriter;
import org.objectweb.asm.MethodVisitor;
import org.objectweb.asm.Opcodes;
import org.objectweb.asm.commons.GeneratorAdapter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.PathMatcher;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.jar.JarEntry;
import java.util.jar.JarFile;
import java.util.jar.JarOutputStream;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
/**
* Utility class for the explore service.
*/
public class ExploreServiceUtils {
private static final Logger LOG = LoggerFactory.getLogger(ExploreServiceUtils.class);
private static final String HIVE_AUTHFACTORY_CLASS_NAME = "org.apache.hive.service.auth.HiveAuthFactory";
/**
* Hive support enum.
*/
public enum HiveSupport {
// The order of the enum values below is very important
// CDH 5.0 to 5.1 uses Hive 0.12
HIVE_CDH5_0(Pattern.compile("^.*cdh5.0\\..*$"), Hive12CDH5ExploreService.class),
HIVE_CDH5_1(Pattern.compile("^.*cdh5.1\\..*$"), Hive12CDH5ExploreService.class),
// CDH 5.2.x and 5.3.x use Hive 0.13
HIVE_CDH5_2(Pattern.compile("^.*cdh5.2\\..*$"), Hive13ExploreService.class),
HIVE_CDH5_3(Pattern.compile("^.*cdh5.3\\..*$"), Hive13ExploreService.class),
// CDH > 5.3 uses Hive >= 1.1 (which Hive14ExploreService supports)
HIVE_CDH5(Pattern.compile("^.*cdh5\\..*$"), Hive14ExploreService.class),
HIVE_12(null, Hive12ExploreService.class),
HIVE_13(null, Hive13ExploreService.class),
HIVE_14(null, Hive14ExploreService.class),
HIVE_1_0(null, Hive14ExploreService.class),
HIVE_1_1(null, Hive14ExploreService.class),
HIVE_1_2(null, Hive14ExploreService.class);
private final Pattern hadoopVersionPattern;
private final Class<? extends ExploreService> hiveExploreServiceClass;
HiveSupport(Pattern hadoopVersionPattern, Class<? extends ExploreService> hiveExploreServiceClass) {
this.hadoopVersionPattern = hadoopVersionPattern;
this.hiveExploreServiceClass = hiveExploreServiceClass;
}
public Pattern getHadoopVersionPattern() {
return hadoopVersionPattern;
}
public Class<? extends ExploreService> getHiveExploreServiceClass() {
return hiveExploreServiceClass;
}
}
// Caching the dependencies so that we don't trace them twice
private static Set<File> exploreDependencies = null;
private static final Pattern HIVE_SITE_FILE_PATTERN = Pattern.compile("^.*/hive-site\\.xml$");
private static final Pattern YARN_SITE_FILE_PATTERN = Pattern.compile("^.*/yarn-site\\.xml$");
private static final Pattern MAPRED_SITE_FILE_PATTERN = Pattern.compile("^.*/mapred-site\\.xml$");
public static Class<? extends ExploreService> getHiveService() {
HiveSupport hiveVersion = checkHiveSupport(null);
return hiveVersion.getHiveExploreServiceClass();
}
public static HiveSupport checkHiveSupport() {
return checkHiveSupport(ExploreUtils.getExploreClassloader());
}
public static String getHiveVersion() {
return getHiveVersion(ExploreUtils.getExploreClassloader());
}
public static String getHiveVersion(@Nullable ClassLoader hiveClassLoader) {
ClassLoader usingCL = hiveClassLoader;
if (usingCL == null) {
usingCL = ExploreServiceUtils.class.getClassLoader();
}
try {
Class<?> hiveVersionInfoClass = usingCL.loadClass("org.apache.hive.common.util.HiveVersionInfo");
return (String) hiveVersionInfoClass.getDeclaredMethod("getVersion").invoke(null);
} catch (Exception e) {
throw Throwables.propagate(e);
}
}
/**
* Check that Hive is in the class path - with a right version.
*/
public static HiveSupport checkHiveSupport(@Nullable ClassLoader hiveClassLoader) {
// First try to figure which hive support is relevant based on Hadoop distribution name
String hadoopVersion = VersionInfo.getVersion();
for (HiveSupport hiveSupport : HiveSupport.values()) {
if (hiveSupport.getHadoopVersionPattern() != null &&
hiveSupport.getHadoopVersionPattern().matcher(hadoopVersion).matches()) {
return hiveSupport;
}
}
String hiveVersion = getHiveVersion(hiveClassLoader);
LOG.info("Client Hive version: {}", hiveVersion);
if (hiveVersion.startsWith("0.12.")) {
return HiveSupport.HIVE_12;
} else if (hiveVersion.startsWith("0.13.")) {
return HiveSupport.HIVE_13;
} else if (hiveVersion.startsWith("0.14.") || hiveVersion.startsWith("1.0.")) {
return HiveSupport.HIVE_14;
} else if (hiveVersion.startsWith("1.1.")) {
return HiveSupport.HIVE_1_1;
} else if (hiveVersion.startsWith(("1.2"))) {
return HiveSupport.HIVE_1_2;
}
throw new RuntimeException("Hive distribution not supported. Set the configuration '" +
Constants.Explore.EXPLORE_ENABLED +
"' to false to start up without Explore.");
}
/**
* Return the list of absolute paths of the bootstrap classes.
*/
public static Set<String> getBoostrapClasses() {
ImmutableSet.Builder<String> builder = ImmutableSet.builder();
for (String classpath : Splitter.on(File.pathSeparatorChar).split(System.getProperty("sun.boot.class.path"))) {
File file = new File(classpath);
builder.add(file.getAbsolutePath());
try {
builder.add(file.getCanonicalPath());
} catch (IOException e) {
LOG.warn("Could not add canonical path to aux class path for file {}", file.toString(), e);
}
}
return builder.build();
}
/**
* Trace the jar dependencies needed by the Explore container. Uses a separate class loader to load Hive classes,
* built using the explore classpath passed as a system property to master.
*
* @return an ordered set of jar files.
*/
public static Set<File> traceExploreDependencies(File tmpDir) throws IOException {
if (exploreDependencies != null) {
return exploreDependencies;
}
ClassLoader classLoader = ExploreUtils.getExploreClassloader();
Set<File> additionalJars = new HashSet<>();
if (isSparkAvailable()) {
File sparkAssemblyJar = SparkUtils.locateSparkAssemblyJar();
LOG.debug("Adding spark jar to explore dependency {}", sparkAssemblyJar);
additionalJars.add(sparkAssemblyJar);
}
if (isTezAvailable()) {
additionalJars.addAll(getTezJars());
}
return traceExploreDependencies(classLoader, tmpDir, additionalJars);
}
/**
* Trace the jar dependencies needed by the Explore container.
*
* @param classLoader class loader to use to trace the dependencies.
* If it is null, use the class loader of this class.
* @param tmpDir temporary directory for storing rewritten jar files.
* @param additionalJars additional jars that will be added to the end of the returned set.
* @return an ordered set of jar files.
*/
private static Set<File> traceExploreDependencies(ClassLoader classLoader, File tmpDir, Set<File> additionalJars)
throws IOException {
if (exploreDependencies != null) {
return exploreDependencies;
}
ClassLoader usingCL = classLoader;
if (classLoader == null) {
usingCL = ExploreRuntimeModule.class.getClassLoader();
}
final Set<String> bootstrapClassPaths = getBoostrapClasses();
ClassAcceptor classAcceptor = new ClassAcceptor() {
/* Excluding any class contained in the bootstrapClassPaths and Kryo classes.
* We need to remove Kryo dependency in the Explore container. Spark introduced version 2.21 version of Kryo,
* which would be normally shipped to the Explore container. Yet, Hive requires Kryo 2.22,
* and gets it from the Hive jars - hive-exec.jar to be precise.
* */
@Override
public boolean accept(String className, URL classUrl, URL classPathUrl) {
return !(bootstrapClassPaths.contains(classPathUrl.getFile()) ||
className.startsWith("com.esotericsoftware.kryo"));
}
};
Set<File> hBaseTableDeps = traceDependencies(usingCL, classAcceptor, tmpDir,
HBaseTableUtilFactory.getHBaseTableUtilClass().getName());
// Note the order of dependency jars is important so that HBase jars come first in the classpath order
// LinkedHashSet maintains insertion order while removing duplicate entries.
Set<File> orderedDependencies = new LinkedHashSet<>();
orderedDependencies.addAll(hBaseTableDeps);
orderedDependencies.addAll(traceDependencies(usingCL, classAcceptor, tmpDir,
DatasetService.class.getName(),
// Referred to by string rather than Class.getName()
// because DatasetStorageHandler and StreamStorageHandler
// extend a Hive class, which isn't present in this class loader
"co.cask.cdap.hive.datasets.DatasetStorageHandler",
"co.cask.cdap.hive.stream.StreamStorageHandler",
"org.apache.hadoop.hive.ql.exec.mr.ExecDriver",
"org.apache.hive.service.cli.CLIService",
"org.apache.hadoop.mapred.YarnClientProtocolProvider",
// Needed for - at least - CDH 4.4 integration
"org.apache.hive.builtins.BuiltinUtils",
// Needed for - at least - CDH 5 integration
"org.apache.hadoop.hive.shims.Hadoop23Shims"));
orderedDependencies.addAll(additionalJars);
exploreDependencies = orderedDependencies;
return orderedDependencies;
}
/**
* Trace the dependencies files of the given className, using the classLoader,
* and including the classes that's accepted by the classAcceptor
*
* Nothing is returned if the classLoader, or if not provided, the ExploreRuntimeModule class loader,
* does not contain the className.
*/
public static Set<File> traceDependencies(@Nullable ClassLoader classLoader, final ClassAcceptor classAcceptor,
File tmpDir, String... classNames) throws IOException {
LOG.debug("Tracing dependencies for classes: {}", Arrays.toString(classNames));
ClassLoader usingCL = classLoader;
if (usingCL == null) {
usingCL = ExploreRuntimeModule.class.getClassLoader();
}
final String rewritingClassName = HIVE_AUTHFACTORY_CLASS_NAME;
final Set<File> rewritingFiles = Sets.newHashSet();
final Set<File> jarFiles = Sets.newHashSet();
Dependencies.findClassDependencies(
usingCL,
new ClassAcceptor() {
@Override
public boolean accept(String className, URL classUrl, URL classPathUrl) {
if (!classAcceptor.accept(className, classUrl, classPathUrl)) {
return false;
}
if (rewritingClassName.equals(className)) {
rewritingFiles.add(new File(classPathUrl.getFile()));
}
jarFiles.add(new File(classPathUrl.getFile()));
return true;
}
},
classNames
);
// Rewrite HiveAuthFactory.loginFromKeytab to be a no-op method.
// This is needed because we don't want to use Hive's CLIService since
// we're already using delegation tokens
for (File rewritingFile : rewritingFiles) {
// TODO: this may cause lots of rewrites since we may rewrite the same jar multiple times
File rewrittenJar = rewriteHiveAuthFactory(
rewritingFile, new File(tmpDir, rewritingFile.getName() + "-" + System.currentTimeMillis() + ".jar"));
jarFiles.add(rewrittenJar);
LOG.debug("Rewrote {} to {}", rewritingFile.getAbsolutePath(), rewrittenJar.getAbsolutePath());
}
jarFiles.removeAll(rewritingFiles);
if (LOG.isDebugEnabled()) {
for (File jarFile : jarFiles) {
LOG.debug("Added jar {}", jarFile.getAbsolutePath());
}
}
return jarFiles;
}
@VisibleForTesting
static File rewriteHiveAuthFactory(File sourceJar, File targetJar) throws IOException {
try (
JarFile input = new JarFile(sourceJar);
JarOutputStream output = new JarOutputStream(new FileOutputStream(targetJar))
) {
String hiveAuthFactoryPath = HIVE_AUTHFACTORY_CLASS_NAME.replace('.', '/') + ".class";
Enumeration<JarEntry> sourceEntries = input.entries();
while (sourceEntries.hasMoreElements()) {
JarEntry entry = sourceEntries.nextElement();
output.putNextEntry(new JarEntry(entry.getName()));
try (InputStream entryInputStream = input.getInputStream(entry)) {
if (!hiveAuthFactoryPath.equals(entry.getName())) {
ByteStreams.copy(entryInputStream, output);
continue;
}
try {
// Rewrite the bytecode of HiveAuthFactory.loginFromKeytab method to a no-op method
ClassReader cr = new ClassReader(entryInputStream);
ClassWriter cw = new ClassWriter(ClassWriter.COMPUTE_MAXS);
cr.accept(new ClassVisitor(Opcodes.ASM5, cw) {
@Override
public MethodVisitor visitMethod(final int access, final String name, final String desc,
String signature, String[] exceptions) {
MethodVisitor methodVisitor = super.visitMethod(access, name, desc, signature, exceptions);
if (!"loginFromKeytab".equals(name)) {
return methodVisitor;
}
GeneratorAdapter adapter = new GeneratorAdapter(methodVisitor, access, name, desc);
adapter.returnValue();
// VisitMaxs with 0 so that COMPUTE_MAXS from ClassWriter will compute the right values.
adapter.visitMaxs(0, 0);
return new MethodVisitor(Opcodes.ASM5) { };
}
}, 0);
output.write(cw.toByteArray());
} catch (Exception e) {
throw new IOException("Unable to generate HiveAuthFactory class", e);
}
}
}
return targetJar;
}
}
/**
* Updates environment variables in hive-site.xml, mapred-site.xml and yarn-site.xml for explore.
* All other conf files are returned without any update.
* @param confFile conf file to update
* @param tempDir temp dir to create files if necessary
* @return the new conf file to use in place of confFile
*/
public static File updateConfFileForExplore(File confFile, File tempDir) {
if (HIVE_SITE_FILE_PATTERN.matcher(confFile.getAbsolutePath()).matches()) {
return updateHiveConfFile(confFile, tempDir);
} else if (YARN_SITE_FILE_PATTERN.matcher(confFile.getAbsolutePath()).matches()) {
return updateYarnConfFile(confFile, tempDir);
} else if (MAPRED_SITE_FILE_PATTERN.matcher(confFile.getAbsolutePath()).matches()) {
return updateMapredConfFile(confFile, tempDir);
} else {
return confFile;
}
}
/**
* Change yarn-site.xml file, and return a temp copy of it to which are added
* necessary options.
*/
private static File updateYarnConfFile(File confFile, File tempDir) {
Configuration conf = new Configuration(false);
try {
conf.addResource(confFile.toURI().toURL());
} catch (MalformedURLException e) {
LOG.error("File {} is malformed.", confFile, e);
throw Throwables.propagate(e);
}
String yarnAppClassPath = conf.get(YarnConfiguration.YARN_APPLICATION_CLASSPATH,
Joiner.on(",").join(YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH));
// add the pwd/* at the beginning of classpath. so user's jar will take precedence and without this change,
// job.jar will be at the beginning of the classpath, since job.jar has old guava version classes,
// we want to add pwd/* before
yarnAppClassPath = "$PWD/*," + yarnAppClassPath;
conf.set(YarnConfiguration.YARN_APPLICATION_CLASSPATH, yarnAppClassPath);
File newYarnConfFile = new File(tempDir, "yarn-site.xml");
try (FileOutputStream os = new FileOutputStream(newYarnConfFile)) {
conf.writeXml(os);
} catch (IOException e) {
LOG.error("Problem creating and writing to temporary yarn-conf.xml conf file at {}", newYarnConfFile, e);
throw Throwables.propagate(e);
}
return newYarnConfFile;
}
/**
* Change mapred-site.xml file, and return a temp copy of it to which are added
* necessary options.
*/
private static File updateMapredConfFile(File confFile, File tempDir) {
Configuration conf = new Configuration(false);
try {
conf.addResource(confFile.toURI().toURL());
} catch (MalformedURLException e) {
LOG.error("File {} is malformed.", confFile, e);
throw Throwables.propagate(e);
}
String mrAppClassPath = conf.get(MRJobConfig.MAPREDUCE_APPLICATION_CLASSPATH,
MRJobConfig.DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH);
// Add the pwd/* at the beginning of classpath. Without this change, old jars from mr framework classpath
// get into classpath.
mrAppClassPath = "$PWD/*," + mrAppClassPath;
conf.set(MRJobConfig.MAPREDUCE_APPLICATION_CLASSPATH, mrAppClassPath);
File newMapredConfFile = new File(tempDir, "mapred-site.xml");
try (FileOutputStream os = new FileOutputStream(newMapredConfFile)) {
conf.writeXml(os);
} catch (IOException e) {
LOG.error("Problem creating and writing to temporary mapred-site.xml conf file at {}", newMapredConfFile, e);
throw Throwables.propagate(e);
}
return newMapredConfFile;
}
/**
* Change hive-site.xml file, and return a temp copy of it to which are added
* necessary options.
*/
private static File updateHiveConfFile(File confFile, File tempDir) {
Configuration conf = new Configuration(false);
try {
conf.addResource(confFile.toURI().toURL());
} catch (MalformedURLException e) {
LOG.error("File {} is malformed.", confFile, e);
throw Throwables.propagate(e);
}
// we prefer jars at container's root directory before job.jar,
// we edit the YARN_APPLICATION_CLASSPATH in yarn-site.xml using
// co.cask.cdap.explore.service.ExploreServiceUtils.updateYarnConfFile and
// setting the MAPREDUCE_JOB_CLASSLOADER and MAPREDUCE_JOB_USER_CLASSPATH_FIRST to false will put
// YARN_APPLICATION_CLASSPATH before job.jar for container's classpath.
conf.setBoolean(Job.MAPREDUCE_JOB_USER_CLASSPATH_FIRST, false);
conf.setBoolean(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER, false);
String sparkHome = System.getenv(Constants.SPARK_HOME);
if (sparkHome != null) {
LOG.debug("Setting spark.home in hive conf to {}", sparkHome);
conf.set("spark.home", sparkHome);
}
File newHiveConfFile = new File(tempDir, "hive-site.xml");
try (FileOutputStream os = new FileOutputStream(newHiveConfFile)) {
conf.writeXml(os);
} catch (IOException e) {
LOG.error("Problem creating temporary hive-site.xml conf file at {}", newHiveConfFile, e);
throw Throwables.propagate(e);
}
return newHiveConfFile;
}
public static boolean isSparkAvailable() {
try {
// SparkUtils.locateSparkAssemblyJar() throws IllegalStateException if it is not able to locate spark jar
SparkUtils.locateSparkAssemblyJar();
return true;
} catch (IllegalStateException e) {
LOG.debug("Got exception while determining spark availability", e);
return false;
}
}
public static boolean isSparkEngine(HiveConf hiveConf) {
// We don't support setting engine through session configuration now
String engine = hiveConf.get("hive.execution.engine");
return "spark".equalsIgnoreCase(engine);
}
// This method is used to determine if Tez is enabled based on TEZ_HOME environment variable.
// Master prepares the explore container by adding jars available in TEZ_HOME.
// However this environment variable is not available to explore container itself(BaseHiveService).
// There we check the existence of tez using hive.execution.engine config variable.
private static boolean isTezAvailable() {
return System.getenv(Constants.TEZ_HOME) != null;
}
private static Set<File> getTezJars() {
String tezHome = System.getenv(Constants.TEZ_HOME);
Path tezHomeDir = Paths.get(tezHome);
final PathMatcher pathMatcher = tezHomeDir.getFileSystem().getPathMatcher("glob:*.jar");
final Set<File> tezJars = new HashSet<>();
try {
Files.walkFileTree(tezHomeDir, new SimpleFileVisitor<Path>() {
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
if (attrs.isRegularFile() && pathMatcher.matches(file.getFileName())) {
tezJars.add(file.toFile());
}
return FileVisitResult.CONTINUE;
}
@Override
public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException {
// Ignore error
return FileVisitResult.CONTINUE;
}
});
} catch (IOException e) {
LOG.warn("Exception raised while inspecting {}", tezHomeDir, e);
}
return tezJars;
}
}