/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nifi.processors.hadoop; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.security.UserGroupInformation; import org.apache.nifi.annotation.behavior.RequiresInstanceClassLoading; import org.apache.nifi.annotation.lifecycle.OnScheduled; import org.apache.nifi.annotation.lifecycle.OnStopped; import org.apache.nifi.components.PropertyDescriptor; import org.apache.nifi.components.ValidationContext; import org.apache.nifi.components.ValidationResult; import org.apache.nifi.hadoop.KerberosProperties; import org.apache.nifi.hadoop.SecurityUtil; import org.apache.nifi.logging.ComponentLog; import org.apache.nifi.processor.AbstractProcessor; import org.apache.nifi.processor.ProcessContext; import org.apache.nifi.processor.ProcessorInitializationContext; import org.apache.nifi.processor.exception.ProcessException; import org.apache.nifi.processor.util.StandardValidators; import javax.net.SocketFactory; import java.io.File; import java.io.IOException; import java.lang.ref.WeakReference; import java.net.InetSocketAddress; import java.net.Socket; import java.net.URI; import java.security.PrivilegedExceptionAction; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.WeakHashMap; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicReference; /** * This is a base class that is helpful when building processors interacting with HDFS. */ @RequiresInstanceClassLoading(cloneAncestorResources = true) public abstract class AbstractHadoopProcessor extends AbstractProcessor { // properties public static final PropertyDescriptor HADOOP_CONFIGURATION_RESOURCES = new PropertyDescriptor.Builder() .name("Hadoop Configuration Resources") .description("A file or comma separated list of files which contains the Hadoop file system configuration. Without this, Hadoop " + "will search the classpath for a 'core-site.xml' and 'hdfs-site.xml' file or will revert to a default configuration.") .required(false) .addValidator(HadoopValidators.ONE_OR_MORE_FILE_EXISTS_VALIDATOR) .build(); public static final PropertyDescriptor DIRECTORY = new PropertyDescriptor.Builder() .name("Directory") .description("The HDFS directory from which files should be read") .required(true) .addValidator(StandardValidators.ATTRIBUTE_EXPRESSION_LANGUAGE_VALIDATOR) .expressionLanguageSupported(true) .build(); public static final PropertyDescriptor COMPRESSION_CODEC = new PropertyDescriptor.Builder() .name("Compression codec") .required(true) .allowableValues(CompressionType.values()) .defaultValue(CompressionType.NONE.toString()) .build(); public static final PropertyDescriptor KERBEROS_RELOGIN_PERIOD = new PropertyDescriptor.Builder() .name("Kerberos Relogin Period").required(false) .description("Period of time which should pass before attempting a kerberos relogin") .defaultValue("4 hours") .addValidator(StandardValidators.TIME_PERIOD_VALIDATOR) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .build(); public static final PropertyDescriptor ADDITIONAL_CLASSPATH_RESOURCES = new PropertyDescriptor.Builder() .name("Additional Classpath Resources") .description("A comma-separated list of paths to files and/or directories that will be added to the classpath. When specifying a " + "directory, all files with in the directory will be added to the classpath, but further sub-directories will not be included.") .required(false) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .dynamicallyModifiesClasspath(true) .build(); public static final String ABSOLUTE_HDFS_PATH_ATTRIBUTE = "absolute.hdfs.path"; private static final Object RESOURCES_LOCK = new Object(); private long kerberosReloginThreshold; private long lastKerberosReloginTime; protected KerberosProperties kerberosProperties; protected List<PropertyDescriptor> properties; private volatile File kerberosConfigFile = null; // variables shared by all threads of this processor // Hadoop Configuration, Filesystem, and UserGroupInformation (optional) private final AtomicReference<HdfsResources> hdfsResources = new AtomicReference<>(); // Holder of cached Configuration information so validation does not reload the same config over and over private final AtomicReference<ValidationResources> validationResourceHolder = new AtomicReference<>(); @Override protected void init(ProcessorInitializationContext context) { hdfsResources.set(new HdfsResources(null, null, null)); kerberosConfigFile = context.getKerberosConfigurationFile(); kerberosProperties = getKerberosProperties(kerberosConfigFile); List<PropertyDescriptor> props = new ArrayList<>(); props.add(HADOOP_CONFIGURATION_RESOURCES); props.add(kerberosProperties.getKerberosPrincipal()); props.add(kerberosProperties.getKerberosKeytab()); props.add(KERBEROS_RELOGIN_PERIOD); props.add(ADDITIONAL_CLASSPATH_RESOURCES); properties = Collections.unmodifiableList(props); } protected KerberosProperties getKerberosProperties(File kerberosConfigFile) { return new KerberosProperties(kerberosConfigFile); } @Override protected List<PropertyDescriptor> getSupportedPropertyDescriptors() { return properties; } @Override protected Collection<ValidationResult> customValidate(ValidationContext validationContext) { final String configResources = validationContext.getProperty(HADOOP_CONFIGURATION_RESOURCES).getValue(); final String principal = validationContext.getProperty(kerberosProperties.getKerberosPrincipal()).getValue(); final String keytab = validationContext.getProperty(kerberosProperties.getKerberosKeytab()).getValue(); final List<ValidationResult> results = new ArrayList<>(); if (!StringUtils.isBlank(configResources)) { try { ValidationResources resources = validationResourceHolder.get(); // if no resources in the holder, or if the holder has different resources loaded, // then load the Configuration and set the new resources in the holder if (resources == null || !configResources.equals(resources.getConfigResources())) { getLogger().debug("Reloading validation resources"); final Configuration config = new ExtendedConfiguration(getLogger()); config.setClassLoader(Thread.currentThread().getContextClassLoader()); resources = new ValidationResources(configResources, getConfigurationFromResources(config, configResources)); validationResourceHolder.set(resources); } final Configuration conf = resources.getConfiguration(); results.addAll(KerberosProperties.validatePrincipalAndKeytab( this.getClass().getSimpleName(), conf, principal, keytab, getLogger())); } catch (IOException e) { results.add(new ValidationResult.Builder() .valid(false) .subject(this.getClass().getSimpleName()) .explanation("Could not load Hadoop Configuration resources") .build()); } } return results; } /* * If your subclass also has an @OnScheduled annotated method and you need hdfsResources in that method, then be sure to call super.abstractOnScheduled(context) */ @OnScheduled public final void abstractOnScheduled(ProcessContext context) throws IOException { try { // This value will be null when called from ListHDFS, because it overrides all of the default // properties this processor sets. TODO: re-work ListHDFS to utilize Kerberos if (context.getProperty(KERBEROS_RELOGIN_PERIOD).getValue() != null) { kerberosReloginThreshold = context.getProperty(KERBEROS_RELOGIN_PERIOD).asTimePeriod(TimeUnit.SECONDS); } HdfsResources resources = hdfsResources.get(); if (resources.getConfiguration() == null) { final String configResources = context.getProperty(HADOOP_CONFIGURATION_RESOURCES).getValue(); resources = resetHDFSResources(configResources, context); hdfsResources.set(resources); } } catch (IOException ex) { getLogger().error("HDFS Configuration error - {}", new Object[] { ex }); hdfsResources.set(new HdfsResources(null, null, null)); throw ex; } } @OnStopped public final void abstractOnStopped() { hdfsResources.set(new HdfsResources(null, null, null)); } private static Configuration getConfigurationFromResources(final Configuration config, String configResources) throws IOException { boolean foundResources = false; if (null != configResources) { String[] resources = configResources.split(","); for (String resource : resources) { config.addResource(new Path(resource.trim())); foundResources = true; } } if (!foundResources) { // check that at least 1 non-default resource is available on the classpath String configStr = config.toString(); for (String resource : configStr.substring(configStr.indexOf(":") + 1).split(",")) { if (!resource.contains("default") && config.getResource(resource.trim()) != null) { foundResources = true; break; } } } if (!foundResources) { throw new IOException("Could not find any of the " + HADOOP_CONFIGURATION_RESOURCES.getName() + " on the classpath"); } return config; } /* * Reset Hadoop Configuration and FileSystem based on the supplied configuration resources. */ HdfsResources resetHDFSResources(String configResources, ProcessContext context) throws IOException { Configuration config = new ExtendedConfiguration(getLogger()); config.setClassLoader(Thread.currentThread().getContextClassLoader()); getConfigurationFromResources(config, configResources); // first check for timeout on HDFS connection, because FileSystem has a hard coded 15 minute timeout checkHdfsUriForTimeout(config); // disable caching of Configuration and FileSystem objects, else we cannot reconfigure the processor without a complete // restart String disableCacheName = String.format("fs.%s.impl.disable.cache", FileSystem.getDefaultUri(config).getScheme()); config.set(disableCacheName, "true"); // If kerberos is enabled, create the file system as the kerberos principal // -- use RESOURCE_LOCK to guarantee UserGroupInformation is accessed by only a single thread at at time FileSystem fs; UserGroupInformation ugi; synchronized (RESOURCES_LOCK) { if (SecurityUtil.isSecurityEnabled(config)) { String principal = context.getProperty(kerberosProperties.getKerberosPrincipal()).getValue(); String keyTab = context.getProperty(kerberosProperties.getKerberosKeytab()).getValue(); ugi = SecurityUtil.loginKerberos(config, principal, keyTab); fs = getFileSystemAsUser(config, ugi); lastKerberosReloginTime = System.currentTimeMillis() / 1000; } else { config.set("ipc.client.fallback-to-simple-auth-allowed", "true"); config.set("hadoop.security.authentication", "simple"); ugi = SecurityUtil.loginSimple(config); fs = getFileSystemAsUser(config, ugi); } } getLogger().debug("resetHDFSResources UGI {}", new Object[]{ugi}); final Path workingDir = fs.getWorkingDirectory(); getLogger().info("Initialized a new HDFS File System with working dir: {} default block size: {} default replication: {} config: {}", new Object[]{workingDir, fs.getDefaultBlockSize(workingDir), fs.getDefaultReplication(workingDir), config.toString()}); return new HdfsResources(config, fs, ugi); } /** * This exists in order to allow unit tests to override it so that they don't take several minutes waiting for UDP packets to be received * * @param config * the configuration to use * @return the FileSystem that is created for the given Configuration * @throws IOException * if unable to create the FileSystem */ protected FileSystem getFileSystem(final Configuration config) throws IOException { return FileSystem.get(config); } protected FileSystem getFileSystemAsUser(final Configuration config, UserGroupInformation ugi) throws IOException { try { return ugi.doAs(new PrivilegedExceptionAction<FileSystem>() { @Override public FileSystem run() throws Exception { return FileSystem.get(config); } }); } catch (InterruptedException e) { throw new IOException("Unable to create file system: " + e.getMessage()); } } /* * Drastically reduce the timeout of a socket connection from the default in FileSystem.get() */ protected void checkHdfsUriForTimeout(Configuration config) throws IOException { URI hdfsUri = FileSystem.getDefaultUri(config); String address = hdfsUri.getAuthority(); int port = hdfsUri.getPort(); if (address == null || address.isEmpty() || port < 0) { return; } InetSocketAddress namenode = NetUtils.createSocketAddr(address, port); SocketFactory socketFactory = NetUtils.getDefaultSocketFactory(config); Socket socket = null; try { socket = socketFactory.createSocket(); NetUtils.connect(socket, namenode, 1000); // 1 second timeout } finally { IOUtils.closeQuietly(socket); } } /** * Returns the configured CompressionCodec, or null if none is configured. * * @param context * the ProcessContext * @param configuration * the Hadoop Configuration * @return CompressionCodec or null */ protected org.apache.hadoop.io.compress.CompressionCodec getCompressionCodec(ProcessContext context, Configuration configuration) { org.apache.hadoop.io.compress.CompressionCodec codec = null; if (context.getProperty(COMPRESSION_CODEC).isSet()) { String compressionClassname = CompressionType.valueOf(context.getProperty(COMPRESSION_CODEC).getValue()).toString(); CompressionCodecFactory ccf = new CompressionCodecFactory(configuration); codec = ccf.getCodecByClassName(compressionClassname); } return codec; } /** * Returns the relative path of the child that does not include the filename or the root path. * * @param root * the path to relativize from * @param child * the path to relativize * @return the relative path */ public static String getPathDifference(final Path root, final Path child) { final int depthDiff = child.depth() - root.depth(); if (depthDiff <= 1) { return "".intern(); } String lastRoot = root.getName(); Path childsParent = child.getParent(); final StringBuilder builder = new StringBuilder(); builder.append(childsParent.getName()); for (int i = (depthDiff - 3); i >= 0; i--) { childsParent = childsParent.getParent(); String name = childsParent.getName(); if (name.equals(lastRoot) && childsParent.toString().endsWith(root.toString())) { break; } builder.insert(0, Path.SEPARATOR).insert(0, name); } return builder.toString(); } protected Configuration getConfiguration() { return hdfsResources.get().getConfiguration(); } protected FileSystem getFileSystem() { // trigger Relogin if necessary getUserGroupInformation(); return hdfsResources.get().getFileSystem(); } protected UserGroupInformation getUserGroupInformation() { // if kerberos is enabled, check if the ticket should be renewed before returning UserGroupInformation userGroupInformation = hdfsResources.get().getUserGroupInformation(); if (userGroupInformation != null && isTicketOld()) { tryKerberosRelogin(userGroupInformation); } return userGroupInformation; } protected void tryKerberosRelogin(UserGroupInformation ugi) { try { getLogger().info("Kerberos ticket age exceeds threshold [{} seconds] " + "attempting to renew ticket for user {}", new Object[]{ kerberosReloginThreshold, ugi.getUserName()}); ugi.doAs((PrivilegedExceptionAction<Void>) () -> { ugi.checkTGTAndReloginFromKeytab(); return null; }); lastKerberosReloginTime = System.currentTimeMillis() / 1000; getLogger().info("Kerberos relogin successful or ticket still valid"); } catch (IOException e) { // Most likely case of this happening is ticket is expired and error getting a new one, // meaning dfs operations would fail getLogger().error("Kerberos relogin failed", e); throw new ProcessException("Unable to renew kerberos ticket", e); } catch (InterruptedException e) { getLogger().error("Interrupted while attempting Kerberos relogin", e); throw new ProcessException("Unable to renew kerberos ticket", e); } } protected boolean isTicketOld() { return (System.currentTimeMillis() / 1000 - lastKerberosReloginTime) > kerberosReloginThreshold; } static protected class HdfsResources { private final Configuration configuration; private final FileSystem fileSystem; private final UserGroupInformation userGroupInformation; public HdfsResources(Configuration configuration, FileSystem fileSystem, UserGroupInformation userGroupInformation) { this.configuration = configuration; this.fileSystem = fileSystem; this.userGroupInformation = userGroupInformation; } public Configuration getConfiguration() { return configuration; } public FileSystem getFileSystem() { return fileSystem; } public UserGroupInformation getUserGroupInformation() { return userGroupInformation; } } static protected class ValidationResources { private final String configResources; private final Configuration configuration; public ValidationResources(String configResources, Configuration configuration) { this.configResources = configResources; this.configuration = configuration; } public String getConfigResources() { return configResources; } public Configuration getConfiguration() { return configuration; } } /** * Extending Hadoop Configuration to prevent it from caching classes that can't be found. Since users may be * adding additional JARs to the classpath we don't want them to have to restart the JVM to be able to load * something that was previously not found, but might now be available. * * Reference the original getClassByNameOrNull from Configuration. */ static class ExtendedConfiguration extends Configuration { private final ComponentLog logger; private final Map<ClassLoader, Map<String, WeakReference<Class<?>>>> CACHE_CLASSES = new WeakHashMap<>(); public ExtendedConfiguration(final ComponentLog logger) { this.logger = logger; } public Class<?> getClassByNameOrNull(String name) { final ClassLoader classLoader = getClassLoader(); Map<String, WeakReference<Class<?>>> map; synchronized (CACHE_CLASSES) { map = CACHE_CLASSES.get(classLoader); if (map == null) { map = Collections.synchronizedMap(new WeakHashMap<>()); CACHE_CLASSES.put(classLoader, map); } } Class<?> clazz = null; WeakReference<Class<?>> ref = map.get(name); if (ref != null) { clazz = ref.get(); } if (clazz == null) { try { clazz = Class.forName(name, true, classLoader); } catch (ClassNotFoundException e) { logger.error(e.getMessage(), e); return null; } // two putters can race here, but they'll put the same class map.put(name, new WeakReference<>(clazz)); return clazz; } else { // cache hit return clazz; } } } }