/******************************************************************************* * Pentaho Big Data * <p> * Copyright (C) 2002-2017 by Pentaho : http://www.pentaho.com * <p> * ****************************************************************************** * <p> * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. ******************************************************************************/ package org.pentaho.hadoop.shim; import java.io.IOException; import java.io.InputStream; import java.lang.reflect.Field; import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.ServiceLoader; import java.util.Set; import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.google.common.annotations.VisibleForTesting; import org.apache.commons.lang.StringUtils; import org.apache.commons.vfs2.FileObject; import org.apache.commons.vfs2.FileSelectInfo; import org.apache.commons.vfs2.FileSelector; import org.apache.commons.vfs2.FileSystemException; import org.apache.commons.vfs2.FileType; import org.apache.commons.vfs2.impl.DefaultFileSystemManager; import org.apache.log4j.Logger; import org.pentaho.di.core.Const; import org.pentaho.di.i18n.BaseMessages; import org.pentaho.hadoop.shim.api.ActiveHadoopConfigurationLocator; import org.pentaho.hadoop.shim.api.Required; import org.pentaho.hadoop.shim.api.ShimProperties; import org.pentaho.hadoop.shim.spi.HadoopConfigurationProvider; import org.pentaho.hadoop.shim.spi.HadoopShim; import org.pentaho.hadoop.shim.spi.PentahoHadoopShim; import org.pentaho.hadoop.shim.spi.PigShim; import org.pentaho.hadoop.shim.spi.SnappyShim; import org.pentaho.hadoop.shim.spi.SqoopShim; import org.pentaho.hbase.shim.spi.HBaseShim; import org.pentaho.oozie.shim.api.OozieClientFactory; /** * A file-based Hadoop configuration provider that knows how to load Hadoop configurations from a VFS file system. This * class is not thread-safe. */ public class HadoopConfigurationLocator implements HadoopConfigurationProvider { private static final String JAR_EXTENSION = ".jar"; private static final String CONFIG_PROPERTIES_FILE = "config.properties"; private static final String CONFIG_PROPERTY_IGNORE_CLASSES = "ignore.classes"; private static final String CONFIG_PROPERTY_EXCLUDE_JARS = "exclude.jars"; private static final String SHIM_CLASSPATH_IGNORE = "classpath.ignore"; private static final String CONFIG_PROPERTY_CLASSPATH = "classpath"; private static final String CONFIG_PROPERTY_LIBRARY_PATH = "library.path"; private static final String CONFIG_PROPERTY_NAME = "name"; private static final String PMR_PROPERTIES = "pmr.properties"; private static final URL[] EMPTY_URL_ARRAY = new URL[ 0 ]; private static final Class<?> PKG = HadoopConfigurationLocator.class; private Logger logger = Logger.getLogger( getClass() ); /** * This is a set of shim classes to load from each Hadoop configuration. TODO Externalize this list so we may * configure it per installation */ @SuppressWarnings( "unchecked" ) private static final Class<? extends PentahoHadoopShim>[] SHIM_TYPES = new Class[] { HadoopShim.class, HBaseShim.class, PigShim.class, SnappyShim.class, SqoopShim.class, OozieClientFactory.class }; private static final PentahoHadoopShim[] EMPTY_SHIM_ARRAY = new PentahoHadoopShim[ 0 ]; /** * Currently known shim configurations */ private Map<String, HadoopConfiguration> configurations; /** * Flag indicating we've been initialized. We require initialization to know where to look for Hadoop configurations * on disk. */ private boolean initialized; /** * Used to determine the active Hadoop configuration at runtime */ private ActiveHadoopConfigurationLocator activeLocator; /** * The file system manager used to provide shims a way to register their {@link FileProvider} implementations. */ private HadoopConfigurationFileSystemManager fsm; private DefaultFileSystemManager defaultFsm; /** * Initialize this factory with a directory of where to look for cluster configurations. * * @param baseDir Directory to look for Hadoop configurations in * @param activeLocator A locator for resolving the current active Hadoop configuration * @param fsm A file system manager to inject VFS file providers into from any loaded Hadoop configuration */ public void init( FileObject baseDir, ActiveHadoopConfigurationLocator activeLocator, DefaultFileSystemManager fsm ) throws ConfigurationException { if ( baseDir == null ) { throw new NullPointerException( FileObject.class.getSimpleName() + " is required" ); } if ( activeLocator == null ) { throw new NullPointerException( ActiveHadoopConfigurationLocator.class.getSimpleName() + " is required" ); } if ( fsm == null ) { throw new NullPointerException( DefaultFileSystemManager.class.getSimpleName() + " is required" ); } this.defaultFsm = fsm; this.fsm = new HadoopConfigurationFileSystemManager( this, fsm ); findHadoopConfigurations( baseDir, activeLocator ); this.activeLocator = activeLocator; initialized = true; } /** * Attempt to find any Hadoop configuration as a direct descendant of the provided directory. * * @param baseDir Directory to look for Hadoop configurations in * @throws ConfigurationException */ private void findHadoopConfigurations( FileObject baseDir, ActiveHadoopConfigurationLocator activeLocator ) throws ConfigurationException { configurations = new HashMap<String, HadoopConfiguration>(); try { if ( !baseDir.exists() ) { throw new ConfigurationException( BaseMessages.getString( PKG, "Error.HadoopConfigurationDirectoryDoesNotExist", baseDir.getURL() ) ); } for ( FileObject f : baseDir.findFiles( new FileSelector() { @Override public boolean includeFile( FileSelectInfo info ) throws Exception { return info.getDepth() == 1 && FileType.FOLDER.equals( info.getFile().getType() ); } @Override public boolean traverseDescendents( FileSelectInfo info ) throws Exception { return info.getDepth() == 0; } } ) ) { // Only load the specified configuration (ID should match the basename, we allow case-insensitivity) if ( f.getName().getBaseName().equalsIgnoreCase( activeLocator.getActiveConfigurationId() ) ) { HadoopConfiguration config = loadHadoopConfiguration( f ); if ( config != null ) { configurations.put( config.getIdentifier(), config ); } } } } catch ( FileSystemException ex ) { throw new ConfigurationException( BaseMessages.getString( PKG, "Error.UnableToLoadConfigurations", baseDir.getName().getFriendlyURI() ), ex ); } } /** * Exclude jars contained in exclude.jars property in config.properties file from the list of URLs * * @param urls the list of all the URLs to add to the class loader * @param excludedJarsProperty exclude.jars property from a config.properties file * @return The rest of the jars in {@code urls} after excluding the jars listed in {@code excludedJarsProperty}. */ protected List<URL> filterJars( List<URL> urls, String excludedJarsProperty ) { Pattern pattern; Matcher matcher; String[] excludedJars; if ( !( excludedJarsProperty == null || excludedJarsProperty.trim().isEmpty() ) ) { excludedJars = excludedJarsProperty.split( "," ); if ( excludedJars != null ) { for ( String excludedJar : excludedJars ) { pattern = Pattern.compile( ".*/" + excludedJar.toLowerCase() + "-.*\\.jar$" ); matcher = pattern.matcher( "" ); Iterator<URL> iterator = urls.listIterator(); while ( iterator.hasNext() ) { URL url = iterator.next(); if ( url.toString().toLowerCase().contains( excludedJar.toLowerCase() ) ) { if ( excludedJar.endsWith( ".jar" ) || url.toString().toLowerCase() .contains( excludedJar.toLowerCase() + ".jar" ) ) { iterator.remove(); } else { if ( matcher.reset( url.toString().toLowerCase() ).matches() ) { iterator.remove(); } } } } } } } return urls; } private List<URL> findJarsIn( FileObject path, final int maxdepth, final Set<String> paths ) throws FileSystemException { FileObject[] jars = path.findFiles( new FileSelector() { @Override public boolean includeFile( FileSelectInfo info ) throws Exception { for ( String path : paths ) { if ( info.getFile().getURL().toString().endsWith( path ) ) { return false; } } return info.getFile().getName().getBaseName().endsWith( JAR_EXTENSION ); } @Override public boolean traverseDescendents( FileSelectInfo info ) throws Exception { for ( String path : paths ) { if ( info.getFile().getURL().toString().endsWith( path ) ) { return false; } } return info.getDepth() <= maxdepth; } } ); List<URL> jarUrls = new ArrayList<URL>(); for ( FileObject jar : jars ) { jarUrls.add( jar.getURL() ); } return jarUrls; } /** * Find all jar files in the path provided. * * @param path Path to search for jar files within * @param maxdepth Maximum traversal depth (1-based) * @return All jars found within {@code path} in at most {@code maxdepth} subdirectories. * @throws FileSystemException */ private void checkInitialized() { if ( !initialized ) { throw new RuntimeException( BaseMessages.getString( PKG, "Error.LocatorNotInitialized" ) ); } } /** * Locates an implementation of {@code service} using the {@link ServiceLoader}. * * @param cl Class loader to look for implementations in * @return The first implementation found. */ protected <T> T locateServiceImpl( ClassLoader cl, Class<T> service ) { ServiceLoader<T> loader = ServiceLoader.load( service, cl ); Iterator<T> iter = loader.iterator(); if ( iter.hasNext() ) { return iter.next(); } return null; } /** * Create a ClassLoader to load resources for a {@code HadoopConfiguration}. * * @param root Configuration root directory * @param parent Parent class loader to delegate to if resources cannot be found in the configuration's * directory or provided classpath * @param classpathUrls Additional URLs to add to the class loader. These will be added before any internal * resources. * @param ignoredClasses Classes (or packages) that should not be loaded by the class loader * @return A class loader capable of loading a Hadoop configuration located at {@code root}. * @throws ConfigurationException Error creating a class loader for the Hadoop configuration located at {@code root} */ protected ClassLoader createConfigurationLoader( FileObject root, ClassLoader parent, List<URL> classpathUrls, ShimProperties configurationProperties, String... ignoredClasses ) throws ConfigurationException { try { if ( root == null || !FileType.FOLDER.equals( root.getType() ) ) { throw new IllegalArgumentException( "root must be a folder: " + root ); } // Find all jar files in the configuration, at most 2 folders deep List<URL> jars = findJarsIn( root, 3, configurationProperties.getConfigSet( SHIM_CLASSPATH_IGNORE ) ); // Add the root of the configuration jars.add( 0, new URL( root.getURL().toExternalForm() + "/" ) ); // Inject any overriding URLs before all other paths if ( classpathUrls != null ) { jars.addAll( 0, classpathUrls ); } //Exclude jars contained in exclude.jars property in config.properties file from the list of jars jars = filterJars( jars, configurationProperties.getProperty( CONFIG_PROPERTY_EXCLUDE_JARS ) ); return new HadoopConfigurationClassLoader( jars.toArray( EMPTY_URL_ARRAY ), parent, ignoredClasses ); } catch ( Exception ex ) { throw new ConfigurationException( BaseMessages.getString( PKG, "Error.CreatingClassLoader" ), ex ); } } private Properties getPmrProperties() { InputStream pmrProperties = getClass().getClassLoader().getResourceAsStream( PMR_PROPERTIES ); Properties properties = new Properties(); if ( pmrProperties != null ) { try { properties.load( pmrProperties ); } catch ( IOException ioe ) { // pmr.properties not available } finally { if ( pmrProperties != null ) { try { pmrProperties.close(); } catch ( IOException e ) { // pmr.properties not available } } } } return properties; } @VisibleForTesting boolean isRunningOnCluster() { Properties pmrProperties = getPmrProperties(); String isPmr = pmrProperties.getProperty( "isPmr", "false" ); return ( "true".equals( isPmr ) ); } /** * Parse a set of URLs from a comma-separated list of URLs. If the URL points to a directory all jar files within that * directory will be returned as well. * * @param urlString Comma-separated list of URLs (relative or absolute) * @return List of URLs resolved from {@code urlString} */ protected List<URL> parseURLs( FileObject root, String urlString ) { if ( urlString == null || urlString.trim().isEmpty() ) { return Collections.emptyList(); } String[] paths = urlString.split( "," ); List<URL> urls = new ArrayList<URL>(); for ( String path : paths ) { try { FileObject file = root.resolveFile( path.trim() ); if ( !file.exists() ) { file = defaultFsm.resolveFile( path.trim() ); } if ( FileType.FOLDER.equals( file.getType() ) ) { // Add directories with a trailing / so the URL ClassLoader interprets // them as directories urls.add( new URL( file.getURL().toExternalForm() + "/" ) ); // Also add all jars within this directory urls.addAll( findJarsIn( file, 1, new HashSet<String>() ) ); } else { urls.add( file.getURL() ); } } catch ( Exception e ) { // Log invalid path logger.error( BaseMessages.getString( PKG, "Error.InvalidClasspathEntry", path ) ); } } return urls; } /** * Attempt to discover a valid Hadoop configuration from the provided folder. * * @param folder Folder that may represent a Hadoop configuration * @return A Hadoop configuration for the folder provided or null if none is found. * @throws ConfigurationException Error when loading the Hadoop configuration. */ protected HadoopConfiguration loadHadoopConfiguration( FileObject folder ) throws ConfigurationException { ShimProperties configurationProperties = new ShimProperties(); try { FileObject configFile = folder.getChild( CONFIG_PROPERTIES_FILE ); if ( configFile != null ) { configurationProperties.putAll( loadProperties( configFile ) ); } } catch ( Exception ex ) { throw new ConfigurationException( BaseMessages.getString( PKG, "Error.UnableToLoadConfigurationProperties", CONFIG_PROPERTIES_FILE ) ); } for ( Entry<String, String> entry : configurationProperties.getPrefixedProperties( "java.system" ).entrySet() ) { System.setProperty( entry.getKey(), entry.getValue() ); } try { List<URL> classpathElements = null; if ( !isRunningOnCluster() ) { // Parse all URLs from an optional classpath from the configuration file classpathElements = parseURLs( folder, configurationProperties.getProperty( CONFIG_PROPERTY_CLASSPATH ) ); } // Allow external configuration of classes to ignore String ignoredClassesProperty = configurationProperties .getProperty( CONFIG_PROPERTY_IGNORE_CLASSES ); String[] ignoredClasses = null; if ( !StringUtils.isEmpty( ignoredClassesProperty ) ) { ignoredClasses = ignoredClassesProperty.split( "," ); } // Pass our class loader in to the configurations' CL as its parent so it // can find the same // API classes we're using ClassLoader cl = createConfigurationLoader( folder, getClass() .getClassLoader(), classpathElements, configurationProperties, ignoredClasses ); verifyClasses( cl, configurationProperties.getProperty( "required.classes" ), configurationProperties.getProperty( "name" ) ); // Treat the Hadoop shim special. It is absolutely required for a Hadoop configuration. HadoopShim hadoopShim = null; List<PentahoHadoopShim> shims = new ArrayList<PentahoHadoopShim>(); // Attempt to locate a shim within this folder for ( Class<? extends PentahoHadoopShim> shimType : SHIM_TYPES ) { PentahoHadoopShim s = locateServiceImpl( cl, shimType ); if ( s == null && shimType.getAnnotation( Required.class ) != null ) { logger.warn( BaseMessages.getString( PKG, "Error.MissingRequiredShim", shimType.getSimpleName() ) ); // Do not continue to load the configuration if we are missing a required shim return null; } if ( HadoopShim.class.isAssignableFrom( shimType ) ) { hadoopShim = (HadoopShim) s; } else { shims.add( s ); } } String id = folder.getName().getBaseName(); String name = configurationProperties.getProperty( CONFIG_PROPERTY_NAME, id ); HadoopConfiguration config = new HadoopConfiguration( configurationProperties, folder, id, name, hadoopShim, shims.toArray( EMPTY_SHIM_ARRAY ) ); // Register native libraries after everything else has been loaded successfully registerNativeLibraryPaths( configurationProperties.getProperty( CONFIG_PROPERTY_LIBRARY_PATH ) ); hadoopShim.onLoad( config, fsm ); return config; } catch ( Throwable t ) { throw new ConfigurationException( BaseMessages.getString( PKG, "Error.LoadingConfiguration" ) + " " + t.toString(), t ); } } protected void verifyClasses( ClassLoader classLoader, String requiredClasses, String shimName ) throws ConfigurationException { if ( !Const.isEmpty( requiredClasses ) ) { for ( String className : requiredClasses.split( "," ) ) { try { classLoader.loadClass( className ); } catch ( Throwable e ) { throw new ConfigurationException( BaseMessages.getString( PKG, "Error.MissingRequiredClasses", className, shimName ) ); } } } } /** * Register a comma-separated list of native library paths. * * @param paths Comma-separated list of libraries */ protected void registerNativeLibraryPaths( String paths ) { if ( paths == null ) { return; } for ( String path : paths.split( "," ) ) { boolean successful = registerNativeLibraryPath( path ); if ( !successful ) { logger.error( BaseMessages.getString( PKG, "Error.RegisteringLibraryPath", path ) ); } } } /** * Dynamically register a native library path. This relies on a specific implementation detail of ClassLoader: it's * usr_paths property. * * @param path Library path to add * @return {@code true} if the library path could be added successfully */ protected boolean registerNativeLibraryPath( String path ) { if ( path == null ) { throw new NullPointerException(); } path = path.trim(); try { Field f = ClassLoader.class.getDeclaredField( "usr_paths" ); boolean accessible = f.isAccessible(); f.setAccessible( true ); try { String[] paths = (String[]) f.get( null ); // Make sure the path isn't already registered for ( String p : paths ) { if ( p.equals( path ) ) { return true; // Success, it's already there! } } String[] newPaths = new String[ paths.length + 1 ]; System.arraycopy( paths, 0, newPaths, 0, paths.length ); newPaths[ paths.length ] = path; f.set( null, newPaths ); // Success! return true; } finally { f.setAccessible( accessible ); } } catch ( Exception ex ) { // Something went wrong, definitely not successful return false; } } /** * Load the properties file located at {@code file} * * @param file Location of a properties file to load * @return Loaded properties file * @throws IOException Error loading properties from file * @throws FileSystemException Error locating input stream for file */ protected Properties loadProperties( FileObject file ) throws FileSystemException, IOException { Properties p = new Properties(); p.load( file.getContent().getInputStream() ); return p; } @Override public List<HadoopConfiguration> getConfigurations() { checkInitialized(); return new ArrayList<HadoopConfiguration>( configurations.values() ); } @Override public boolean hasConfiguration( String id ) { checkInitialized(); return configurations.containsKey( id ); } @Override public HadoopConfiguration getConfiguration( String id ) throws ConfigurationException { checkInitialized(); HadoopConfiguration config = configurations.get( id ); if ( config == null ) { throw new ConfigurationException( BaseMessages.getString( PKG, "Error.UnknownHadoopConfiguration", id ) ); } return config; } @Override public HadoopConfiguration getActiveConfiguration() throws ConfigurationException { return getConfiguration( activeLocator.getActiveConfigurationId() ); } }