/* * Copyright 2011 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.springframework.data.hadoop.fs; import java.io.Closeable; import java.io.File; import java.io.IOException; import java.net.URI; import java.util.Collections; import java.util.LinkedHashSet; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.springframework.beans.factory.DisposableBean; import org.springframework.beans.factory.InitializingBean; import org.springframework.core.PriorityOrdered; import org.springframework.core.io.DefaultResourceLoader; //import org.springframework.core.io.FileSystemResource; import org.springframework.core.io.Resource; import org.springframework.core.io.support.PathMatchingResourcePatternResolver; import org.springframework.core.io.support.ResourcePatternResolver; import org.springframework.util.AntPathMatcher; import org.springframework.util.Assert; import org.springframework.util.PathMatcher; import org.springframework.util.StringUtils; /** * Spring ResourceLoader over Hadoop FileSystem. * * @author Costin Leau * @author Janne Valkealahti * */ public class HdfsResourceLoader extends DefaultResourceLoader implements ResourcePatternResolver, PriorityOrdered, Closeable, DisposableBean, InitializingBean { private static final Log log = LogFactory.getLog(HdfsResourceLoader.class); /** Pseudo URL prefix for loading from the hdfs path: "hdfs:" */ private static final String HDFS_URL_PREFIX = "hdfs:"; private final FileSystem fs; private final PathMatcher pathMatcher = new AntPathMatcher(); /** Flag telling if fs is created in this class */ private final boolean internalFS; private volatile boolean useCodecs = true; private volatile CompressionCodecFactory codecsFactory; /** Flag telling if path without prefix is routed to hdfs */ private volatile boolean handleNoprefix = true; /** If we're impersonating a user */ private String impersonatedUser = null; /** Needed to fall back to default spring functionality */ private ResourcePatternResolver resourcePatternResolver; /** * Constructs a new <code>HdfsResourceLoader</code> instance. * * @param config Hadoop configuration to use. */ public HdfsResourceLoader(Configuration config) { this(config, null); } /** * Constructs a new <code>HdfsResourceLoader</code> instance. * * @param config Hadoop configuration to use. * @param uri Hadoop file system URI. * @param user Hadoop user for accessing the file system. */ @SuppressWarnings("resource") public HdfsResourceLoader(Configuration config, URI uri, String user) { Assert.notNull(config, "a valid configuration is required"); impersonatedUser = user; internalFS = true; FileSystem tempFS = null; codecsFactory = new CompressionCodecFactory(config); try { if (uri == null) { uri = FileSystem.getDefaultUri(config); } tempFS = (StringUtils.hasText(impersonatedUser) ? FileSystem.get(uri, config, impersonatedUser) : FileSystem.get(uri, config)); } catch (Exception ex) { tempFS = null; throw new IllegalStateException("Cannot create filesystem", ex); } finally { fs = tempFS; } } /** * Constructs a new <code>HdfsResourceLoader</code> instance. * * @param config Hadoop configuration to use. * @param uri Hadoop file system URI. */ public HdfsResourceLoader(Configuration config, URI uri) { this(config, uri, null); } /** * Constructs a new <code>HdfsResourceLoader</code> instance. * * @param fs Hadoop file system to use. */ public HdfsResourceLoader(FileSystem fs) { Assert.notNull(fs, "a non-null file-system required"); this.fs = fs; internalFS = false; codecsFactory = new CompressionCodecFactory(fs.getConf()); } @Override protected Resource getResourceByPath(String path) { if (handleNoprefix) { return new HdfsResource(stripLeadingTilde(path), fs, codecs()); } else { return super.getResourceByPath(path); } } @Override public Resource getResource(String location) { // it looks like spring DefaultResourceLoader will rely java.net.URL to throw // exception before if fall back to getResourceByPath. This is not reliable // so do explicit check if location starts with 'hdfs'. if (location.startsWith(HDFS_URL_PREFIX) || (location.indexOf(':') < 0 && handleNoprefix)) { return getResourceByPath(location); } else { return super.getResource(location); } } @Override public Resource[] getResources(String locationPattern) throws IOException { Assert.notNull(locationPattern, "Location pattern must not be null"); if (locationPattern.startsWith(HDFS_URL_PREFIX) || (locationPattern.indexOf(':') < 0 && handleNoprefix)) { // Only look for a pattern after a prefix here // (to not get fooled by a pattern symbol in a strange prefix). if (pathMatcher.isPattern(stripPrefix(locationPattern))) { // a resource pattern return findPathMatchingResources(locationPattern); } else { // a single resource with the given name return new Resource[] { getResource(stripPrefix(stripLeadingTilde(locationPattern))) }; } } else { return resourcePatternResolver.getResources(locationPattern); } } @Override public int getOrder() { return PriorityOrdered.HIGHEST_PRECEDENCE; } @Override public void destroy() throws IOException { close(); } @Override public void close() throws IOException { if (fs != null && internalFS) { try { fs.close(); // swallow bug in FS closing too early - HADOOP-4829 } catch (NullPointerException npe) { } } } @Override public void afterPropertiesSet() throws Exception { if (resourcePatternResolver == null) { resourcePatternResolver = new PathMatchingResourcePatternResolver(this); } } @Override public ClassLoader getClassLoader() { return fs.getConf().getClassLoader(); } /** * Sets the handle noprefix. * * @param handleNoprefix the new handle noprefix */ public void setHandleNoprefix(boolean handleNoprefix) { this.handleNoprefix = handleNoprefix; } /** * Returns the Hadoop file system used by this resource loader. * * @return the Hadoop file system in use. */ public FileSystem getFileSystem() { return fs; } /** * Indicates whether to use (or not) the codecs found inside the Hadoop * configuration. This affects the content of the streams backing this * resource - whether the raw content is delivered as is * or decompressed on the fly (if the configuration allows it so). * The latter is the default. * * @param useCodecs whether to use any codecs defined in the Hadoop configuration */ public void setUseCodecs(boolean useCodecs) { this.useCodecs = useCodecs; } /** * Sets the resource pattern resolver. * * @param resourcePatternResolver the new resource pattern resolver */ public void setResourcePatternResolver(ResourcePatternResolver resourcePatternResolver) { this.resourcePatternResolver = resourcePatternResolver; } /** * Find all resources that match the given location pattern via the * Ant-style PathMatcher. * * @param locationPattern the location pattern to match * @return the result as Resource array * @throws IOException in case of I/O errors */ protected Resource[] findPathMatchingResources(String locationPattern) throws IOException { String rootDirPath = determineRootDir(locationPattern); String subPattern = locationPattern.substring(rootDirPath.length()); Resource[] rootDirResources = getResources(rootDirPath); Set<Resource> result = new LinkedHashSet<Resource>(16); for (Resource rootDirResource : rootDirResources) { result.addAll(doFindPathMatchingFileResources(rootDirResource, subPattern)); } if (log.isDebugEnabled()) { log.debug("Resolved location pattern [" + locationPattern + "] to resources " + result); } return result.toArray(new Resource[result.size()]); } /** * Find all resources in the hdfs file system that match the given location pattern * via the Ant-style PathMatcher. * * @param rootDirResource the root directory as Resource * @param subPattern the sub pattern to match (below the root directory) * @return the Set of matching Resource instances * @throws IOException in case of I/O errors */ protected Set<Resource> doFindPathMatchingFileResources(Resource rootDirResource, String subPattern) throws IOException { Path rootDir; try { rootDir = (rootDirResource instanceof HdfsResource ? ((HdfsResource) rootDirResource).getPath() : new Path(rootDirResource.getURI().toString())); } catch (IOException ex) { if (log.isWarnEnabled()) { log.warn("Cannot search for matching files underneath " + rootDirResource + " because it does not correspond to a directory in the file system", ex); } return Collections.emptySet(); } return doFindMatchingFileSystemResources(rootDir, subPattern); } /** * Find all resources in the file system that match the given location pattern * via the Ant-style PathMatcher. * * @param rootDir the root directory in the file system * @param subPattern the sub pattern to match (below the root directory) * @return the Set of matching Resource instances * @throws IOException in case of I/O errors * @see org.springframework.util.PathMatcher */ protected Set<Resource> doFindMatchingFileSystemResources(Path rootDir, String subPattern) throws IOException { if (log.isDebugEnabled()) { log.debug("Looking for matching resources in directory tree [" + rootDir.toUri().getPath() + "]"); } Set<Path> matchingFiles = retrieveMatchingFiles(rootDir, subPattern); Set<Resource> result = new LinkedHashSet<Resource>(matchingFiles.size()); for (Path path : matchingFiles) { result.add(new HdfsResource(path, fs, codecs())); } return result; } /** * Retrieve files that match the given path pattern, * checking the given directory and its subdirectories. * * @param rootDir the directory to start from * @param pattern the pattern to match against, * relative to the root directory * @return the Set of matching Path instances * @throws IOException if directory contents could not be retrieved */ @SuppressWarnings("deprecation") protected Set<Path> retrieveMatchingFiles(Path rootDir, String pattern) throws IOException { boolean exists = fs.exists(rootDir); if (!exists) { // Silently skip non-existing directories. if (log.isDebugEnabled()) { log.debug("Skipping [" + rootDir.toUri().getPath() + "] because it does not exist"); } return Collections.emptySet(); } // previous exists() should make sure we don't // get FileNotFoundException FileStatus fileStatus = fs.getFileStatus(rootDir); if (!fileStatus.isDir()) { // Complain louder if it exists but is no directory. if (log.isWarnEnabled()) { log.warn("Skipping [" + rootDir.toUri().getPath() + "] because it does not denote a directory"); } return Collections.emptySet(); } String fullPattern = StringUtils.replace(rootDir.toUri().getPath(), File.separator, "/"); if (!pattern.startsWith("/")) { fullPattern += "/"; } fullPattern = fullPattern + StringUtils.replace(pattern, File.separator, "/"); Set<Path> result = new LinkedHashSet<Path>(8); doRetrieveMatchingFiles(fullPattern, rootDir, result); return result; } /** * Recursively retrieve files that match the given pattern, * adding them to the given result list. * * @param fullPattern the pattern to match against, with prepended root directory path * @param dir the current directory * @param result the Set of matching File instances to add to * @throws IOException if directory contents could not be retrieved */ @SuppressWarnings("deprecation") protected void doRetrieveMatchingFiles(String fullPattern, Path dir, Set<Path> result) throws IOException { if (log.isDebugEnabled()) { log.debug("Searching directory [" + dir.toUri().getPath() + "] for files matching pattern [" + fullPattern + "]"); } FileStatus[] dirContents = null; try { dirContents = fs.listStatus(dir); } catch (IOException ex) { // ignore (likely security exception) } if (dirContents == null) { if (log.isWarnEnabled()) { log.warn("Could not retrieve contents of directory [" + dir.toUri().getPath() + "]"); } return; } for (FileStatus content : dirContents) { String currPath = StringUtils.replace(content.getPath().toUri().getPath(), File.separator, "/"); if (content.isDir() && pathMatcher.matchStart(fullPattern, currPath + "/")) { doRetrieveMatchingFiles(fullPattern, content.getPath(), result); } if (pathMatcher.match(fullPattern, currPath)) { result.add(content.getPath()); } } } /** * Determine the root directory for the given location. * <p>Used for determining the starting point for file matching, * resolving the root directory location and passing it * into {@code doFindPathMatchingPathResources}, with the * remainder of the location as pattern. * <p>Will return "/dir/" for the pattern "/dir/*.xml", * for example. * * @param location the location to check * @return the part of the location that denotes the root directory */ protected String determineRootDir(String location) { int rootDirEnd = location.length(); while (rootDirEnd > 0 && pathMatcher.isPattern(location.substring(0,rootDirEnd))) { rootDirEnd = location.lastIndexOf('/', rootDirEnd - 2) + 1; } return location.substring(0, rootDirEnd); } /** * Removes a leading tilde shortcut if exists. */ private String stripLeadingTilde(String locationPattern) { if (locationPattern.startsWith("~/")) { return locationPattern.substring(2); } return locationPattern; } private CompressionCodecFactory codecs() { return (useCodecs ? codecsFactory : null); } /** * Removes a prefix from a given path and what's * left is a real 'file' path */ private static String stripPrefix(String path) { String ret = null; try { ret = new Path(path).toUri().getPath(); } catch (Exception e) {} if (ret == null && path.startsWith(HDFS_URL_PREFIX) && !path.startsWith("hdfs://")) { // check if path is 'hdfs:myfile.txt', strip prefix and colon ret = path.substring(5); } if (ret == null) { // fall back to given path ret = path; } return ret; } }