/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.spark.utils; import java.io.IOException; import java.net.URI; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.metamodel.util.FileHelper; import org.apache.metamodel.util.FileResource; import org.apache.metamodel.util.HdfsResource; import org.apache.metamodel.util.Resource; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.deploy.SparkHadoopUtil; import org.datacleaner.util.HadoopResource; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Strings; /** * Helper class for interacting with HDFS. */ public class HdfsHelper { private static final Logger logger = LoggerFactory.getLogger(HdfsHelper.class); private static Configuration _lastKnownConfiguration; private final Configuration _hadoopConfiguration; public HdfsHelper(final JavaSparkContext sparkContext) { this(getHadoopConfigurationIfYarnMode(sparkContext)); } public HdfsHelper(final Configuration configuration) { if (configuration == null) { logger.warn("Hadoop Configuration is null!", new Throwable()); } else { _lastKnownConfiguration = configuration; } _hadoopConfiguration = configuration; } /** * Creates a {@link HdfsHelper} without any configuration or context * available. This is normally not the recommended way to obtain a * {@link HdfsHelper} but may be necessary in executor functions where the * {@link JavaSparkContext} is not in scope and not made available by Spark * (at least spark's end-user API). * * @return */ public static HdfsHelper createHelper() { Configuration configuration = _lastKnownConfiguration; if (configuration == null) { try { final SparkHadoopUtil sparkHadoopUtil = SparkHadoopUtil.get(); if (sparkHadoopUtil.isYarnMode()) { configuration = sparkHadoopUtil.conf(); } } catch (final Exception e) { // the above is developer API so we don't consider it very // stable. } } return new HdfsHelper(configuration); } private static Configuration getHadoopConfigurationIfYarnMode(final JavaSparkContext sparkContext) { final String sparkMaster = sparkContext.getConf().get("spark.master"); if (Strings.isNullOrEmpty(sparkMaster) || "local".equals(sparkMaster)) { return null; } return sparkContext.hadoopConfiguration(); } /** * Clears up the statically cached reference to a {@link Configuration} object, which is used an * HdfsHelper is instantiated without an explicit {@link Configuration} object. * * Note: Only use if you want to start fresh and make sure no lingering objects are used. */ public static void clear() { _lastKnownConfiguration = null; } public String readFile(final URI filepath) { return readFile(filepath, false); } public String readFile(final URI filepath, final boolean failOnNoData) { final Resource resourceInUse = getResourceToUse(filepath); if (failOnNoData && resourceInUse == null) { throw new IllegalArgumentException("Could not resolve resource: " + filepath); } return readResource(resourceInUse); } public String readResource(final Resource resource) { final Resource resourceInUse = getResourceToUse(resource); if (resourceInUse == null) { return null; } return resourceInUse.read(in -> { return FileHelper.readInputStreamAsString(in, FileHelper.DEFAULT_ENCODING); }); } public Resource getResourceToUse(final Resource resource) { if (resource == null) { return null; } if (_hadoopConfiguration == null || resource instanceof HadoopResource) { return resource; } if (resource instanceof HdfsResource) { // wrap the resource with our known configuration return new HadoopResource(resource, _hadoopConfiguration, HadoopResource.DEFAULT_CLUSTERREFERENCE); } if (resource instanceof FileResource) { // this may very well be a path that was mis-interpreted as a local // file because no scheme was defined if (resource.getQualifiedPath().startsWith("/")) { return new HadoopResource(resource, _hadoopConfiguration, HadoopResource.DEFAULT_CLUSTERREFERENCE); } } return resource; } public Resource getResourceToUse(final URI path) { if (path == null) { return null; } if (_hadoopConfiguration == null) { if ("hdfs".equals(path.getScheme())) { return new HdfsResource(path.toString()); } return new FileResource(path.toString()); } return new HadoopResource(path, _hadoopConfiguration, HadoopResource.DEFAULT_CLUSTERREFERENCE); } public boolean isDirectory(final URI path) { final Resource resource = getResourceToUse(path); if (!resource.isExists()) { return false; } if (resource instanceof FileResource) { return ((FileResource) resource).getFile().isDirectory(); } if (resource instanceof HdfsResource) { final FileSystem fileSystem = ((HdfsResource) resource).getHadoopFileSystem(); final Path hadoopPath = ((HdfsResource) resource).getHadoopPath(); try { return fileSystem.isDirectory(hadoopPath); } catch (final IOException e) { throw new IllegalStateException(e); } } // actually we don't know, but most likely it's not a directory return false; } }