/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.metamodel.util; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.Serializable; import java.net.URI; import java.util.Objects; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.metamodel.MetaModelException; import com.google.common.base.Strings; /** * A {@link Resource} implementation that connects to Apache Hadoop's HDFS * distributed file system. */ public class HdfsResource extends AbstractResource implements Serializable { private static final long serialVersionUID = 1L; public static final String SYSTEM_PROPERTY_HADOOP_CONF_DIR_ENABLED = "metamodel.hadoop.use_hadoop_conf_dir"; public static final String SCHEME_HDFS = "hdfs"; public static final String SCHEME_SWIFT = "swift"; public static final String SCHEME_EMRFS = "emrfs"; public static final String SCHEME_MAPRFS = "maprfs"; public static final String SCHEME_S3 = "s3"; public static final String SCHEME_FTP = "ftp"; private final String _scheme; private final String _hadoopConfDir; private final String _hostname; private final int _port; private final String _filepath; private transient Path _path; /** * Creates a {@link HdfsResource} * * @param url * a URL of the form: scheme://hostname:port/path/to/file */ public HdfsResource(String url) { this(url, null); } /** * Creates a {@link HdfsResource} * * @param url * a URL of the form: scheme://hostname:port/path/to/file * @param hadoopConfDir * the path of a directory containing the Hadoop and HDFS * configuration file(s). */ public HdfsResource(String url, String hadoopConfDir) { if (url == null) { throw new IllegalArgumentException("Url cannot be null"); } final URI uri = URI.create(url); _scheme = uri.getScheme(); _hostname = uri.getHost(); _port = uri.getPort(); _filepath = uri.getPath(); _hadoopConfDir = hadoopConfDir; } /** * Creates a {@link HdfsResource} using the "hdfs" scheme * * @param hostname * the HDFS (namenode) hostname * @param port * the HDFS (namenode) port number * @param filepath * the path on HDFS to the file, starting with slash ('/') */ public HdfsResource(String hostname, int port, String filepath) { this(SCHEME_HDFS, hostname, port, filepath, null); } /** * Creates a {@link HdfsResource} * * @param scheme * the scheme to use (consider using {@link #SCHEME_HDFS} or any * of the other "SCHEME_" constants). * @param hostname * the HDFS (namenode) hostname * @param port * the HDFS (namenode) port number * @param filepath * the path on HDFS to the file, starting with slash ('/') * @param hadoopConfDir * the path of a directory containing the Hadoop and HDFS * configuration file(s). */ public HdfsResource(String scheme, String hostname, int port, String filepath, String hadoopConfDir) { _scheme = scheme; _hostname = hostname; _port = port; _filepath = filepath; _hadoopConfDir = hadoopConfDir; } public String getScheme() { if (_scheme == null) { // should only happen for deserialized and old objects before // METAMODEL-220 introduced dynamic schemes return SCHEME_HDFS; } return _scheme; } public String getFilepath() { return _filepath; } public String getHostname() { return _hostname; } public int getPort() { return _port; } public String getHadoopConfDir() { return _hadoopConfDir; } @Override public String getName() { final int lastSlash = _filepath.lastIndexOf('/'); if (lastSlash != -1) { return _filepath.substring(lastSlash + 1); } return _filepath; } @Override public String getQualifiedPath() { final StringBuilder sb = new StringBuilder(); sb.append(getScheme()); sb.append("://"); if (_hostname != null) { sb.append(_hostname); } if (_port > 0) { sb.append(':'); sb.append(_port); } sb.append(_filepath); return sb.toString(); } @Override public boolean isReadOnly() { // We assume it is not read-only return false; } @Override public boolean isExists() { final FileSystem fs = getHadoopFileSystem(); try { return fs.exists(getHadoopPath()); } catch (Exception e) { throw wrapException(e); } finally { FileHelper.safeClose(fs); } } @Override public long getSize() { final FileSystem fs = getHadoopFileSystem(); try { if (fs.isFile(getHadoopPath())) { return fs.getFileStatus(getHadoopPath()).getLen(); } else { return fs.getContentSummary(getHadoopPath()).getLength(); } } catch (Exception e) { throw wrapException(e); } finally { FileHelper.safeClose(fs); } } @Override public long getLastModified() { final FileSystem fs = getHadoopFileSystem(); try { return fs.getFileStatus(getHadoopPath()).getModificationTime(); } catch (Exception e) { throw wrapException(e); } finally { FileHelper.safeClose(fs); } } @Override public OutputStream write() throws ResourceException { final FileSystem fs = getHadoopFileSystem(); try { final FSDataOutputStream out = fs.create(getHadoopPath(), true); return new HdfsFileOutputStream(out, fs); } catch (IOException e) { // we can close 'fs' in case of an exception FileHelper.safeClose(fs); throw wrapException(e); } } @Override public OutputStream append() throws ResourceException { final FileSystem fs = getHadoopFileSystem(); try { final FSDataOutputStream out = fs.append(getHadoopPath()); return new HdfsFileOutputStream(out, fs); } catch (IOException e) { // we can close 'fs' in case of an exception FileHelper.safeClose(fs); throw wrapException(e); } } @Override public InputStream read() throws ResourceException { final FileSystem fs = getHadoopFileSystem(); final InputStream in; try { final Path hadoopPath = getHadoopPath(); // return a wrapper InputStream which manages the 'fs' closeable if (fs.isFile(hadoopPath)) { in = fs.open(hadoopPath); return new HdfsFileInputStream(in, fs); } else { return new HdfsDirectoryInputStream(hadoopPath, fs); } } catch (Exception e) { // we can close 'fs' in case of an exception FileHelper.safeClose(fs); throw wrapException(e); } } private RuntimeException wrapException(Exception e) { if (e instanceof RuntimeException) { return (RuntimeException) e; } return new MetaModelException(e); } public Configuration getHadoopConfiguration() { final Configuration conf = new Configuration(); if (_hostname != null && _port > 0) { conf.set("fs.defaultFS", getScheme() + "://" + _hostname + ":" + _port); } final File hadoopConfigurationDirectory = getHadoopConfigurationDirectoryToUse(); if (hadoopConfigurationDirectory != null) { addResourceIfExists(conf, hadoopConfigurationDirectory, "core-site.xml"); addResourceIfExists(conf, hadoopConfigurationDirectory, "hdfs-site.xml"); } return conf; } private void addResourceIfExists(Configuration conf, File hadoopConfigurationDirectory, String filename) { final File file = new File(hadoopConfigurationDirectory, filename); if (file.exists()) { final InputStream inputStream = FileHelper.getInputStream(file); conf.addResource(inputStream, filename); } } private File getHadoopConfigurationDirectoryToUse() { File candidate = getDirectoryIfExists(null, _hadoopConfDir); if ("true".equals(System.getProperty(SYSTEM_PROPERTY_HADOOP_CONF_DIR_ENABLED))) { candidate = getDirectoryIfExists(candidate, System.getProperty("YARN_CONF_DIR")); candidate = getDirectoryIfExists(candidate, System.getProperty("HADOOP_CONF_DIR")); candidate = getDirectoryIfExists(candidate, System.getenv("YARN_CONF_DIR")); candidate = getDirectoryIfExists(candidate, System.getenv("HADOOP_CONF_DIR")); } return candidate; } /** * Gets a candidate directory based on a file path, if it exists, and if it * another candidate hasn't already been resolved. * * @param existingCandidate * an existing candidate directory. If this is non-null, it will * be returned immediately. * @param path * the path of a directory * @return a candidate directory, or null if none was resolved. */ private File getDirectoryIfExists(File existingCandidate, String path) { if (existingCandidate != null) { return existingCandidate; } if (!Strings.isNullOrEmpty(path)) { final File directory = new File(path); if (directory.exists() && directory.isDirectory()) { return directory; } } return null; } public FileSystem getHadoopFileSystem() { try { return FileSystem.newInstance(getHadoopConfiguration()); } catch (IOException e) { throw new MetaModelException("Could not connect to HDFS: " + e.getMessage(), e); } } public Path getHadoopPath() { if (_path == null) { _path = new Path(_filepath); } return _path; } @Override public int hashCode() { return Objects.hash(getScheme(), _filepath, _hostname, _port, _hadoopConfDir); } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj instanceof HdfsResource) { final HdfsResource other = (HdfsResource) obj; return Objects.equals(getScheme(), other.getScheme()) && Objects.equals(_filepath, other._filepath) && Objects.equals(_hostname, other._hostname) && Objects.equals(_port, other._port) && Objects.equals(_hadoopConfDir, other._hadoopConfDir); } return false; } }