/* * Copyright © 2014-2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.data2.dataset2.lib.file; import co.cask.cdap.api.data.batch.DatasetOutputCommitter; import co.cask.cdap.api.dataset.DataSetException; import co.cask.cdap.api.dataset.DatasetContext; import co.cask.cdap.api.dataset.DatasetSpecification; import co.cask.cdap.api.dataset.lib.FileSet; import co.cask.cdap.api.dataset.lib.FileSetArguments; import co.cask.cdap.api.dataset.lib.FileSetProperties; import co.cask.cdap.common.conf.CConfiguration; import co.cask.cdap.common.conf.Constants; import co.cask.cdap.common.namespace.NamespacedLocationFactory; import co.cask.cdap.proto.Id; import com.google.common.base.Function; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.base.Throwables; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.twill.filesystem.Location; import org.apache.twill.filesystem.LocationFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.List; import java.util.Map; import javax.annotation.Nonnull; import javax.annotation.Nullable; /** * Implementation of file dataset. */ public final class FileSetDataset implements FileSet, DatasetOutputCommitter { private static final Logger LOG = LoggerFactory.getLogger(FileSetDataset.class); public static final String FILESET_VERSION_PROPERTY = "fileset.version"; static final String FILESET_VERSION = "2"; private final DatasetSpecification spec; private final Map<String, String> runtimeArguments; private final boolean isExternal; private final Location baseLocation; private final List<Location> inputLocations; private final Location outputLocation; private final String inputFormatClassName; private final String outputFormatClassName; /** * Constructor. * @param datasetContext the context for the dataset * @param cConf the CDAP configuration * @param spec the dataset specification * @param namespacedLocationFactory a factory for namespaced {@link Location} * @param runtimeArguments the runtime arguments */ public FileSetDataset(DatasetContext datasetContext, CConfiguration cConf, DatasetSpecification spec, LocationFactory absoluteLocationFactory, NamespacedLocationFactory namespacedLocationFactory, @Nonnull Map<String, String> runtimeArguments) throws IOException { Preconditions.checkNotNull(datasetContext, "Dataset context must not be null"); Preconditions.checkNotNull(runtimeArguments, "Runtime arguments must not be null"); this.spec = spec; this.runtimeArguments = runtimeArguments; this.isExternal = FileSetProperties.isDataExternal(spec.getProperties()); this.baseLocation = determineBaseLocation(datasetContext, cConf, spec, absoluteLocationFactory, namespacedLocationFactory); this.outputLocation = determineOutputLocation(); this.inputLocations = determineInputLocations(); this.inputFormatClassName = FileSetProperties.getInputFormat(spec.getProperties()); this.outputFormatClassName = FileSetProperties.getOutputFormat(spec.getProperties()); } /** * Generate the base location of the file set. * <ul> * <li>If the properties do not contain a base path, generate one from the dataset name;</li> * <li>If the base path is absolute, return a location relative to the root of the file system;</li> * <li>Otherwise return a location relative to the data directory of the namespace.</li> * </ul> * This is package visible, because FileSetAdmin needs it, too. * TODO: Ideally, this should be done in configure(), but currently it cannot because of CDAP-1721 */ static Location determineBaseLocation(DatasetContext datasetContext, CConfiguration cConf, DatasetSpecification spec, LocationFactory rootLocationFactory, NamespacedLocationFactory namespacedLocationFactory) throws IOException { // older versions of file set incorrectly interpret absolute paths as relative to the namespace's // data directory. These file sets do not have the file set version property. boolean hasAbsoluteBasePathBug = spec.getProperties().get(FILESET_VERSION_PROPERTY) == null; String basePath = FileSetProperties.getBasePath(spec.getProperties()); if (basePath == null) { basePath = spec.getName().replace('.', '/'); } // for absolute paths, get the location from the file system's root. if (basePath.startsWith("/")) { // but only if it is not a legacy dataset that interprets absolute paths as relative if (hasAbsoluteBasePathBug) { LOG.info("Dataset {} was created with a version of FileSet that treats absolute path {} as relative. " + "To disable this message, upgrade the dataset properties with a relative path. ", spec.getName(), basePath); } else { String topLevelPath = namespacedLocationFactory.getBaseLocation().toURI().getPath(); topLevelPath = topLevelPath.endsWith("/") ? topLevelPath : topLevelPath + "/"; Location baseLocation = rootLocationFactory.create(basePath); if (baseLocation.toURI().getPath().startsWith(topLevelPath)) { throw new DataSetException("Invalid base path '" + basePath + "' for dataset '" + spec.getName() + "'. " + "It must not be inside the CDAP base path '" + topLevelPath + "'."); } return baseLocation; } } Id.Namespace namespaceId = Id.Namespace.from(datasetContext.getNamespaceId()); String dataDir = cConf.get(Constants.Dataset.DATA_DIR, Constants.Dataset.DEFAULT_DATA_DIR); return namespacedLocationFactory.get(namespaceId).append(dataDir).append(basePath); } private Location determineOutputLocation() { if (FileSetArguments.isBaseOutputPath(runtimeArguments)) { return baseLocation; } String outputPath = FileSetArguments.getOutputPath(runtimeArguments); return outputPath == null ? null : createLocation(outputPath); } private List<Location> determineInputLocations() { List<Location> locations = Lists.newLinkedList(); for (String path : FileSetArguments.getInputPaths(runtimeArguments)) { locations.add(createLocation(path)); } return locations; } private Location createLocation(String relativePath) { try { return baseLocation.append(relativePath); } catch (IOException e) { throw new DataSetException("Error constructing path from base '" + baseLocation + "' and relative path '" + relativePath + "'", e); } } @Override public Location getBaseLocation() { // TODO: if the file set is external, we could return a ReadOnlyLocation that prevents writing [CDAP-2934] return baseLocation; } @Override public List<Location> getInputLocations() { // TODO: if the file set is external, we could return a ReadOnlyLocation that prevents writing [CDAP-2934] return Lists.newLinkedList(inputLocations); } @Override public Location getOutputLocation() { if (isExternal) { throw new UnsupportedOperationException( "Output is not supported for external file set '" + spec.getName() + "'"); } return outputLocation; } @Override public Location getLocation(String relativePath) { // TODO: if the file set is external, we could return a ReadOnlyLocation that prevents writing [CDAP-2934] return createLocation(relativePath); } @Override public void close() throws IOException { // no-op - nothing to do } @Override public String getInputFormatClassName() { return inputFormatClassName; } @Override public Map<String, String> getInputFormatConfiguration() { return getInputFormatConfiguration(inputLocations); } @Override public Map<String, String> getInputFormatConfiguration(Iterable<? extends Location> inputLocs) { ImmutableMap.Builder<String, String> config = ImmutableMap.builder(); config.putAll(FileSetProperties.getInputProperties(spec.getProperties())); config.putAll(FileSetProperties.getInputProperties(runtimeArguments)); String inputs = Joiner.on(',').join(Iterables.transform(inputLocs, new Function<Location, String>() { @Override public String apply(@Nullable Location location) { return getFileSystemPath(location); } })); config.put(FileInputFormat.INPUT_DIR, inputs); return config.build(); } @Override public String getOutputFormatClassName() { if (isExternal) { throw new UnsupportedOperationException( "Output is not supported for external file set '" + spec.getName() + "'"); } return outputFormatClassName; } @Override public Map<String, String> getOutputFormatConfiguration() { if (isExternal) { throw new UnsupportedOperationException( "Output is not supported for external file set '" + spec.getName() + "'"); } ImmutableMap.Builder<String, String> builder = ImmutableMap.builder(); builder.putAll(FileSetProperties.getOutputProperties(spec.getProperties())); builder.putAll(FileSetProperties.getOutputProperties(runtimeArguments)); if (outputLocation != null) { builder.put(FileOutputFormat.OUTDIR, getFileSystemPath(outputLocation)); } return builder.build(); } @Override public Map<String, String> getRuntimeArguments() { return runtimeArguments; } private String getFileSystemPath(Location loc) { return loc.toURI().getPath(); } @Override public void onSuccess() throws DataSetException { // nothing needed to do on success } @Override public void onFailure() throws DataSetException { Location outputLocation = getOutputLocation(); // If there is no output path, it is either using DynamicPartitioner or the job would have failed. // Either way, we can't do much here. if (outputLocation == null) { return; } try { // Only delete the configured output directory, if it is empty. // On Failure, org.apache.hadoop.mapreduce.lib.output.FileOutputFormat will remove files that it wrote, // but it leaves around the directory that it created. // We don't want to unconditionally delete the output directory on failure, because it may have files written // by a different job. if (outputLocation.isDirectory() && outputLocation.list().isEmpty()) { if (!outputLocation.delete()) { throw new DataSetException(String.format("Error deleting file(s) at path %s.", outputLocation)); } } } catch (IOException ioe) { throw new DataSetException(String.format("Error deleting file(s) at path %s.", outputLocation), ioe); } } }