/*
* Copyright © 2014-2015 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.api.dataset.lib;
import co.cask.cdap.api.data.batch.InputFormatProvider;
import co.cask.cdap.api.data.batch.OutputFormatProvider;
import co.cask.cdap.api.dataset.Dataset;
import org.apache.twill.filesystem.Location;
import java.util.List;
import java.util.Map;
/**
* This dataset represents a collection of files on the file system. The dataset has a base location, under which
* all of its files are located. When instantiated, runtime arguments are required to specify the individual file
* or files being used.
*
* This dataset can be made available for querying with SQL (explore). This is enabled through dataset
* properties when the dataset is created. See {@link co.cask.cdap.api.dataset.lib.FileSetProperties}
* for details. If it is enabled for explore, a Hive external table will be created when the dataset is
* created. The Hive table is not partitioned, and therefore querying will only work if every path in the
* fileset is a file (not a subdirectory).
*/
public interface FileSet extends Dataset, InputFormatProvider, OutputFormatProvider {
/**
* Allows to interact directly with the location of this dataset in the underlying file system.
*
* @return the location of the base directory
*/
Location getBaseLocation();
/**
* Allows direct access to files of this dataset in the underlying file system.
*
* @return the list of input locations
*/
List<Location> getInputLocations();
/**
* Allows direct access to files in the output location, in the underlying file system.
*
* @return the output location
*/
Location getOutputLocation();
/**
* Allows direct access to files in this dataset, in the underlying file system.
*
* @return the full location given by the path, relative to the base path.
*/
Location getLocation(String relativePath);
/**
* Allow direct access to the runtime arguments of this file set.
*
* @return the runtime arguments specified for this file set.
*/
Map<String, String> getRuntimeArguments();
/**
* A variant of {@link co.cask.cdap.api.data.batch.InputFormatProvider#getInputFormatConfiguration}
* that allows passing in the input locations (rather than using the input locations that were
* determined from runtime arguments).
*
* @param inputLocs the input locations to be used
*/
Map<String, String> getInputFormatConfiguration(Iterable<? extends Location> inputLocs);
}