/** * (c) Copyright 2012 WibiData, Inc. * * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kiji.mapreduce.tools.framework; import java.util.Arrays; import java.util.HashMap; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.kiji.annotations.ApiAudience; import org.kiji.annotations.ApiStability; /** * <p>Describes the format and location of the input for a MapReduce job.</p> * * <p>The input for a MapReduce job can be described with two components: format and * location. The format determines how the input data should be read, e.g., is it a text * file or a sequence file? The location may depend on the format. For a file, it is * usually a URL or filesystem path. For a Kiji table, it is simply the name of the * table.</p> * * <p>JobInputSpecs can be constructed if you know the format and location, or they can be * parsed from a String using the {@link JobInputSpec#parse(String)} * method.</p> */ @ApiAudience.Framework @ApiStability.Stable public final class JobInputSpec { /** * The Job input formats supported by Kiji. In the string representation of a * JobInputSpec, this is the part before the first colon, e.g. the "avro" in * "avro:/path/to/avro/container/file." */ public static enum Format { /** Avro container files from a file system. */ AVRO("avro"), /** Avro container files of key/value generic records. */ AVRO_KV("avrokv"), /** An HBase table. */ HTABLE("htable"), /** Sequence files from a file system. */ SEQUENCE("seq"), /** Single files in hdfs get read as one record per file. */ SMALL_TEXT_FILES("small-text-files"), /** Text files from a file system. */ TEXT("text"), /** A Kiji table. */ KIJI("kiji"), /** XML files from a file system. */ XML("xml"); /** The short name of the format. */ private String mName; /** A static map from short names to formats. */ private static Map<String, Format> mNameMap; static { // Initialize the map from names to Formats for quick lookup later. mNameMap = new HashMap<String, Format>(); for (Format format : Format.class.getEnumConstants()) { mNameMap.put(format.getName(), format); } } /** * Constructor for a member of the Format enum. * * @param name The short name of the format. */ private Format(String name) { mName = name; } /** * The short name of the format (used as the prefix in the string * representation of the JobInputSpec). * * @return The name of the format. */ public String getName() { return mName; } /** * Return the format referred to by the name. * * @param name The short name of the format. * @return the Format object. * @throws JobIOSpecParseException If the name does not identify a valid format. */ public static Format parse(String name) { Format format = mNameMap.get(name); if (null == format) { throw new JobIOSpecParseException("Unrecognized format", name); } return format; } } /** The format of the MapReduce job input. */ private Format mFormat; /** * The location of the MapReduce job input, whose meaning may depend * on the format (usually a filesystem path). * May specify multiple inputs as long as they are all of the same type. */ private String[] mLocations; /** * Constructor. The KIJI and HTABLE formats must specify exactly one input location. * All other formats may specify multiple input locations. * * @param format The format of the input data. * @param locations The locations of the input data. */ private JobInputSpec(Format format, String... locations) { if ((Format.KIJI == format || Format.HTABLE == format) && locations.length != 1) { throw new UnsupportedOperationException("Format " + format.toString() + " only supports a single input location." + " You specified: " + Arrays.toString(locations)); } mFormat = format; mLocations = locations; } /** * Creates a new job input specification. When created with formats KIJI or HTABLE only one * input location may be specified. Otherwise multiple input locations may be given. * * @param format is the format of the input data. * @param locations are the locations of the input data. * @return a new job input specification using the specified format and locations. */ public static JobInputSpec create(Format format, String... locations) { return new JobInputSpec(format, locations); } /** @return The format of the input data. */ public Format getFormat() { return mFormat; } /** * Convenience method to return the location iff exactly one is specified. * Throws an IllegalStateException if not. * * @return The location of the input data. */ public String getLocation() { if (null == mLocations || 1 != mLocations.length) { throw new IllegalStateException("getLocation() may only be used if there is exactly" + " one location specified. Locations are: " + Arrays.toString(getLocations())); } return mLocations[0]; } /** @return The locations of the input data. */ public String[] getLocations() { return mLocations.clone(); } /** {@inheritDoc} */ @Override public String toString() { return mFormat.getName() + ":" + Arrays.toString(mLocations); } /** * <p>Parses the string representation of a JobInputSpec.</p> * * <p>JobInputSpecs are of the form {@literal "<format>:<location>"}.</p> * * @param spec The input spec string to parse. * @return The parsed JobInputSpec. * @throws JobIOSpecParseException If is unable to parse. */ public static JobInputSpec parse(String spec) { // Split it on ':'. String[] parts = StringUtils.split(spec, ":", 2); if (parts.length != 2) { throw new JobIOSpecParseException("Should be '<format>:<location>'", spec); } // Parse the format. Format format = Format.parse(parts[0]); // Parse the location. String commaDelimitedLocations = parts[1]; String[] locations = StringUtils.split(commaDelimitedLocations, ","); return new JobInputSpec(format, locations); } }