/**
* (c) Copyright 2012 WibiData, Inc.
*
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kiji.mapreduce.tools.framework;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import org.kiji.annotations.ApiAudience;
import org.kiji.annotations.ApiStability;
/**
* Describes the format and location for the output of a MapReduce job.
*
* <p>The output for a MapReduce job can be described with three components: format,
* location, and a number of splits. The format determines how the output data should be
* written, e.g., as a text file or sequence file? The location may depend on the
* format. For a file, it is usually a URL or filesystem path. For a Kiji table, it is
* simply the name of the table. Finally, the number of splits is used to determine the
* number or reducers. When writing files, the number of sharded output files will
* be equal to the number of reducers, since each reducer writes one shard.</p>
*/
@ApiAudience.Framework
@ApiStability.Stable
public final class JobOutputSpec {
/**
* The job output formats supported by Kiji. In the string
* representation of a JobOutputSpec, this is the part before the
* first colon, e.g., the "avro" in "avro:/path/to/avro/container/file@8"
*/
public static enum Format {
/** A Kiji table. */
KIJI("kiji"),
/** Text files in a file system. */
TEXT("text"),
/** Sequence files in a file system. */
SEQUENCE_FILE("seq"),
/** Map files in a file system. */
MAP_FILE("map"),
/** Avro container files in a file system. */
AVRO("avro"),
/** Avro container files of key/value generic records. */
AVRO_KV("avrokv"),
/** HFiles used in HBase for bulk loading into region servers. */
HFILE("hfile");
/** The short name of the format. */
private String mName;
/** A static map from a format name to formats. */
private static final Map<String, Format> NAME_MAP = Maps.newHashMap();
static {
// Initialize the map from names to Formats for quick lookup later.
for (Format format : Format.class.getEnumConstants()) {
NAME_MAP.put(format.getName(), format);
}
}
/**
* Initializes a format enum value.
*
* @param name Name of the format.
*/
private Format(String name) {
mName = name;
}
/**
* Name of the format.
*
* @return the format name.
*/
public String getName() {
return mName;
}
/**
* Gets a Format object from its name.
*
* @param name Name of the format.
* @return the parsed format enum value.
* @throws JobIOSpecParseException If the name does not identify a valid format.
*/
public static Format parse(String name) {
final Format format = NAME_MAP.get(name);
if (null == format) {
throw new JobIOSpecParseException("Unrecognized format", name);
}
return format;
}
}
/** The format of the job output data. */
private Format mFormat;
/** The location of the job output data, or null if not specified. */
private String mLocation;
/**
* The number of splits for the output data, which determines the
* number of reducers and the number of sharded output files if this
* writes to a file system.
*/
private int mSplits;
/**
* Constructor.
*
* @param format The job output data format.
* @param location The target location of the output data (or null
* if a location is implied by the format).
* @param splits The number of output splits.
*/
private JobOutputSpec(Format format, String location, int splits) {
mFormat = Preconditions.checkNotNull(format);
mLocation = Preconditions.checkNotNull(location);
mSplits = splits;
Preconditions.checkArgument(splits >= 0);
}
/**
* Creates a new job output specification.
*
* @param format is the format of the data output by the job.
* @param location is the target location of the output data (or <code>null</code> if a location
* is implied by the format).
* @param splits is the number of desired output splits.
* @return a new job output specification using the specified format, location, and splits.
*/
public static JobOutputSpec create(Format format, String location, int splits) {
return new JobOutputSpec(format, location, splits);
}
/** @return The format of the output data. */
public Format getFormat() {
return mFormat;
}
/** @return The target location of the output data (may be null). */
public String getLocation() {
return mLocation;
}
/** @return The number of splits in the output data. */
public int getSplits() {
return mSplits;
}
/** {@inheritDoc} */
@Override
public String toString() {
return mFormat.getName() + ":" + mLocation + "@" + mSplits;
}
/** Regex matching "format:location@split". */
private static final Pattern RE_JOB_OUTPUT_SPEC = Pattern.compile("([^:]+):(.*)@(\\d+)");
/**
* Parses the string representation of a JobOutputSpec. The string
* representation is of the format {@literal "<format>:<location>@<splits>"}.
*
* @param spec The output spec string to parse.
* @return The parsed JobOutputSpec.
* @throws JobIOSpecParseException If it is unable to parse.
*/
public static JobOutputSpec parse(String spec) {
final Matcher matcher = RE_JOB_OUTPUT_SPEC.matcher(spec);
if (!matcher.matches()) {
throw new JobIOSpecParseException(
"Invalid job output spec, expecting 'format:location@nsplit'.", spec);
}
final Format format = Format.parse(matcher.group(1));
final String location = matcher.group(2);
final int nsplits = Integer.parseInt(matcher.group(3));
return new JobOutputSpec(format, location, nsplits);
}
}