/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapreduce; import java.io.IOException; import java.util.List; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; /** * <code>InputFormat</code> describes the input-specification for a * Map-Reduce job. * * <p>The Map-Reduce framework relies on the <code>InputFormat</code> of the * job to:<p> * <ol> * <li> * Validate the input-specification of the job. * <li> * Split-up the input file(s) into logical {@link InputSplit}s, each of * which is then assigned to an individual {@link Mapper}. * </li> * <li> * Provide the {@link RecordReader} implementation to be used to glean * input records from the logical <code>InputSplit</code> for processing by * the {@link Mapper}. * </li> * </ol> * * <p>The default behavior of file-based {@link InputFormat}s, typically * sub-classes of {@link FileInputFormat}, is to split the * input into <i>logical</i> {@link InputSplit}s based on the total size, in * bytes, of the input files. However, the {@link FileSystem} blocksize of * the input files is treated as an upper bound for input splits. A lower bound * on the split size can be set via * <a href="{@docRoot}/../mapred-default.html#mapred.min.split.size"> * mapred.min.split.size</a>.</p> * * <p>Clearly, logical splits based on input-size is insufficient for many * applications since record boundaries are to respected. In such cases, the * application has to also implement a {@link RecordReader} on whom lies the * responsibility to respect record-boundaries and present a record-oriented * view of the logical <code>InputSplit</code> to the individual task. * * @see InputSplit * @see RecordReader * @see FileInputFormat */ public abstract class InputFormat<K, V> { /** * Logically split the set of input files for the job. * * <p>Each {@link InputSplit} is then assigned to an individual {@link Mapper} * for processing.</p> * * <p><i>Note</i>: The split is a <i>logical</i> split of the inputs and the * input files are not physically split into chunks. For e.g. a split could * be <i><input-file-path, start, offset></i> tuple. The InputFormat * also creates the {@link RecordReader} to read the {@link InputSplit}. * * @param context job configuration. * @return an array of {@link InputSplit}s for the job. */ public abstract List<InputSplit> getSplits(JobContext context ) throws IOException, InterruptedException; /** * Create a record reader for a given split. The framework will call * {@link RecordReader#initialize(InputSplit, TaskAttemptContext)} before * the split is used. * @param split the split to be read * @param context the information about the task * @return a new record reader * @throws IOException * @throws InterruptedException */ public abstract RecordReader<K,V> createRecordReader(InputSplit split, TaskAttemptContext context ) throws IOException, InterruptedException; }