/**
* (c) Copyright 2012 WibiData, Inc.
*
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kiji.mapreduce.input.impl;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.kiji.annotations.ApiAudience;
/**
* <p>A RecordReader that reads an entire file as a single key-value.
* The key is the qualified path to this file, and the value is the Text contents of the file.
* This is intended for reading small files.
* (Note that the entire file must be able to fit in ram)</p>
*/
@ApiAudience.Private
public final class WholeFileRecordReader extends RecordReader<Text, Text> {
private static final Logger LOG = LoggerFactory.getLogger(WholeFileRecordReader.class);
/** The path to the file to read. */
private final Path mFileToRead;
/** The length of this file. */
private final long mFileLength;
/** The Configuration. */
private final Configuration mConf;
/** Whether this FileSplit has been processed. */
private boolean mProcessed;
/** Single Text to store the file name of the current file. */
private final Text mFileName;
/** Single Text to store the value of this file (the value) when it is read. */
private final Text mFileText;
/**
* Implementation detail: This constructor is built to be called via
* reflection from within CombineFileRecordReader.
*
* @param fileSplit The CombineFileSplit that this will read from.
* @param context The context for this task.
* @param pathToProcess The path index from the CombineFileSplit to process in this record.
*/
public WholeFileRecordReader(CombineFileSplit fileSplit, TaskAttemptContext context,
Integer pathToProcess) {
mProcessed = false;
mFileToRead = fileSplit.getPath(pathToProcess);
mFileLength = fileSplit.getLength(pathToProcess);
mConf = context.getConfiguration();
assert 0 == fileSplit.getOffset(pathToProcess);
if (LOG.isDebugEnabled()) {
LOG.debug("FileToRead is: " + mFileToRead.toString());
LOG.debug("Processing path " + pathToProcess + " out of " + fileSplit.getNumPaths());
try {
FileSystem fs = FileSystem.get(mConf);
assert fs.getFileStatus(mFileToRead).getLen() == mFileLength;
} catch (IOException ioe) {
// oh well, I was just testing.
}
}
mFileName = new Text();
mFileText = new Text();
}
/** {@inheritDoc} */
@Override
public void close() throws IOException {
mFileText.clear();
}
/**
* Returns the absolute path to the current file.
*
* @return The absolute path to the current file.
* @throws IOException never.
* @throws InterruptedException never.
*/
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return mFileName;
}
/**
* <p>Returns the current value. If the file has been read with a call to NextKeyValue(),
* this returns the contents of the file as a BytesWritable. Otherwise, it returns an
* empty BytesWritable.</p>
*
* <p>Throws an IllegalStateException if initialize() is not called first.</p>
*
* @return A BytesWritable containing the contents of the file to read.
* @throws IOException never.
* @throws InterruptedException never.
*/
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
return mFileText;
}
/**
* Returns whether the file has been processed or not. Since only one record
* will be generated for a file, progress will be 0.0 if it has not been processed,
* and 1.0 if it has.
*
* @return 0.0 if the file has not been processed. 1.0 if it has.
* @throws IOException never.
* @throws InterruptedException never.
*/
@Override
public float getProgress() throws IOException, InterruptedException {
return (mProcessed) ? (float) 1.0 : (float) 0.0;
}
/**
* All of the internal state is already set on instantiation. This is a no-op.
*
* @param split The InputSplit to read. Unused.
* @param context The context for this task. Unused.
* @throws IOException never.
* @throws InterruptedException never.
*/
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
// no-op.
}
/**
* <p>If the file has not already been read, this reads it into memory, so that a call
* to getCurrentValue() will return the entire contents of this file as Text,
* and getCurrentKey() will return the qualified path to this file as Text. Then, returns
* true. If it has already been read, then returns false without updating any internal state.</p>
*
* @return Whether the file was read or not.
* @throws IOException if there is an error reading the file.
* @throws InterruptedException if there is an error.
*/
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!mProcessed) {
if (mFileLength > (long) Integer.MAX_VALUE) {
throw new IOException("File is longer than Integer.MAX_VALUE.");
}
byte[] contents = new byte[(int) mFileLength];
FileSystem fs = mFileToRead.getFileSystem(mConf);
FSDataInputStream in = null;
try {
// Set the contents of this file.
in = fs.open(mFileToRead);
IOUtils.readFully(in, contents, 0, contents.length);
mFileText.set(contents, 0, contents.length);
// Set the name of this file.
String fileName = mFileToRead.makeQualified(fs).toString();
mFileName.set(fileName);
} finally {
IOUtils.closeStream(in);
}
mProcessed = true;
return true;
}
return false;
}
}