package org.apache.pig.impl.io;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pig.data.Tuple;
import eu.stratosphere.nephele.configuration.Configuration;
import eu.stratosphere.nephele.fs.FSDataInputStream;
import eu.stratosphere.nephele.fs.FileInputSplit;
import eu.stratosphere.nephele.fs.FileStatus;
import eu.stratosphere.nephele.fs.FileSystem;
import eu.stratosphere.nephele.fs.LineReader;
import eu.stratosphere.nephele.fs.Path;
import eu.stratosphere.pact.common.contract.FileDataSource;
import eu.stratosphere.pact.common.io.FileInputFormat;
import eu.stratosphere.pact.common.io.FileInputFormat.FileBaseStatistics;
import eu.stratosphere.pact.common.io.statistics.BaseStatistics;
/**
*
* Copy of DelimitedInputFormat with the only difference that the methods
* readRecord() and nextRecord() have Tuple as targets instead of PactRecord
* @author hinata
*
*/
public abstract class SPigDelimitedInputFormat extends SPigFileInputFormat {
/**
* The log.
*/
private static final Log LOG = LogFactory.getLog(SPigDelimitedInputFormat.class);
/**
* The default read buffer size = 1MB.
*/
private static final int DEFAULT_READ_BUFFER_SIZE = 1024 * 1024;
/**
* The default number of sample lines to consider when calculating the line width.
*/
private static final int DEFAULT_NUM_SAMPLES = 10;
// ------------------------------------- Config Keys ------------------------------------------
/**
* The configuration key to set the record delimiter.
*/
public static final String RECORD_DELIMITER = "delimited-format.delimiter";
/**
* The configuration key to set the record delimiter encoding.
*/
private static final String RECORD_DELIMITER_ENCODING = "delimited-format.delimiter-encoding";
/**
* The configuration key to set the number of samples to take for the statistics.
*/
private static final String NUM_STATISTICS_SAMPLES = "delimited-format.numSamples";
// --------------------------------------------------------------------------------------------
protected byte[] readBuffer;
protected byte[] wrapBuffer;
protected int readPos;
protected int limit;
protected byte[] delimiter = new byte[] {'\n'};
private byte[] currBuffer;
private int currOffset;
private int currLen;
protected boolean overLimit;
protected boolean end;
protected int bufferSize = -1;
protected int numLineSamples; // the number of lines to sample for statistics
// --------------------------------------------------------------------------------------------
/**
* This function parses the given byte array which represents a serialized key/value
* pair. The parsed content is then returned by setting the pair variables. If the
* byte array contains invalid content the record can be skipped by returning <tt>false</tt>.
*
* @param record The holder for the line that is read.
* @param bytes The serialized record.
* @return returns whether the record was successfully deserialized
*/
public abstract boolean readRecord(Tuple target, byte[] bytes, int offset, int numBytes);
// --------------------------------------------------------------------------------------------
/**
* Gets the delimiter that defines the record boundaries.
*
* @return The delimiter, as bytes.
*/
public byte[] getDelimiter()
{
return this.delimiter;
}
/**
* Sets the size of the buffer to be used to find record boundaries. This method has only an effect, if it is called
* before the input format is opened.
*
* @param bufferSize The buffer size to use.
*/
public void setBufferSize(int bufferSize)
{
this.bufferSize = bufferSize;
}
/**
* Gets the size of the buffer internally used to parse record boundaries.
*
* @return The size of the parsing buffer.
*/
public int getBufferSize()
{
return this.readBuffer == null ? 0: this.readBuffer.length;
}
// --------------------------------------------------------------------------------------------
/**
* Configures this input format by reading the path to the file from the configuration and the string that
* defines the record delimiter.
*
* @param parameters The configuration object to read the parameters from.
*/
@Override
public void configure(Configuration parameters)
{
super.configure(parameters);
final String delimString = parameters.getString(RECORD_DELIMITER, AbstractConfigBuilder.NEWLINE_DELIMITER);
if (delimString == null) {
throw new IllegalArgumentException("The delimiter not be null.");
}
final String charsetName = parameters.getString(RECORD_DELIMITER_ENCODING, null);
try {
this.delimiter = charsetName == null ? delimString.getBytes() : delimString.getBytes(charsetName);
} catch (UnsupportedEncodingException useex) {
throw new IllegalArgumentException("The charset with the name '" + charsetName +
"' is not supported on this TaskManager instance.", useex);
}
// set the number of samples
this.numLineSamples = DEFAULT_NUM_SAMPLES;
final String samplesString = parameters.getString(NUM_STATISTICS_SAMPLES, null);
if (samplesString != null) {
try {
this.numLineSamples = Integer.parseInt(samplesString);
}
catch (NumberFormatException nfex) {
if (LOG.isWarnEnabled())
LOG.warn("Invalid value for number of samples to take: " + samplesString +
". Using default value of " + DEFAULT_NUM_SAMPLES);
}
}
}
// --------------------------------------------------------------------------------------------
/* (non-Javadoc)
* @see eu.stratosphere.pact.common.io.InputFormat#getStatistics()
*/
@Override
public FileBaseStatistics getStatistics(BaseStatistics cachedStatistics)
{
// check the cache
FileBaseStatistics stats = null;
if (cachedStatistics != null && cachedStatistics instanceof FileBaseStatistics) {
stats = (FileBaseStatistics) cachedStatistics;
}
else {
stats = new FileBaseStatistics(-1, BaseStatistics.UNKNOWN, BaseStatistics.UNKNOWN);
}
try {
final Path file = this.filePath;
final URI uri = file.toUri();
// get the filesystem
final FileSystem fs = FileSystem.get(uri);
List<FileStatus> files = null;
// get the file info and check whether the cached statistics are still
// valid.
{
FileStatus status = fs.getFileStatus(file);
if (status.isDir()) {
FileStatus[] fss = fs.listStatus(file);
files = new ArrayList<FileStatus>(fss.length);
boolean unmodified = true;
for (FileStatus s : fss) {
if (!s.isDir()) {
files.add(s);
if (s.getModificationTime() > stats.getLastModificationTime()) {
stats.fileModTime = s.getModificationTime();
unmodified = false;
}
}
}
if (unmodified) {
return stats;
}
}
else {
// check if the statistics are up to date
long modTime = status.getModificationTime();
if (stats.getLastModificationTime() == modTime) {
return stats;
}
stats.fileModTime = modTime;
files = new ArrayList<FileStatus>(1);
files.add(status);
}
}
stats.avgBytesPerRecord = -1.0f;
stats.fileSize = 0;
// calculate the whole length
for (FileStatus s : files) {
stats.fileSize += s.getLen();
}
// sanity check
if (stats.fileSize <= 0) {
stats.fileSize = BaseStatistics.UNKNOWN;
return stats;
}
// currently, the sampling only works on line separated data
final byte[] delimiter = getDelimiter();
if (! ((delimiter.length == 1 && delimiter[0] == '\n') ||
(delimiter.length == 2 && delimiter[0] == '\r' && delimiter[1] == '\n')) )
{
return stats;
}
// make the samples small for very small files
int numSamples = Math.min(this.numLineSamples, (int) (stats.fileSize / 1024));
if (numSamples < 2) {
numSamples = 2;
}
long offset = 0;
long bytes = 0; // one byte for the line-break
long stepSize = stats.fileSize / numSamples;
int fileNum = 0;
int samplesTaken = 0;
// take the samples
for (int sampleNum = 0; sampleNum < numSamples && fileNum < files.size(); sampleNum++) {
FileStatus currentFile = files.get(fileNum);
FSDataInputStream inStream = null;
try {
inStream = fs.open(currentFile.getPath());
LineReader lineReader = new LineReader(inStream, offset, currentFile.getLen() - offset, 1024);
byte[] line = lineReader.readLine();
lineReader.close();
if (line != null && line.length > 0) {
samplesTaken++;
bytes += line.length + 1; // one for the linebreak
}
}
finally {
// make a best effort to close
if (inStream != null) {
try {
inStream.close();
} catch (Throwable t) {}
}
}
offset += stepSize;
// skip to the next file, if necessary
while (fileNum < files.size() && offset >= (currentFile = files.get(fileNum)).getLen()) {
offset -= currentFile.getLen();
fileNum++;
}
}
stats.avgBytesPerRecord = bytes / (float) samplesTaken;
}
catch (IOException ioex) {
if (LOG.isWarnEnabled())
LOG.warn("Could not determine complete statistics for file '" + filePath + "' due to an io error: "
+ ioex.getMessage());
}
catch (Throwable t) {
if (LOG.isErrorEnabled())
LOG.error("Unexpected problen while getting the file statistics for file '" + filePath + "': "
+ t.getMessage(), t);
}
return stats;
}
/**
* Opens the given input split. This method opens the input stream to the specified file, allocates read buffers
* and positions the stream at the correct position, making sure that any partial record at the beginning is skipped.
*
* @param split The input split to open.
*
* @see eu.stratosphere.pact.common.io.FileInputFormat#open(eu.stratosphere.nephele.fs.FileInputSplit)
*/
@Override
public void open(FileInputSplit split) throws IOException
{
super.open(split);
this.bufferSize = this.bufferSize <= 0 ? DEFAULT_READ_BUFFER_SIZE : this.bufferSize;
this.readBuffer = new byte[this.bufferSize];
this.wrapBuffer = new byte[256];
this.readPos = 0;
this.overLimit = false;
this.end = false;
if (this.splitStart != 0) {
this.stream.seek(this.splitStart);
readLine();
// if the first partial record already pushes the stream over the limit of our split, then no
// record starts within this split
if (this.overLimit) {
this.end = true;
}
}
else {
fillBuffer();
}
}
/**
* Checks whether the current split is at its end.
*
* @return True, if the split is at its end, false otherwise.
*/
@Override
public boolean reachedEnd()
{
return this.end;
}
/* (non-Javadoc)
* @see eu.stratosphere.pact.common.generic.io.InputFormat#nextRecord(java.lang.Object)
*/
@Override
public boolean nextRecord(Tuple record) throws IOException
{
if (readLine()) {
return readRecord(record, this.currBuffer, this.currOffset, this.currLen);
} else {
this.end = true;
return false;
}
}
/**
* Closes the input by releasing all buffers and closing the file input stream.
*
* @throws IOException Thrown, if the closing of the file stream causes an I/O error.
*/
@Override
public void close() throws IOException
{
this.wrapBuffer = null;
this.readBuffer = null;
super.close();
}
// --------------------------------------------------------------------------------------------
private boolean readLine() throws IOException
{
if (this.stream == null || this.overLimit) {
return false;
}
int countInWrapBuffer = 0;
/* position of matching positions in the delimiter byte array */
int i = 0;
while (true) {
if (this.readPos >= this.limit) {
if (!fillBuffer()) {
if (countInWrapBuffer > 0) {
setResult(this.wrapBuffer, 0, countInWrapBuffer);
return true;
} else {
return false;
}
}
}
int startPos = this.readPos;
int count = 0;
while (this.readPos < this.limit && i < this.delimiter.length) {
if ((this.readBuffer[this.readPos++]) == this.delimiter[i]) {
i++;
} else {
i = 0;
}
}
// check why we dropped out
if (i == this.delimiter.length) {
// line end
count = this.readPos - startPos - this.delimiter.length;
// copy to byte array
if (countInWrapBuffer > 0) {
// check wrap buffer size
if (this.wrapBuffer.length < countInWrapBuffer + count) {
final byte[] nb = new byte[countInWrapBuffer + count];
System.arraycopy(this.wrapBuffer, 0, nb, 0, countInWrapBuffer);
this.wrapBuffer = nb;
}
if (count >= 0) {
System.arraycopy(this.readBuffer, 0, this.wrapBuffer, countInWrapBuffer, count);
}
setResult(this.wrapBuffer, 0, countInWrapBuffer + count);
return true;
} else {
setResult(this.readBuffer, startPos, count);
return true;
}
} else {
count = this.limit - startPos;
// buffer exhausted
while (this.wrapBuffer.length - countInWrapBuffer < count) {
// reallocate
byte[] tmp = new byte[this.wrapBuffer.length * 2];
System.arraycopy(this.wrapBuffer, 0, tmp, 0, countInWrapBuffer);
this.wrapBuffer = tmp;
}
System.arraycopy(this.readBuffer, startPos, this.wrapBuffer, countInWrapBuffer, count);
countInWrapBuffer += count;
}
}
}
private final void setResult(byte[] buffer, int offset, int len) {
this.currBuffer = buffer;
this.currOffset = offset;
this.currLen = len;
}
private final boolean fillBuffer() throws IOException {
int toRead = this.splitLength > this.readBuffer.length ? this.readBuffer.length : (int) this.splitLength;
if (this.splitLength <= 0) {
toRead = this.readBuffer.length;
this.overLimit = true;
}
int read = this.stream.read(this.readBuffer, 0, toRead);
if (read == -1) {
this.stream.close();
this.stream = null;
return false;
} else {
this.splitLength -= read;
this.readPos = 0;
this.limit = read;
return true;
}
}
// ============================================================================================
/**
* Creates a configuration builder that can be used to set the input format's parameters to the config in a fluent
* fashion.
*
* @return A config builder for setting parameters.
*/
public static ConfigBuilder configureDelimitedFormat(FileDataSource target) {
return new ConfigBuilder(target.getParameters());
}
/**
* Abstract builder used to set parameters to the input format's configuration in a fluent way.
*/
protected static class AbstractConfigBuilder<T> extends SPigFileInputFormat.AbstractConfigBuilder<T>
{
private static final String NEWLINE_DELIMITER = "\n";
// --------------------------------------------------------------------
/**
* Creates a new builder for the given configuration.
*
* @param targetConfig The configuration into which the parameters will be written.
*/
protected AbstractConfigBuilder(Configuration config) {
super(config);
}
// --------------------------------------------------------------------
/**
* Sets the delimiter to be a single character, namely the given one. The character must be within
* the value range <code>0</code> to <code>127</code>.
*
* @param delimiter The delimiter character.
* @return The builder itself.
*/
public T recordDelimiter(char delimiter) {
if (delimiter == '\n') {
this.config.setString(RECORD_DELIMITER, NEWLINE_DELIMITER);
} else {
this.config.setString(RECORD_DELIMITER, String.valueOf(delimiter));
}
@SuppressWarnings("unchecked")
T ret = (T) this;
return ret;
}
/**
* Sets the delimiter to be the given string. The string will be converted to bytes for more efficient
* comparison during input parsing. The conversion will be done using the platforms default charset.
*
* @param delimiter The delimiter string.
* @return The builder itself.
*/
public T recordDelimiter(String delimiter) {
this.config.setString(RECORD_DELIMITER, delimiter);
@SuppressWarnings("unchecked")
T ret = (T) this;
return ret;
}
/**
* Sets the delimiter to be the given string. The string will be converted to bytes for more efficient
* comparison during input parsing. The conversion will be done using the charset with the given name.
* The charset must be available on the processing nodes, otherwise an exception will be raised at
* runtime.
*
* @param delimiter The delimiter string.
* @param charsetName The name of the encoding character set.
* @return The builder itself.
*/
public T recordDelimiter(String delimiter, String charsetName) {
this.config.setString(RECORD_DELIMITER, delimiter);
this.config.setString(RECORD_DELIMITER_ENCODING, charsetName);
@SuppressWarnings("unchecked")
T ret = (T) this;
return ret;
}
/**
* Sets the number of line samples to take in order to estimate the base statistics for the
* input format.
*
* @param numSamples The number of line samples to take.
* @return The builder itself.
*/
public T numSamplesForStatistics(int numSamples) {
this.config.setInteger(NUM_STATISTICS_SAMPLES, numSamples);
@SuppressWarnings("unchecked")
T ret = (T) this;
return ret;
}
}
/**
* A builder used to set parameters to the input format's configuration in a fluent way.
*/
public static class ConfigBuilder extends AbstractConfigBuilder<ConfigBuilder>
{
/**
* Creates a new builder for the given configuration.
*
* @param targetConfig The configuration into which the parameters will be written.
*/
protected ConfigBuilder(Configuration targetConfig) {
super(targetConfig);
}
}
}