/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/
package eu.stratosphere.api.java.record.io;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import eu.stratosphere.api.java.record.operators.FileDataSink;
import eu.stratosphere.configuration.Configuration;
import eu.stratosphere.types.Record;
/**
* The base class for output formats that serialize their records into a delimited sequence.
*/
public abstract class DelimitedOutputFormat extends FileOutputFormat {
private static final long serialVersionUID = 1L;
/**
* The configuration key for the entry that defines the record delimiter.
*/
public static final String RECORD_DELIMITER = "pact.output.delimited.delimiter";
/**
* The configuration key to set the record delimiter encoding.
*/
private static final String RECORD_DELIMITER_ENCODING = "pact.output.delimited.delimiter-encoding";
/**
* The configuration key for the entry that defines the write-buffer size.
*/
public static final String WRITE_BUFFER_SIZE = "pact.output.delimited.buffersize";
/**
* The default write-buffer size. 64 KiByte.
*/
private static final int DEFAULT_WRITE_BUFFER_SIZE = 64 * 1024;
/**
* The minimal write-buffer size, 1 KiByte.
*/
private static final int MIN_WRITE_BUFFER_SIZE = 1024;
// --------------------------------------------------------------------------------------------
private byte[] delimiter;
private byte[] buffer;
private byte[] targetArray = new byte[64];
private int pos;
private int bufferSize;
// --------------------------------------------------------------------------------------------
/**
* Calls the super classes to configure themselves and reads the config parameters for the delimiter and
* the write buffer size.
*
* @param config The configuration to read the parameters from.
*
* @see eu.stratosphere.api.java.record.io.FileOutputFormat#configure(eu.stratosphere.configuration.Configuration)
*/
public void configure(Configuration config)
{
super.configure(config);
final String delim = config.getString(RECORD_DELIMITER, "\n");
final String charsetName = config.getString(RECORD_DELIMITER_ENCODING, null);
if (delim == null) {
throw new IllegalArgumentException("The delimiter in the DelimitedOutputFormat must not be null.");
}
try {
this.delimiter = charsetName == null ? delim.getBytes() : delim.getBytes(charsetName);
} catch (UnsupportedEncodingException useex) {
throw new IllegalArgumentException("The charset with the name '" + charsetName +
"' is not supported on this TaskManager instance.", useex);
}
this.bufferSize = config.getInteger(WRITE_BUFFER_SIZE, DEFAULT_WRITE_BUFFER_SIZE);
if (this.bufferSize < MIN_WRITE_BUFFER_SIZE) {
throw new IllegalArgumentException("The write buffer size must not be less than " + MIN_WRITE_BUFFER_SIZE
+ " bytes.");
}
}
@Override
public void open(int taskNumber, int numTasks) throws IOException
{
super.open(taskNumber, numTasks);
if (this.buffer == null) {
this.buffer = new byte[this.bufferSize];
}
if (this.targetArray == null) {
this.targetArray = new byte[64];
}
this.pos = 0;
}
// --------------------------------------------------------------------------------------------
@Override
public void close() throws IOException {
if (this.stream != null) {
this.stream.write(this.buffer, 0, this.pos);
}
// close file stream
super.close();
}
/**
* This method is called for every record so serialize itself into the given target array. The method should
* return the number of bytes occupied in the target array. If the target array is not large enough, a negative
* value should be returned.
* <p>
* The absolute value of the returned integer can be given as a hint how large an array is required. The array is
* resized to the return value's absolute value, if that is larger than the current array size. Otherwise, the
* array size is simply doubled.
*
* @param rec The record to be serialized.
* @param target The array to serialize the record into.
* @return The length of the serialized contents, or a negative value, indicating that the array is too small.
*
* @throws Exception If the user code produces an exception that prevents processing the record, it should
* throw it such that the engine recognizes the situation as a fault.
*/
public abstract int serializeRecord(Record rec, byte[] target) throws Exception;
@Override
public void writeRecord(Record record) throws IOException
{
int size;
try {
while ((size = serializeRecord(record, this.targetArray)) < 0) {
if (-size > this.targetArray.length) {
this.targetArray = new byte[-size];
}
else {
this.targetArray = new byte[this.targetArray.length * 2];
}
}
}
catch (Exception ex) {
throw new IOException("Error while serializing the record to bytes: " + ex.getMessage(), ex);
}
if (this.bufferSize - this.pos > size + this.delimiter.length) {
System.arraycopy(this.targetArray, 0, this.buffer, this.pos, size);
System.arraycopy(this.delimiter, 0, this.buffer, pos + size, this.delimiter.length);
pos += size + this.delimiter.length;
}
else {
// copy the target array (piecewise)
int off = 0;
while (off < size) {
int toCopy = Math.min(size - off, this.bufferSize - this.pos);
System.arraycopy(this.targetArray, off, this.buffer, this.pos, toCopy);
off += toCopy;
this.pos += toCopy;
if (this.pos == this.bufferSize) {
this.pos = 0;
this.stream.write(this.buffer, 0, this.bufferSize);
}
}
// copy the delimiter (piecewise)
off = 0;
while (off < this.delimiter.length) {
int toCopy = Math.min(this.delimiter.length - off, this.bufferSize - this.pos);
System.arraycopy(this.delimiter, off, this.buffer, this.pos, toCopy);
off += toCopy;
this.pos += toCopy;
if (this.pos == this.bufferSize) {
this.pos = 0;
this.stream.write(this.buffer, 0, this.bufferSize);
}
}
}
}
// ============================================================================================
/**
* Creates a configuration builder that can be used to set the input format's parameters to the config in a fluent
* fashion.
*
* @return A config builder for setting parameters.
*/
public static ConfigBuilder configureDelimitedFormat(FileDataSink target) {
return new ConfigBuilder(target.getParameters());
}
/**
* A builder used to set parameters to the input format's configuration in a fluent way.
*/
protected static abstract class AbstractConfigBuilder<T> extends FileOutputFormat.AbstractConfigBuilder<T>
{
private static final String NEWLINE_DELIMITER = "\n";
// --------------------------------------------------------------------
/**
* Creates a new builder for the given configuration.
*
* @param config The configuration into which the parameters will be written.
*/
protected AbstractConfigBuilder(Configuration config) {
super(config);
}
// --------------------------------------------------------------------
/**
* Sets the delimiter to be a single character, namely the given one. The character must be within
* the value range <code>0</code> to <code>127</code>.
*
* @param delimiter The delimiter character.
* @return The builder itself.
*/
public T recordDelimiter(char delimiter) {
if (delimiter == '\n') {
this.config.setString(RECORD_DELIMITER, NEWLINE_DELIMITER);
} else {
this.config.setString(RECORD_DELIMITER, String.valueOf(delimiter));
}
@SuppressWarnings("unchecked")
T ret = (T) this;
return ret;
}
/**
* Sets the delimiter to be the given string. The string will be converted to bytes for more efficient
* comparison during input parsing. The conversion will be done using the platforms default charset.
*
* @param delimiter The delimiter string.
* @return The builder itself.
*/
public T recordDelimiter(String delimiter) {
this.config.setString(RECORD_DELIMITER, delimiter);
@SuppressWarnings("unchecked")
T ret = (T) this;
return ret;
}
/**
* Sets the delimiter to be the given string. The string will be converted to bytes for more efficient
* comparison during input parsing. The conversion will be done using the charset with the given name.
* The charset must be available on the processing nodes, otherwise an exception will be raised at
* runtime.
*
* @param delimiter The delimiter string.
* @param charsetName The name of the encoding character set.
* @return The builder itself.
*/
public T recordDelimiter(String delimiter, String charsetName) {
this.config.setString(RECORD_DELIMITER, delimiter);
this.config.setString(RECORD_DELIMITER_ENCODING, charsetName);
@SuppressWarnings("unchecked")
T ret = (T) this;
return ret;
}
/**
* Sets the size of the write buffer.
*
* @param sizeInBytes The size of the write buffer in bytes.
* @return The builder itself.
*/
public T writeBufferSize(int sizeInBytes) {
this.config.setInteger(WRITE_BUFFER_SIZE, sizeInBytes);
@SuppressWarnings("unchecked")
T ret = (T) this;
return ret;
}
}
/**
* A builder used to set parameters to the input format's configuration in a fluent way.
*/
public static class ConfigBuilder extends AbstractConfigBuilder<ConfigBuilder>
{
/**
* Creates a new builder for the given configuration.
*
* @param targetConfig The configuration into which the parameters will be written.
*/
protected ConfigBuilder(Configuration targetConfig) {
super(targetConfig);
}
}
}