/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/
package eu.stratosphere.api.java.record.io;
import java.io.IOException;
import eu.stratosphere.configuration.Configuration;
import eu.stratosphere.core.io.GenericInputSplit;
import eu.stratosphere.types.Record;
/**
* This input format starts an external process and reads its input from the standard out (stdout) of the started process.
* The input is split into fixed-sized segments from which a {@link Record} is generated.
* The external process is started outside of the JVM via a provided start command and can be an arbitrary program,
* e.g., a data generator or a shell script. The input format checks the exit code of the process
* to validate whether the process terminated correctly. A list of allowed exit codes can be provided.
* The input format requires ({@link ExternalProcessInputSplit} objects that hold the command to execute.
*
* <b>Warning:</b> This format does not consume the standard error stream (stderr) of the started process. This might cause deadlocks.
*
*
* @param <T>, The type of the input split (must extend ExternalProcessInputSplit)
*
*/
public abstract class ExternalProcessFixedLengthInputFormat<T extends ExternalProcessInputSplit> extends ExternalProcessInputFormat<T> {
private static final long serialVersionUID = 1L;
/**
* The config parameter which defines the fixed length of a record.
*/
public static final String RECORDLENGTH_PARAMETER_KEY = "pact.input.recordLength";
/**
* The default read buffer size = 1MB.
*/
private static final int DEFAULT_TARGET_READ_BUFFER_SIZE = 1024 * 1024;
/**
* Buffer to read a batch of records from a file
*/
private byte[] readBuffer;
/**
* read position within the read buffer
*/
private int readBufferReadPos;
/**
* fill marker within the read buffer
*/
private int readBufferFillPos;
/**
* remaining space within the read buffer
*/
private int readBufferRemainSpace;
/**
* target size of the read buffer
*/
private int targetReadBufferSize = DEFAULT_TARGET_READ_BUFFER_SIZE;
/**
* fixed length of all records
*/
protected int recordLength;
/**
* Flags to indicate the end of the split
*/
private boolean noMoreStreamInput;
private boolean noMoreRecordBuffers;
/**
* Reads a record out of the given buffer. This operation always consumes the standard number of
* bytes, regardless of whether the produced record was valid.
*
* @param target The target Record
* @param buffer The buffer containing the binary data.
* @param startPos The start position in the byte array.
* @return True, is the record is valid, false otherwise.
*/
public abstract boolean readBytes(Record target, byte[] buffer, int startPos);
@Override
public void configure(Configuration parameters)
{
// configure parent
super.configure(parameters);
// read own parameters
this.recordLength = parameters.getInteger(RECORDLENGTH_PARAMETER_KEY, 0);
if (recordLength < 1) {
throw new IllegalArgumentException("The record length parameter must be set and larger than 0.");
}
}
/**
* Sets the target size of the buffer to be used to read from the stdout stream.
* The actual size depends on the record length since it is chosen such that records are not split.
* This method has only an effect, if it is called before the input format is opened.
*
* @param targetReadBufferSize The target size of the read buffer.
*/
public void setTargetReadBufferSize(int targetReadBufferSize)
{
this.targetReadBufferSize = targetReadBufferSize;
}
@Override
public void open(GenericInputSplit split) throws IOException {
super.open(split);
// compute readBufferSize
if(recordLength > this.targetReadBufferSize) {
// read buffer is at least as big as record
this.readBuffer = new byte[recordLength];
} else if (this.targetReadBufferSize % recordLength == 0) {
// target read buffer size is a multiple of record length, so it's ok
this.readBuffer = new byte[this.targetReadBufferSize];
} else {
// extent default read buffer size such that records are not split
this.readBuffer = new byte[(recordLength - (this.targetReadBufferSize % recordLength)) + this.targetReadBufferSize];
}
// initialize read buffer positions
this.readBufferReadPos = 0;
this.readBufferFillPos = 0;
this.readBufferRemainSpace = readBuffer.length;
// initialize end flags
this.noMoreStreamInput = false;
this.noMoreRecordBuffers = false;
}
@Override
public boolean reachedEnd() throws IOException {
return noMoreRecordBuffers;
}
@Override
public Record nextRecord(Record reuse) throws IOException {
// check if read buffer must be filled (less than one record contained)
if(this.readBufferFillPos - this.readBufferReadPos < this.recordLength) {
// try to fill read buffer
if(!this.fillReadBuffer()) {
return null;
}
}
// update read buffer read marker
this.readBufferReadPos += this.recordLength;
return this.readBytes(reuse, readBuffer, (this.readBufferReadPos-this.recordLength)) ? reuse : null;
}
/**
* Fills the read buffer by reading from the stdout stream of the external process.
* WARNING: We do not read from the error stream. This might cause a deadlock.
*
* @return true if new content was filled into the buffer, false otherwise.
* @throws IOException
*/
private boolean fillReadBuffer() throws IOException {
// TODO: Add reading from error stream of external process. Otherwise the InputFormat might get deadlocked!
// stream was completely processed
if(noMoreStreamInput) {
if(this.readBufferReadPos == this.readBufferFillPos) {
this.noMoreRecordBuffers = true;
return false;
} else {
throw new RuntimeException("External process produced incomplete record");
}
}
// the buffer was completely filled and processed
if(this.readBufferReadPos == this.readBuffer.length &&
this.readBufferRemainSpace == 0) {
// reset counters and fill again
this.readBufferFillPos = 0;
this.readBufferRemainSpace = this.readBuffer.length;
this.readBufferReadPos = 0;
}
// as long as not at least one record is complete
while(this.readBufferFillPos - this.readBufferReadPos < this.recordLength) {
// read from stdout
int readCnt = super.extProcOutStream.read(this.readBuffer, this.readBufferFillPos, this.readBufferRemainSpace);
if(readCnt == -1) {
// the is nothing more to read
this.noMoreStreamInput = true;
return false;
} else {
// update fill position and remain cnt
this.readBufferFillPos += readCnt;
this.readBufferRemainSpace -= readCnt;
}
}
return true;
}
}