/*
* RapidMiner
*
* Copyright (C) 2001-2008 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.io;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.List;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.DataRowFactory;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeFile;
import com.rapidminer.parameter.UndefinedParameterError;
/**
* Superclass for file data source operators which read
* the file byte per byte into a byte array and extract
* the actual data from that array. This class provides
* some methods to extract integer and floating point
* values from such an array.
*
* @author Tobias Malbrecht
* @version $Id: BytewiseExampleSource.java,v 1.3 2008/08/27 16:14:45 tobiasmalbrecht Exp $
*/
public abstract class BytewiseExampleSource extends Operator {
/** The parameter name for "Name of the file to read the data from." */
public static final String PARAMETER_FILENAME = "filename";
/** The parameter name for "Determines, how the data is represented internally." */
public static final String PARAMETER_DATAMANAGEMENT = "datamanagement";
/** A generic wrong file format error message. */
protected static final String GENERIC_ERROR_MESSAGE = "Wrong file format";
/** A even more generic error message. */
protected static final String UNSPECIFIED_ERROR_MESSAGE = "Unspecified error";
/** The length of a byte measured in bytes. */
protected static final int LENGTH_BYTE = 1;
/** The length of an int measured in bytes. */
protected static final int LENGTH_INT_32 = 4;
/** The length of a double measured in bytes. */
protected static final int LENGTH_DOUBLE = 8;
private static final Class[] INPUT_CLASSES = {};
private static final Class[] OUTPUT_CLASSES = { ExampleSet.class };
public BytewiseExampleSource(OperatorDescription description) {
super(description);
}
public IOObject[] apply() throws OperatorException {
File file = getParameterAsFile(PARAMETER_FILENAME);
DataRowFactory dataRowFactory = new DataRowFactory(getParameterAsInt(PARAMETER_DATAMANAGEMENT), '.');
ExampleSet result = null;
// read file and construct example set
try {
result = readFile(file, dataRowFactory);
} catch (IOException e) {
throw new UserError(this, 302, file, e.getMessage());
}
// verify that the result is not null
if (result == null) {
throw new UserError(this, 302, file, UNSPECIFIED_ERROR_MESSAGE);
}
// verify that the resulting example set is not empty
if (result.size() == 0) {
throw new UserError(this, 117);
}
return new IOObject[] { result };
}
/**
* Returns the suffix of the files which should be read
* by the input operator.
*/
protected abstract String getFileSuffix();
/**
* Reads the given file and constructs an example set from the
* read data.
*/
protected abstract ExampleSet readFile(File file, DataRowFactory dataRowFactory) throws IOException, UndefinedParameterError;
/**
* Reads a number (specified by length) of bytes from a given
* file reader into a byte array beginning at index 0.
*/
protected int read(FileInputStream fileReader, byte[] buffer, int length) throws IOException {
final int offset = 0;
return read(fileReader, buffer, offset, length);
}
/**
* Reads a number (specified by length) of bytes from a given
* file reader into a byte array beginning at the given offset.
*/
protected int read(FileInputStream fileReader, byte[] buffer, int offset, int length) throws IOException {
int readLength = fileReader.read(buffer, offset, length);
if (readLength != length)
throw new IOException("wrong byte length");
return readLength;
}
/**
* Reads a number (specified by length) of bytes from a given
* file reader into a byte array beginning at index 0. No read
* length verification is performed.
*/
protected int readWithoutLengthCheck(FileInputStream fileReader, byte[] buffer, int length) throws IOException {
return fileReader.read(buffer, 0, length);
}
/**
* Reads bytes from a given file reader until either a certain
* character is read, the buffer is completely filled or the
* end of file is reached.
*/
protected int read(FileInputStream fileReader, byte[] buffer, char divider) throws IOException {
int index = 0;
do {
byte readByte = (byte) (0x000000FF & fileReader.read());
if (readByte == -1 || readByte == (byte) divider) {
index++;
return index;
}
buffer[index] = readByte;
index++;
} while (index < buffer.length);
return index;
}
/**
* Reads bytes from a given file reader until either a specified
* character sequence is read, the buffer is completely filled or the
* end of file is reached.
*/
protected int read(FileInputStream fileReader, byte[] buffer, char[] divider) throws IOException {
int index = 0;
int dividerIndex = 0;
do {
byte readByte = (byte) (0x000000FF & fileReader.read());
if (readByte == -1) {
index++;
return index;
}
if (readByte == divider[dividerIndex]) {
dividerIndex++;
}
if (dividerIndex == divider.length) {
index -= dividerIndex - 1;
for (int i = index; i < index + dividerIndex; i++) {
if (i >= buffer.length) {
break;
}
buffer[i] = 0;
}
return index;
}
buffer[index] = readByte;
index++;
} while (index < buffer.length);
return index;
}
/**
* Extracts a 2-byte (short) int from a byte array.
*/
protected int extract2ByteInt(byte[] buffer, int offset, boolean reverseEndian) {
int r = 0;
if (reverseEndian) {
r = (buffer[offset + 1] << 8) + (0x000000FF & buffer[offset]);
} else {
r = (buffer[offset] << 8) + (0x000000FF & buffer[offset + 1]);
}
return r;
}
/**
* Extracts an int from a byte array.
*/
protected int extractInt(byte[] buffer, int offset, boolean reverseEndian) {
int r = 0;
if (reverseEndian) {
for (int i = offset + 3; i >= offset; i--) {
r = r << 8;
r += 0x000000FF & buffer[i];
}
} else {
for (int i = offset; i < offset + 4; i++) {
r = r << 8;
r += 0x000000FF & buffer[i];
}
}
return r;
}
/**
* Extracts a float from a byte array.
*/
protected float extractFloat(byte[] value, int offset, boolean reverseEndian) {
int bits = 0;
if (reverseEndian) {
for (int i = offset + 3; i >= offset; i--) {
bits = bits << 8;
bits += 0x000000FF & value[i];
}
} else {
for (int i = offset; i < offset + 4; i++) {
bits = bits << 8;
bits += 0x000000FF & value[i];
}
}
return java.lang.Float.intBitsToFloat(bits);
}
/**
* Extracts a double from a byte array.
*/
protected double extractDouble(byte[] value, int offset, boolean reverseEndian) {
long bits = 0;
if (reverseEndian) {
for (int i = offset + 7; i >= offset; i--) {
bits = bits << 8;
bits += 0x000000FF & value[i];
}
} else {
for (int i = offset; i < offset + 8; i++) {
bits = bits << 8;
bits += 0x000000FF & value[i];
}
}
return java.lang.Double.longBitsToDouble(bits);
}
/**
* Extracts string from byte array.
*/
protected String extractString(byte[] value, int offset, int length) {
/* TODO: Shevek suggests this use a Charset for safety. */
return (new String(value, offset, length)).trim();
}
public Class<?>[] getInputClasses() {
return INPUT_CLASSES;
}
public Class<?>[] getOutputClasses() {
return OUTPUT_CLASSES;
}
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
ParameterType type = new ParameterTypeFile(PARAMETER_FILENAME, "Name of the file to read the data from.", getFileSuffix(), false);
type.setExpert(false);
types.add(type);
type = new ParameterTypeCategory(PARAMETER_DATAMANAGEMENT, "Determines, how the data is represented internally.", DataRowFactory.TYPE_NAMES, DataRowFactory.TYPE_DOUBLE_ARRAY);
type.setExpert(true);
types.add(type);
return types;
}
}