package com.rapidminer.operator.io;
import java.io.IOException;
import java.io.StreamTokenizer;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DataRow;
import com.rapidminer.example.table.DataRowFactory;
import com.rapidminer.example.table.ExampleTable;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.parameter.UndefinedParameterError;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.RandomGenerator;
import com.rapidminer.tools.Tools;
public class ArffReader {
protected StreamTokenizer tokenizer;
protected ArffExampleSource arffES;
protected final String PARAMETER_SAMPLE_SIZE;
protected final String PARAMETER_SAMPLE_RATIO;
protected final String PARAMETER_DATAMANAGEMENT;
protected final String PARAMETER_LOCAL_RANDOM_SEED;
protected final String PARAMETER_DECIMAL_POINT_CHARACTER;
public ArffReader( StreamTokenizer tokenizer,
ArffExampleSource arffES,
String parameter_sample_size,
String parameter_sample_ratio,
String parameter_datamanagement,
String parameter_local_random_seed,
String parameter_decimal_point_character) {
this.tokenizer = tokenizer;
this.arffES = arffES;
PARAMETER_SAMPLE_SIZE = parameter_sample_size;
PARAMETER_SAMPLE_RATIO = parameter_sample_ratio;
PARAMETER_DATAMANAGEMENT = parameter_datamanagement;
PARAMETER_LOCAL_RANDOM_SEED = parameter_local_random_seed;
PARAMETER_DECIMAL_POINT_CHARACTER = parameter_decimal_point_character;
}
public ExampleTable read() throws IOException, UndefinedParameterError{
List<Attribute> attributes = readAttributes(false);
return readData(attributes);
}
public List<Attribute> readAttributes(boolean relativeAttributesAllowed) throws IOException{
List<Attribute> attributes = new ArrayList<Attribute>();
// attributes
Tools.getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
throw new IOException("unexpected end of file in line " + tokenizer.lineno() + ", attribute description expected...");
}
while ("@attribute".equalsIgnoreCase(tokenizer.sval)) {
Attribute attribute = createAttribute(tokenizer,relativeAttributesAllowed);
attributes.add(attribute);
}
return attributes;
}
public MemoryExampleTable readData(List<Attribute> attributes) throws IOException, UndefinedParameterError{
// expect data declaration
if (!"@data".equalsIgnoreCase(tokenizer.sval)) {
throw new IOException("expected keyword '@data' in line " + tokenizer.lineno());
}
// check attribute number
if (attributes.size() == 0) {
throw new IOException("no attributes were declared in the ARFF file, please declare attributes with the '@attribute' keyword.");
}
// fill data table
MemoryExampleTable table = new MemoryExampleTable(attributes);
Attribute[] attributeArray = table.getAttributes();
DataRowFactory factory = new DataRowFactory(arffES.getParameterAsInt(PARAMETER_DATAMANAGEMENT), arffES.getParameterAsString(PARAMETER_DECIMAL_POINT_CHARACTER).charAt(0));
int maxRows = arffES.getParameterAsInt(PARAMETER_SAMPLE_SIZE);
double sampleProb = arffES.getParameterAsDouble(PARAMETER_SAMPLE_RATIO);
Random random = RandomGenerator.getRandomGenerator(arffES.getParameterAsInt(PARAMETER_LOCAL_RANDOM_SEED));
DataRow dataRow = null;
int counter = 0;
while ((dataRow = createDataRow(tokenizer, true, factory, attributeArray)) != null) {
if ((maxRows > -1) && (counter >= maxRows))
break;
counter++;
if (maxRows == -1) {
if (random.nextDouble() > sampleProb)
continue;
}
table.addDataRow(dataRow);
}
return table;
}
protected Attribute createAttribute(StreamTokenizer tokenizer,boolean relAttAllowed) throws IOException {
Attribute attribute = null;
// name
Tools.getNextToken(tokenizer);
String attributeName = tokenizer.sval;
// determine value type
Tools.getNextToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
// numerical or string value type
if (tokenizer.sval.equalsIgnoreCase("real")) {
attribute = AttributeFactory.createAttribute(attributeName, Ontology.REAL);
} else if (tokenizer.sval.equalsIgnoreCase("integer")) {
attribute = AttributeFactory.createAttribute(attributeName, Ontology.INTEGER);
} else if (tokenizer.sval.equalsIgnoreCase("numeric")) {
attribute = AttributeFactory.createAttribute(attributeName, Ontology.NUMERICAL);
} else if (tokenizer.sval.equalsIgnoreCase("string")) {
attribute = AttributeFactory.createAttribute(attributeName, Ontology.STRING);
} else if (tokenizer.sval.equalsIgnoreCase("date")) {
attribute = AttributeFactory.createAttribute(attributeName, Ontology.DATE);
} else if (tokenizer.sval.equalsIgnoreCase("file")) {
attribute = AttributeFactory.createAttribute(attributeName, Ontology.STRING);
} else if (tokenizer.sval.equalsIgnoreCase("relational")) {
attribute = readRelationalAttribute(tokenizer, attributeName,relAttAllowed);
}
Tools.waitForEOL(tokenizer);
} else {
// nominal attribute
attribute = AttributeFactory.createAttribute(attributeName, Ontology.NOMINAL);
tokenizer.pushBack();
// check if nominal value definition starts
if (tokenizer.nextToken() != '{') {
throw new IOException("{ expected at beginning of nominal values definition in line " + tokenizer.lineno());
}
// read all nominal values until the end of the definition
while (tokenizer.nextToken() != '}') {
if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
throw new IOException("} expected at end of the nominal values definition in line " + tokenizer.lineno());
} else {
attribute.getMapping().mapString(tokenizer.sval);
}
}
if (attribute.getMapping().size() == 0) {
throw new IOException("empty definition of nominal values is not suggested in line " + tokenizer.lineno());
}
}
Tools.getLastToken(tokenizer, false);
Tools.getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF)
throw new IOException("unexpected end of file before data section in line " + tokenizer.lineno());
return attribute;
}
protected Attribute readRelationalAttribute(StreamTokenizer tokenizer, String attributeName, boolean depAttribute) throws IOException{
throw new IOException("arff file contains attribute type "+ tokenizer.sval + ", which are not supported by Rapidminer in simple arff format");
}
protected Attribute checkInnerAttributeTypes(StreamTokenizer tokenizer, String attributeName) throws IOException{
throw new IOException("arff file contains attribute type "+ tokenizer.sval +", which are not supported by Rapidminer in simple arff format");
}
protected DataRow createDataRow(StreamTokenizer tokenizer, boolean checkForCarriageReturn, DataRowFactory factory, Attribute[] allAttributes) throws IOException {
// return null at the end of file
Tools.getFirstToken(tokenizer);
if(!AnnotationFound()){
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
return null;
}
// create datarow from either dense or sparse format
if (tokenizer.ttype == '{') {
return createDataRowFromSparse(tokenizer, checkForCarriageReturn, factory, allAttributes);
} else {
return createDataRowFromDense(tokenizer, checkForCarriageReturn, factory, allAttributes);
}
}
else return null;
}
protected boolean AnnotationFound(){
return false;
}
private DataRow createDataRowFromDense(StreamTokenizer tokenizer, boolean checkForCarriageReturn, DataRowFactory factory, Attribute[] allAttributes) throws IOException {
String[] tokens = new String[allAttributes.length];
// fetch all values
for (int i = 0; i < allAttributes.length; i++) {
if (i > 0) {
Tools.getNextToken(tokenizer);
}
// check for missing value
if (tokenizer.ttype == '?') {
tokens[i] = "?";
} else {
if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
throw new IOException("not a valid value '" + tokenizer.sval + "' in line " + tokenizer.lineno());
}
tokens[i] = tokenizer.sval;
}
}
if (checkForCarriageReturn) {
findDataDefinitionEnd();
}
// Add instance to dataset
return factory.create(tokens, allAttributes);
}
private DataRow createDataRowFromSparse(StreamTokenizer tokenizer, boolean checkForCarriageReturn, DataRowFactory factory, Attribute[] allAttributes) throws IOException {
String[] tokens = new String[allAttributes.length];
for (int t = 0; t < tokens.length; t++)
tokens[t] = "0";
// Get values
do {
if (tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
throw new IOException("unexpedted end of line " + tokenizer.lineno());
}
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
throw new IOException("unexpedted end of file in line " + tokenizer.lineno());
}
if (tokenizer.ttype == '}') {
break;
}
// determine index
int index = Integer.valueOf(tokenizer.sval);
// determine value
Tools.getNextToken(tokenizer);
// Check if value is missing.
if (tokenizer.ttype == '?') {
tokens[index] = "?";
} else {
if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
throw new IOException("not a valid value '" + tokenizer.sval + "' in line " + tokenizer.lineno());
}
tokens[index] = tokenizer.sval;
}
} while (true);
if (checkForCarriageReturn) {
findDataDefinitionEnd();
}
// Add instance to dataset
return factory.create(tokens, allAttributes);
}
protected void findDataDefinitionEnd() throws IOException {
Tools.getLastToken(tokenizer, true);
}
}