package de.tud.inf.operator.io;
import java.io.IOException;
import java.io.StreamTokenizer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DataRow;
import com.rapidminer.example.table.DataRowFactory;
import com.rapidminer.example.table.DataRowReader;
import com.rapidminer.example.table.ExampleTable;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.operator.io.ArffExampleSource;
import com.rapidminer.operator.io.ArffReader;
import com.rapidminer.parameter.UndefinedParameterError;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.Tools;
import de.tud.inf.example.table.ComplexAttributeDescription;
import de.tud.inf.example.table.ComplexAttributeFactory;
import de.tud.inf.example.table.ComplexExampleTable;
import de.tud.inf.example.table.RelationalAttribute;
public class ComplexArffReader extends ArffReader{
public ComplexArffReader(StreamTokenizer tokenizer,
ArffExampleSource arffES, String parameter_sample_size,
String parameter_sample_ratio, String parameter_datamanagement,
String parameter_local_random_seed,
String parameter_decimal_point_character) {
super(tokenizer, arffES, parameter_sample_size, parameter_sample_ratio,
parameter_datamanagement, parameter_local_random_seed,
parameter_decimal_point_character);
}
/**
* builds two example tables, one with dependency information, one with data information, and merges them to create a complex example table
*/
@Override
public ComplexExampleTable read() throws IOException,UndefinedParameterError {
Tools.getFirstToken(tokenizer);
if(ComplexArffDescription.depAnnotation.equalsIgnoreCase(tokenizer.sval)) {
Tools.getNextToken(tokenizer);
Tools.getLastToken(tokenizer, false);
}
//extract dependency information from following attributes
List<Attribute> depAttributes = readAttributes(true);
if(depAttributes.size() == 0) new IOException("no attribute defintion for dependency relation found");
//check if dependency information only contains attribute names which are allowed in dependency part (recognize misspelled attribute names)
List<String> names = Arrays.asList(new String[]{ComplexArffDescription.depAttName,
ComplexArffDescription.depParamName,
ComplexArffDescription.depClassName,
ComplexArffDescription.depInnerAttributesName,
ComplexArffDescription.depHintName});
for(Attribute a: depAttributes)
if(!names.contains(a.getName()))
throw new IOException("attribute name '" +a.getName()+"' is not allowed in dependency section of complex arff file");
//terminates @data of dependency by checking if there is a @relation - Annotation (thats why additional function)
ExampleTable depEt = readDependencyData(depAttributes);
if("@relation".equalsIgnoreCase(tokenizer.sval)) {
Tools.getNextToken(tokenizer);
Tools.getLastToken(tokenizer, false);
}
List<Attribute> attributes = readAttributes(false);
//now check if dependency information is correct, if records of dependency table contain valid references to attribute names
//1. collect table indexes of attributes (not necessary if table attributes, but with names)
names = new LinkedList<String>();
for(Attribute a: attributes)
names.add(a.getName());
/*
String incorrAtt = "";
boolean corrDeps = true;
for(Attribute a : depEt.getAttributes())
//2. check if attribute could be valid (here: just check if possible nominal values are valid (no dependency information rows)
if(a.getName().equals(ComplexArffDescription.depInnerAttributesName)){
List<String> atts = ((RelationalAttribute)a).getInnerAttributes().get(0).getMapping().getValues();
for(String strA: atts)
if(!names.contains(strA)){
corrDeps = false;
incorrAtt = strA;
break;
}
}
if(corrDeps){ //indicates that dependency section does not contain innerAttriubte stuff, which is invalid (not really)
return buildTable(attributes, depEt);
}
else{
throw new IOException("dependency information attribute contains inncorrect attribute name "+incorrAtt+".");
}
*/
return buildTable(attributes, depEt);
}
/**
* read the data section of an ARFF - file
* @param attributes attribute information of attribute section in arff, which is already read
* @param depEt dependency information example table
* @return ComplexExampleTable which contains attributes + dependency information
* @throws UndefinedParameterError
* @throws IOException
*/
protected ComplexExampleTable buildTable(List<Attribute> attributes,ExampleTable depEt) throws UndefinedParameterError, IOException{
//read the "real" dataset
ExampleTable et = readData(attributes);
List<ComplexAttributeDescription> depList = createValidDependencyList(depEt,et);
try{
//ComplexAttributeConstraintChecker.checkConstraints(et, depList);
for(ComplexAttributeDescription desc : depList)
desc.checkConstraints(et);
}catch(RuntimeException e){
throw new IOException(e.getMessage());
}
return new ComplexExampleTable(et,depList);
}
/***
* @param tokenizer
* @param attributeName name of the relational Attribute
* @param depAttribute is true if relational attribute appears in dependency information of .arff file
* @return relational Attribute with wrapped innerAttributes
*/
@Override
protected Attribute readRelationalAttribute(StreamTokenizer tokenizer, String attributeName, boolean depAttribute) throws IOException{
RelationalAttribute attribute = null;
// get the name
Tools.getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
throw new IOException("unexpected end of file in line " + tokenizer.lineno() + ", attribute description expected...");
}
ArrayList<Attribute> innerAttributes = createInnerAttributes(tokenizer,attributeName);
if(innerAttributes != null && innerAttributes.size()>0){
attribute = (RelationalAttribute)AttributeFactory.createAttribute(attributeName,Ontology.RELATIONAL);
attribute.setInnerAttributes(innerAttributes);
return attribute;
}
else throw new IOException("relational attributes should contain at least one inner attribute");
}
/***
* create list of inner attributes of a relational attribute
* @param tokenizer
* @param attributeName name of the relational Attribute
* @return innerAttributes
*/
private ArrayList<Attribute> createInnerAttributes(StreamTokenizer tokenizer, String attributeName) throws IOException{
ArrayList<Attribute> attributes = new ArrayList<Attribute>();
while ("@attribute".equalsIgnoreCase(tokenizer.sval)) {
Attribute attribute = createAttribute(tokenizer,false);
attributes.add(attribute);
}
if("@end".equalsIgnoreCase(tokenizer.sval)){
tokenizer.nextToken();
if(attributeName.equalsIgnoreCase(tokenizer.sval))
return attributes;
else throw new IOException("relational attribute definition is false, should be '@end "+ attributeName +"'");
}
else throw new IOException("relational attribute end definition is false, should be '@end "+ attributeName +"'");
}
private ExampleTable readDependencyData(List<Attribute> depAttributes) throws IOException{
if (!"@data".equalsIgnoreCase(tokenizer.sval)) {
throw new IOException("expected keyword '@data' in line " + tokenizer.lineno());
}
// check attribute number
if (depAttributes.size() == 0) {
throw new IOException("no attributes were declared in the ARFF file, please declare attributes with the '@attribute' keyword.");
}
// fill data table
MemoryExampleTable table = new MemoryExampleTable(depAttributes);
Attribute[] attributeArray = table.getAttributes();
DataRowFactory factory = new DataRowFactory(DataRowFactory.TYPE_INT_ARRAY, '.');
DataRow dataRow = null;
while (((dataRow = createDataRow(tokenizer, true, factory, attributeArray)) != null)) {
table.addDataRow(dataRow);
}
return table;
}
/***
* creates the list of information about valid complex attributes from the two exampleTables
* @param depEt created from description section in ARFF file
* @param et created from relation section in ARFF file
* @return
* @throws IOException
*/
private List<ComplexAttributeDescription> createValidDependencyList(ExampleTable depEt, ExampleTable et) throws IOException{
//collect map String - TableIndexes from et
Map<String,Integer> nameIndexMap = new HashMap<String,Integer>();
for(int i=0;i<et.getNumberOfAttributes();i++)
if(et.getAttribute(i) != null)
nameIndexMap.put(et.getAttribute(i).getName(), et.getAttribute(i).getTableIndex());
List<ComplexAttributeDescription> etDependencies = new ArrayList<ComplexAttributeDescription>();
//name of the current dependency information attribute
String name;
//className, parameter tableIndex list and attribute table index list form a ExampleTableDependency
String symbol;
String attName = null;
String hint;
int[] params = null;
int[] attributes = null;
//identifies the attribute within a relational attribute -> in our case just one nominal attribute is necessary
//and should be the first (and only) innerAttribute in attributes and parameters description
Attribute innerA;
DataRowReader reader = depEt.getDataRowReader();
//example table dependency id = nr of datarow
//complex dataRow information for a concrete relational attribute
double[][] relValues;
int count =0;
Integer attIndex;
//read dataRows of dependency section
while(reader.hasNext()){
DataRow row = reader.next();
count++;
//find symbol
symbol = null; hint = null; params = null; attributes = null;
for(Attribute a1: depEt.getAttributes()){
name = a1.getName();
if(a1.getName().equals(ComplexArffDescription.depClassName) && a1.isNominal())
symbol = a1.getMapping().mapIndex((int)a1.getValue(row));
else if(a1.getName().equals(ComplexArffDescription.depHintName))
hint = a1.getMapping().mapIndex((int)a1.getValue(row));
else if(name.equals(ComplexArffDescription.depAttName)) {
attName = a1.getMapping().mapIndex((int)a1.getValue(row));
}else if( a1.isRelational() && (a1.getName().equals(ComplexArffDescription.depParamName))||(a1.getName().equals(ComplexArffDescription.depInnerAttributesName))){
relValues = row.getRelativeValuesFor(a1.getTableIndex());
if (((RelationalAttribute)a1).getInnerAttributeCount()!=1)
throw new IOException("relational attribute '"+a1.getName()+"' must contain exactly one inner attribute");
//the first (and only) innerAttribute should be a nominal one
innerA = ((RelationalAttribute)a1).getInnerAttributes().get(0);
if(innerA.isNominal()){
//fetch all table indexes of correlating attributes/parameters
if(name.equals(ComplexArffDescription.depParamName)){
//parameters must not be there
if((relValues!=null) && relValues.length>0){
params = new int[relValues.length];
for(int i=0;i<relValues.length;i++){
int tId = (int)relValues[i][0]; //relational parameter attribute just has ONE inner attribute
String tName = innerA.getMapping().mapIndex(tId);
attIndex = nameIndexMap.get(tName);
if(attIndex == null)
throw new IOException("parameter attribute "+ tName + " does not exist in the dataset");
else params[i] = nameIndexMap.get(tName).intValue();
}
}
}else if(name.equals(ComplexArffDescription.depInnerAttributesName))
if((relValues!=null) && relValues.length>0){
attributes = new int[relValues.length];
for(int i=0;i<relValues.length;i++){
int tId = (int)relValues[i][0]; //relational innerAttribute attribute just has ONE inner attribute
String tName = innerA.getMapping().mapIndex(tId);
attIndex = nameIndexMap.get(tName);
if(attIndex == null)
throw new IOException("dependency attribute "+ tName +" does not exist in the dataset");
else attributes[i] = nameIndexMap.get(tName).intValue();
}
}else throw new IOException("dependency data in row "+ count+" must contain at least one inner attribute");
}
else throw new IOException("inner attribute "+ innerA.getName() +" of attribute "+ name +" must be nominal");
}
}
//there should be at least information about correlating attributes
if(attributes != null)
//etDependencies.add(new ComplexAttributeDescription(attributes,params,symbol,attName,hint));
etDependencies.add(ComplexAttributeFactory.createAttributeDescription(attributes, params, symbol, attName, hint));
else throw new IOException("no correlating attributes defined for complex attribute "+ attName);
}
return etDependencies;
}
@Override
protected void findDataDefinitionEnd() throws IOException {
//maybe true is also working
Tools.getLastToken(tokenizer, false);
}
protected boolean AnnotationFound(){
if("@relation".equalsIgnoreCase(tokenizer.sval))
return true;
else return false;
}
}