/*
* Copyright 2004-2010 Information & Software Engineering Group (188/1)
* Institute of Software Technology and Interactive Systems
* Vienna University of Technology, Austria
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package at.tuwien.ifs.somtoolbox.data;
import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.logging.Logger;
/**
* Reads input data from a <a href="http://www.cs.waikato.ac.nz/~ml/weka/arff.html">WEKA ARFF File Format</a>.<br>
*
* @author Rudolf Mayer
* @version $Id: ARFFFormatInputData.java 3587 2010-05-21 10:35:33Z mayer $
*/
public class ARFFFormatInputData extends SOMLibSparseInputData {
public static final String RELATION = "@RELATION";
public static final String ATTRIBUTE = "@ATTRIBUTE";
public static final String DATA = "@DATA";
public static final String INTEGER_TYPE = "integer";
public static final String NUMERIC_TYPE = "NUMERIC";
public static final String STRING_TYPE = "string";
private static final char NULLCHAR = '\u0000';
public ARFFFormatInputData(String arffFileName) {
this(arffFileName, DEFAULT_SPARSE, DEFAULT_NORMALISED, DEFAULT_NUM_CACHE_BLOCKS, DEFAULT_RANDOM_SEED);
}
public ARFFFormatInputData(String arffFileName, boolean sparse, boolean norm, int numCacheBlocks, long seed) {
super(arffFileName, sparse, norm, numCacheBlocks, seed);
}
@Override
protected void readVectorFile(String arffFileName, boolean sparse) {
classInfo = new SOMLibClassInformation();
BufferedReader br = openFile(arffFileName);
try {
String line = br.readLine();
// skip lines till first attribute
ArrayList<String> components = new ArrayList<String>();
while (line != null && !line.toUpperCase().startsWith(ATTRIBUTE)) {
line = br.readLine();
}
int labelIndex = -1;
int classIndex = -1;
// read all attributes
while (line != null && line.startsWith(ATTRIBUTE.toUpperCase())) {
String lineData = line.substring(ATTRIBUTE.length()).trim();
int split = lineData.lastIndexOf(' ');
String name = lineData.substring(0, split).trim();
if (name.startsWith("\"")) {
name = name.substring(1);
}
if (name.endsWith("\"")) {
name = name.substring(0, (name.length() - 1));
}
String type = lineData.substring(split).trim();
// numerical attribute => treat as vector element
if (INTEGER_TYPE.equalsIgnoreCase(type) || NUMERIC_TYPE.equalsIgnoreCase(type)) {
components.add(name);
} else if (type.equalsIgnoreCase(STRING_TYPE)) {
// string attribute => treat as instance/label name
Logger.getLogger("at.tuwien.ifs.somtoolbox").info(
"Assuming String-type attribute '" + name + "' as instance name.");
labelIndex = components.size();
} else if (type.contains("{") && type.contains("}")) {
// categorical feature => treat as class index
Logger.getLogger("at.tuwien.ifs.somtoolbox").info(
"Assuming categorical attribute '" + name + "' as class index.");
classIndex = components.size();
}
line = br.readLine();
}
int expectedLineComponents = components.size() + (labelIndex > 0 ? 1 : 0) + (classIndex > 0 ? 1 : 0);
// skip lines up to data marker
while (line != null && !line.toUpperCase().startsWith(DATA)) {
line = br.readLine();
}
// skip lines up to data
while (line != null && (line.toUpperCase().startsWith(DATA) || line.trim().length() == 0)) {
line = br.readLine();
}
// we don't have any information on the number of vectors available, thus we need to first read them into a
// list
ArrayList<String> lines = new ArrayList<String>();
while (line != null) {
if (line.trim().length() > 0) {
lines.add(line.trim());
}
line = br.readLine();
}
// now we can compute the number of vectors, and initialise our data structures accordingly.
numVectors = lines.size();
dim = components.size();
// now that we know numVectors, we can also create the template vector
templateVector = new SOMLibTemplateVector(numVectors, components.toArray(new String[components.size()]), 2);
classInfo = new SOMLibClassInformation();
initDataStructures(sparse);
int index = 0;
// read data lines
for (String lineData : lines) {
ArrayList<String> values = new ArrayList<String>();
// parse line
if (lineData.length() != 0) {
char quotCharacter = NULLCHAR;
StringBuilder buffer = new StringBuilder("");
for (int c = 0; c < lineData.length(); c++) {
char ch = lineData.charAt(c);
if (quotCharacter != NULLCHAR) {
if (ch == quotCharacter) {
quotCharacter = NULLCHAR;
} else {
buffer.append(ch);
}
} else if (ch == '"' || ch == '\'') {
quotCharacter = ch;
} else if (ch == ',') {
String featureValue = buffer.toString();
values.add(featureValue);
buffer = new StringBuilder("");
} else {
buffer.append(ch);
}
}
// check buffer
if (buffer.length() != 0 && values.size() < expectedLineComponents) {
String featureValue = buffer.toString();
values.add(featureValue);
}
}
// process the line values
String label;
if (labelIndex > 0) {
label = values.get(labelIndex);
values.remove(labelIndex);
if (classIndex > labelIndex) {
classIndex--;
}
} else {
label = String.valueOf(index + 1);
}
String className = null;
if (classIndex > 0) {
className = values.get(classIndex);
values.remove(classIndex);
}
for (int ve = 0; ve < dim; ve++) {
setMatrixValue(index, ve, parseDouble(values.get(ve)));
}
addInstance(index, label);
if (className != null) {
// if we have a class info
classInfo.addItem(label, className);
}
index++;
}
} catch (Exception e) {
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(ERROR_MESSAGE_FILE_FORMAT_CORRUPT);
e.printStackTrace();
throw new IllegalArgumentException(e.getMessage());
}
classInfo.processItems(false);
Logger.getLogger("at.tuwien.ifs.somtoolbox").info("ARFF vector file seems to be correct. Riding on ...");
}
protected String getClassAttributeName() {
return "class";
}
public static String getFormatName() {
return "ARFF";
}
public static String getFileNameSuffix() {
return ".arff";
}
}