/*
* Copyright 2004-2010 Information & Software Engineering Group (188/1)
* Institute of Software Technology and Interactive Systems
* Vienna University of Technology, Austria
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package at.tuwien.ifs.somtoolbox.data;
import java.io.BufferedReader;
import java.util.logging.Logger;
import cern.colt.matrix.DoubleMatrix1D;
import at.tuwien.ifs.somtoolbox.util.FileUtils;
/**
* A reader for the Marsyas 0.2 ARFF format, which has the label name as a comment in front of the vector file name.
*
* @author Rudolf Mayer
* @version $Id: MarsyasARFFInputData.java 3589 2010-05-21 10:42:01Z mayer $
*/
public class MarsyasARFFInputData extends ARFFFormatInputData {
public MarsyasARFFInputData(String arffFileName) {
super(arffFileName);
}
@Override
protected void readVectorFile(String arffFileName, boolean sparse) {
super.readVectorFile(arffFileName, sparse);
// after reading the marsyas file, we parse the instance names
SOMLibClassInformation newClassInfo = new SOMLibClassInformation();
try {
final BufferedReader reader = FileUtils.openFile("ARFF Input Vector File", arffFileName);
String line;
do {
line = reader.readLine();
} while (line != null && !line.trim().equals("@data"));
// now we expect blocks of two lines, first a comment with the instance name, then the actual data
int index = 0;
while ((line = reader.readLine()) != null) {
String instanceName = line.replaceFirst("%", "").trim(); // replace the first (and only first!) %
// write new class
newClassInfo.addItem(instanceName, classInfo.getClassName(dataNames[index]));
dataNames[index] = instanceName;
// do some checking if the data equals the data vector read before..
String dataString = reader.readLine();
final String[] splits = dataString.split(",");
if (splits.length - 1 != dim()) {
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(
"Mismatch in WEKA ARFF reader and manual reader, for line '" + instanceName
+ "', expected dimensionality " + dim() + " found " + (splits.length - 1) + ".");
System.exit(-1);
}
final DoubleMatrix1D vector = data.viewRow(index);
for (int i = 0; i < splits.length - 1; i++) {
splits[i] = splits[i].trim();
if (Double.parseDouble(splits[i]) != vector.get(i)) {
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(
"Mismatch in WEKA ARFF reader and manual reader, for line '" + instanceName
+ "', element " + i + ", expected " + vector.get(i) + ", found " + splits[i]
+ ".");
System.exit(-1);
}
}
vector.toArray();
index++;
}
// check if we read all the instances
if (index != dataNames.length) {
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(
"Read " + index + " new instance names, expected " + dataNames.length);
}
newClassInfo.processItems(false);
this.classInfo = newClassInfo;
} catch (Exception e) {
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(ERROR_MESSAGE_FILE_FORMAT_CORRUPT);
e.printStackTrace();
}
}
public static String getFormatName() {
return "Marsyas0.2ARFF";
}
public static void main(String[] args) {
new MarsyasARFFInputData("/tmp/collection_600_allmp3s_manual_complete_absPaths.arff");
}
@Override
protected String getClassAttributeName() {
return "output"; // well, this is the name that Marsyas uses..
}
}