package shared.reader;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import shared.DataSet;
import shared.DataSetDescription;
import shared.Instance;
/**
* Class to read in data from a ARFF file
* @author Jarvis Johnson <https://github.com/Magicjarvis>
* @author Alex Linton <https://github.com/lexlinton>
* @date 2013-03-05
*/
public class ArffDataSetReader extends DataSetReader {
private final String DATA_TAG = "@data";
private final String ATTRIBUTE_TAG = "@attribute";
private final int SPLIT_LIMIT = 3;
public ArffDataSetReader(String file) {
super(file);
}
@Override
public DataSet read() throws Exception {
BufferedReader in = new BufferedReader(new FileReader(file));
try {
List<Map<String, Double>> attributes = processAttributes(in);
Instance[] instances = processInstances(in, attributes);
DataSet set = new DataSet(instances);
set.setDescription(new DataSetDescription(set));
return set;
} finally {
// don't forget to close the buffer
in.close();
}
}
/**
* Parses the buffer in to a map attribute->
* @param in Buffer to read from
* @return
* @throws IOException
*/
private List<Map<String, Double>> processAttributes(BufferedReader in)
throws IOException {
String line = in.readLine();
List<Map<String, Double>> attributes
= new ArrayList<Map<String, Double>>();
while (line != null && line.toLowerCase().indexOf(DATA_TAG) == -1) {
if (!line.isEmpty() && line.charAt(0) != '%') {
String[] parts = line.split("\\s", SPLIT_LIMIT);
if (parts[0].equalsIgnoreCase(ATTRIBUTE_TAG)) {
// process any attribute values
//NOTE: for REAL and INTEGER types, this will do nothing but those types are handled
// in processInstances
String[] values = parts[2].replaceAll(" |\\{|\\}|'","").split(",");
double id = 0.0;
Map<String, Double> valMap = new HashMap<String, Double>();
for (String s : values) {
s = s.trim(); //trim off whitespace
valMap.put(s, id++);
}
attributes.add(valMap);
}
}
line = in.readLine();
}
return attributes;
}
private Instance[] processInstances(BufferedReader in,
List<Map<String, Double>> valueMaps) throws IOException {
List<Instance> instances = new ArrayList<Instance>();
String line = in.readLine();
Pattern pattern = Pattern.compile("[ ,]+");
while (line != null) {
if (!line.isEmpty() && line.charAt(0) != '%') {
String[] values = pattern.split(line.trim());
double[] ins = new double[values.length];
for (int i = 0; i < values.length; i++) {
String v = values[i];
// defaulting to 0 if attribute value unknown.
double d = 0;
try {
d = Double.parseDouble(v);
}
catch(NumberFormatException e){
if (valueMaps.get(i).containsKey(v)) {
d = valueMaps.get(i).get(v);
}
}
ins[i] = d;
}
Instance i = new Instance(ins);
instances.add(i);
}
line = in.readLine();
}
return instances.toArray(new Instance[instances.size()]);
}
}