/*
* Apache License
* Version 2.0, January 2004
* http://www.apache.org/licenses/
*
* Copyright 2013 Aurelian Tutuianu
* Copyright 2014 Aurelian Tutuianu
* Copyright 2015 Aurelian Tutuianu
* Copyright 2016 Aurelian Tutuianu
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package rapaio.io;
import rapaio.data.*;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Class for loading ARFF files. ARFF is a human readable file format used by
* Weka.
* <a href="http://www.cs.waikato.ac.nz/ml/weka/arff.html">About Weka</a>
*
* @author Aurelian Tutuianu
*/
@Deprecated
public class ArffPersistence {
public final Frame read(String fileName) throws IOException {
return read(new File(fileName));
}
/**
* Uses the given file path to load a data set from an ARFF file.
*
* @param file the path to the ARFF file to load
* @return the data set from the ARFF file, or null if the file could not be
* loaded.
* @throws java.io.IOException
*/
public final Frame read(File file) throws IOException {
return read(new FileInputStream(file));
}
public final Frame read(InputStream stream) throws IOException {
try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) {
String line;
ArrayList<Var> vars = new ArrayList<>();
ArrayList<String> names = new ArrayList<>();
HashMap<String, List<String>> nomValueMap = new HashMap<>();
ArrayList<String> data = new ArrayList<>();
boolean ondata = false;
while ((line = br.readLine()) != null) {
line = line.trim();
if (line.startsWith("%") || line.trim().isEmpty()) {
continue;
}
if (line.startsWith("@") && !ondata) {
if (line.toLowerCase().startsWith("@relation")) {
continue;
}
if (line.toLowerCase().startsWith("@data")) {
// process column definitions
ondata = true;
continue;
}
if (line.toLowerCase().startsWith("@attribute")) {
line = line.substring("@attribute".length()).trim();//Remove the space, it could be multiple spaces
String variableName;
line = line.replace("\t", " ");
if (line.startsWith("'")) {
Pattern p = Pattern.compile("'.+?'");
Matcher m = p.matcher(line);
m.find();
variableName = fullTrim(m.group());
line = line.replaceFirst("'.+?'", "placeHolder");
} else {
variableName = fullTrim(line.trim().replaceAll("\\s+.*", ""));
}
names.add(variableName);
String[] tmp = line.split("\\s+", 2);
if (tmp[1].trim().equalsIgnoreCase("real") || tmp[1].trim().equals("isNumeric") || tmp[1].trim().startsWith("integer")) {
vars.add(Numeric.empty());
} else//Not correct, but we aren't supporting anything other than real and categorical right now
{
String cats = tmp[1].replace("{", "").replace("}", "").trim();
if (cats.endsWith(",")) {
cats = cats.substring(0, cats.length() - 1);
}
String[] catValsRaw = cats.split(",");
List<String> tempMap = new ArrayList<>();
for (String catVal : catValsRaw) {
tempMap.add(fullTrim(catVal));
}
nomValueMap.put(variableName, tempMap);
vars.add(Nominal.empty(0, tempMap));
}
continue;
}
}
data.add(line.trim());
}
List<Var> newvectors = new ArrayList<>();
for (int i = 0; i < vars.size(); i++) {
if (vars.get(i) instanceof Numeric) {
newvectors.add(Numeric.empty(data.size()));
}
if (vars.get(i) instanceof Nominal) {
newvectors.add(Nominal.empty(data.size(), nomValueMap.get(names.get(i))));
}
}
for (int i = 0; i < newvectors.size(); i++) {
newvectors.get(i).withName(names.get(i));
}
Frame df = SolidFrame.byVars(data.size(), newvectors);
// process data
for (int i = 0; i < data.size(); i++) {
String[] tmp = data.get(i).split(",");
for (int j = 0; j < tmp.length; j++) {
if ("?".equals(tmp[j])) {
continue;
}
if (df.var(j).type().isNumeric()) {
df.var(j).setValue(i, Double.parseDouble(tmp[j]));
}
if (df.var(j).type().isNominal()) {
df.var(j).setLabel(i, fullTrim(tmp[j]));
}
}
}
return df;
}
}
/**
* Removes the quotes at the end and front of a string if there are any, as
* well as spaces at the front and end
*
* @param in
* @return
*/
private String fullTrim(String in) {
in = in.trim();
if (in.startsWith("'") || in.startsWith("\"")) {
in = in.substring(1);
}
if (in.endsWith("'") || in.startsWith("\"")) {
in = in.substring(0, in.length() - 1);
}
return in.trim();
}
}