/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.ga.watchmaker.cd;
import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Scanner;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
/**
* Initializes a DataSet using a special format file.<br>
* The file contains for each attribute one of the following:<br>
* <ul>
* <li>{@code IGNORED}<br>
* if the attribute is ignored</li>
* <li>{@code LABEL, val1, val2, ...}<br>
* if the attribute is the label, and its possible values</li>
* <li>{@code CATEGORICAL, val1, val2, ...}<br>
* if the attribute is nominal, and its possible values</li>
* <li>{@code NUMERICAL, min, max}<br>
* if the attribute is numerical, and its min and max values</li>
* </ul>
*/
public final class FileInfoParser {
public static final String IGNORED_TOKEN = "IGNORED";
public static final String LABEL_TOKEN = "LABEL";
public static final String NOMINAL_TOKEN = "CATEGORICAL";
public static final String NUMERICAL_TOKEN = "NUMERICAL";
private static final Splitter COMMA = Splitter.on(',').trimResults();
private FileInfoParser() { }
/**
* Initializes a dataset using an info file.
*
* @param fs
* file system
* @param inpath
* info file
* @return Initialized Dataset
*/
public static DataSet parseFile(FileSystem fs, Path inpath) throws IOException {
Path info = getInfoFile(fs, inpath);
FSDataInputStream input = fs.open(info);
Scanner reader = new Scanner(input);
List<Integer> ignored = Lists.newArrayList();
List<Attribute> attributes = Lists.newArrayList();
int labelIndex = -1;
int index = 0;
while (reader.hasNextLine()) {
String line = reader.nextLine();
Iterator<String> tokens = COMMA.split(line).iterator();
String token = tokens.next();
if (IGNORED_TOKEN.equals(token)) {
ignored.add(index);
} else if (LABEL_TOKEN.equals(token)) {
labelIndex = index;
attributes.add(parseNominal(tokens));
} else if (NOMINAL_TOKEN.equals(token)) {
attributes.add(parseNominal(tokens));
} else if (NUMERICAL_TOKEN.equals(token)) {
attributes.add(parseNumerical(tokens));
} else {
throw new IllegalArgumentException("Unknown token (" + token
+ ") encountered while parsing the info file");
}
}
reader.close();
if (labelIndex == -1) {
throw new IllegalStateException("Info file does not contain a LABEL");
}
return new DataSet(attributes, ignored, labelIndex);
}
/**
* Prepares the path for the info file corresponding to the input path.
*
* @param fs file system
*/
public static Path getInfoFile(FileSystem fs, Path inpath) throws IOException {
Preconditions.checkArgument(inpath != null && fs.exists(inpath) && fs.getFileStatus(inpath).isDir(),
"Input path should be a directory", inpath);
Path infoPath = new Path(inpath.getParent(), inpath.getName() + ".infos");
Preconditions.checkArgument(fs.exists(infoPath), "Info file does not exist", infoPath);
return infoPath;
}
/**
* Parse a nominal attribute.
*/
private static NominalAttr parseNominal(Iterator<String> tokens) {
Collection<String> vlist = Lists.newArrayList();
while (tokens.hasNext()) {
vlist.add(tokens.next());
}
String[] values = new String[vlist.size()];
vlist.toArray(values);
return new NominalAttr(values);
}
/**
* Parse a numerical attribute.
*/
private static NumericalAttr parseNumerical(Iterator<String> tokens) {
double min = Double.parseDouble(tokens.next());
double max = Double.parseDouble(tokens.next());
Preconditions.checkArgument(min <= max, "min > max");
return new NumericalAttr(min, max);
}
}