/*******************************************************************************
* Copyright (c) 2010 Haifeng Li
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package smile.data.parser;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.text.ParseException;
import smile.data.Attribute;
import smile.data.AttributeDataset;
import smile.data.Datum;
import smile.data.NumericAttribute;
/**
* The delimited text file parser. By default, the parser expects a
* white-space-separated-values file. Each line in the file corresponds
* to a row in the table. Within a line, fields are separated by white spaces,
* each field belonging to one table column. This class can also be
* used to read other text tabular files by setting delimiter character
* such ash ','. The file may contain comment lines (starting with '%')
* and missing values (indicated by placeholder '?'), which both can be
* parameterized.
*
* @author Haifeng Li
*/
public class DelimitedTextParser {
/**
* The delimiter character to separate columns.
*/
private String delimiter = "\\s+";
/**
* The start of comments.
*/
private String comment = "%";
/**
* The placeholder of missing values in the data.
*/
private String missing = "?";
/**
* The dataset has column names at first row.
*/
private boolean hasColumnNames = false;
/**
* The dataset has row names at first column.
*/
private boolean hasRowNames = false;
/**
* The attribute of dependent/response variable.
*/
private Attribute response = null;
/**
* The column index of dependent/response variable.
*/
private int responseIndex = -1;
/**
* Constructor with default delimiter of white space, comment line
* starting with '%', missing value placeholder "?", no column names,
* no row names.
*/
public DelimitedTextParser() {
}
/**
* Returns the delimiter character/string.
*/
public String getDelimiter() {
return delimiter;
}
/**
* Set the delimiter character/string.
*/
public DelimitedTextParser setDelimiter(String delimiter) {
this.delimiter = delimiter;
return this;
}
/**
* Returns the character/string that starts a comment line.
*/
public String getCommentStartWith() {
return comment;
}
/**
* Set the character/string that starts a comment line.
*/
public DelimitedTextParser setCommentStartWith(String comment) {
this.comment = comment;
return this;
}
/**
* Returns the missing value placeholder.
*/
public String getMissingValuePlaceholder() {
return missing;
}
/**
* Set the missing value placeholder.
*/
public DelimitedTextParser setMissingValuePlaceholder(String missing) {
this.missing = missing;
return this;
}
/**
* Sets the attribute and column index (starting at 0) of dependent/response variable.
*/
public DelimitedTextParser setResponseIndex(Attribute response, int index) {
if (response.getType() != Attribute.Type.NOMINAL && response.getType() != Attribute.Type.NUMERIC) {
throw new IllegalArgumentException("The response variable is not numeric or nominal.");
}
this.response = response;
this.responseIndex = index;
return this;
}
/**
* Returns if the dataset has row names (at column 0).
*/
public boolean hasRowNames() {
return hasRowNames;
}
/**
* Set if the dataset has row names (at column 0).
*/
public DelimitedTextParser setRowNames(boolean hasRowNames) {
this.hasRowNames = hasRowNames;
return this;
}
/**
* Returns if the dataset has column namesS (at row 0).
*/
public boolean hasColumnNames() {
return hasColumnNames;
}
/**
* Set if the dataset has column names (at row 0).
*/
public DelimitedTextParser setColumnNames(boolean hasColNames) {
this.hasColumnNames = hasColNames;
return this;
}
/**
* Parse a dataset from given URI.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(URI uri) throws IOException, ParseException {
return parse(new File(uri));
}
/**
* Parse a dataset from given URI.
* @param uri the URI of data source.
* @param attributes the list attributes of data in proper order.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(String name, Attribute[] attributes, URI uri) throws IOException, ParseException {
return parse(name, attributes, new File(uri));
}
/**
* Parse a dataset from given file.
* @param path the file path of data source.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(String path) throws IOException, ParseException {
return parse(new File(path));
}
/**
* Parse a dataset from given file.
* @param path the file path of data source.
* @param attributes the list attributes of data in proper order.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(String name, Attribute[] attributes, String path) throws IOException, ParseException {
return parse(name, attributes, new File(path));
}
/**
* Parse a dataset from given file.
* @param file the file of data source.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(File file) throws IOException, ParseException {
String name = file.getPath();
return parse(name, new FileInputStream(file));
}
/**
* Parse a dataset from given file.
* @param file the file of data source.
* @throws java.io.IOException
*/
public AttributeDataset parse(String name, File file) throws IOException, ParseException {
return parse(name, new FileInputStream(file));
}
/**
* Parse a dataset from given file.
* @param file the file of data source.
* @param attributes the list attributes of data in proper order.
* @throws java.io.IOException
*/
public AttributeDataset parse(Attribute[] attributes, File file) throws IOException, ParseException {
String name = file.getPath();
return parse(name, attributes, file);
}
/**
* Parse a dataset from given file.
* @param file the file of data source.
* @param attributes the list attributes of data in proper order.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(String name, Attribute[] attributes, File file) throws IOException, ParseException {
try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)))) {
return parse(name, attributes, reader);
}
}
/**
* Parse a dataset from an input stream.
* @param name the name of dataset.
* @param stream the input stream of data.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(String name, InputStream stream) throws IOException, ParseException {
try (BufferedReader reader = new BufferedReader(new InputStreamReader(stream))) {
return parse(name, null, reader);
}
}
/**
* Parse a dataset from a buffered reader.
* @param name the name of dataset.
* @param attributes the list attributes of data in proper order.
* @param reader the buffered reader for data.
* @throws java.io.IOException
*/
private AttributeDataset parse(String name, Attribute[] attributes, BufferedReader reader) throws IOException, ParseException {
String line = reader.readLine();
while (line != null) {
if (line.isEmpty() || line.startsWith(comment)) {
line = reader.readLine();
} else {
break;
}
}
if (line == null) {
throw new IOException("Empty data source.");
}
String[] s = line.split(delimiter, 0);
if (attributes == null) {
int p = s.length;
if (hasRowNames) {
p--;
}
if (responseIndex >= s.length) {
throw new ParseException("Invalid response variable index: " + responseIndex, responseIndex);
}
if (responseIndex >= 0) {
p--;
}
attributes = new Attribute[p];
for (int i = 0; i < p; i++) {
attributes[i] = new NumericAttribute("V" + (i + 1));
}
}
int ncols = attributes.length;
int startColumn = 0;
if (hasRowNames) {
ncols++;
startColumn = 1;
}
if (responseIndex >= 0) {
ncols++;
}
if (ncols != s.length)
throw new ParseException(String.format("%d columns, expected %d", s.length, ncols), s.length);
AttributeDataset data = new AttributeDataset(name, attributes, response);
if (hasColumnNames) {
for (int i = startColumn, k = 0; i < s.length; i++) {
if (i != responseIndex) {
attributes[k++].setName(s[i]);
} else {
response.setName(s[i]);
}
}
} else {
String rowName = hasRowNames ? s[0] : null;
double[] x = new double[attributes.length];
double y = Double.NaN;
for (int i = startColumn, k = 0; i < s.length; i++) {
if (i == responseIndex) {
y = response.valueOf(s[i]);
} else if (missing != null && missing.equalsIgnoreCase(s[i])) {
x[k++] = Double.NaN;
} else {
x[k] = attributes[k].valueOf(s[i]);
k++;
}
}
Datum<double[]> datum = new Datum<>(x, y);
datum.name = rowName;
data.add(datum);
}
while ((line = reader.readLine()) != null) {
if (line.isEmpty() || line.startsWith(comment)) {
continue;
}
s = line.split(delimiter, 0);
if (s.length != ncols) {
throw new ParseException(String.format("%d columns, expected %d", s.length, ncols), s.length);
}
String rowName = hasRowNames ? s[0] : null;
double[] x = new double[attributes.length];
double y = Double.NaN;
for (int i = startColumn, k = 0; i < s.length; i++) {
if (i == responseIndex) {
y = response.valueOf(s[i]);
} else if (missing != null && missing.equalsIgnoreCase(s[i])) {
x[k++] = Double.NaN;
} else {
x[k] = attributes[k].valueOf(s[i]);
k++;
}
}
Datum<double[]> datum = new Datum<>(x, y);
datum.name = rowName;
data.add(datum);
}
return data;
}
}