DelimitedTextParser.java example

Explorer
smile-master
/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *   
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/

package smile.data.parser;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.text.ParseException;

import smile.data.Attribute;
import smile.data.AttributeDataset;
import smile.data.Datum;
import smile.data.NumericAttribute;

/**
 * The delimited text file parser. By default, the parser expects a
 * white-space-separated-values file. Each line in the file corresponds
 * to a row in the table. Within a line, fields are separated by white spaces,
 * each field belonging to one table column. This class can also be
 * used to read other text tabular files by setting delimiter character
 * such ash ','. The file may contain comment lines (starting with '%')
 * and missing values (indicated by placeholder '?'), which both can be
 * parameterized.
 *
 * @author Haifeng Li
 */
public class DelimitedTextParser {

    /**
     * The delimiter character to separate columns.
     */
    private String delimiter = "\\s+";
    /**
     * The start of comments.
     */
    private String comment = "%";
    /**
     * The placeholder of missing values in the data.
     */
    private String missing = "?";
    /**
     * The dataset has column names at first row.
     */
    private boolean hasColumnNames = false;
    /**
     * The dataset has row names at first column.
     */
    private boolean hasRowNames = false;
    /**
     * The attribute of dependent/response variable.
     */
    private Attribute response = null;
    /**
     * The column index of dependent/response variable.
     */
    private int responseIndex = -1;

    /**
     * Constructor with default delimiter of white space, comment line
     * starting with '%', missing value placeholder "?", no column names,
     * no row names.
     */
    public DelimitedTextParser() {
    }

    /**
     * Returns the delimiter character/string.
     */
    public String getDelimiter() {
        return delimiter;
    }

    /**
     * Set the delimiter character/string.
     */
    public DelimitedTextParser setDelimiter(String delimiter) {
        this.delimiter = delimiter;
        return this;
    }

    /**
     * Returns the character/string that starts a comment line.
     */
    public String getCommentStartWith() {
        return comment;
    }

    /**
     * Set the character/string that starts a comment line.
     */
    public DelimitedTextParser setCommentStartWith(String comment) {
        this.comment = comment;
        return this;
    }

    /**
     * Returns the missing value placeholder.
     */
    public String getMissingValuePlaceholder() {
        return missing;
    }

    /**
     * Set the missing value placeholder.
     */
    public DelimitedTextParser setMissingValuePlaceholder(String missing) {
        this.missing = missing;
        return this;
    }

    /**
     * Sets the attribute and column index (starting at 0) of dependent/response variable.
     */
    public DelimitedTextParser setResponseIndex(Attribute response, int index) {
        if (response.getType() != Attribute.Type.NOMINAL && response.getType() != Attribute.Type.NUMERIC) {
            throw new IllegalArgumentException("The response variable is not numeric or nominal.");
        }
        
        this.response = response;
        this.responseIndex = index;
        return this;
    }

    /**
     * Returns if the dataset has row names (at column 0).
     */
    public boolean hasRowNames() {
        return hasRowNames;
    }

    /**
     * Set if the dataset has row names (at column 0).
     */
    public DelimitedTextParser setRowNames(boolean hasRowNames) {
        this.hasRowNames = hasRowNames;
        return this;
    }

    /**
     * Returns if the dataset has column namesS (at row 0).
     */
    public boolean hasColumnNames() {
        return hasColumnNames;
    }

    /**
     * Set if the dataset has column names (at row 0).
     */
    public DelimitedTextParser setColumnNames(boolean hasColNames) {
        this.hasColumnNames = hasColNames;
        return this;
    }

    /**
     * Parse a dataset from given URI.
     * @throws java.io.FileNotFoundException
     */
    public AttributeDataset parse(URI uri) throws IOException, ParseException {
        return parse(new File(uri));
    }

    /**
     * Parse a dataset from given URI.
     * @param uri the URI of data source.
     * @param attributes the list attributes of data in proper order.
     * @throws java.io.FileNotFoundException
     */
    public AttributeDataset parse(String name, Attribute[] attributes, URI uri) throws IOException, ParseException {
        return parse(name, attributes, new File(uri));
    }

    /**
     * Parse a dataset from given file.
     * @param path the file path of data source.
     * @throws java.io.FileNotFoundException
     */
    public AttributeDataset parse(String path) throws IOException, ParseException {
        return parse(new File(path));
    }

    /**
     * Parse a dataset from given file.
     * @param path the file path of data source.
     * @param attributes the list attributes of data in proper order.
     * @throws java.io.FileNotFoundException
     */
    public AttributeDataset parse(String name, Attribute[] attributes, String path) throws IOException, ParseException {
        return parse(name, attributes, new File(path));
    }

    /**
     * Parse a dataset from given file.
     * @param file the file of data source.
     * @throws java.io.FileNotFoundException
     */
    public AttributeDataset parse(File file) throws IOException, ParseException {
        String name = file.getPath();
        return parse(name, new FileInputStream(file));
    }

    /**
     * Parse a dataset from given file.
     * @param file the file of data source.
     * @throws java.io.IOException
     */
    public AttributeDataset parse(String name, File file) throws IOException, ParseException {
        return parse(name, new FileInputStream(file));
    }

    /**
     * Parse a dataset from given file.
     * @param file the file of data source.
     * @param attributes the list attributes of data in proper order.
     * @throws java.io.IOException
     */
    public AttributeDataset parse(Attribute[] attributes, File file) throws IOException, ParseException {
        String name = file.getPath();
        return parse(name, attributes, file);
    }

    /**
     * Parse a dataset from given file.
     * @param file the file of data source.
     * @param attributes the list attributes of data in proper order.
     * @throws java.io.FileNotFoundException
     */
    public AttributeDataset parse(String name, Attribute[] attributes, File file) throws IOException, ParseException {
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)))) {
           return parse(name, attributes, reader);
        }
    }

    /**
     * Parse a dataset from an input stream.
     * @param name the name of dataset.
     * @param stream the input stream of data.
     * @throws java.io.FileNotFoundException
     */
    public AttributeDataset parse(String name, InputStream stream) throws IOException, ParseException {
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(stream))) {
            return parse(name, null, reader);
        }
    }
    
    /**
     * Parse a dataset from a buffered reader.
     * @param name the name of dataset.
     * @param attributes the list attributes of data in proper order.
     * @param reader the buffered reader for data.
     * @throws java.io.IOException
     */
    private AttributeDataset parse(String name, Attribute[] attributes, BufferedReader reader) throws IOException, ParseException {
        String line = reader.readLine();
        while (line != null) {
            if (line.isEmpty() || line.startsWith(comment)) {
                line = reader.readLine();
            } else {
                break;
            }
        }

        if (line == null) {
            throw new IOException("Empty data source.");
        }

        String[] s = line.split(delimiter, 0);

        if (attributes == null) {
            int p = s.length;
            if (hasRowNames) {
                p--;
            }

            if (responseIndex >= s.length) {
                throw new ParseException("Invalid response variable index: " + responseIndex, responseIndex);
            }

            if (responseIndex >= 0) {
                p--;
            }

            attributes = new Attribute[p];
            for (int i = 0; i < p; i++) {
                attributes[i] = new NumericAttribute("V" + (i + 1));
            }
        }

        int ncols = attributes.length;
        int startColumn = 0;

        if (hasRowNames) {
            ncols++;
            startColumn = 1;
        }

        if (responseIndex >= 0) {
            ncols++;
        }

        if (ncols != s.length)
            throw new ParseException(String.format("%d columns, expected %d", s.length, ncols), s.length);

        AttributeDataset data = new AttributeDataset(name, attributes, response);

        if (hasColumnNames) {
            for (int i = startColumn, k = 0; i < s.length; i++) {
                if (i != responseIndex) {
                    attributes[k++].setName(s[i]);
                } else {
                    response.setName(s[i]);
                }
            }
        } else {
            String rowName = hasRowNames ? s[0] : null;
            double[] x = new double[attributes.length];
            double y = Double.NaN;

            for (int i = startColumn, k = 0; i < s.length; i++) {
                if (i == responseIndex) {
                    y = response.valueOf(s[i]);
                } else if (missing != null && missing.equalsIgnoreCase(s[i])) {
                    x[k++] = Double.NaN;
                } else {
                    x[k] = attributes[k].valueOf(s[i]);
                    k++;
                }
            }

            Datum<double[]> datum = new Datum<>(x, y);
            datum.name = rowName;
            data.add(datum);
        }

        while ((line = reader.readLine()) != null) {
            if (line.isEmpty() || line.startsWith(comment)) {
                continue;
            }

            s = line.split(delimiter, 0);
            if (s.length != ncols) {
                throw new ParseException(String.format("%d columns, expected %d", s.length, ncols), s.length);
            }

            String rowName = hasRowNames ? s[0] : null;
            double[] x = new double[attributes.length];
            double y = Double.NaN;

            for (int i = startColumn, k = 0; i < s.length; i++) {
                if (i == responseIndex) {
                    y = response.valueOf(s[i]);
                } else if (missing != null && missing.equalsIgnoreCase(s[i])) {
                    x[k++] = Double.NaN;
                } else {
                    x[k] = attributes[k].valueOf(s[i]);
                    k++;
                }
            }

            Datum<double[]> datum = new Datum<>(x, y);
            datum.name = rowName;
            data.add(datum);
        }

        return data;
    }
}