CsvParser.java example

Explorer
diirt-master
/**
 * Copyright (C) 2010-14 diirt developers. See COPYRIGHT.TXT
 * All rights reserved. Use is subject to license terms. See LICENSE.TXT
 */
package org.diirt.util.text;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.diirt.util.array.ArrayDouble;
import org.diirt.util.array.ListDouble;
import static org.diirt.util.text.StringUtil.DOUBLE_REGEX_WITH_NAN;

/**
 * Utility class to parse CSV text. The parser is thread safe: it includes an
 * immutable set of parameters and the state for each parsing is kept separate.
 * A change in the parser parameters will create a new parser, so to create
 * your configuration take the closest matching as a template and apply the
 * difference.
 * <p>
 * Since there is no CSV strict format, this parser honors as best it
 * can the suggestions found in <a href="http://tools.ietf.org/html/rfc4180">RFC4180</a>,
 * in the <a href="http://en.wikipedia.org/wiki/Comma-separated_values">CSV wikipedia article</a>
 * and other sources.
 * <p>
 * The parser can try multiple separators, so that it can auto-detect the
 * likely correct one. It does so by trying them one by one, checking
 * that it finds more than one column and that all the rows have the same
 * number of columns. If not, proceeds to the next separator.
 * <p>
 * Typical use of the parser:
 * <blockquote><pre>
 * CsvParserResult result = CsvParser.AUTOMATIC
 *   .withHeader(CsvParser.Header.NONE)
 *   .parse(new FileReader("table.csv"));</pre></blockquote>
 * <p>
 * The parsing of each line is based on code and insights found in
 * <a href="http://regex.info/book.html"> Mastering Regular Expressions</a>.
 *
 * @author carcassi
 */
public class CsvParser {

    // Configuration
    private final String separators;
    private final Header header;

    /**
     * The configuration options for the header.
     */
    public enum Header {
        /**
         * Auto detects whether the first line is a header.
         * <p>
         * The first line is interpreted as data only if it can be safely
         * distinguished. If all columns contain strings, then the first
         * line is always interpreted as a header. If the types in the
         * first line do not match the column (e.g. first line string, rest are
         * numbers) then it is interpreted as header. If the types match,
         * and one of them is not a string (e.g. number) then the first
         * line is interpreted as data.
         */
        AUTO,

        /**
         * The first line is the header.
         */
        FIRST_LINE,

        /**
         * The data contains no header, and the first line is data.
         * <p>
         * A header is automatically generated with the convention given by
         * spreadsheets columns: A, B, ..., Y, Z, AA, AB, ..., AZ, BA, and so on.
         */
        NONE};

    private class State {
        // Parser state
        private int nColumns;
        private boolean columnMismatch = false;
        private List<String> columnNames;
        private List<Boolean> columnNumberParsable;
        private List<Boolean> columnTimestampParsable;
        private List<List<String>> columnTokens;
        private String currentSeparator;

        // Regex object used for parsing
        private Matcher mLineTokens;
        private final Matcher mQuote = pQuote.matcher("");
        private final Matcher mDouble = pDouble.matcher("");

        // Keep data on best matched separator
        private String bestSeparator;
        private int bestNLines = -1;
    }


    private static final Pattern pQuote = Pattern.compile("\"\"");
    private static final Pattern pDouble = Pattern.compile(DOUBLE_REGEX_WITH_NAN);

    /**
     * Automatic parser: auto-detects whether the first line is a header or not
     * and tries the most common separators (i.e. ',' ';' 'TAB' 'SPACE').
     */
    public static final CsvParser AUTOMATIC = new CsvParser(",;\t ", Header.AUTO);

    private CsvParser(String separators, Header header) {
        this.separators = separators;
        this.header = header;
    }

    /**
     * Returns the list of separators that are going to be tried while parsing.
     *
     * @return a string with all the possible separators
     */
    public String getSeparators() {
        return separators;
    }

    /**
     * Creates a new parser that uses the given separators.
     * <p>
     * Each character of the string is tried until the parsing is
     * successful.
     *
     * @param separators the new list of separators
     * @return a new parser
     */
    public CsvParser withSeparators(String separators) {
        return new CsvParser(separators, header);
    }

    /**
     * Returns the way that the parser handles the header (the first line of
     * the csv file).
     *
     * @return the header configuration of the parser
     */
    public Header getHeader() {
        return header;
    }

    /**
     * Creates a new parser with the given header handling.
     *
     * @param header the header configuration for the parser
     * @return a new parser
     */
    public CsvParser withHeader(Header header) {
        return new CsvParser(separators, header);
    }


    /**
     * Parser the text provided by the reader with the format defined in this
     * parser. This method is thread-safe.
     * <p>
     * If the parsing fails, this method does not throw an exception but
     * will have information in the result. The idea is that, in the future,
     * the parser can provide multiple reasons as why the parsing failed or
     * event incomplete results.
     *
     * @param reader a reader
     * @return the parsed information
     */
    public CsvParserResult parse(Reader reader) {
        // State used for parsing. Since each call has its own state,
        // the parsing is thread safe.
        State state = new State();

        // Divide into lines.
        // Note that means we are going to keep in memory the whole file.
        // This is not very memory efficient. But since we have to do multiple
        // passes to find the right separator, we don't have much choice.
        // Also: the actual parsed result will need to stay in memory anyway.
        List<String> lines = csvLines(reader);

        // Try each seaparater
        separatorLoop:
        for(int nSeparator = 0; nSeparator < getSeparators().length(); nSeparator++) {
            state.currentSeparator = getSeparators().substring(nSeparator, nSeparator+1);

            // Taken from Mastering Regular Exceptions
            // Disabled comments so that space could work as possible separator
            String regex = // puts a doublequoted field in group(1) and an unquoted field into group(2)
                    // Start with beginning of line or separator
                    "\\G(?:^|" + state.currentSeparator + ")" +
                    // Match a quoted string
                    "(?:" +
                    "\"" +
                    "((?:[^\"]++|\"\")*+)" +
                    "\"" +
                    // Or match a string without the separator
                    "|" +
                    "([^\"" + state.currentSeparator + "]*)" +
                    ")";
            // Compile the matcher once for all the parsing
            state.mLineTokens = Pattern.compile(regex).matcher("");

            // Try to parse the first line (the titles)
            // If only one columns is found, proceed to next separator
            state.columnNames = parseTitles(state, lines.get(0));
            state.nColumns = state.columnNames.size();
            if (state.nColumns == 1) {
                continue;
            }

            // Prepare the data structures to hold column data while parsing
            state.columnMismatch = false;
            state.columnNumberParsable = new ArrayList<>(state.nColumns);
            state.columnTimestampParsable = new ArrayList<>(state.nColumns);
            state.columnTokens = new ArrayList<>();
            for (int i = 0; i < state.nColumns; i++) {
                state.columnNumberParsable.add(true);
                state.columnTimestampParsable.add(false);
                state.columnTokens.add(new ArrayList<String>());
            }

            // Parse each line
            // If one line does not match the number of columns found in the first
            // line, pass to the next separator
            for (int i = 1; i < lines.size(); i++) {
                parseLine(state, lines.get(i));
                if (state.columnMismatch) {
                    if (i > state.bestNLines) {
                        state.bestSeparator =  state.currentSeparator;
                        state.bestNLines = i;
                    }
                    continue separatorLoop;
                }
            }

            // The parsing succeeded! No need to try other separator
            break;

        }

        // We are out of the loop: did we end because we parsed correctly,
        // or because even the last separator was a mismatch?
        if (state.columnMismatch) {
            return new CsvParserResult(null, null, null, 0, false, "Parsing failed: number of columns not constant. Using separator '"
                    + state.bestSeparator + "', line " + (state.bestNLines + 1));
        }

        // Parsing was successful.
        // Should the first line be used as data?
        if (header == Header.NONE || (header == Header.AUTO && isFirstLineData(state, state.columnNames))) {
            for (int i = 0; i < state.nColumns; i++) {
                state.columnTokens.set(i, joinList(state.columnNames.get(i), state.columnTokens.get(i)));
                state.columnNames.set(i, alphabeticName(i));
            }
        }

        // Now it's time to convert the tokens to the actual type.
        List<Object> columnValues = new ArrayList<>(state.nColumns);
        List<Class<?>> columnTypes = new ArrayList<>(state.nColumns);
        for (int i = 0; i < state.nColumns; i++) {
            if (state.columnNumberParsable.get(i)) {
                columnValues.add(convertToListDouble(state.columnTokens.get(i)));
                columnTypes.add(double.class);
            } else {
                columnValues.add(state.columnTokens.get(i));
                columnTypes.add(String.class);
            }
        }

        // Prepare result, and remember to clear the state, so
        // we don't keep references to junk
        CsvParserResult result = new CsvParserResult(state.columnNames, columnValues, columnTypes, state.columnTokens.get(0).size(), true, null);
        return result;
    }

    /**
     * Given a list of tokens, convert them to a list of numbers.
     *
     * @param tokens the tokens to be converted
     * @return the number list
     */
    private ListDouble convertToListDouble(List<String> tokens) {
        double[] values = new double[tokens.size()];
        for (int i = 0; i < values.length; i++) {
            if (tokens.get(i).isEmpty()) {
                values[i] = Double.NaN;
            } else {
                values[i] = Double.parseDouble(tokens.get(i));
            }
        }
        return new ArrayDouble(values);
    }

    /**
     * Divides the whole text into lines.
     *
     * @param reader the source of text
     * @return the lines
     */
    static List<String> csvLines(Reader reader) {
        // This needs to handle quoted text that spans multiple lines,
        // so we divide the full text into chunks that correspond to
        // a single csv line
        try {
            BufferedReader br = new BufferedReader(reader);
            List<String> lines = new ArrayList<>();
            // The current line read from the Reader
            String line;
            // The full csv line that may span multiple lines
            String longLine = null;
            while ((line = br.readLine()) != null) {
                // If we have a line from the previous iteration,
                // we concatenate it
                if (longLine == null) {
                    longLine = line;
                } else {
                    longLine = longLine.concat("\n").concat(line);
                }
                // Count the number of quotes: if it's even, the csv line
                // must end here. If not, it will continue to the next
                if (isEvenQuotes(longLine)) {
                    lines.add(longLine);
                    longLine = null;
                }
            }
            // If there is text leftover, the line was not closed propertly.
            // XXX: we need to figure out how to handle errors like this
            if (longLine != null) {
                lines.add(longLine);
            }
            return lines;
        } catch(IOException ex) {
            throw new RuntimeException("Couldn't process data", ex);
        }
    }

    /**
     * Determines whether the string contains an even number of double quote
     * characters.
     *
     * @param string the given string
     * @return true if contains even number of '"'
     */
    static boolean isEvenQuotes(String string) {
        // In principle, we could use the regex given by:
        // Pattern pEvenQuotes = Pattern.compile("([^\"]*\\\"[^\"]*\\\")*[^\"]*");
        // We assume just counting the instances of double quotes is more efficient
        // but we haven't really tested that assumption.

        boolean even = true;
        for (int i = 0; i < string.length(); i++) {
            if (string.charAt(i) == '\"') {
                even = !even;
            }
        }
        return even;
    }

    /**
     * Parses the first line to get the column names.
     *
     * @param line the text line
     * @return the column names
     */
    private List<String> parseTitles(State state, String line) {
        // Match using the parser
        List<String> titles = new ArrayList<>();
        state.mLineTokens.reset(line);
        while (state.mLineTokens.find()) {
            String value;
            if (state.mLineTokens.start(2) >= 0) {
                value = state.mLineTokens.group(2);
            } else {
                // If quoted, always use string
                value = state.mQuote.reset(state.mLineTokens.group(1)).replaceAll("\"");
            }
            titles.add(value);
        }
        return titles;
    }

    /**
     * Parses a line, saving the tokens, and determines the type match.
     *
     * @param line a new line
     */
    private void parseLine(State state, String line) {
        // XXX The regex does not work if the first token is blank, and I
        // don't understand why. Workaround: if it's blank, add a space,
        // and remember I added a space.
        boolean firstEmpty = false;
        if (line.startsWith(state.currentSeparator)) {
            line = " " + line;
            firstEmpty = true;
        }

        // Match using the parser
        state.mLineTokens.reset(line);
        int nColumn = 0;
        while (state.mLineTokens.find()) {
            // Does this line have more columns than expected?
            if (nColumn == state.nColumns) {
                state.columnMismatch = true;
                return;
            }

            String token;
            if (state.mLineTokens.start(2) >= 0) {
                // The token was unquoted. Check if it could be a number.
                token = state.mLineTokens.group(2);
                if (firstEmpty) {
                    token = "";
                    firstEmpty = false;
                }
                if (!isTokenNumberParsable(state, token)) {
                    state.columnNumberParsable.set(nColumn, false);
                }
            } else {
                // If quoted, always use string
                token = state.mQuote.reset(state.mLineTokens.group(1)).replaceAll("\"");
                state.columnNumberParsable.set(nColumn, false);
            }
            state.columnTokens.get(nColumn).add(token);
            nColumn++;
        }
        // Does this line have fewer columns than expected?
        if (nColumn != state.nColumns) {
            state.columnMismatch = true;
        }
    }

    /**
     * Check whether the token can be parsed to a number.
     *
     * @param state the state of the parser
     * @param token the token
     * @return true if token matches a double
     */
    private boolean isTokenNumberParsable(State state, String token) {
        if (token.isEmpty()) {
            return true;
        }
        return state.mDouble.reset(token).matches();
    }

    /**
     * Checks whether the header can be safely interpreted as data.
     * This is used for the auto header detection.
     *
     * @param state the state of the parser
     * @param headerTokens the header
     * @return true if header should be handled as data
     */
    private boolean isFirstLineData(State state, List<String> headerTokens) {
        // Check whether the type of the header match the type of the following data
        boolean headerCompatible = true;
        // Check whether if all types where strings
        boolean allStrings = true;
        for (int i = 0; i < state.nColumns; i++) {
            if (state.columnNumberParsable.get(i)) {
                allStrings = false;
                if (!isTokenNumberParsable(state, headerTokens.get(i))) {
                    headerCompatible = false;
                }
            }
        }
        // If all columns are strings, it's impossible to tell whether we have
        // a header or not: assume we have a header.
        // If the column types matches (e.g. the header for a number column is also
        // a number) then we'll assume the header is actually data.
        return !allStrings && headerCompatible;
    }

    /**
     * Takes an elements and a list and returns a new list with both.
     *
     * @param head the first element
     * @param tail the rest of the elements
     * @return a list with all elements
     */
    private List<String> joinList(final String head, final List<String> tail) {
        return new AbstractList<String>() {

            @Override
            public String get(int index) {
                if (index == 0) {
                    return head;
                } else {
                    return tail.get(index - 1);
                }
            }

            @Override
            public int size() {
                return tail.size()+1;
            }
        };
    }

    static String alphabeticName(int i) {
        String name = "";
        while (true) {
            int offset = i % 26;
            i = i / 26;
            char character = (char) ('A' + offset);
            name = name + character;
            if (i == 0) {
                return name;
            }
        }
    }

    /**
     * Parses a line of text representing comma separated values and returns
     * the values themselves.
     *
     * @param line the line to parse
     * @param separatorChar the regular expression for the separator
     * @return the list of values
     */
    public static List<Object> parseCSVLine(String line, String separatorChar) {
        String regex = // puts a doublequoted field in group(1) and an unquoted field into group(2)
                "\\G(?:^|" + separatorChar + ")" +
                "(?:" +
                "\"" +
                "((?:[^\"]++|\"\")*+)" +
                "\"" +
                "|" +
                "([^\"" + separatorChar + "]*)" +
                ")";
        Matcher mMain = Pattern.compile(regex).matcher("");
        Matcher mQuote = Pattern.compile("\"\"").matcher("");
        Matcher mDouble = Pattern.compile(DOUBLE_REGEX_WITH_NAN).matcher("");

        List<Object> values = new ArrayList<>();
        mMain.reset(line);
        while (mMain.find()) {
            Object value;
            if (mMain.start(2) >= 0) {
                String field = mMain.group(2);
                if (mDouble.reset(field).matches()) {
                    value = Double.parseDouble(field);
                } else {
                    value = field;
                }
            } else {
                // If quoted, always use string
                value = mQuote.reset(mMain.group(1)).replaceAll("\"");
            }
            values.add(value);
        }
        return values;
    }
}