/** * Copyright (C) 2010-14 diirt developers. See COPYRIGHT.TXT * All rights reserved. Use is subject to license terms. See LICENSE.TXT */ package org.diirt.util.text; import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; import java.util.AbstractList; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.diirt.util.array.ArrayDouble; import org.diirt.util.array.ListDouble; import static org.diirt.util.text.StringUtil.DOUBLE_REGEX_WITH_NAN; /** * Utility class to parse CSV text. The parser is thread safe: it includes an * immutable set of parameters and the state for each parsing is kept separate. * A change in the parser parameters will create a new parser, so to create * your configuration take the closest matching as a template and apply the * difference. * <p> * Since there is no CSV strict format, this parser honors as best it * can the suggestions found in <a href="http://tools.ietf.org/html/rfc4180">RFC4180</a>, * in the <a href="http://en.wikipedia.org/wiki/Comma-separated_values">CSV wikipedia article</a> * and other sources. * <p> * The parser can try multiple separators, so that it can auto-detect the * likely correct one. It does so by trying them one by one, checking * that it finds more than one column and that all the rows have the same * number of columns. If not, proceeds to the next separator. * <p> * Typical use of the parser: * <blockquote><pre> * CsvParserResult result = CsvParser.AUTOMATIC * .withHeader(CsvParser.Header.NONE) * .parse(new FileReader("table.csv"));</pre></blockquote> * <p> * The parsing of each line is based on code and insights found in * <a href="http://regex.info/book.html"> Mastering Regular Expressions</a>. * * @author carcassi */ public class CsvParser { // Configuration private final String separators; private final Header header; /** * The configuration options for the header. */ public enum Header { /** * Auto detects whether the first line is a header. * <p> * The first line is interpreted as data only if it can be safely * distinguished. If all columns contain strings, then the first * line is always interpreted as a header. If the types in the * first line do not match the column (e.g. first line string, rest are * numbers) then it is interpreted as header. If the types match, * and one of them is not a string (e.g. number) then the first * line is interpreted as data. */ AUTO, /** * The first line is the header. */ FIRST_LINE, /** * The data contains no header, and the first line is data. * <p> * A header is automatically generated with the convention given by * spreadsheets columns: A, B, ..., Y, Z, AA, AB, ..., AZ, BA, and so on. */ NONE}; private class State { // Parser state private int nColumns; private boolean columnMismatch = false; private List<String> columnNames; private List<Boolean> columnNumberParsable; private List<Boolean> columnTimestampParsable; private List<List<String>> columnTokens; private String currentSeparator; // Regex object used for parsing private Matcher mLineTokens; private final Matcher mQuote = pQuote.matcher(""); private final Matcher mDouble = pDouble.matcher(""); // Keep data on best matched separator private String bestSeparator; private int bestNLines = -1; } private static final Pattern pQuote = Pattern.compile("\"\""); private static final Pattern pDouble = Pattern.compile(DOUBLE_REGEX_WITH_NAN); /** * Automatic parser: auto-detects whether the first line is a header or not * and tries the most common separators (i.e. ',' ';' 'TAB' 'SPACE'). */ public static final CsvParser AUTOMATIC = new CsvParser(",;\t ", Header.AUTO); private CsvParser(String separators, Header header) { this.separators = separators; this.header = header; } /** * Returns the list of separators that are going to be tried while parsing. * * @return a string with all the possible separators */ public String getSeparators() { return separators; } /** * Creates a new parser that uses the given separators. * <p> * Each character of the string is tried until the parsing is * successful. * * @param separators the new list of separators * @return a new parser */ public CsvParser withSeparators(String separators) { return new CsvParser(separators, header); } /** * Returns the way that the parser handles the header (the first line of * the csv file). * * @return the header configuration of the parser */ public Header getHeader() { return header; } /** * Creates a new parser with the given header handling. * * @param header the header configuration for the parser * @return a new parser */ public CsvParser withHeader(Header header) { return new CsvParser(separators, header); } /** * Parser the text provided by the reader with the format defined in this * parser. This method is thread-safe. * <p> * If the parsing fails, this method does not throw an exception but * will have information in the result. The idea is that, in the future, * the parser can provide multiple reasons as why the parsing failed or * event incomplete results. * * @param reader a reader * @return the parsed information */ public CsvParserResult parse(Reader reader) { // State used for parsing. Since each call has its own state, // the parsing is thread safe. State state = new State(); // Divide into lines. // Note that means we are going to keep in memory the whole file. // This is not very memory efficient. But since we have to do multiple // passes to find the right separator, we don't have much choice. // Also: the actual parsed result will need to stay in memory anyway. List<String> lines = csvLines(reader); // Try each seaparater separatorLoop: for(int nSeparator = 0; nSeparator < getSeparators().length(); nSeparator++) { state.currentSeparator = getSeparators().substring(nSeparator, nSeparator+1); // Taken from Mastering Regular Exceptions // Disabled comments so that space could work as possible separator String regex = // puts a doublequoted field in group(1) and an unquoted field into group(2) // Start with beginning of line or separator "\\G(?:^|" + state.currentSeparator + ")" + // Match a quoted string "(?:" + "\"" + "((?:[^\"]++|\"\")*+)" + "\"" + // Or match a string without the separator "|" + "([^\"" + state.currentSeparator + "]*)" + ")"; // Compile the matcher once for all the parsing state.mLineTokens = Pattern.compile(regex).matcher(""); // Try to parse the first line (the titles) // If only one columns is found, proceed to next separator state.columnNames = parseTitles(state, lines.get(0)); state.nColumns = state.columnNames.size(); if (state.nColumns == 1) { continue; } // Prepare the data structures to hold column data while parsing state.columnMismatch = false; state.columnNumberParsable = new ArrayList<>(state.nColumns); state.columnTimestampParsable = new ArrayList<>(state.nColumns); state.columnTokens = new ArrayList<>(); for (int i = 0; i < state.nColumns; i++) { state.columnNumberParsable.add(true); state.columnTimestampParsable.add(false); state.columnTokens.add(new ArrayList<String>()); } // Parse each line // If one line does not match the number of columns found in the first // line, pass to the next separator for (int i = 1; i < lines.size(); i++) { parseLine(state, lines.get(i)); if (state.columnMismatch) { if (i > state.bestNLines) { state.bestSeparator = state.currentSeparator; state.bestNLines = i; } continue separatorLoop; } } // The parsing succeeded! No need to try other separator break; } // We are out of the loop: did we end because we parsed correctly, // or because even the last separator was a mismatch? if (state.columnMismatch) { return new CsvParserResult(null, null, null, 0, false, "Parsing failed: number of columns not constant. Using separator '" + state.bestSeparator + "', line " + (state.bestNLines + 1)); } // Parsing was successful. // Should the first line be used as data? if (header == Header.NONE || (header == Header.AUTO && isFirstLineData(state, state.columnNames))) { for (int i = 0; i < state.nColumns; i++) { state.columnTokens.set(i, joinList(state.columnNames.get(i), state.columnTokens.get(i))); state.columnNames.set(i, alphabeticName(i)); } } // Now it's time to convert the tokens to the actual type. List<Object> columnValues = new ArrayList<>(state.nColumns); List<Class<?>> columnTypes = new ArrayList<>(state.nColumns); for (int i = 0; i < state.nColumns; i++) { if (state.columnNumberParsable.get(i)) { columnValues.add(convertToListDouble(state.columnTokens.get(i))); columnTypes.add(double.class); } else { columnValues.add(state.columnTokens.get(i)); columnTypes.add(String.class); } } // Prepare result, and remember to clear the state, so // we don't keep references to junk CsvParserResult result = new CsvParserResult(state.columnNames, columnValues, columnTypes, state.columnTokens.get(0).size(), true, null); return result; } /** * Given a list of tokens, convert them to a list of numbers. * * @param tokens the tokens to be converted * @return the number list */ private ListDouble convertToListDouble(List<String> tokens) { double[] values = new double[tokens.size()]; for (int i = 0; i < values.length; i++) { if (tokens.get(i).isEmpty()) { values[i] = Double.NaN; } else { values[i] = Double.parseDouble(tokens.get(i)); } } return new ArrayDouble(values); } /** * Divides the whole text into lines. * * @param reader the source of text * @return the lines */ static List<String> csvLines(Reader reader) { // This needs to handle quoted text that spans multiple lines, // so we divide the full text into chunks that correspond to // a single csv line try { BufferedReader br = new BufferedReader(reader); List<String> lines = new ArrayList<>(); // The current line read from the Reader String line; // The full csv line that may span multiple lines String longLine = null; while ((line = br.readLine()) != null) { // If we have a line from the previous iteration, // we concatenate it if (longLine == null) { longLine = line; } else { longLine = longLine.concat("\n").concat(line); } // Count the number of quotes: if it's even, the csv line // must end here. If not, it will continue to the next if (isEvenQuotes(longLine)) { lines.add(longLine); longLine = null; } } // If there is text leftover, the line was not closed propertly. // XXX: we need to figure out how to handle errors like this if (longLine != null) { lines.add(longLine); } return lines; } catch(IOException ex) { throw new RuntimeException("Couldn't process data", ex); } } /** * Determines whether the string contains an even number of double quote * characters. * * @param string the given string * @return true if contains even number of '"' */ static boolean isEvenQuotes(String string) { // In principle, we could use the regex given by: // Pattern pEvenQuotes = Pattern.compile("([^\"]*\\\"[^\"]*\\\")*[^\"]*"); // We assume just counting the instances of double quotes is more efficient // but we haven't really tested that assumption. boolean even = true; for (int i = 0; i < string.length(); i++) { if (string.charAt(i) == '\"') { even = !even; } } return even; } /** * Parses the first line to get the column names. * * @param line the text line * @return the column names */ private List<String> parseTitles(State state, String line) { // Match using the parser List<String> titles = new ArrayList<>(); state.mLineTokens.reset(line); while (state.mLineTokens.find()) { String value; if (state.mLineTokens.start(2) >= 0) { value = state.mLineTokens.group(2); } else { // If quoted, always use string value = state.mQuote.reset(state.mLineTokens.group(1)).replaceAll("\""); } titles.add(value); } return titles; } /** * Parses a line, saving the tokens, and determines the type match. * * @param line a new line */ private void parseLine(State state, String line) { // XXX The regex does not work if the first token is blank, and I // don't understand why. Workaround: if it's blank, add a space, // and remember I added a space. boolean firstEmpty = false; if (line.startsWith(state.currentSeparator)) { line = " " + line; firstEmpty = true; } // Match using the parser state.mLineTokens.reset(line); int nColumn = 0; while (state.mLineTokens.find()) { // Does this line have more columns than expected? if (nColumn == state.nColumns) { state.columnMismatch = true; return; } String token; if (state.mLineTokens.start(2) >= 0) { // The token was unquoted. Check if it could be a number. token = state.mLineTokens.group(2); if (firstEmpty) { token = ""; firstEmpty = false; } if (!isTokenNumberParsable(state, token)) { state.columnNumberParsable.set(nColumn, false); } } else { // If quoted, always use string token = state.mQuote.reset(state.mLineTokens.group(1)).replaceAll("\""); state.columnNumberParsable.set(nColumn, false); } state.columnTokens.get(nColumn).add(token); nColumn++; } // Does this line have fewer columns than expected? if (nColumn != state.nColumns) { state.columnMismatch = true; } } /** * Check whether the token can be parsed to a number. * * @param state the state of the parser * @param token the token * @return true if token matches a double */ private boolean isTokenNumberParsable(State state, String token) { if (token.isEmpty()) { return true; } return state.mDouble.reset(token).matches(); } /** * Checks whether the header can be safely interpreted as data. * This is used for the auto header detection. * * @param state the state of the parser * @param headerTokens the header * @return true if header should be handled as data */ private boolean isFirstLineData(State state, List<String> headerTokens) { // Check whether the type of the header match the type of the following data boolean headerCompatible = true; // Check whether if all types where strings boolean allStrings = true; for (int i = 0; i < state.nColumns; i++) { if (state.columnNumberParsable.get(i)) { allStrings = false; if (!isTokenNumberParsable(state, headerTokens.get(i))) { headerCompatible = false; } } } // If all columns are strings, it's impossible to tell whether we have // a header or not: assume we have a header. // If the column types matches (e.g. the header for a number column is also // a number) then we'll assume the header is actually data. return !allStrings && headerCompatible; } /** * Takes an elements and a list and returns a new list with both. * * @param head the first element * @param tail the rest of the elements * @return a list with all elements */ private List<String> joinList(final String head, final List<String> tail) { return new AbstractList<String>() { @Override public String get(int index) { if (index == 0) { return head; } else { return tail.get(index - 1); } } @Override public int size() { return tail.size()+1; } }; } static String alphabeticName(int i) { String name = ""; while (true) { int offset = i % 26; i = i / 26; char character = (char) ('A' + offset); name = name + character; if (i == 0) { return name; } } } /** * Parses a line of text representing comma separated values and returns * the values themselves. * * @param line the line to parse * @param separatorChar the regular expression for the separator * @return the list of values */ public static List<Object> parseCSVLine(String line, String separatorChar) { String regex = // puts a doublequoted field in group(1) and an unquoted field into group(2) "\\G(?:^|" + separatorChar + ")" + "(?:" + "\"" + "((?:[^\"]++|\"\")*+)" + "\"" + "|" + "([^\"" + separatorChar + "]*)" + ")"; Matcher mMain = Pattern.compile(regex).matcher(""); Matcher mQuote = Pattern.compile("\"\"").matcher(""); Matcher mDouble = Pattern.compile(DOUBLE_REGEX_WITH_NAN).matcher(""); List<Object> values = new ArrayList<>(); mMain.reset(line); while (mMain.find()) { Object value; if (mMain.start(2) >= 0) { String field = mMain.group(2); if (mDouble.reset(field).matches()) { value = Double.parseDouble(field); } else { value = field; } } else { // If quoted, always use string value = mQuote.reset(mMain.group(1)).replaceAll("\""); } values.add(value); } return values; } }