/**
* Copyright (C) 2015 - present by OpenGamma Inc. and the OpenGamma group of companies
*
* Please see distribution for license.
*/
package com.opengamma.strata.collect.io;
import static com.opengamma.strata.collect.Guavate.toImmutableList;
import java.io.Reader;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.io.CharSource;
import com.google.common.io.CharStreams;
import com.opengamma.strata.collect.ArgChecker;
import com.opengamma.strata.collect.Unchecked;
/**
* A CSV file.
* <p>
* Represents a CSV file together with the ability to parse it from a {@link CharSource}.
* The separator may be specified, allowing TSV files (tab-separated) and other similar formats to be parsed.
* <p>
* This class loads the entire CSV file into memory.
* To process the CSV file row-by-row, use {@link CsvIterator}.
* <p>
* The CSV file format is a general-purpose comma-separated value format.
* The format is parsed line-by-line, with lines separated by CR, LF or CRLF.
* Each line can contain one or more fields.
* Each field is separated by a comma character ({@literal ,}) or tab.
* Any field may be quoted using a double quote at the start and end.
* The content of a quoted field may include commas and additional double quotes.
* Two adjacent double quotes in a quoted field will be replaced by a single double quote.
* Quoted fields are not trimmed. Non-quoted fields are trimmed.
* <p>
* The first line may be treated as a header row.
* The header row is accessed separately from the data rows.
* <p>
* Blank lines are ignored.
* Lines may be commented with has '#' or semicolon ';'.
*/
public final class CsvFile {
/**
* The header row, ordered as the headers appear in the file.
*/
private final ImmutableList<String> headers;
/**
* The header map, transformed for case-insensitive searching.
*/
private final ImmutableMap<String, Integer> searchHeaders;
/**
* The data rows in the CSV file.
*/
private final ImmutableList<CsvRow> rows;
//------------------------------------------------------------------------
/**
* Parses the specified source as a CSV file, using a comma as the separator.
*
* @param source the CSV file resource
* @param headerRow whether the source has a header row, an empty source must still contain the header
* @return the CSV file
* @throws UncheckedIOException if an IO exception occurs
* @throws IllegalArgumentException if the file cannot be parsed
*/
public static CsvFile of(CharSource source, boolean headerRow) {
return of(source, headerRow, ',');
}
/**
* Parses the specified source as a CSV file where the separator is specified and might not be a comma.
* <p>
* This overload allows the separator to be controlled.
* For example, a tab-separated file is very similar to a CSV file, the only difference is the separator.
*
* @param source the file resource
* @param headerRow whether the source has a header row, an empty source must still contain the header
* @param separator the separator used to separate each field, typically a comma, but a tab is sometimes used
* @return the CSV file
* @throws UncheckedIOException if an IO exception occurs
* @throws IllegalArgumentException if the file cannot be parsed
*/
public static CsvFile of(CharSource source, boolean headerRow, char separator) {
ArgChecker.notNull(source, "source");
List<String> lines = Unchecked.wrap(() -> source.readLines());
return create(lines, headerRow, separator);
}
/**
* Parses the specified reader as a CSV file, using a comma as the separator.
* <p>
* This factory method takes a {@link Reader}.
* Callers are encouraged to use {@link CharSource} instead of {@code Reader}
* as it allows the resource to be safely managed.
* <p>
* This factory method allows the separator to be controlled.
* For example, a tab-separated file is very similar to a CSV file, the only difference is the separator.
*
* @param reader the file resource
* @param headerRow whether the source has a header row, an empty source must still contain the header
* @return the CSV file
* @throws UncheckedIOException if an IO exception occurs
* @throws IllegalArgumentException if the file cannot be parsed
*/
public static CsvFile of(Reader reader, boolean headerRow) {
return of(reader, headerRow, ',');
}
/**
* Parses the specified reader as a CSV file where the separator is specified and might not be a comma.
* <p>
* This factory method takes a {@link Reader}.
* Callers are encouraged to use {@link CharSource} instead of {@code Reader}
* as it allows the resource to be safely managed.
* <p>
* This factory method allows the separator to be controlled.
* For example, a tab-separated file is very similar to a CSV file, the only difference is the separator.
*
* @param reader the file resource
* @param headerRow whether the source has a header row, an empty source must still contain the header
* @param separator the separator used to separate each field, typically a comma, but a tab is sometimes used
* @return the CSV file
* @throws UncheckedIOException if an IO exception occurs
* @throws IllegalArgumentException if the file cannot be parsed
*/
public static CsvFile of(Reader reader, boolean headerRow, char separator) {
ArgChecker.notNull(reader, "source");
List<String> lines = Unchecked.wrap(() -> CharStreams.readLines(reader));
return create(lines, headerRow, separator);
}
// creates the file
private static CsvFile create(List<String> lines, boolean headerRow, char separator) {
ArrayList<ImmutableList<String>> parsedCsv = parseAll(lines, separator);
if (!headerRow) {
return new CsvFile(ImmutableList.of(), ImmutableMap.of(), ImmutableList.copyOf(parsedCsv));
}
if (parsedCsv.isEmpty()) {
throw new IllegalArgumentException("Could not read header row from empty CSV file");
}
ImmutableList<String> headers = parsedCsv.remove(0);
return new CsvFile(headers, buildSearchHeaders(headers), ImmutableList.copyOf(parsedCsv));
}
//------------------------------------------------------------------------
/**
* Obtains an instance from a list of headers and rows.
* <p>
* The headers may be an empty list.
* All the rows must contain a list of the same size, matching the header if present.
*
* @param headers the headers, empty if no headers
* @param rows the data rows
* @return the CSV file
* @throws IllegalArgumentException if the rows do not match the headers
*/
public static CsvFile of(List<String> headers, List<? extends List<String>> rows) {
ArgChecker.notNull(headers, "headers");
ArgChecker.notNull(rows, "rows");
int size = (headers.size() == 0 && rows.size() > 0 ? rows.get(0).size() : headers.size());
if (rows.stream().filter(row -> row.size() != size).findAny().isPresent()) {
throw new IllegalArgumentException("Invalid data rows, each row must have same columns as header row");
}
ImmutableList<String> copiedHeaders = ImmutableList.copyOf(headers);
ImmutableList<ImmutableList<String>> copiedRows = rows.stream()
.map(row -> ImmutableList.copyOf(row))
.collect(toImmutableList());
return new CsvFile(copiedHeaders, buildSearchHeaders(copiedHeaders), copiedRows);
}
//------------------------------------------------------------------------
// parses the CSV file format
private static ArrayList<ImmutableList<String>> parseAll(List<String> lines, char separator) {
ArrayList<ImmutableList<String>> parsedLines = new ArrayList<>();
for (String line : lines) {
ImmutableList<String> parsed = parseLine(line, separator);
if (!parsed.isEmpty()) {
parsedLines.add(parsed);
}
}
return parsedLines;
}
// parse a single line
static ImmutableList<String> parseLine(String line, char separator) {
if (line.length() == 0 || line.startsWith("#") || line.startsWith(";")) {
return ImmutableList.of();
}
ImmutableList.Builder<String> builder = ImmutableList.builder();
int start = 0;
String terminated = line + separator;
int nextSeparator = terminated.indexOf(separator, start);
while (nextSeparator >= 0) {
String possible = terminated.substring(start, nextSeparator).trim();
if (possible.startsWith("\"")) {
while (true) {
if (possible.substring(1).replace("\"\"", "").endsWith("\"")) {
possible = possible.substring(1, possible.length() - 1).replace("\"\"", "\"");
break;
} else {
nextSeparator = terminated.indexOf(separator, nextSeparator + 1);
if (nextSeparator < 0) {
throw new IllegalArgumentException("Mismatched quotes on line: " + line);
}
possible = terminated.substring(start, nextSeparator).trim();
}
}
}
builder.add(possible);
start = nextSeparator + 1;
nextSeparator = terminated.indexOf(separator, start);
}
ImmutableList<String> fields = builder.build();
if (!hasContent(fields)) {
return ImmutableList.of();
}
return fields;
}
// determines whether there is any content on a line
// this handles lines that contain separators but nothing else
private static boolean hasContent(ImmutableList<String> fields) {
for (String field : fields) {
if (!field.trim().isEmpty()) {
return true;
}
}
return false;
}
// build the search headers
static ImmutableMap<String, Integer> buildSearchHeaders(ImmutableList<String> headers) {
// need to allow duplicate headers and only store the first instance
Map<String, Integer> searchHeaders = new HashMap<>();
for (int i = 0; i < headers.size(); i++) {
String searchHeader = headers.get(i).toLowerCase(Locale.ENGLISH);
searchHeaders.putIfAbsent(searchHeader, i);
}
return ImmutableMap.copyOf(searchHeaders);
}
//------------------------------------------------------------------------
/**
* Restricted constructor.
*
* @param headers the header row
* @param rows the data rows
*/
private CsvFile(
ImmutableList<String> headers,
ImmutableMap<String, Integer> searchHeaders,
ImmutableList<ImmutableList<String>> rows) {
this.headers = headers;
this.searchHeaders = searchHeaders;
this.rows = rows.stream()
.map(cols -> new CsvRow(headers, this.searchHeaders, cols))
.collect(toImmutableList());
}
//------------------------------------------------------------------------
/**
* Gets the header row.
* <p>
* If there is no header row, an empty list is returned.
*
* @return the header row
*/
public ImmutableList<String> headers() {
return headers;
}
/**
* Gets all data rows in the file.
*
* @return the data rows
*/
public ImmutableList<CsvRow> rows() {
return rows;
}
/**
* Gets the number of data rows.
*
* @return the number of data rows
*/
public int rowCount() {
return rows.size();
}
/**
* Gets a single row.
*
* @param index the row index, zero-based
* @return the row
*/
public CsvRow row(int index) {
return rows.get(index);
}
//-------------------------------------------------------------------------
/**
* Checks if this CSV file equals another.
* <p>
* The comparison checks the content.
*
* @param obj the other file, null returns false
* @return true if equal
*/
@Override
public boolean equals(Object obj) {
if (obj == this) {
return true;
}
if (obj instanceof CsvFile) {
CsvFile other = (CsvFile) obj;
return headers.equals(other.headers) && rows.equals(other.rows);
}
return false;
}
/**
* Returns a suitable hash code for the CSV file.
*
* @return the hash code
*/
@Override
public int hashCode() {
return headers.hashCode() ^ rows.hashCode();
}
/**
* Returns a string describing the CSV file.
*
* @return the descriptive string
*/
@Override
public String toString() {
return "CsvFile" + headers.toString();
}
}