/*
* Joinery -- Data frames for Java
* Copyright (c) 2014, 2015 IBM Corp.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package joinery.impl;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.math.BigInteger;
import java.net.URL;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import joinery.DataFrame;
import joinery.DataFrame.NumberDefault;
import org.apache.poi.hssf.usermodel.HSSFDataFormat;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.supercsv.cellprocessor.ConvertNullTo;
import org.supercsv.cellprocessor.FmtDate;
import org.supercsv.cellprocessor.ift.CellProcessor;
import org.supercsv.io.CsvListReader;
import org.supercsv.io.CsvListWriter;
import org.supercsv.prefs.CsvPreference;
public class Serialization {
private static final String EMPTY_DF_STRING = "[empty data frame]";
private static final String ELLIPSES = "...";
private static final String NEWLINE = "\n";
private static final String DELIMITER = "\t";
private static final Object INDEX_KEY = new Object();
private static final int MAX_COLUMN_WIDTH = 20;
public static String toString(final DataFrame<?> df, final int limit) {
final int len = df.length();
if (len == 0) {
return EMPTY_DF_STRING;
}
final StringBuilder sb = new StringBuilder();
final Map<Object, Integer> width = new HashMap<>();
final List<Class<?>> types = df.types();
final List<Object> columns = new ArrayList<>(df.columns());
// determine index width
width.put(INDEX_KEY, 0);
for (final Object row : df.index()) {
Class<? extends Object> rowClass = row == null ? null : row.getClass();
width.put(INDEX_KEY, clamp(
width.get(INDEX_KEY),
MAX_COLUMN_WIDTH,
fmt(rowClass, row).length()));
}
// determine column widths
for (int c = 0; c < columns.size(); c++) {
final Object column = columns.get(c);
width.put(column, String.valueOf(column).length());
for (int r = 0; r < df.length(); r++) {
width.put(column, clamp(
width.get(column),
MAX_COLUMN_WIDTH,
fmt(types.get(c), df.get(r, c)).length()));
}
}
// output column names
sb.append(lpad("", width.get(INDEX_KEY)));
for (int c = 0; c < columns.size(); c++) {
sb.append(DELIMITER);
final Object column = columns.get(c);
sb.append(lpad(column, width.get(column)));
}
sb.append(NEWLINE);
// output rows
final Iterator<Object> names = df.index().iterator();
for (int r = 0; r < len; r++) {
// output row name
int w = width.get(INDEX_KEY);
final Object row = names.hasNext() ? names.next() : r;
Class<? extends Object> rowClass = row == null ? null : row.getClass();
sb.append(truncate(lpad(fmt(rowClass, row), w), w));
// output rows
for (int c = 0; c < df.size(); c++) {
sb.append(DELIMITER);
final Class<?> cls = types.get(c);
w = width.get(columns.get(c));
if (Number.class.isAssignableFrom(cls)) {
sb.append(lpad(fmt(cls, df.get(r, c)), w));
} else {
sb.append(truncate(rpad(fmt(cls, df.get(r, c)), w), w));
}
}
sb.append(NEWLINE);
// skip rows if necessary to limit output
if (limit - 3 < r && r < (limit << 1) && r < len - 4) {
sb.append(NEWLINE).append(ELLIPSES)
.append(" ").append(len - limit)
.append(" rows skipped ").append(ELLIPSES)
.append(NEWLINE).append(NEWLINE);
while (r < len - 2) {
if (names.hasNext()) {
names.next();
}
r++;
}
}
}
return sb.toString();
}
private static final int clamp(final int lower, final int upper, final int value) {
return Math.max(lower, Math.min(upper, value));
}
private static final String lpad(final Object o, final int w) {
final StringBuilder sb = new StringBuilder();
final String value = String.valueOf(o);
for (int i = value.length(); i < w; i++) {
sb.append(' ');
}
sb.append(value);
return sb.toString();
}
private static final String rpad(final Object o, final int w) {
final StringBuilder sb = new StringBuilder();
final String value = String.valueOf(o);
sb.append(value);
for (int i = value.length(); i < w; i++) {
sb.append(' ');
}
return sb.toString();
}
private static final String truncate(final Object o, final int w) {
final String value = String.valueOf(o);
return value.length() - ELLIPSES.length() > w ? value.substring(0, w - ELLIPSES.length()) + ELLIPSES : value;
}
private static final String fmt(final Class<?> cls, final Object o) {
if(cls==null) return "null";
String s;
if (o instanceof Number) {
if (Short.class.equals(cls) || Integer.class.equals(cls) ||
Long.class.equals(cls) || BigInteger.class.equals(cls)) {
s = String.format("% d", Number.class.cast(o).longValue());
} else {
s = String.format("% .8f", Number.class.cast(o).doubleValue());
}
} else if (o instanceof Date) {
final Date dt = Date.class.cast(o);
final Calendar cal = Calendar.getInstance();
cal.setTime(dt);
final DateFormat fmt = new SimpleDateFormat(
cal.get(Calendar.HOUR_OF_DAY) == 0 &&
cal.get(Calendar.MINUTE) == 0 &&
cal.get(Calendar.SECOND) == 0 ?
"yyyy-MM-dd" : "yyyy-MM-dd'T'HH:mm:ssXXX"
);
s = fmt.format(dt);
} else {
s = o != null ? String.valueOf(o) : "";
}
return s;
}
public static DataFrame<Object> readCsv(final String file)
throws IOException {
return readCsv(file.contains("://") ?
new URL(file).openStream() : new FileInputStream(file), ",", NumberDefault.LONG_DEFAULT, null);
}
public static DataFrame<Object> readCsv(final String file, final String separator, NumberDefault numDefault)
throws IOException {
return readCsv(file.contains("://") ?
new URL(file).openStream() : new FileInputStream(file), separator, numDefault, null);
}
public static DataFrame<Object> readCsv(final String file, final String separator, NumberDefault numDefault, final String naString)
throws IOException {
return readCsv(file.contains("://") ?
new URL(file).openStream() : new FileInputStream(file), separator, numDefault, naString);
}
public static DataFrame<Object> readCsv(final String file, final String separator, NumberDefault numDefault, final String naString, boolean hasHeader)
throws IOException {
return readCsv(file.contains("://") ?
new URL(file).openStream() : new FileInputStream(file), separator, numDefault, naString, hasHeader);
}
public static DataFrame<Object> readCsv(final InputStream input)
throws IOException {
return readCsv(input, ",", NumberDefault.LONG_DEFAULT, null);
}
public static DataFrame<Object> readCsv(final InputStream input, String separator, NumberDefault numDefault, String naString)
throws IOException {
return readCsv(input,separator, numDefault,naString, true);
}
public static DataFrame<Object> readCsv(final InputStream input, String separator, NumberDefault numDefault, String naString, boolean hasHeader)
throws IOException {
CsvPreference csvPreference;
switch (separator) {
case "\\t":
csvPreference = CsvPreference.TAB_PREFERENCE;
break;
case ",":
csvPreference = CsvPreference.STANDARD_PREFERENCE;
break;
case ";":
csvPreference = CsvPreference.EXCEL_NORTH_EUROPE_PREFERENCE;
break;
default:
throw new IllegalArgumentException("Separator: " + separator + " is not currently supported");
}
try (CsvListReader reader = new CsvListReader(new InputStreamReader(input), csvPreference)) {
final List<String> header;
final DataFrame<Object> df;
final CellProcessor[] procs;
if(hasHeader) {
header = Arrays.asList(reader.getHeader(true));
procs = new CellProcessor[header.size()];
df = new DataFrame<>(header);
} else {
// Read the first row to figure out how many columns we have
reader.read();
header = new ArrayList<String>();
for (int i = 0; i < reader.length(); i++) {
header.add("V"+i);
}
procs = new CellProcessor[header.size()];
df = new DataFrame<>(header);
// The following line executes the procs on the previously read row again
df.append(reader.executeProcessors(procs));
}
for (List<Object> row = reader.read(procs); row != null; row = reader.read(procs)) {
df.append(new ArrayList<>(row));
}
return df.convert(numDefault, naString);
}
}
public static <V> void writeCsv(final DataFrame<V> df, final String output)
throws IOException {
writeCsv(df, new FileOutputStream(output));
}
public static <V> void writeCsv(final DataFrame<V> df, final OutputStream output)
throws IOException {
try (CsvListWriter writer = new CsvListWriter(new OutputStreamWriter(output), CsvPreference.STANDARD_PREFERENCE)) {
final String[] header = new String[df.size()];
final Iterator<Object> it = df.columns().iterator();
for (int c = 0; c < df.size(); c++) {
header[c] = String.valueOf(it.hasNext() ? it.next() : c);
}
writer.writeHeader(header);
final CellProcessor[] procs = new CellProcessor[df.size()];
final List<Class<?>> types = df.types();
for (int c = 0; c < df.size(); c++) {
final Class<?> cls = types.get(c);
if (Date.class.isAssignableFrom(cls)) {
procs[c] = new ConvertNullTo("", new FmtDate("yyyy-MM-dd'T'HH:mm:ssXXX"));
} else {
procs[c] = new ConvertNullTo("");
}
}
for (final List<V> row : df) {
writer.write(row, procs);
}
}
}
public static DataFrame<Object> readXls(final String file)
throws IOException {
return readXls(file.contains("://") ?
new URL(file).openStream() : new FileInputStream(file));
}
public static DataFrame<Object> readXls(final InputStream input)
throws IOException {
final Workbook wb = new HSSFWorkbook(input);
final Sheet sheet = wb.getSheetAt(0);
final List<Object> columns = new ArrayList<>();
final List<List<Object>> data = new ArrayList<>();
for (final Row row : sheet) {
if (row.getRowNum() == 0) {
// read header
for (final Cell cell : row) {
columns.add(readCell(cell));
}
} else {
// read data values
final List<Object> values = new ArrayList<>();
for (final Cell cell : row) {
values.add(readCell(cell));
}
data.add(values);
}
}
// create data frame
final DataFrame<Object> df = new DataFrame<>(columns);
for (final List<Object> row : data) {
df.append(row);
}
return df.convert();
}
public static <V> void writeXls(final DataFrame<V> df, final String output)
throws IOException {
writeXls(df, new FileOutputStream(output));
}
public static <V> void writeXls(final DataFrame<V> df, final OutputStream output)
throws IOException {
final Workbook wb = new HSSFWorkbook();
final Sheet sheet = wb.createSheet();
// add header
Row row = sheet.createRow(0);
final Iterator<Object> it = df.columns().iterator();
for (int c = 0; c < df.size(); c++) {
final Cell cell = row.createCell(c);
writeCell(cell, it.hasNext() ? it.next() : c);
}
// add data values
for (int r = 0; r < df.length(); r++) {
row = sheet.createRow(r + 1);
for (int c = 0; c < df.size(); c++) {
final Cell cell = row.createCell(c);
writeCell(cell, df.get(r, c));
}
}
// write to stream
wb.write(output);
output.close();
}
private static final Object readCell(final Cell cell) {
switch (cell.getCellType()) {
case Cell.CELL_TYPE_NUMERIC:
if (DateUtil.isCellDateFormatted(cell)) {
return DateUtil.getJavaDate(cell.getNumericCellValue());
}
return cell.getNumericCellValue();
case Cell.CELL_TYPE_BOOLEAN:
return cell.getBooleanCellValue();
default:
return cell.getStringCellValue();
}
}
private static final void writeCell(final Cell cell, final Object value) {
if (value instanceof Number) {
cell.setCellType(Cell.CELL_TYPE_NUMERIC);
cell.setCellValue(Number.class.cast(value).doubleValue());
} else if (value instanceof Date) {
final CellStyle style = cell.getSheet().getWorkbook().createCellStyle();
style.setDataFormat(HSSFDataFormat.getBuiltinFormat("m/d/yy h:mm"));
cell.setCellStyle(style);
cell.setCellType(Cell.CELL_TYPE_NUMERIC);
cell.setCellValue(Date.class.cast(value));
} else if (value instanceof Boolean) {
cell.setCellType(Cell.CELL_TYPE_BOOLEAN);
} else {
cell.setCellType(Cell.CELL_TYPE_STRING);
cell.setCellValue(value != null ? String.valueOf(value) : "");
}
}
}