/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.io;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.CreationHelper;
import org.apache.poi.ss.usermodel.Font;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.util.WorkbookUtil;
import org.apache.poi.xssf.streaming.SXSSFWorkbook;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.ProcessStoppedException;
import com.rapidminer.operator.UserError;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeDateFormat;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.parameter.conditions.EqualTypeCondition;
import com.rapidminer.tools.DateParser;
import com.rapidminer.tools.I18N;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.io.Encoding;
import jxl.Workbook;
import jxl.WorkbookSettings;
import jxl.write.DateFormat;
import jxl.write.DateTime;
import jxl.write.Label;
import jxl.write.Number;
import jxl.write.NumberFormat;
import jxl.write.WritableCellFormat;
import jxl.write.WritableFont;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.WriteException;
/**
* <p>
* This operator can be used to write data into Microsoft Excel spreadsheets. This operator creates
* Excel files readable by Excel 95, 97, 2000, XP, 2003 and newer. Missing data values are indicated
* by empty cells.
* </p>
*
* @author Ingo Mierswa, Nils Woehler
*/
public class ExcelExampleSetWriter extends AbstractStreamWriter {
private static final String RAPID_MINER_DATA = "RapidMiner Data";
/** The parameter name for "The Excel spreadsheet file which should be written." */
public static final String PARAMETER_EXCEL_FILE = "excel_file";
public static final String FILE_FORMAT_XLS = "xls";
public static final String FILE_FORMAT_XLSX = "xlsx";
public static final String[] FILE_FORMAT_CATEGORIES = new String[] { FILE_FORMAT_XLS, FILE_FORMAT_XLSX };
public static final int FILE_FORMAT_XLS_INDEX = 0;
public static final int FILE_FORMAT_XLSX_INDEX = 1;
public static final String PARAMETER_FILE_FORMAT = "file_format";
public static final String PARAMETER_DATE_FORMAT = "date_format";
public static final String PARAMETER_NUMBER_FORMAT = "number_format";
public static final String PARAMETER_SHEET_NAME = "sheet_name";
public static final String DEFAULT_DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
public static final String DEFAULT_NUMBER_FORMAT = "#.0";
/**
* the limit of an excel cell, see the <a href=
* "https://support.office.com/en-gb/article/Excel-specifications-and-limits-16c69c74-3d6a-4aaf-ba35-e6eb276e8eaa">
* Microsoft limit documentation</a> for more information.
*/
private static final int CHARACTER_CELL_LIMIT = 32_767;
private static final String CROP_INDICATOR = "[...]";
public ExcelExampleSetWriter(OperatorDescription description) {
super(description);
}
/**
* Writes the example set into a excel file with XLS format. If you want to write it in XLSX
* format use {@link #writeXLSX(ExampleSet, String, String, String, OutputStream)}
*
* @param exampleSet
* the exampleSet to write.
* @param encoding
* the Charset to use for the file.
* @param out
* the stream to use.
*
* @deprecated please use
* {@link ExcelExampleSetWriter#write(ExampleSet, Charset, OutputStream, Operator)}
* to support checkForStop.
*/
@Deprecated
public static void write(ExampleSet exampleSet, Charset encoding, OutputStream out) throws IOException, WriteException {
try {
write(exampleSet, encoding, out, null);
} catch (ProcessStoppedException e) {
// can not happen because we do not deliver an Operator
}
}
/**
* Writes the example set into a excel file with XLS format. If you want to write it in XLSX
* format use {@link #writeXLSX(ExampleSet, String, String, String, OutputStream)}
*
* @param exampleSet
* the exampleSet to write.
* @param encoding
* the Charset to use for the file.
* @param out
* the stream to use.
* @param op
* will be used to provide checkForStop.
*/
public static void write(ExampleSet exampleSet, Charset encoding, OutputStream out, Operator op)
throws IOException, WriteException, ProcessStoppedException {
try {
// .xls files can only store up to 256 columns, so throw error in case of more
if (exampleSet.getAttributes().allSize() > 256) {
throw new IllegalArgumentException(
I18N.getMessage(I18N.getErrorBundle(), "export.excel.excel_xls_file_exceeds_column_limit"));
}
WorkbookSettings ws = new WorkbookSettings();
ws.setEncoding(encoding.name());
ws.setLocale(Locale.US);
WritableWorkbook workbook = Workbook.createWorkbook(out, ws);
WritableSheet s = workbook.createSheet(RAPID_MINER_DATA, 0);
writeDataSheet(s, exampleSet, op);
workbook.write();
workbook.close();
} finally {
try {
out.close();
} catch (Exception e) {
// silent. exception will trigger warning anyway
}
}
}
/**
* Writes the provided {@link ExampleSet} to the {@link WritableSheet}.
*
* @param s
* the DataSheet to be filled
* @param exampleSet
* the data to write
* @param op
* an {@link Operator} of the executing operator to checkForStop
* @throws WriteException
* @throws ProcessStoppedException
*/
private static void writeDataSheet(WritableSheet s, ExampleSet exampleSet, Operator op)
throws WriteException, ProcessStoppedException {
// Format the Font
WritableFont wf = new WritableFont(WritableFont.ARIAL, 10, WritableFont.BOLD);
WritableCellFormat cf = new WritableCellFormat(wf);
Iterator<Attribute> a = exampleSet.getAttributes().allAttributes();
int counter = 0;
while (a.hasNext()) {
Attribute attribute = a.next();
s.addCell(new Label(counter++, 0, attribute.getName(), cf));
}
NumberFormat nf = new NumberFormat(DEFAULT_NUMBER_FORMAT);
WritableCellFormat nfCell = new WritableCellFormat(nf);
WritableFont wf2 = new WritableFont(WritableFont.ARIAL, 10, WritableFont.NO_BOLD);
WritableCellFormat cf2 = new WritableCellFormat(wf2);
DateFormat df = new DateFormat(DateParser.DEFAULT_DATE_TIME_FORMAT);
WritableCellFormat dfCell = new WritableCellFormat(df);
int rowCounter = 1;
for (Example example : exampleSet) {
a = exampleSet.getAttributes().allAttributes();
int columnCounter = 0;
while (a.hasNext()) {
Attribute attribute = a.next();
if (!Double.isNaN(example.getValue(attribute))) {
if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.NOMINAL)) {
s.addCell(new Label(columnCounter, rowCounter,
stripIfNecessary(replaceForbiddenChars(example.getValueAsString(attribute))), cf2));
} else if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.DATE_TIME)) {
DateTime dateTime = new DateTime(columnCounter, rowCounter,
new Date((long) example.getValue(attribute)), dfCell);
s.addCell(dateTime);
} else if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.NUMERICAL)) {
Number number = new Number(columnCounter, rowCounter, example.getValue(attribute), nfCell);
s.addCell(number);
} else {
// default: write as a String
s.addCell(new Label(columnCounter, rowCounter,
stripIfNecessary(replaceForbiddenChars(example.getValueAsString(attribute))), cf2));
}
}
columnCounter++;
}
rowCounter++;
// checkForStop every 100 examples
if (op != null && rowCounter % 100 == 0) {
op.checkForStop();
}
}
}
private static String replaceForbiddenChars(String originalValue) {
return originalValue.replace((char) 0, ' ');
}
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
types.add(makeFileParameterType());
types.add(new ParameterTypeCategory(PARAMETER_FILE_FORMAT,
"Defines the file format the excel file should be saved as.", FILE_FORMAT_CATEGORIES, FILE_FORMAT_XLSX_INDEX,
false));
List<ParameterType> encodingTypes = Encoding.getParameterTypes(this);
for (ParameterType type : encodingTypes) {
type.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_FILE_FORMAT, FILE_FORMAT_CATEGORIES,
false, new int[] { FILE_FORMAT_XLS_INDEX }));
}
types.addAll(encodingTypes);
List<ParameterType> xlsxTypes = new LinkedList<ParameterType>();
ParameterTypeString sheetName = new ParameterTypeString(PARAMETER_SHEET_NAME,
"The name of the created sheet. Note that sheet name is Excel must not exceed 31 characters.",
RAPID_MINER_DATA);
sheetName.setExpert(false);
xlsxTypes.add(sheetName);
xlsxTypes.add(new ParameterTypeDateFormat(PARAMETER_DATE_FORMAT,
"The parse format of the date values. Default: for example \"yyyy-MM-dd HH:mm:ss\".", DEFAULT_DATE_FORMAT,
true));
xlsxTypes.add(new ParameterTypeString(PARAMETER_NUMBER_FORMAT,
"Specifies the number format for date entries. Default: \"#.0\"", DEFAULT_NUMBER_FORMAT, true));
for (ParameterType type : xlsxTypes) {
type.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_FILE_FORMAT, FILE_FORMAT_CATEGORIES,
false, new int[] { FILE_FORMAT_XLSX_INDEX }));
}
types.addAll(xlsxTypes);
return types;
}
@Override
protected String[] getFileExtensions() {
return new String[] { FILE_FORMAT_XLSX, FILE_FORMAT_XLS };
}
@Override
protected String getFileParameterName() {
return PARAMETER_EXCEL_FILE;
}
@Override
protected void writeStream(ExampleSet exampleSet, OutputStream outputStream) throws OperatorException {
File file = getParameterAsFile(PARAMETER_EXCEL_FILE, true);
if (getParameterAsString(PARAMETER_FILE_FORMAT).equals(FILE_FORMAT_XLSX)) {
String dateFormat = isParameterSet(PARAMETER_DATE_FORMAT) ? getParameterAsString(PARAMETER_DATE_FORMAT) : null;
String numberFormat = isParameterSet(PARAMETER_NUMBER_FORMAT) ? getParameterAsString(PARAMETER_NUMBER_FORMAT)
: null;
String sheetName = getParameterAsString(PARAMETER_SHEET_NAME);
if (sheetName.length() > 31) {
throw new UserError(this, "excel_sheet_name_too_long", sheetName, sheetName.length());
}
try {
writeXLSX(exampleSet, sheetName, dateFormat, numberFormat, outputStream, this);
} catch (WriteException | IOException e) {
throw new UserError(this, 303, file.getName(), e.getMessage());
}
} else {
WorkbookSettings ws = new WorkbookSettings();
Charset encoding = Encoding.getEncoding(this);
ws.setEncoding(encoding.name());
ws.setLocale(Locale.US);
try {
write(exampleSet, encoding, outputStream, this);
} catch (WriteException | IOException e) {
throw new UserError(this, 303, file.getName(), e.getMessage());
}
}
}
/**
* Writes the example set into a excel file with XLSX format. If you want to write it in XLS
* format use {@link #write(ExampleSet, Charset, OutputStream)}.
*
* @param exampleSet
* the exampleSet to write
* @param sheetName
* name of the excel sheet which will be created.
* @param dateFormat
* a string which describes the format used for dates.
* @param numberFormat
* a string which describes the format used for numbers.
* @param outputStream
* the stream to write the file to
*
* @deprecated please use
* {@link ExcelExampleSetWriter#writeXLSX(ExampleSet, String, String, String, OutputStream, Operator)}
* to support checkForStop.
*/
@Deprecated
public static void writeXLSX(ExampleSet exampleSet, String sheetName, String dateFormat, String numberFormat,
OutputStream outputStream) throws WriteException, IOException {
try {
writeXLSX(exampleSet, sheetName, dateFormat, numberFormat, outputStream, null);
} catch (ProcessStoppedException e) {
// can not happen because we provide no Operator
}
}
/**
* Writes the example set into a excel file with XLSX format. If you want to write it in XLS
* format use {@link #write(ExampleSet, Charset, OutputStream)}.
*
* @param exampleSet
* the exampleSet to write
* @param sheetName
* name of the excel sheet which will be created.
* @param dateFormat
* a string which describes the format used for dates.
* @param numberFormat
* a string which describes the format used for numbers.
* @param outputStream
* the stream to write the file to
* @param op
* needed for checkForStop
*/
public static void writeXLSX(ExampleSet exampleSet, String sheetName, String dateFormat, String numberFormat,
OutputStream outputStream, Operator op) throws WriteException, IOException, ProcessStoppedException {
// .xlsx files can only store up to 16384 columns, so throw error in case of more
if (exampleSet.getAttributes().allSize() > 16384) {
throw new IllegalArgumentException(
I18N.getMessage(I18N.getErrorBundle(), "export.excel.excel_xlsx_file_exceeds_column_limit"));
}
try (SXSSFWorkbook workbook = new SXSSFWorkbook(null, SXSSFWorkbook.DEFAULT_WINDOW_SIZE, false, true)) {
Sheet sheet = workbook.createSheet(WorkbookUtil.createSafeSheetName(sheetName));
dateFormat = dateFormat == null ? DEFAULT_DATE_FORMAT : dateFormat;
numberFormat = numberFormat == null ? "#.0" : numberFormat;
writeXLSXDataSheet(workbook, sheet, dateFormat, numberFormat, exampleSet, op);
workbook.write(outputStream);
} finally {
outputStream.flush();
outputStream.close();
}
}
/**
* Writes the provided {@link ExampleSet} to a XLSX formatted data sheet.
*
* @param wb
* the workbook to use
* @param sheet
* the excel sheet to write to.
* @param dateFormat
* a string which describes the format used for dates.
* @param numberFormat
* a string which describes the format used for numbers.
* @param exampleSet
* the exampleSet to write
* @param op
* needed for checkForStop
* @throws ProcessStoppedException
* if the process was stopped by the user.
* @throws WriteException
*/
private static void writeXLSXDataSheet(SXSSFWorkbook wb, Sheet sheet, String dateFormat, String numberFormat,
ExampleSet exampleSet, Operator op) throws WriteException, ProcessStoppedException {
Font headerFont = wb.createFont();
headerFont.setBoldweight(Font.BOLDWEIGHT_BOLD);
CellStyle headerStyle = wb.createCellStyle();
headerStyle.setFont(headerFont);
// create the header
Iterator<Attribute> a = exampleSet.getAttributes().allAttributes();
int columnCounter = 0;
int rowCounter = 0;
Row headerRow = sheet.createRow(rowCounter);
while (a.hasNext()) {
Attribute attribute = a.next();
Cell headerCell = headerRow.createCell(columnCounter);
headerCell.setCellValue(attribute.getName());
headerCell.setCellStyle(headerStyle);
columnCounter++;
}
rowCounter++;
// body font
Font bodyFont = wb.createFont();
bodyFont.setBoldweight(Font.BOLDWEIGHT_NORMAL);
CreationHelper createHelper = wb.getCreationHelper();
// number format
CellStyle numericalStyle = wb.createCellStyle();
numericalStyle.setDataFormat(createHelper.createDataFormat().getFormat(numberFormat));
numericalStyle.setFont(bodyFont);
// date format
CellStyle dateStyle = wb.createCellStyle();
dateStyle.setDataFormat(createHelper.createDataFormat().getFormat(dateFormat));
dateStyle.setFont(bodyFont);
// create nominal cell style
CellStyle nominalStyle = wb.createCellStyle();
nominalStyle.setFont(bodyFont);
// fill body
for (Example example : exampleSet) {
// create new row
Row bodyRow = sheet.createRow(rowCounter);
// iterate over attributes and save examples
a = exampleSet.getAttributes().allAttributes();
columnCounter = 0;
while (a.hasNext()) {
Attribute attribute = a.next();
Cell currentCell = bodyRow.createCell(columnCounter);
if (!Double.isNaN(example.getValue(attribute))) {
if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.DATE_TIME)) {
Date dateValue = example.getDateValue(attribute);
currentCell.setCellValue(dateValue);
currentCell.setCellStyle(dateStyle);
} else if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.NUMERICAL)) {
double numericalValue = example.getNumericalValue(attribute);
currentCell.setCellValue(numericalValue);
currentCell.setCellStyle(numericalStyle);
} else {
currentCell
.setCellValue(stripIfNecessary(replaceForbiddenChars(example.getValueAsString(attribute))));
currentCell.setCellStyle(nominalStyle);
}
}
columnCounter++;
}
rowCounter++;
// checkForStop every 100 examples
if (op != null && rowCounter % 100 == 0) {
op.checkForStop();
}
}
}
/**
* Checks if the given value length is greater than the allowed Excel cell limit (
* {@value #CHARACTER_CELL_LIMIT}). If it exceeds the limit the string will be stripped.
*
* @param value
* the string value which should be checked
* @return the original string if the character limit is not exceeded, otherwise a stripped one
*/
private static String stripIfNecessary(String value) {
if (value.length() > CHARACTER_CELL_LIMIT) {
return value.substring(0, CHARACTER_CELL_LIMIT - CROP_INDICATOR.length()) + CROP_INDICATOR;
} else {
return value;
}
}
}