/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.io;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.sql.Date;
import java.text.DateFormat;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.OperatorProgress;
import com.rapidminer.operator.ProcessStoppedException;
import com.rapidminer.operator.ports.Port;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.parameter.PortProvider;
import com.rapidminer.parameter.conditions.PortConnectedCondition;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.io.Encoding;
/**
* <p>
* This operator can be used to write data into CSV files (Comma Separated Values). The values and
* columns are separated by ";". Missing data values are indicated by empty cells.
* </p>
*
* @author Ingo Mierswa
*/
public class CSVExampleSetWriter extends AbstractStreamWriter {
/** The parameter name for "The CSV file which should be written." */
public static final String PARAMETER_CSV_FILE = "csv_file";
/** The parameter name for the column separator parameter. */
public static final String PARAMETER_COLUMN_SEPARATOR = "column_separator";
/** Indicates if the attribute names should be written as first row. */
public static final String PARAMETER_WRITE_ATTRIBUTE_NAMES = "write_attribute_names";
/**
* Indicates if nominal values should be quoted with double quotes. Quotes inside of nominal
* values will be escaped by a backslash.
*/
public static final String PARAMETER_QUOTE_NOMINAL_VALUES = "quote_nominal_values";
public static final String PARAMETER_APPEND_FILE = "append_to_file";
/**
* Indicates if date attributes are written as a formated string or as milliseconds past since
* January 1, 1970, 00:00:00 GMT
*/
// TODO introduce parameter which allows to determine the written format see
// Nominal2Date operator
public static final String PARAMETER_FORMAT_DATE = "format_date_attributes";
public CSVExampleSetWriter(OperatorDescription description) {
super(description);
}
/**
* Writes the exampleSet with the {@link PrintWriter} out, using colSeparator as column
* separator.
*
* @param exampleSet
* the example set to write
* @param out
* the {@link PrintWriter}
* @param colSeparator
* the column separator
* @param quoteNomValues
* if {@code true} nominal values are quoted
* @param writeAttribNames
* if {@code true} the attribute names are written into the first row
* @param formatDate
* if {@code true} dates are formatted to "M/d/yy h:mm a", otherwise milliseconds
* since the epoch are used
*
* @deprecated please use
* {@link CSVExampleSetWriter#writeCSV(ExampleSet, PrintWriter, String, boolean, boolean, boolean, OperatorProgress)}
* instead to support operator progress.
*/
@Deprecated
public static void writeCSV(ExampleSet exampleSet, PrintWriter out, String colSeparator, boolean quoteNomValues,
boolean writeAttribNames, boolean formatDate) {
try {
writeCSV(exampleSet, out, colSeparator, quoteNomValues, writeAttribNames, formatDate, null, null);
} catch (ProcessStoppedException e) {
// can not happen because we provide no OperatorProgressListener
}
}
/**
* Writes the exampleSet with the {@link PrintWriter} out, using colSeparator as column
* separator.
*
* @param exampleSet
* the example set to write
* @param out
* the {@link PrintWriter}
* @param colSeparator
* the column separator
* @param quoteNomValues
* if {@code true} nominal values are quoted
* @param writeAttribNames
* if {@code true} the attribute names are written into the first row
* @param formatDate
* if {@code true} dates are formatted to "M/d/yy h:mm a", otherwise milliseconds
* since the epoch are used
* @param opProg
* the {@link OperatorProgress} is used to provide a more detailed progress. Within
* this method the progress will be increased by number of examples times the number
* of attributes. If you do not want the operator progress, just provide <code> null
* <code>.
*/
public static void writeCSV(ExampleSet exampleSet, PrintWriter out, String colSeparator, boolean quoteNomValues,
boolean writeAttribNames, boolean formatDate, OperatorProgress operatorProgress) throws ProcessStoppedException {
writeCSV(exampleSet, out, colSeparator, quoteNomValues, writeAttribNames, formatDate, null, operatorProgress);
}
/**
* Writes the exampleSet with the {@link PrintWriter} out, using colSeparator as column
* separator and infinitySybol to denote infinite values.
*
* @param exampleSet
* the example set to write
* @param out
* the {@link PrintWriter}
* @param colSeparator
* the column separator
* @param quoteNomValues
* if {@code true} nominal values are quoted
* @param writeAttribNames
* if {@code true} the attribute names are written into the first row
* @param formatDate
* if {@code true} dates are formatted to "M/d/yy h:mm a", otherwise milliseconds
* since the epoch are used
* @param infinitySymbol
* the symbol to use for infinite values; if {@code null} the default symbol
* "Infinity" is used
*
* @deprecated please use
* {@link CSVExampleSetWriter#writeCSV(ExampleSet, PrintWriter, String, boolean, boolean, boolean, String, OperatorProgress)}
* to support operator progress.
*/
@Deprecated
public static void writeCSV(ExampleSet exampleSet, PrintWriter out, String colSeparator, boolean quoteNomValues,
boolean writeAttribNames, boolean formatDate, String infinitySymbol) {
try {
writeCSV(exampleSet, out, colSeparator, quoteNomValues, writeAttribNames, formatDate, infinitySymbol, null);
} catch (ProcessStoppedException e) {
// can not happen because we provide no OperatorProcessListener
}
}
/**
* Writes the exampleSet with the {@link PrintWriter} out, using colSeparator as column
* separator and infinitySybol to denote infinite values.
*
* @param exampleSet
* the example set to write
* @param out
* the {@link PrintWriter}
* @param colSeparator
* the column separator
* @param quoteNomValues
* if {@code true} nominal values are quoted
* @param writeAttribNames
* if {@code true} the attribute names are written into the first row
* @param formatDate
* if {@code true} dates are formatted to "M/d/yy h:mm a", otherwise milliseconds
* since the epoch are used
* @param infinitySymbol
* the symbol to use for infinite values; if {@code null} the default symbol
* "Infinity" is used
* @param opProg
* the {@link OperatorProgress} is used to provide a more detailed progress. Within
* this method the progress will be increased by number of examples times the number
* of attributes. If you do not want the operator progress, just provide <code> null
* <code>.
*/
public static void writeCSV(ExampleSet exampleSet, PrintWriter out, String colSeparator, boolean quoteNomValues,
boolean writeAttribNames, boolean formatDate, String infinitySymbol, OperatorProgress opProg)
throws ProcessStoppedException {
String negativeInfinitySymbol = null;
if (infinitySymbol != null) {
negativeInfinitySymbol = "-" + infinitySymbol;
}
String columnSeparator = colSeparator;
boolean quoteNominalValues = quoteNomValues;
// write column names
if (writeAttribNames) {
Iterator<Attribute> a = exampleSet.getAttributes().allAttributes();
boolean first = true;
while (a.hasNext()) {
if (!first) {
out.print(columnSeparator);
}
Attribute attribute = a.next();
String name = attribute.getName();
if (quoteNominalValues) {
name = name.replaceAll("\"", "'");
name = "\"" + name + "\"";
}
out.print(name);
first = false;
}
out.println();
}
// write data
int progressCounter = 0;
for (Example example : exampleSet) {
Iterator<Attribute> a = exampleSet.getAttributes().allAttributes();
boolean first = true;
while (a.hasNext()) {
Attribute attribute = a.next();
if (!first) {
out.print(columnSeparator);
}
if (!Double.isNaN(example.getValue(attribute))) {
if (attribute.isNominal()) {
String stringValue = example.getValueAsString(attribute);
if (quoteNominalValues) {
stringValue = stringValue.replaceAll("\"", "'");
stringValue = "\"" + stringValue + "\"";
}
out.print(stringValue);
} else {
Double value = example.getValue(attribute);
if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.DATE_TIME)) {
if (formatDate) {
Date date = new Date(value.longValue());
String s = DateFormat.getInstance().format(date);
out.print(s);
} else {
out.print(value);
}
} else {
if (value.isInfinite() && infinitySymbol != null) {
if (Double.POSITIVE_INFINITY == value) {
out.print(infinitySymbol);
} else {
out.print(negativeInfinitySymbol);
}
} else {
out.print(value);
}
}
}
}
first = false;
}
out.println();
// trigger operator progress every 100 examples
if (opProg != null) {
++progressCounter;
if (progressCounter % 100 == 0) {
opProg.step(100);
progressCounter = 0;
}
}
}
}
@Override
public void writeStream(ExampleSet exampleSet, java.io.OutputStream outputStream) throws OperatorException {
String columnSeparator = getParameterAsString(PARAMETER_COLUMN_SEPARATOR);
boolean quoteNominalValues = getParameterAsBoolean(PARAMETER_QUOTE_NOMINAL_VALUES);
boolean writeAttribNames = getParameterAsBoolean(PARAMETER_WRITE_ATTRIBUTE_NAMES);
boolean formatDate = getParameterAsBoolean(PARAMETER_FORMAT_DATE);
try (PrintWriter out = new PrintWriter(new OutputStreamWriter(outputStream, Encoding.getEncoding(this)))) {
// init operator progress
getProgress().setTotal(exampleSet.size());
writeCSV(exampleSet, out, columnSeparator, quoteNominalValues, writeAttribNames, formatDate, getProgress());
getProgress().complete();
}
}
@Override
protected boolean supportsEncoding() {
return true;
}
@Override
protected boolean shouldAppend() {
return getParameterAsBoolean(PARAMETER_APPEND_FILE);
}
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = new LinkedList<ParameterType>();
types.add(makeFileParameterType());
// types.add(new ParameterTypeFile(PARAMETER_CSV_FILE,
// "The CSV file which should be written.", "csv", false));
types.add(new ParameterTypeString(PARAMETER_COLUMN_SEPARATOR, "The column separator.", ";", false));
types.add(new ParameterTypeBoolean(PARAMETER_WRITE_ATTRIBUTE_NAMES,
"Indicates if the attribute names should be written as first row.", true, false));
types.add(new ParameterTypeBoolean(PARAMETER_QUOTE_NOMINAL_VALUES,
"Indicates if nominal values should be quoted with double quotes.", true, false));
types.add(new ParameterTypeBoolean(PARAMETER_FORMAT_DATE,
"Indicates if date attributes are written as a formated string or as milliseconds past since January 1, 1970, 00:00:00 GMT",
true, true));
ParameterType type = new ParameterTypeBoolean(PARAMETER_APPEND_FILE,
"Indicates if new content should be appended to the file or if the pre-existing file content should be overwritten.",
false, false);
type.registerDependencyCondition(new PortConnectedCondition(this, new PortProvider() {
@Override
public Port getPort() {
return fileOutputPort;
}
}, true, false));
types.add(type);
types.addAll(super.getParameterTypes());
return types;
}
@Override
protected String getFileParameterName() {
return PARAMETER_CSV_FILE;
}
@Override
protected String[] getFileExtensions() {
return new String[] { "csv" };
}
}