/*
* RapidMiner
*
* Copyright (C) 2001-2008 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.io;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.List;
import java.util.zip.GZIPOutputStream;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleFormatter;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.FormatterException;
import com.rapidminer.example.table.NumericalAttribute;
import com.rapidminer.example.table.SparseFormatDataRowReader;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeFile;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.ParameterTypeString;
/**
* Writes values of all examples in an {@link ExampleSet} to a file. Dense,
* sparse, and user defined formats (specified by the parameter 'format') can be
* used. Attribute description files may be generated for dense and sparse
* format as well. These formats can be read using the {@link ExampleSource} and
* {@link SparseFormatExampleSource} operators.
*
* <dl>
* <dt>dense:</dt>
* <dd> Each line of the generated data file is of the form<br/> <center>
*
* <pre>
* regular attributes <special attributes>
* </pre>
*
* </center> For example, each line could have the form <center>
*
* <pre>
* value1 value2 ... valueN <id> <label> <prediction> ... <confidences>
* </pre>
*
* </center> Values in parenthesis are optional and are only printed if they are
* available. The confidences are only given for nominal predictions. Other
* special attributes might be the example weight or the cluster number. </dd>
* <dt>sparse:</dt>
* <dd>Only non 0 values are written to the file, prefixed by a column index.
* See the description of {@link SparseFormatExampleSource} for details. </dd>
* <dt>special:</dt>
* <dd>Using the parameter 'special_format', the user can specify the exact
* format. The $ sign has a special meaning and introduces a command (the
* following character) Additional arguments to this command may be supplied
* enclosing it in square brackets.
* <dl>
* <dt>$a:</dt>
* <dd> All attributes separated by the default separator</dd>
* <dt>$a[separator]:</dt>
* <dd> All attributes separated by separator</dd>
* <dt>$s[separator][indexSeparator]:</dt>
* <dd> Sparse format. For all non zero attributes the following strings are
* concatenated: the column index, the value of indexSeparator, the attribute
* value. Attributes are separated by separator.</dd>
* <dt>$v[name]:</dt>
* <dd> The value of the attribute with the given name (both regular and special
* attributes)</dd>
* <dt>$k[index]:</dt>
* <dd> The value of the attribute with the given index</dd>
* <dt>$l:</dt>
* <dd> The label</dd>
* <dt>$p:</dt>
* <dd> The predicted label</dd>
* <dt>$d:</dt>
* <dd> All prediction confidences for all classes in the form conf(class)=value</dd>
* <dt>$d[class]:</dt>
* <dd> The prediction confidence for the defined class as a simple number</dd>
* <dt>$i:</dt>
* <dd> The id</dd>
* <dt>$w:</dt>
* <dd> The weight</dd>
* <dt>$b:</dt>
* <dd> The batch number</dd>
* <dt>$n:</dt>
* <dd> The newline character</dd>
* <dt>$t:</dt>
* <dd> The tabulator character</dd>
* <dt>$$:</dt>
* <dd> The dollar sign</dd>
* <dt>$[:</dt>
* <dd> The '[' character</dd>
* <dt>$]:</dt>
* <dd> The ']' character</dd>
* </dl>
* Make sure the format string ends with $n if you want examples to be separated
* by newlines!</dd>
* </dl>
*
* @see com.rapidminer.example.ExampleSet
*
* @author Simon Fischer, Ingo Mierswa
* @version $Id: ExampleSetWriter.java,v 1.15 2006/03/27 13:22:00 ingomierswa
* Exp $
*/
public class ExampleSetWriter extends Operator {
/** The parameter name for "File to save the example set to." */
public static final String PARAMETER_EXAMPLE_SET_FILE = "example_set_file";
/** The parameter name for "File to save the attribute descriptions to." */
public static final String PARAMETER_ATTRIBUTE_DESCRIPTION_FILE = "attribute_description_file";
/** The parameter name for "Format to use for output." */
public static final String PARAMETER_FORMAT = "format";
/** The parameter name for "Format string to use for output." */
public static final String PARAMETER_SPECIAL_FORMAT = "special_format";
/** The parameter name for "The number of fraction digits in the output file (-1: all possible digits)." */
public static final String PARAMETER_FRACTION_DIGITS = "fraction_digits";
/** The parameter name for "Indicates if nominal values containing whitespace characters should be quoted with double quotes." */
public static final String PARAMETER_QUOTE_WHITESPACE = "quote_whitespace";
/** The parameter name for "Indicates if the data file content should be zipped." */
public static final String PARAMETER_ZIPPED = "zipped";
/** The parameter name for "Indicates if an existing table should be overwritten." */
public static final String PARAMETER_OVERWRITE_MODE = "overwrite_mode";
public static final String[] OVERWRITE_MODES = new String[] {
"none",
"overwrite first, append then",
"overwrite",
"append"
};
public static final int OVERWRITE_MODE_NONE = 0;
public static final int OVERWRITE_MODE_OVERWRITE_FIRST = 1;
public static final int OVERWRITE_MODE_OVERWRITE = 2;
public static final int OVERWRITE_MODE_APPEND = 3;
private static String[] formatNames;
private static final int DENSE_FORMAT = 0;
static {
formatNames = new String[SparseFormatDataRowReader.FORMAT_NAMES.length + 2];
formatNames[0] = "dense";
for (int i = 0; i < SparseFormatDataRowReader.FORMAT_NAMES.length; i++) {
formatNames[i + 1] = "sparse_" + SparseFormatDataRowReader.FORMAT_NAMES[i];
}
formatNames[formatNames.length - 1] = "special_format";
}
public ExampleSetWriter(OperatorDescription description) {
super(description);
}
public IOObject[] apply() throws OperatorException {
ExampleSet eSet = getInput(ExampleSet.class);
boolean zipped = getParameterAsBoolean(PARAMETER_ZIPPED);
File dataFile = getParameterAsFile(PARAMETER_EXAMPLE_SET_FILE);
if (zipped) {
dataFile = new File(dataFile.getAbsolutePath() + ".gz");
}
File attFile = getParameterAsFile(PARAMETER_ATTRIBUTE_DESCRIPTION_FILE);
boolean quoteWhitespace = getParameterAsBoolean(PARAMETER_QUOTE_WHITESPACE);
int overwriteMode = getParameterAsInt(PARAMETER_OVERWRITE_MODE);
boolean append = false;
switch (overwriteMode) {
case OVERWRITE_MODE_NONE:
if (dataFile.exists()) {
throw new UserError(this, 100);
}
append = false;
break;
case OVERWRITE_MODE_OVERWRITE:
if (dataFile.exists()) {
log("File " + dataFile + " already exists. Overwriting...");
}
append = false;
break;
case OVERWRITE_MODE_APPEND:
if (dataFile.exists()) {
log("File " + dataFile + " already exists. Appending...");
}
append = true;
break;
case OVERWRITE_MODE_OVERWRITE_FIRST:
default:
if (getApplyCount() == 0) { // first time
if (dataFile.exists()) {
log("File " + dataFile + " already exists. Overwriting this time...");
}
append = false;
} else {
if (dataFile.exists()) {
log("File " + dataFile + " already exists. Appending...");
}
append = true;
}
break;
}
int fractionDigits = getParameterAsInt(PARAMETER_FRACTION_DIGITS);
if (fractionDigits < 0)
fractionDigits = NumericalAttribute.UNLIMITED_NUMBER_OF_DIGITS;
Charset encoding = getEncoding();
try {
// write example set
int format = getParameterAsInt(PARAMETER_FORMAT);
log("Writing example set in format '" + formatNames[format] + "'.");
if (format == DENSE_FORMAT) { // dense
eSet.writeDataFile(dataFile, fractionDigits, quoteWhitespace, zipped, append, encoding);
if (attFile != null) {
eSet.writeAttributeFile(attFile, dataFile, getEncoding());
}
} else if (format == formatNames.length - 1) { // special format
if (attFile != null)
logError("special_format used. Ignoring attribute description file.");
writeSpecialFormat(eSet, dataFile, fractionDigits, quoteWhitespace, zipped, append, encoding);
} else { // sparse
eSet.writeSparseDataFile(dataFile, format - 1, fractionDigits, quoteWhitespace, zipped, append, encoding);
if (attFile != null)
eSet.writeSparseAttributeFile(attFile, dataFile, format - 1, encoding);
}
} catch (IOException e) {
throw new UserError(this, e, 303, new Object[] { dataFile + " / " + attFile, e.getMessage() });
}
return new IOObject[] { eSet };
}
private void writeSpecialFormat(ExampleSet exampleSet, File dataFile, int fractionDigits, boolean quoteWhitespace, boolean zipped, boolean append, Charset encoding) throws OperatorException {
String format = getParameterAsString(PARAMETER_SPECIAL_FORMAT);
if (format == null)
throw new UserError(this, 201, new Object[] { "special_format", "format", "special_format" });
ExampleFormatter formatter;
try {
formatter = ExampleFormatter.compile(format, exampleSet, fractionDigits, quoteWhitespace);
} catch (FormatterException e) {
throw new UserError(this, 901, format, e.getMessage());
}
OutputStream out = null;
PrintWriter writer = null;
try {
if (zipped) {
out = new GZIPOutputStream(new FileOutputStream(dataFile, append));
} else {
out = new FileOutputStream(dataFile, append);
}
writer = new PrintWriter(new OutputStreamWriter(out, encoding));
Iterator<Example> reader = exampleSet.iterator();
while (reader.hasNext())
writer.println(formatter.format(reader.next()));
} catch (IOException e) {
throw new UserError(this, 303, dataFile, e.getMessage());
} finally {
if (writer != null) {
writer.close();
}
if (out != null) {
try {
out.close();
} catch (IOException e) {
logError("Cannot close stream to file " + dataFile);
}
}
}
}
public Class<?>[] getInputClasses() {
return new Class[] { ExampleSet.class };
}
public Class<?>[] getOutputClasses() {
return new Class[] { ExampleSet.class };
}
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
types.add(new ParameterTypeFile(PARAMETER_EXAMPLE_SET_FILE, "File to save the example set to.", "dat", false));
types.add(new ParameterTypeFile(PARAMETER_ATTRIBUTE_DESCRIPTION_FILE, "File to save the attribute descriptions to.", "aml", true));
types.add(new ParameterTypeCategory(PARAMETER_FORMAT, "Format to use for output.", formatNames, 0));
types.add(new ParameterTypeString(PARAMETER_SPECIAL_FORMAT, "Format string to use for output.", true));
types.add(new ParameterTypeInt(PARAMETER_FRACTION_DIGITS, "The number of fraction digits in the output file (-1: all possible digits).", -1, Integer.MAX_VALUE, -1));
types.add(new ParameterTypeBoolean(PARAMETER_QUOTE_WHITESPACE, "Indicates if nominal values containing whitespace characters should be quoted with double quotes.", true));
types.add(new ParameterTypeBoolean(PARAMETER_ZIPPED, "Indicates if the data file content should be zipped.", false));
types.add(new ParameterTypeCategory(PARAMETER_OVERWRITE_MODE, "Indicates if an existing table should be overwritten or if data should be appended.", OVERWRITE_MODES, OVERWRITE_MODE_OVERWRITE_FIRST));
return types;
}
}