/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.io; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.nio.charset.Charset; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.logging.Level; import java.util.zip.GZIPOutputStream; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleFormatter; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.FormatterException; import com.rapidminer.example.table.NumericalAttribute; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeCategory; import com.rapidminer.parameter.ParameterTypeFile; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.parameter.ParameterTypeString; import com.rapidminer.tools.io.Encoding; /** Writes an example set to a file using a special, user defined format. * * Using the parameter 'special_format', the user can specify the exact * format. The $ sign has a special meaning and introduces a command (the * following character) Additional arguments to this command may be supplied * enclosing it in square brackets. * <dl> * <dt>$a:</dt> * <dd> All attributes separated by the default separator</dd> * <dt>$a[separator]:</dt> * <dd> All attributes separated by separator</dd> * <dt>$s[separator][indexSeparator]:</dt> * <dd> Sparse format. For all non zero attributes the following strings are * concatenated: the column index, the value of indexSeparator, the attribute * value. Attributes are separated by separator.</dd> * <dt>$v[name]:</dt> * <dd> The value of the attribute with the given name (both regular and special * attributes)</dd> * <dt>$k[index]:</dt> * <dd> The value of the attribute with the given index</dd> * <dt>$l:</dt> * <dd> The label</dd> * <dt>$p:</dt> * <dd> The predicted label</dd> * <dt>$d:</dt> * <dd> All prediction confidences for all classes in the form conf(class)=value</dd> * <dt>$d[class]:</dt> * <dd> The prediction confidence for the defined class as a simple number</dd> * <dt>$i:</dt> * <dd> The id</dd> * <dt>$w:</dt> * <dd> The weight</dd> * <dt>$b:</dt> * <dd> The batch number</dd> * <dt>$n:</dt> * <dd> The newline character</dd> * <dt>$t:</dt> * <dd> The tabulator character</dd> * <dt>$$:</dt> * <dd> The dollar sign</dd> * <dt>$[:</dt> * <dd> The '[' character</dd> * <dt>$]:</dt> * <dd> The ']' character</dd> * </dl> * Make sure the format string ends with $n if you want examples to be separated * by newlines! * * Up to Version 5.0, the functionality of this operator was covered by the regular * {@link ExampleSetWriter}. * * @author Simon Fischer */ public class SpecialFormatExampleSetWriter extends AppendingExampleSetWriter { public static final String PARAMETER_ADD_NEW_LINE = "add_line_separator"; /** The parameter name for "Format string to use for output." */ public static final String PARAMETER_SPECIAL_FORMAT = "special_format"; /** The parameter name for "File to save the example set to." */ public static final String PARAMETER_EXAMPLE_SET_FILE = "example_set_file"; /** The parameter name for "The number of fraction digits in the output file (-1: all possible digits)." */ public static final String PARAMETER_FRACTION_DIGITS = "fraction_digits"; /** Indicates if nominal values should be quoted with double quotes. Quotes inside of nominal values will be escaped by a backslash. */ public static final String PARAMETER_QUOTE_NOMINAL_VALUES = "quote_nominal_values"; /** The parameter name for "Indicates if the data file content should be zipped." */ public static final String PARAMETER_ZIPPED = "zipped"; /** The parameter name for "Indicates if an existing table should be overwritten." */ public static final String PARAMETER_OVERWRITE_MODE = "overwrite_mode"; public SpecialFormatExampleSetWriter(OperatorDescription description) { super(description); } @Override public ExampleSet write(ExampleSet ioobject) throws OperatorException { boolean zipped = getParameterAsBoolean(PARAMETER_ZIPPED); File dataFile = getParameterAsFile(PARAMETER_EXAMPLE_SET_FILE, true); if (zipped) { dataFile = new File(dataFile.getAbsolutePath() + ".gz"); } boolean quoteNominal = getParameterAsBoolean(PARAMETER_QUOTE_NOMINAL_VALUES); int fractionDigits = getParameterAsInt(PARAMETER_FRACTION_DIGITS); if (fractionDigits < 0) fractionDigits = NumericalAttribute.UNLIMITED_NUMBER_OF_DIGITS; Charset encoding = Encoding.getEncoding(this); writeSpecialFormat(ioobject, dataFile, fractionDigits, getParameterAsBoolean(PARAMETER_ADD_NEW_LINE), quoteNominal, zipped, shouldAppend(dataFile), encoding); return ioobject; } @Override protected boolean supportsEncoding() { return true; } private void writeSpecialFormat(ExampleSet exampleSet, File dataFile, int fractionDigits, boolean automaticLineBreak, boolean quoteNominal, boolean zipped, boolean append, Charset encoding) throws OperatorException { String format = getParameterAsString(PARAMETER_SPECIAL_FORMAT); if (format == null) throw new UserError(this, 201, new Object[] { "special_format", "format", "special_format" }); ExampleFormatter formatter; try { formatter = ExampleFormatter.compile(format, exampleSet, fractionDigits, quoteNominal); } catch (FormatterException e) { throw new UserError(this, 901, format, e.getMessage()); } OutputStream out = null; PrintWriter writer = null; try { if (zipped) { out = new GZIPOutputStream(new FileOutputStream(dataFile, append)); } else { out = new FileOutputStream(dataFile, append); } writer = new PrintWriter(new OutputStreamWriter(out, encoding)); Iterator<Example> reader = exampleSet.iterator(); while (reader.hasNext()) { if (automaticLineBreak) writer.println(formatter.format(reader.next())); else writer.print(formatter.format(reader.next())); } } catch (IOException e) { throw new UserError(this, 303, dataFile, e.getMessage()); } finally { if (writer != null) { writer.flush(); writer.close(); } if (out != null) { try { out.close(); } catch (IOException e) { getLogger().log(Level.WARNING, "Cannot close stream to file " + dataFile, e); } } } } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = new LinkedList<ParameterType>(); types.add(new ParameterTypeFile(PARAMETER_EXAMPLE_SET_FILE, "File to save the example set to.", "dat", false)); types.add(new ParameterTypeString(PARAMETER_SPECIAL_FORMAT, "Format string to use for output.", false)); types.add(new ParameterTypeInt(PARAMETER_FRACTION_DIGITS, "The number of fraction digits in the output file (-1: all possible digits).", -1, Integer.MAX_VALUE, -1)); types.add(new ParameterTypeBoolean(PARAMETER_ADD_NEW_LINE, "If checked, each example is followed by a line break automatically", true)); types.add(new ParameterTypeBoolean(PARAMETER_QUOTE_NOMINAL_VALUES, "Indicates if nominal values should be quoted with double quotes.", true)); types.add(new ParameterTypeBoolean(PARAMETER_ZIPPED, "Indicates if the data file content should be zipped.", false)); types.add(new ParameterTypeCategory(PARAMETER_OVERWRITE_MODE, "Indicates if an existing table should be overwritten or if data should be appended.", OVERWRITE_MODES, OVERWRITE_MODE_OVERWRITE_FIRST)); types.addAll(super.getParameterTypes()); return types; } }