/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * CSVSaver.java * Copyright (C) 2004-2012 University of Waikato, Hamilton, New Zealand * */ package weka.core.converters; import java.io.IOException; import java.io.PrintWriter; import java.util.Enumeration; import java.util.Vector; import weka.core.AbstractInstance; import weka.core.Capabilities; import weka.core.Capabilities.Capability; import weka.core.DenseInstance; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.RevisionUtils; import weka.core.SparseInstance; import weka.core.Utils; /** <!-- globalinfo-start --> * Writes to a destination that is in CSV (comma-separated values) format. The column separator can be chosen (default is ',') as well as the value representing missing values (default is '?'). * <p/> <!-- globalinfo-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -F <separator> * The field separator to be used. * '\t' can be used as well. * (default: ',')</pre> * * <pre> -M <str> * The string representing a missing value. * (default: ?)</pre> * * <pre> -N * Don't write a header row.</pre> * * <pre> -decimal <num> * The maximum number of digits to print after the decimal * place for numeric values (default: 6)</pre> * * <pre> -i <the input file> * The input file</pre> * * <pre> -o <the output file> * The output file</pre> * <!-- options-end --> * * @author Stefan Mutter (mutter@cs.waikato.ac.nz) * @version $Revision: 9870 $ * @see Saver */ public class CSVSaver extends AbstractFileSaver implements BatchConverter, IncrementalConverter, FileSourcedConverter { /** for serialization. */ static final long serialVersionUID = 476636654410701807L; /** the field separator. */ protected String m_FieldSeparator = ","; /** The placeholder for missing values. */ protected String m_MissingValue = "?"; /** Max number of decimal places for numeric values */ protected int m_MaxDecimalPlaces = AbstractInstance.s_numericAfterDecimalPoint; /** Set to true to not write the header row */ protected boolean m_noHeaderRow = false; /** * Constructor. */ public CSVSaver() { resetOptions(); } /** * Returns a string describing this Saver. * * @return a description of the Saver suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Writes to a destination that is in CSV (comma-separated values) format. " + "The column separator can be chosen (default is ',') as well as the value " + "representing missing values (default is '?')."; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration listOptions() { Vector<Option> result = new Vector<Option>(); result.addElement(new Option("\tThe field separator to be used.\n" + "\t'\\t' can be used as well.\n" + "\t(default: ',')", "F", 1, "-F <separator>")); result.addElement(new Option("\tThe string representing a missing value.\n" + "\t(default: ?)", "M", 1, "-M <str>")); result.addElement(new Option("\tDon't write a header row.", "N", 0, "-N")); result.addElement(new Option( "\tThe maximum number of digits to print after the decimal\n" + "\tplace for numeric values (default: 6)", "decimal", 1, "-decimal <num>")); Enumeration en = super.listOptions(); while (en.hasMoreElements()) result.addElement((Option) en.nextElement()); return result.elements(); } /** <!-- options-start --> * Valid options are: <p/> * * <pre> -F <separator> * The field separator to be used. * '\t' can be used as well. * (default: ',')</pre> * * <pre> -M <str> * The string representing a missing value. * (default: ?)</pre> * * <pre> -N * Don't write a header row.</pre> * * <pre> -decimal <num> * The maximum number of digits to print after the decimal * place for numeric values (default: 6)</pre> * * <pre> -i <the input file> * The input file</pre> * * <pre> -o <the output file> * The output file</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ @Override public void setOptions(String[] options) throws Exception { String tmpStr; tmpStr = Utils.getOption('F', options); if (tmpStr.length() != 0) setFieldSeparator(tmpStr); else setFieldSeparator(","); tmpStr = Utils.getOption('M', options); if (tmpStr.length() != 0) setMissingValue(tmpStr); else setMissingValue("?"); setNoHeaderRow(Utils.getFlag('N', options)); tmpStr = Utils.getOption("decimal", options); if (tmpStr.length() > 0) { setMaxDecimalPlaces(Integer.parseInt(tmpStr)); } super.setOptions(options); } /** * Gets the current settings of the Classifier. * * @return an array of strings suitable for passing to setOptions */ @Override public String[] getOptions() { Vector<String> result; String[] options; int i; result = new Vector<String>(); result.add("-F"); result.add(getFieldSeparator()); result.add("-M"); result.add(getMissingValue()); if (getNoHeaderRow()) { result.add("-N"); } result.add("-decimal"); result.add("" + getMaxDecimalPlaces()); options = super.getOptions(); for (i = 0; i < options.length; i++) result.add(options[i]); return result.toArray(new String[result.size()]); } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String noHeaderRowTipText() { return "If true then the header row is not written"; } /** * Set whether to not write the header row * * @param b true if no header row is to be written */ public void setNoHeaderRow(boolean b) { m_noHeaderRow = b; } /** * Get whether to not write the header row * * @return true if no header row is to be written */ public boolean getNoHeaderRow() { return m_noHeaderRow; } /** * Set the maximum number of decimal places to print * * @param maxDecimal the maximum number of decimal places to print */ public void setMaxDecimalPlaces(int maxDecimal) { m_MaxDecimalPlaces = maxDecimal; } /** * Get the maximum number of decimal places to print * * @return the maximum number of decimal places to print */ public int getMaxDecimalPlaces() { return m_MaxDecimalPlaces; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String maxDecimalPlacesTipText() { return "The maximum number of digits to print after the decimal " + "point for numeric values"; } /** * Sets the character used as column separator. * * @param value the character to use */ public void setFieldSeparator(String value) { m_FieldSeparator = Utils.unbackQuoteChars(value); /* * if (m_FieldSeparator.length() != 1) { m_FieldSeparator = ","; System.err * .println( * "Field separator can only be a single character (exception being '\t'), " * + "defaulting back to '" + m_FieldSeparator + "'!"); } */ } /** * Returns the character used as column separator. * * @return the character to use */ public String getFieldSeparator() { return Utils.backQuoteChars(m_FieldSeparator); } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String fieldSeparatorTipText() { return "The character to use as separator for the columns/fields (use '\\t' for TAB)."; } /** * Sets the placeholder for missing values. * * @param value the placeholder */ public void setMissingValue(String value) { m_MissingValue = value; } /** * Returns the current placeholder for missing values. * * @return the placeholder */ public String getMissingValue() { return m_MissingValue; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String missingValueTipText() { return "The placeholder for missing values, default is '?'."; } /** * Returns a description of the file type. * * @return a short file description */ @Override public String getFileDescription() { return "CSV file: comma separated files"; } /** * Resets the Saver. */ @Override public void resetOptions() { super.resetOptions(); setFileExtension(".csv"); } /** * Returns the Capabilities of this saver. * * @return the capabilities of this object * @see Capabilities */ @Override public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.STRING_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); // class result.enable(Capability.NOMINAL_CLASS); result.enable(Capability.NUMERIC_CLASS); result.enable(Capability.DATE_CLASS); result.enable(Capability.STRING_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); result.enable(Capability.NO_CLASS); return result; } /** * Saves an instances incrementally. Structure has to be set by using the * setStructure() method or setInstances() method. * * @param inst the instance to save * @throws IOException throws IOEXception if an instance cannot be saved * incrementally. */ @Override public void writeIncremental(Instance inst) throws IOException { int writeMode = getWriteMode(); Instances structure = getInstances(); PrintWriter outW = null; if (getRetrieval() == BATCH || getRetrieval() == NONE) throw new IOException("Batch and incremental saving cannot be mixed."); if (getWriter() != null) outW = new PrintWriter(getWriter()); if (writeMode == WAIT) { if (structure == null) { setWriteMode(CANCEL); if (inst != null) System.err .println("Structure(Header Information) has to be set in advance"); } else setWriteMode(STRUCTURE_READY); writeMode = getWriteMode(); } if (writeMode == CANCEL) { if (outW != null) outW.close(); cancel(); } if (writeMode == STRUCTURE_READY) { setWriteMode(WRITE); // write header if (!getNoHeaderRow()) { if (retrieveFile() == null && outW == null) { // print out attribute names as first row for (int i = 0; i < structure.numAttributes(); i++) { System.out.print(structure.attribute(i).name()); if (i < structure.numAttributes() - 1) { System.out.print(m_FieldSeparator); } else { System.out.println(); } } } else { for (int i = 0; i < structure.numAttributes(); i++) { outW.print(structure.attribute(i).name()); if (i < structure.numAttributes() - 1) { outW.print(m_FieldSeparator); } else { outW.println(); } } outW.flush(); } } writeMode = getWriteMode(); } if (writeMode == WRITE) { if (structure == null) throw new IOException("No instances information available."); if (inst != null) { // write instance if (retrieveFile() == null && outW == null) System.out.println(inst); else { outW.println(instanceToString(inst)); // flushes every 100 instances m_incrementalCounter++; if (m_incrementalCounter > 100) { m_incrementalCounter = 0; outW.flush(); } } } else { // close if (outW != null) { outW.flush(); outW.close(); } m_incrementalCounter = 0; resetStructure(); outW = null; resetWriter(); } } } /** * Writes a Batch of instances. * * @throws IOException throws IOException if saving in batch mode is not * possible */ @Override public void writeBatch() throws IOException { if (getInstances() == null) throw new IOException("No instances to save"); if (getRetrieval() == INCREMENTAL) throw new IOException("Batch and incremental saving cannot be mixed."); setRetrieval(BATCH); setWriteMode(WRITE); if (retrieveFile() == null && getWriter() == null) { if (!getNoHeaderRow()) { // print out attribute names as first row for (int i = 0; i < getInstances().numAttributes(); i++) { System.out.print(getInstances().attribute(i).name()); if (i < getInstances().numAttributes() - 1) { System.out.print(m_FieldSeparator); } else { System.out.println(); } } } for (int i = 0; i < getInstances().numInstances(); i++) { System.out.println(instanceToString(getInstances().instance(i))); } setWriteMode(WAIT); return; } PrintWriter outW = new PrintWriter(getWriter()); // print out attribute names as first row for (int i = 0; i < getInstances().numAttributes(); i++) { outW.print(Utils.quote(getInstances().attribute(i).name())); if (i < getInstances().numAttributes() - 1) { outW.print(m_FieldSeparator); } else { outW.println(); } } for (int i = 0; i < getInstances().numInstances(); i++) { outW.println(instanceToString((getInstances().instance(i)))); } outW.flush(); outW.close(); setWriteMode(WAIT); outW = null; resetWriter(); setWriteMode(CANCEL); } /** * turns an instance into a string. takes care of sparse instances as well. * * @param inst the instance to turn into a string * @return the generated string */ protected String instanceToString(Instance inst) { StringBuffer result; Instance outInst; int i; String field; result = new StringBuffer(); if (inst instanceof SparseInstance) { outInst = new DenseInstance(inst.weight(), inst.toDoubleArray()); outInst.setDataset(inst.dataset()); } else { outInst = inst; } for (i = 0; i < outInst.numAttributes(); i++) { if (i > 0) { result.append(m_FieldSeparator); } if (outInst.isMissing(i)) field = m_MissingValue; else field = outInst.toString(i, m_MaxDecimalPlaces); // make sure that custom field separators, like ";" get quoted correctly // as well (but only for single character field separators) if (m_FieldSeparator.length() == 1 && (field.indexOf(m_FieldSeparator) > -1) && !field.startsWith("'") && !field.endsWith("'")) field = "'" + field + "'"; result.append(field); } return result.toString(); } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision: 9870 $"); } /** * Main method. * * @param args should contain the options of a Saver. */ public static void main(String[] args) { runFileSaver(new CSVSaver(), args); } }