/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.example.set; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.zip.GZIPOutputStream; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.w3c.dom.Document; import org.w3c.dom.Element; import com.rapidminer.datatable.DataTable; import com.rapidminer.datatable.DataTableExampleSetAdapter; import com.rapidminer.example.Attribute; import com.rapidminer.example.AttributeRole; import com.rapidminer.example.AttributeWeights; import com.rapidminer.example.Attributes; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.Statistics; import com.rapidminer.example.table.SparseFormatDataRowReader; import com.rapidminer.io.process.XMLTools; import com.rapidminer.operator.IOContainer; import com.rapidminer.operator.IOObject; import com.rapidminer.operator.MissingIOObjectException; import com.rapidminer.operator.ResultObjectAdapter; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.Tools; import com.rapidminer.tools.XMLException; /** * Implements wrapper methods of abstract example set. Implements all * ResultObject methods.<br> * * Apart from the interface methods the implementing classes must have a public * single argument clone constructor. This constructor is invoked by reflection * from the clone method. Do not forget to call the superclass method. * * @author Ingo Mierswa, Simon Fischer */ public abstract class AbstractExampleSet extends ResultObjectAdapter implements ExampleSet { private static final long serialVersionUID = 8596141056047402798L; /** Maps attribute names to list of statistics objects. */ private final Map<String, List<Statistics>> statisticsMap = new HashMap<String, List<Statistics>>(); /** Maps the id values on the line index in the example table. */ private Map<Double, int[]> idMap = new HashMap<Double, int[]>(); /** This method overrides the implementation of ResultObjectAdapter and returns "ExampleSet". */ @Override public String getName() { return "ExampleSet"; } public Example getExampleFromId(double id) { int[] indices = idMap.get(id); if (indices != null && indices.length > 0) { return getExample(indices[0]); } else { return null; } } public int[] getExampleIndicesFromId(double id) { return idMap.get(id); } // --- Visualisation and toString() methods --- @Override public String toString() { StringBuffer str = new StringBuffer(this.getClass().getSimpleName() + ":" + Tools.getLineSeparator()); str.append(size() + " examples," + Tools.getLineSeparator()); str.append(getAttributes().size() + " regular attributes," + Tools.getLineSeparator()); boolean first = true; Iterator<AttributeRole> s = getAttributes().specialAttributes(); while (s.hasNext()) { if (first) { str.append("special attributes = {" + Tools.getLineSeparator()); first = false; } AttributeRole special = s.next(); str.append(" " + special.getSpecialName() + " = " + special.getAttribute() + Tools.getLineSeparator()); } if (!first) { str.append("}"); } else { str.append("no special attributes" + Tools.getLineSeparator()); } return str.toString(); } /** This method is used to create a {@link DataTable} from this example set. The default implementation * returns an instance of {@link DataTableExampleSetAdapter}. The given IOContainer is used to check if * there are compatible attribute weights which would used as column weights of the returned table. * Subclasses might want to override this method in order to allow for other data tables. */ public DataTable createDataTable(IOContainer container) { AttributeWeights weights = null; if (container != null) { try { weights = container.get(AttributeWeights.class); for (Attribute attribute : getAttributes()) { double weight = weights.getWeight(attribute.getName()); if (Double.isNaN(weight)) { // not compatible weights = null; break; } } } catch (MissingIOObjectException e) {} } return new DataTableExampleSetAdapter(this, weights); } // -------------------- File Writing -------------------- public void writeDataFile(File dataFile, int fractionDigits, boolean quoteNominal, boolean zipped, boolean append, Charset encoding) throws IOException { PrintWriter out = null; OutputStream outStream = null; try { if (zipped) { outStream = new GZIPOutputStream(new FileOutputStream(dataFile, append)); } else { outStream = new FileOutputStream(dataFile, append); } out = new PrintWriter(new OutputStreamWriter(outStream, encoding)); Iterator<Example> reader = iterator(); while (reader.hasNext()) { out.println(reader.next().toDenseString(fractionDigits, quoteNominal)); } } catch (IOException e) { throw e; } finally { if (out != null) { out.close(); } if (outStream != null) { outStream.close(); } } } /** Writes the data into a sparse file format. */ public void writeSparseDataFile(File dataFile, int format, int fractionDigits, boolean quoteNominal, boolean zipped, boolean append, Charset encoding) throws IOException { PrintWriter out = null; OutputStream outStream = null; try { if (zipped) { outStream = new GZIPOutputStream(new FileOutputStream(dataFile, append)); } else { outStream = new FileOutputStream(dataFile, append); } out = new PrintWriter(new OutputStreamWriter(outStream, encoding)); Iterator<Example> reader = iterator(); while (reader.hasNext()) { out.println(reader.next().toSparseString(format, fractionDigits, quoteNominal)); } } catch (IOException e) { throw e; } finally { if (out != null) { out.close(); } if (outStream != null) { outStream.close(); } } } /** * Writes the attribute descriptions for all examples. Writes first all * regular attributes and then the special attributes (just like the data * write format of {@link Example#toString()}. Please note that the given * data file will only be used to determine the relative position. */ public void writeAttributeFile(File attFile, File dataFile, Charset encoding) throws IOException { // determine relative path if (dataFile == null) throw new IOException("ExampleSet writing: cannot determine path to data file: data file was not given!"); String relativePath = Tools.getRelativePath(dataFile, attFile); try { // building DOM Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument(); Element root = document.createElement("attributeset"); root.setAttribute("default_source", relativePath); root.setAttribute("encoding", encoding.name()); document.appendChild(root); int sourcecol = 1; Iterator<AttributeRole> i = getAttributes().allAttributeRoles(); while (i.hasNext()) { root.appendChild(writeAttributeMetaData(i.next(), sourcecol, document, false)); sourcecol++; } // writing XML from DOM PrintWriter writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(attFile), encoding)); writer.print(XMLTools.toString(document, encoding)); writer.close(); } catch (ParserConfigurationException e) { throw new IOException("Cannot create XML document builder: "+e, e); } catch (XMLException e) { throw new IOException("Could not format XML document:" + e, e); } } /** * Writes the attribute descriptions for all examples. Writes only the * special attributes which are supported by the sparse format of the method * {@link Example#toSparseString(int, int, boolean)}. Please note that the given data * file is only be used to determine the relative position. */ public void writeSparseAttributeFile(File attFile, File dataFile, int format, Charset encoding) throws IOException { if (dataFile == null) throw new IOException("ExampleSet sparse writing: cannot determine path to data file: data file was not given!"); String relativePath = Tools.getRelativePath(dataFile, attFile); try { // building DOM Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument(); Element root = document.createElement("attributeset"); root.setAttribute("default_source", relativePath); root.setAttribute("encoding", encoding.name()); document.appendChild(root); // special attributes AttributeRole labelRole = getAttributes().getRole(Attributes.LABEL_NAME); if ((labelRole != null) && (format != SparseFormatDataRowReader.FORMAT_NO_LABEL)) root.appendChild(writeAttributeMetaData(labelRole, 0, document, true)); AttributeRole idRole = getAttributes().getRole(Attributes.ID_NAME); if (idRole != null) root.appendChild(writeAttributeMetaData(idRole, 0, document, true)); AttributeRole weightRole = getAttributes().getRole(Attributes.WEIGHT_NAME); if (weightRole != null) root.appendChild(writeAttributeMetaData(weightRole, 0, document, true)); // regular attributes int sourcecol = 1; for (Attribute attribute: getAttributes()) { root.appendChild(writeAttributeMetaData("attribute", attribute, sourcecol, document, false)); sourcecol++; } // writing XML from DOM PrintWriter writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(attFile), encoding)); writer.print(XMLTools.toString(document, encoding)); writer.close(); } catch (ParserConfigurationException e) { throw new IOException("Cannot create XML document builder: "+e, e); } catch (XMLException e) { throw new IOException("Could not format XML document:" + e, e); } } /** Writes the data of this attribute in the given stream. */ private Element writeAttributeMetaData(AttributeRole attributeRole, int sourcecol, Document document, boolean sparse) { String tag = "attribute"; if (attributeRole.isSpecial()) tag = attributeRole.getSpecialName(); Attribute attribute = attributeRole.getAttribute(); return writeAttributeMetaData(tag, attribute, sourcecol, document, sparse); } /** Writes the data of this attribute in the given stream. */ private Element writeAttributeMetaData(String tag, Attribute attribute, int sourcecol, Document document, boolean sparse) { Element attributeElement = document.createElement(tag); attributeElement.setAttribute("name", attribute.getName()); if (!sparse || tag.equals("attribute")) { attributeElement.setAttribute("sourcecol", sourcecol + ""); } attributeElement.setAttribute("valuetype", Ontology.ATTRIBUTE_VALUE_TYPE.mapIndex(attribute.getValueType())); if (!Ontology.ATTRIBUTE_BLOCK_TYPE.isA(attribute.getBlockType(), Ontology.SINGLE_VALUE)) attributeElement.setAttribute("blocktype", Ontology.ATTRIBUTE_BLOCK_TYPE.mapIndex(attribute.getBlockType())); // nominal values if ((Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.NOMINAL)) && (!tag.equals(Attributes.KNOWN_ATTRIBUTE_TYPES[Attributes.TYPE_ID]))) { for (String nominalValue: attribute.getMapping().getValues()) { Element valueElement = document.createElement("value"); valueElement.setTextContent(nominalValue); attributeElement.appendChild(valueElement); } } return attributeElement; } public String getExtension() { return "aml"; } public String getFileDescription() { return "attribute description file"; } /** * Returns true, if all attributes including labels and other special * attributes are equal. */ @Override public boolean equals(Object o) { if (!(o instanceof ExampleSet)) { return false; } ExampleSet es = (ExampleSet) o; return getAttributes().equals(es.getAttributes()); } /** Returns the hash code of all attributes. */ @Override public int hashCode() { return getAttributes().hashCode(); } @Override public IOObject copy() { return (IOObject)clone(); } /** Clones the example set by invoking a single argument clone constructor. Please note that a cloned * example set has no information about the attribute statistics. That means, that attribute statistics * must be (re-)calculated after the clone was created. */ @Override public Object clone() { try { Class<? extends AbstractExampleSet> clazz = getClass(); java.lang.reflect.Constructor cloneConstructor = clazz.getConstructor(new Class[] { clazz }); AbstractExampleSet result = (AbstractExampleSet)cloneConstructor.newInstance(new Object[] { this }); result.idMap = this.idMap; return result; } catch (IllegalAccessException e) { throw new RuntimeException("Cannot clone ExampleSet: " + e.getMessage()); } catch (NoSuchMethodException e) { throw new RuntimeException("'" + getClass().getName() + "' does not implement clone constructor!"); } catch (java.lang.reflect.InvocationTargetException e) { throw new RuntimeException("Cannot clone " + getClass().getName() + ": " + e + ". Target: " + e.getTargetException() + ". Cause: " + e.getCause() + "."); } catch (InstantiationException e) { throw new RuntimeException("Cannot clone " + getClass().getName() + ": " + e); } } // ============================================================================= public void remapIds() { idMap = new HashMap<Double, int[]>(size()); Attribute idAttribute = getAttributes().getSpecial(Attributes.ID_NAME); if (idAttribute != null) { int index = 0; for (Example example : this) { double value = example.getValue(idAttribute); if (!Double.isNaN(value)) { if (idMap.containsKey(value)) { int[] indices = idMap.get(value); int[] newIndices = new int[indices.length + 1]; for (int i = 0; i < indices.length; i++) { newIndices[i] = indices[i]; } newIndices[newIndices.length - 1] = index; idMap.put(value, newIndices); } else { idMap.put(value, new int[] { index }); } } index++; } } } // ============================================================================= /** * Recalculates the attribute statistics for all attributes. They are * average value, variance, minimum, and maximum. For nominal attributes the * occurences for all values are counted. This method collects all * attributes (regular and special) in a list and invokes * <code>recalculateAttributeStatistics(List attributes)</code> and * performs only one data scan. */ public void recalculateAllAttributeStatistics() { List<Attribute> allAttributes = new ArrayList<Attribute>(); Iterator<Attribute> a = getAttributes().allAttributes(); while (a.hasNext()) { allAttributes.add(a.next()); } recalculateAttributeStatistics(allAttributes); } /** Recalculate the attribute statistics of the given attribute. */ public void recalculateAttributeStatistics(Attribute attribute) { List<Attribute> allAttributes = new ArrayList<Attribute>(); allAttributes.add(attribute); recalculateAttributeStatistics(allAttributes); } /** * Here the Example Set is parsed only once, all the information is retained * for each example set. */ private void recalculateAttributeStatistics(List<Attribute> attributeList) { // do nothing if not desired if (attributeList.size() == 0) { return; } else { // init statistics for (Attribute attribute : attributeList) { Iterator<Statistics> stats = attribute.getAllStatistics(); while (stats.hasNext()) { Statistics statistics = stats.next(); statistics.startCounting(attribute); } } // calculate statistics Attribute weightAttribute = getAttributes().getWeight(); if ((weightAttribute != null) && (!weightAttribute.isNumerical())) // use only numerical weights weightAttribute = null; for (Example example : this) { for (Attribute attribute : attributeList) { double value = example.getValue(attribute); double weight = 1.0d; if (weightAttribute != null) { weight = example.getValue(weightAttribute); } Iterator<Statistics> stats = attribute.getAllStatistics(); while (stats.hasNext()) { Statistics statistics = stats.next(); statistics.count(value, weight); } } } // store cloned statistics for (Attribute attribute : attributeList) { List<Statistics> statisticsList = statisticsMap.get(attribute.getName()); // no stats known for this attribute at all --> new list if (statisticsList == null) { statisticsList = new LinkedList<Statistics>(); statisticsMap.put(attribute.getName(), statisticsList); } // in all cases: clear the list before adding new stats (clone of the calculations) statisticsList.clear(); Iterator<Statistics> stats = attribute.getAllStatistics(); while (stats.hasNext()) { Statistics statistics = (Statistics)stats.next().clone(); statisticsList.add(statistics); } } } } /** Returns the desired statistic for the given attribute. This method should be * preferred over the deprecated method Attribute#getStatistics(String) * since it correctly calculates and keep the statistics for the current example * set and does not overwrite the statistics in the attribute. * Invokes the method {@link #getStatistics(Attribute, String, String)} with a null * statistics parameter. */ public double getStatistics(Attribute attribute, String statisticsName) { return getStatistics(attribute, statisticsName, null); } /** Returns the desired statistic for the given attribute. This method should be * preferred over the deprecated method Attribute#getStatistics(String) * since it correctly calculates and keep the statistics for the current example * set and does not overwrite the statistics in the attribute. If the statistics * were not calculated before (via one of the recalculate methods) this method * will return NaN. If no statistics is available for the given name, also NaN * is returned. */ public double getStatistics(Attribute attribute, String statisticsName, String statisticsParameter) { List<Statistics> statisticsList = statisticsMap.get(attribute.getName()); if (statisticsList == null) return Double.NaN; for (Statistics statistics : statisticsList) { if (statistics.handleStatistics(statisticsName)) { return statistics.getStatistics(attribute, statisticsName, statisticsParameter); } } return Double.NaN; } }