/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.io; import java.io.File; import java.io.IOException; import java.util.LinkedList; import java.util.List; import java.util.Random; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import com.rapidminer.example.Attribute; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.example.table.DataRowFactory; import com.rapidminer.example.table.MemoryExampleTable; import com.rapidminer.operator.IOObject; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeCategory; import com.rapidminer.parameter.ParameterTypeDouble; import com.rapidminer.parameter.ParameterTypeFile; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.parameter.ParameterTypeString; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.RandomGenerator; /** * <p>This operator can read XRFF files known from Weka. * The XRFF (eXtensible attribute-Relation File Format) is an XML-based extension of the ARFF format * in some sense similar to the original RapidMiner file format for attribute description files (.aml).</p> * * <p>Here you get a small example for the IRIS dataset represented as XRFF file:</p> * * <pre> * <?xml version="1.0" encoding="utf-8"?> * <dataset name="iris" version="3.5.3"> * <header> * <attributes> * <attribute name="sepallength" type="numeric"/> * <attribute name="sepalwidth" type="numeric"/> * <attribute name="petallength" type="numeric"/> * <attribute name="petalwidth" type="numeric"/> * <attribute class="yes" name="class" type="nominal"> * <labels> * <label>Iris-setosa</label> * <label>Iris-versicolor</label> * <label>Iris-virginica</label> * </labels> * </attribute> * </attributes> * </header> * * <body> * <instances> * <instance> * <value>5.1</value> * <value>3.5</value> * <value>1.4</value> * <value>0.2</value> * <value>Iris-setosa</value> * </instance> * <instance> * <value>4.9</value> * <value>3</value> * <value>1.4</value> * <value>0.2</value> * <value>Iris-setosa</value> * </instance> * ... * </instances> * </body> * </dataset> * </pre> * * <p>Please note that the sparse XRFF format is currently not supported, please use one of the * other options for sparse data files provided by RapidMiner.</p> * * <p>Since the XML representation takes up considerably more space since the data is wrapped * into XML tags, one can also compress the data via gzip. RapidMiner automatically recognizes a file * being gzip compressed, if the file's extension is .xrff.gz instead of .xrff.</p> * * <p>Similar to the native RapidMiner data definition via .aml and almost arbitrary data files, the XRFF * format contains some additional features. Via the class="yes" attribute in the attribute * specification in the header, one can define which attribute should used as a prediction label * attribute. Although the RapidMiner terminus for such classes is "label" instead of * "class" we support the terminus class in order to not break compatibility with * original XRFF files.</p> * * <p>Please note that loading attribute weights is currently not supported, please use * the other RapidMiner operators for attribute weight loading and writing for this * purpose.</p> * * <p>Instance weights can be defined via a weight XML attribute in each instance tag. * By default, the weight is 1. Here's an example:</p> * * <pre> * <instance weight="0.75"> * <value>5.1</value> * <value>3.5</value> * <value>1.4</value> * <value>0.2</value> * <value>Iris-setosa</value> * </instance> * </pre> * * <p>Since the XRFF format does not support id attributes one have to use one of the RapidMiner * operators in order to change on of the columns to the id column if desired. This has to be done * after loading the data.</p> * * @rapidminer.index xrff * @author Ingo Mierswa * @version $Id: XrffExampleSource.java,v 1.7 2008/07/07 07:06:39 ingomierswa Exp $ */ public class XrffExampleSource extends Operator { /** The parameter name for "The path to the data file." */ public static final String PARAMETER_DATA_FILE = "data_file"; /** The parameter name for "The (case sensitive) name of the id attribute" */ public static final String PARAMETER_ID_ATTRIBUTE = "id_attribute"; /** The parameter name for "Determines, how the data is represented internally." */ public static final String PARAMETER_DATAMANAGEMENT = "datamanagement"; /** The parameter name for "Character that is used as decimal point." */ public static final String PARAMETER_DECIMAL_POINT_CHARACTER = "decimal_point_character"; /** The parameter name for "The fraction of the data set which should be read (1 = all; only used if sample_size = -1)" */ public static final String PARAMETER_SAMPLE_RATIO = "sample_ratio"; /** The parameter name for "The exact number of samples which should be read (-1 = use sample ratio; if not -1, sample_ratio will not have any effect)" */ public static final String PARAMETER_SAMPLE_SIZE = "sample_size"; /** The parameter name for "Use the given random seed instead of global random numbers (only for permutation, -1: use global)." */ public static final String PARAMETER_LOCAL_RANDOM_SEED = "local_random_seed"; public XrffExampleSource(OperatorDescription description) { super(description); } public IOObject[] apply() throws OperatorException { File file = getParameterAsFile(PARAMETER_DATA_FILE); String idName = getParameterAsString(PARAMETER_ID_ATTRIBUTE); Attribute label = null; Attribute id = null; Attribute weight = null; boolean instanceWeightsUsed = false; MemoryExampleTable table = null; try { Document document = null; try { document = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(file); } catch (SAXException e1) { throw new IOException(e1.getMessage()); } catch (ParserConfigurationException e1) { throw new IOException(e1.getMessage()); } Element datasetElement = document.getDocumentElement(); if (!datasetElement.getTagName().equals("dataset")) { throw new IOException("Outer tag of XRFF file must be <dataset>."); } // read attribute meta data Element headerElement = retrieveSingleNode(datasetElement, "header"); Element attributesElement = retrieveSingleNode(headerElement, "attributes"); List<Attribute> attributeList = new LinkedList<Attribute>(); NodeList attributes = attributesElement.getChildNodes(); for (int i = 0; i < attributes.getLength(); i++) { Node node = attributes.item(i); if (node instanceof Element) { Element attribute= (Element)node; String tagName = attribute.getTagName(); if (!tagName.equals("attribute")) throw new IOException("Only tags <attribute> are allowed inside <attributes>, was " + tagName); String name = attribute.getAttribute("name"); if (name == null) throw new IOException("The tag <attribute> needs a 'name' attribute."); String classAttribute = attribute.getAttribute("class"); boolean isClass = classAttribute != null && classAttribute.equals("yes"); String valueType = attribute.getAttribute("type"); if (valueType == null) throw new IOException("The tag <attribute> needs a 'type' attribute."); Attribute att = createAttribute(name, valueType); if (att.isNominal()) { Element labelsElement = retrieveSingleNode(attribute, "labels", false); if (labelsElement != null) { NodeList labels = labelsElement.getChildNodes(); for (int j = 0; j < labels.getLength(); j++) { Node labelNode = labels.item(j); if (labelNode instanceof Element) { String labelTagName = labelNode.getNodeName(); if (!labelTagName.equals("label")) throw new IOException("Only tags <label> are allowed inside <labels>, was " + labelTagName); String labelValue = labelNode.getTextContent(); att.getMapping().mapString(labelValue); } } } } if (isClass) label = att; if ((idName != null) && (name.equals(idName))) id = att; attributeList.add(att); } } // create weight attribute for instance weights // remove this later on if no instance weights were defined weight = AttributeFactory.createAttribute("weight", Ontology.REAL); attributeList.add(weight); // read data table = new MemoryExampleTable(attributeList); DataRowFactory factory = new DataRowFactory(getParameterAsInt(PARAMETER_DATAMANAGEMENT), getParameterAsString(PARAMETER_DECIMAL_POINT_CHARACTER).charAt(0)); Attribute[] attributeArray = new Attribute[attributeList.size()]; attributeList.toArray(attributeArray); Element bodyElement = retrieveSingleNode(datasetElement, "body"); Element instancesElement = retrieveSingleNode(bodyElement, "instances"); NodeList instances = instancesElement.getChildNodes(); int maxRows = getParameterAsInt(PARAMETER_SAMPLE_SIZE); double sampleProb = getParameterAsDouble(PARAMETER_SAMPLE_RATIO); Random random = RandomGenerator.getRandomGenerator(getParameterAsInt(PARAMETER_LOCAL_RANDOM_SEED)); int counter = 0; for (int i = 0; i < instances.getLength(); i++) { Node node = instances.item(i); if (node instanceof Element) { Element instance = (Element)node; String tagName = instance.getTagName(); if (!tagName.equals("instance")) throw new IOException("Only tags <instance> are allowed inside <instances>, was " + tagName); NodeList values = instance.getChildNodes(); int elementCount = 0; for (int j = 0; j < values.getLength(); j++) { if (values.item(j) instanceof Element) { elementCount++; } } if (elementCount != attributeList.size() - 1) { // -1 because of the add. weight att throw new IOException("Number of values must be the same than the number of attributes."); } String[] valueArray = new String[attributeList.size()]; int index = 0; for (int j = 0; j < values.getLength(); j++) { Node valueNode = values.item(j); if (valueNode instanceof Element) { Element valueElement = (Element)valueNode; String valueTagName = valueElement.getTagName(); if (!valueTagName.equals("value")) throw new IOException("Only tags <value> are allowed inside <instance>, was " + valueTagName); valueArray[index++] = valueNode.getTextContent(); } } String weightString = instance.getAttribute("weight"); if ((weightString != null) && (weightString.length() > 0)) { valueArray[valueArray.length - 1] = weightString; instanceWeightsUsed = true; } else { valueArray[valueArray.length - 1] = "1.0"; } if ((maxRows > -1) && (counter >= maxRows)) break; counter++; if (maxRows == -1) { if (random.nextDouble() > sampleProb) continue; } table.addDataRow(factory.create(valueArray, attributeArray)); } } } catch (IOException e) { throw new UserError(this, 302, getParameterAsString(PARAMETER_DATA_FILE), e.getMessage()); } ExampleSet result = table.createExampleSet(label, weight, id); if (!instanceWeightsUsed) { result.getAttributes().remove(weight); result.getExampleTable().removeAttribute(weight); } return new IOObject[] { result }; } private Element retrieveSingleNode(Element element, String nodeName) throws IOException { return retrieveSingleNode(element, nodeName, true); } private Element retrieveSingleNode(Element element, String nodeName, boolean exceptionOnFail) throws IOException { NodeList headerElements = element.getElementsByTagName(nodeName); if (headerElements.getLength() == 0) { if (exceptionOnFail) throw new IOException("A dataset must define a <"+nodeName+"> section for attribute meta data description."); else return null; } if (headerElements.getLength() > 1) { if (exceptionOnFail) throw new IOException("A dataset must not define more than one <"+nodeName+"> section."); else return null; } return (Element)headerElements.item(0); } private Attribute createAttribute(String name, String type) { int valueType = Ontology.NOMINAL; if (type.toLowerCase().equals("numeric")) { valueType = Ontology.NUMERICAL; } else if (type.toLowerCase().equals("real")) { valueType = Ontology.REAL; } else if (type.toLowerCase().equals("integer")) { valueType = Ontology.INTEGER; } else if (type.toLowerCase().equals("string")) { valueType = Ontology.STRING; } else if (type.toLowerCase().equals("date")) { valueType = Ontology.DATE; } return AttributeFactory.createAttribute(name, valueType); } public Class<?>[] getInputClasses() { return new Class[0]; } public Class<?>[] getOutputClasses() { return new Class[] { ExampleSet.class }; } public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.add(new ParameterTypeFile(PARAMETER_DATA_FILE, "The path to the data file.", "xrff", false)); types.add(new ParameterTypeString(PARAMETER_ID_ATTRIBUTE, "The (case sensitive) name of the id attribute")); types.add(new ParameterTypeCategory(PARAMETER_DATAMANAGEMENT, "Determines, how the data is represented internally.", DataRowFactory.TYPE_NAMES, DataRowFactory.TYPE_DOUBLE_ARRAY)); types.add(new ParameterTypeString(PARAMETER_DECIMAL_POINT_CHARACTER, "Character that is used as decimal point.", ".")); ParameterType type = new ParameterTypeDouble(PARAMETER_SAMPLE_RATIO, "The fraction of the data set which should be read (1 = all; only used if sample_size = -1)", 0.0d, 1.0d, 1.0d); type.setExpert(false); types.add(type); types.add(new ParameterTypeInt(PARAMETER_SAMPLE_SIZE, "The exact number of samples which should be read (-1 = use sample ratio; if not -1, sample_ratio will not have any effect)", -1, Integer.MAX_VALUE, -1)); types.add(new ParameterTypeInt(PARAMETER_LOCAL_RANDOM_SEED, "Use the given random seed instead of global random numbers (only for permutation, -1: use global).", -1, Integer.MAX_VALUE, -1)); return types; } }