/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.tools.att; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.StringTokenizer; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.w3c.dom.Attr; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import com.rapidminer.example.Attribute; import com.rapidminer.example.Attributes; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.tools.LogService; import com.rapidminer.tools.LoggingHandler; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.Tools; import com.rapidminer.tools.XMLException; /** * Reference to source of an attribute, i.e. file, column number (token number). * Statics methods of this class can be used to parse an attribute description * file. * * @author Ingo Mierswa, Simon Fischer * @version $Id: AttributeDataSource.java,v 2.22 2006/03/27 13:22:03 ingomierswa * Exp $ */ public class AttributeDataSource { private File file; private int column; private Attribute attribute; private String attributeType; public AttributeDataSource(Attribute attribute, File file, int column, String attributeType) { this.attribute = attribute; this.file = file; this.column = column; this.attributeType = attributeType; } public void setAttribute(Attribute attribute) { this.attribute = attribute; } public Attribute getAttribute() { return attribute; } public int getColumn() { return column; } public File getFile() { return file; } public void setType(String type) { this.attributeType = type; } public String getType() { return attributeType; } public void setSource(File file, int column) { this.file = file; this.column = column; } public void writeXML(PrintWriter out, File defaultSource) throws IOException { out.println(" <" + Tools.escapeXML(attributeType)); out.println(" name = \"" + Tools.escapeXML(attribute.getName()) + "\""); if (!getFile().equals(defaultSource)) { out.println(" sourcefile = \"" + getFile().getAbsolutePath() + "\""); } out.println(" sourcecol = \"" + (getColumn() + 1) + "\""); out.print(" valuetype = \"" + Ontology.ATTRIBUTE_VALUE_TYPE.mapIndex(attribute.getValueType()) + "\""); if (!Ontology.ATTRIBUTE_BLOCK_TYPE.isA(attribute.getBlockType(), Ontology.SINGLE_VALUE)) out.print(Tools.getLineSeparator() + " blocktype = \"" + Ontology.ATTRIBUTE_BLOCK_TYPE.mapIndex(attribute.getBlockType()) + "\""); if ((Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.NOMINAL)) && (!attributeType.equals(Attributes.KNOWN_ATTRIBUTE_TYPES[Attributes.TYPE_ID]))) { out.println(">"); Iterator<String> i = attribute.getMapping().getValues().iterator(); while (i.hasNext()) { out.println(" <value>" + Tools.escapeXML(i.next()) + "</value>"); } out.println(" </" + Tools.escapeXML(attributeType) + ">" + Tools.getLineSeparator()); } else { // no values, simply end this attribute out.println("/>" + Tools.getLineSeparator()); } } /** Returns a list of {@link AttributeDataSource}s read from the file. */ public static AttributeDataSources createAttributeDataSources(File attributeDescriptionFile, boolean sourceColRequired, LoggingHandler logging) throws XMLException, ParserConfigurationException, SAXException, IOException { Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(attributeDescriptionFile); Element attributeSet = document.getDocumentElement(); if (!attributeSet.getTagName().equals("attributeset")) { throw new XMLException("Outer tag of attribute description file must be <attributeset>"); } File defaultSource = null; if (attributeSet.getAttribute("default_source") != null) { defaultSource = Tools.getFile(attributeDescriptionFile.getParentFile(), attributeSet.getAttribute("default_source")); } List<AttributeDataSource> attributeDataSources = new LinkedList<AttributeDataSource>(); NodeList attributes = attributeSet.getChildNodes(); for (int i = 0; i < attributes.getLength(); i++) { Node node = attributes.item(i); if (node instanceof Element) { Element attributeTag = (Element) node; String type = attributeTag.getTagName(); String name = attributeTag.getAttribute("name"); String file = null; Attr fileAttr = attributeTag.getAttributeNode("sourcefile"); if (fileAttr != null) file = fileAttr.getValue(); int firstSourceCol = -1; Attr sourcecolAttr = attributeTag.getAttributeNode("sourcecol"); if (sourcecolAttr != null) { if (sourcecolAttr.getValue().equals("none")) { firstSourceCol = -1; } else { try { firstSourceCol = Integer.parseInt(sourcecolAttr.getValue()) - 1; } catch (NumberFormatException e) { throw new XMLException("Attribute sourcecol must be 'none' or an integer (was: '" + sourcecolAttr.getValue() + "')!"); } } } int lastSourceCol = -1; Attr sourceEndAttr = attributeTag.getAttributeNode("sourcecol_end"); if (sourceEndAttr != null) { try { lastSourceCol = Integer.parseInt(sourceEndAttr.getValue()) - 1; } catch (NumberFormatException e) { throw new XMLException("Attribute sourcecol_end must be 'none' or an integer (was: '" + sourceEndAttr.getValue() + "')!"); } } int valueType = Ontology.VALUE_TYPE; Attr valueTypeAttr = attributeTag.getAttributeNode("valuetype"); if (valueTypeAttr != null) { try { valueType = Integer.parseInt(valueTypeAttr.getValue()); } catch (NumberFormatException e) { valueType = Ontology.ATTRIBUTE_VALUE_TYPE.mapName(valueTypeAttr.getValue()); if (valueType < 0) throw new XMLException("valuetype must be an index number or a legal value type name (was: '" + valueTypeAttr.getValue() + "')"); } } int blockType = Ontology.SINGLE_VALUE; Attr blockTypeAttr = attributeTag.getAttributeNode("blocktype"); if (blockTypeAttr != null) { try { blockType = Integer.parseInt(blockTypeAttr.getValue()); } catch (NumberFormatException e) { blockType = Ontology.ATTRIBUTE_BLOCK_TYPE.mapName(blockTypeAttr.getValue()); if (blockType < 0) throw new XMLException("blocktype must be an index number or a legal block type name (was: '" + blockTypeAttr.getValue() + "')"); } } List<String> classList = null; if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(valueType, Ontology.NOMINAL)) { // nominal? check possible values... classList = new LinkedList<String>(); // try inner tags <value>...</value> NodeList values = attributeTag.getElementsByTagName("value"); for (int v = 0; v < values.getLength(); v++) { Node value = values.item(v); String valueText = value.getTextContent(); classList.add(valueText); } // if list is still empty try deprecated 'classes' attribute Attr classesAttr = attributeTag.getAttributeNode("classes"); if (classesAttr != null) { if (classList.size() == 0) { StringTokenizer tokenizer = new StringTokenizer(classesAttr.getValue()); while (tokenizer.hasMoreTokens()) { classList.add(tokenizer.nextToken()); } } else { logging.logWarning("XML attribute 'classes' ignored since possible values are already defined by inner <value>...</value> tags."); } } if (classList.size() == 0) { // stil empty class list? --> Warning if (type.equals(Attributes.ID_NAME)) { logging.logNote("The ID attribute '" + name + "' is defined with a nominal value type but the possible values are not defined! " + "Although this often does not lead to problems (unlike for labels or regular nominal attributes) you might want " + "to specify the possible values by inner tags <value>first</value><value>second</value>...."); } else if (type.equals(Attributes.LABEL_NAME)) { logging.logError("The label attribute (class) '" + name + "' is defined with a nominal value type but the possible values are not defined! " + "Please specify the possible values by inner tags <value>first</value><value>second</value>.... " + "Otherwise it might happen that the same nominal values of two example sets are handled in different ways which might cause flipped predictions."); } else { logging.logWarning("At least one of the attributes is defined with a nominal value type but the possible values are not defined! " + "Please specify the possible values by inner tags <value>first</value><value>second</value>.... " + "Otherwise it might happen that the same nominal values of two example sets are handled in different ways which might cause less accurate models."); } } } if (lastSourceCol == -1) lastSourceCol = firstSourceCol; if (sourceColRequired) { if (firstSourceCol < 0) throw new XMLException("sourcecol not defined for " + type + " '" + name + "'!"); if (lastSourceCol < firstSourceCol) throw new XMLException("sourcecol < sourcecol_end must hold."); } for (int col = firstSourceCol; col <= lastSourceCol; col++) { int thisBlockType = blockType; String theName = name; if (lastSourceCol > firstSourceCol) { theName = name + "_" + (col + 1); if ((col == firstSourceCol) && (blockType == Ontology.VALUE_SERIES)) thisBlockType = Ontology.VALUE_SERIES_START; if ((col == lastSourceCol) && (blockType == Ontology.VALUE_SERIES)) thisBlockType = Ontology.VALUE_SERIES_END; } Attribute attribute = AttributeFactory.createAttribute(theName, valueType, thisBlockType); if (attribute.isNominal() && (classList != null)) { Iterator c = classList.iterator(); while (c.hasNext()) { attribute.getMapping().mapString((String) c.next()); } } if (!attribute.isNominal() && (classList != null) && (classList.size() != 0)) { LogService.getGlobal().log("Ignoring classes for non-nominal attribute " + theName + ".", LogService.WARNING); } attributeDataSources.add(new AttributeDataSource(attribute, (file != null) ? Tools.getFile(attributeDescriptionFile.getParentFile(), file) : defaultSource, col, type)); } } } return new AttributeDataSources(attributeDataSources, defaultSource); } public String toString() { return attribute.getName() + " (type: " + attributeType + ", value type: " + Ontology.VALUE_TYPE_NAMES[attribute.getValueType()] + ") from " + file.getName() + " (" + column + ")"; } }