/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.ports.metadata; import java.io.IOException; import java.io.ObjectInputStream; import java.io.Serializable; import java.util.Date; import java.util.Iterator; import java.util.Set; import java.util.TreeSet; import com.rapidminer.RapidMiner; import com.rapidminer.example.Attribute; import com.rapidminer.example.AttributeRole; import com.rapidminer.example.Attributes; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.Statistics; import com.rapidminer.operator.Annotations; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.ParameterService; import com.rapidminer.tools.Tools; import com.rapidminer.tools.math.container.Range; /** Meta data about an attribute * * @author Simon Fischer * */ public class AttributeMetaData implements Serializable { private static final long serialVersionUID = 1L; private ExampleSetMetaData owner = null; private String name; private int type = Ontology.ATTRIBUTE_VALUE; private String role = null; private MDInteger numberOfMissingValues = new MDInteger(0); // it has to be ensured that the appropriate value set type is constructed anyway private SetRelation valueSetRelation = SetRelation.UNKNOWN; private Range valueRange = new Range(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY); private Set<String> valueSet = new TreeSet<String>(); private String mode; private MDReal mean = new MDReal(); private Annotations annotations = new Annotations(); public AttributeMetaData(String name, int type) { this(name, type, null); } /** * This will generate the complete meta data with all values. */ public AttributeMetaData(AttributeRole role, ExampleSet exampleSet) { this(role, exampleSet, false); } /** * This will generate the attribute meta data with the data's values shortened if the number of values exceeds the * respective property and the boolean flag is set to true. * If shortened only the first 100 characters of each nominal value is returned. */ public AttributeMetaData(AttributeRole role, ExampleSet exampleSet, boolean shortened) { this(role.getAttribute().getName(), role.getAttribute().getValueType(), role.getSpecialName()); Attribute att = role.getAttribute(); if (att.isNominal()) { int maxValues = shortened ? getMaximumNumberOfNominalValues(): Integer.MAX_VALUE; valueSet.clear(); for (String value: att.getMapping().getValues()) { if (shortened && value.length() > 100) value = value.substring(0, 100); valueSet.add(value); maxValues --; if (maxValues == 0) break; } valueSetRelation = SetRelation.EQUAL; } if (exampleSet != null) { numberOfMissingValues = new MDInteger((int)exampleSet.getStatistics(att, Statistics.UNKNOWN)); if (att.isNumerical() || Ontology.ATTRIBUTE_VALUE_TYPE.isA(att.getValueType(), Ontology.DATE_TIME)) { valueSetRelation = SetRelation.EQUAL; valueRange = new Range(exampleSet.getStatistics(att, Statistics.MINIMUM), exampleSet.getStatistics(att, Statistics.MAXIMUM)); setMean(new MDReal(exampleSet.getStatistics(att, Statistics.AVERAGE))); } if (att.isNominal()) { double modeIndex = exampleSet.getStatistics(att, Statistics.MODE); if (!Double.isNaN(modeIndex) && modeIndex >= 0 && modeIndex < att.getMapping().size()) { setMode(att.getMapping().mapIndex((int) modeIndex)); } } } else { numberOfMissingValues = new MDInteger(); if (att.isNumerical()) { setMean(new MDReal()); } if (att.isNominal()) { setMode(null); } } this.annotations.putAll(att.getAnnotations()); } public AttributeMetaData(String name, int type, String role) { this.name = name; this.type = type; this.role = role; } public AttributeMetaData(String name, String role, int nominalType, String...values) { this(name, role, values); this.type = nominalType; } public AttributeMetaData(String name, String role, String...values) { this.name = name; this.type = Ontology.NOMINAL; this.role = role; this.valueSetRelation = SetRelation.EQUAL; for (String string: values) valueSet.add(string); } public AttributeMetaData(String name, String role, Range range) { this.name = name; this.role = role; this.type = Ontology.REAL; this.valueRange = range; this.valueSetRelation = SetRelation.EQUAL; } public AttributeMetaData(String name, String role, int type, Range range) { this(name, role, range); this.type = type; } private AttributeMetaData(AttributeMetaData attributeMetaData) { // must not keep references on mutable objects! this.name = attributeMetaData.name; this.role = attributeMetaData.role; this.type= attributeMetaData.type; this.numberOfMissingValues = new MDInteger(attributeMetaData.numberOfMissingValues); this.mean = new MDReal(attributeMetaData.mean); this.mode = attributeMetaData.mode; this.valueSetRelation = attributeMetaData.getValueSetRelation(); this.valueRange = new Range(attributeMetaData.getValueRange()); this.valueSet = new TreeSet<String>(); this.annotations = new Annotations(attributeMetaData.annotations); valueSet.addAll(attributeMetaData.getValueSet()); } public AttributeMetaData(Attribute attribute) { this.name = attribute.getName(); this.type = attribute.getValueType(); this.annotations.putAll(attribute.getAnnotations()); } private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); if (annotations == null) { annotations = new Annotations(); } } public String getRole() { return role; } public String getName() { return name; } public void setName(String name) { String oldName = this.name; this.name = name; // informing ExampleSetMEtaData if one registered if (owner != null) owner.attributeRenamed(this, oldName); } public String getTypeName() { return Ontology.ATTRIBUTE_VALUE_TYPE.mapIndex(type); } public int getValueType() { return type; } /** * If you change the type, keep in mind to set the value sets and their relation */ public void setType(int type) { if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(type, Ontology.NUMERICAL)) { valueSet.clear(); } else { setValueRange(new Range(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY), SetRelation.SUBSET); } this.type = type; } @Override public String toString() { return getDescription(); } public String getDescription() { StringBuilder buf = new StringBuilder(); if (role != null && !role.equals(Attributes.ATTRIBUTE_NAME)) { buf.append("<em>"); buf.append(role); buf.append("</em>: "); } buf.append(getName()); buf.append(" ("); buf.append(getValueTypeName()); if (valueSetRelation != SetRelation.UNKNOWN) { buf.append(" in "); appendValueSetDescription(buf); } else { if (isNominal()) buf.append(", values unkown"); else buf.append(", range unknown"); } switch (containsMissingValues()) { case NO: buf.append("; no missing values"); break; case YES: buf.append("; "); buf.append(numberOfMissingValues.toString()); buf.append(" missing values"); break; case UNKNOWN: buf.append("; may contain missing values"); break; } buf.append(")"); return buf.toString(); } public String getValueTypeName() { return Ontology.ATTRIBUTE_VALUE_TYPE.mapIndex(getValueType()); } public String getValueSetDescription() { StringBuilder buf = new StringBuilder(); appendValueSetDescription(buf); return buf.toString(); } private void appendValueSetDescription(StringBuilder buf) { if (isNominal()) { buf.append(valueSetRelation + " {"); boolean first = true; String mode = getMode(); int index = 0; for (String value : valueSet) { index++; if (first) { first = false; } else { buf.append(", "); } if (index >= 10) { buf.append("..."); break; } boolean isMode = value.equals(mode); if (isMode) { buf.append("<span style=\"text-decoration:underline\">"); } buf.append(value); if (isMode) { buf.append("</span>"); } } buf.append("}"); } if (isNumerical()) { buf.append(valueSetRelation + " ["); if (getValueRange() != null) { buf.append(Tools.formatNumber(getValueRange().getLower(), 3)); buf.append("..."); buf.append(Tools.formatNumber(getValueRange().getUpper(), 3)); buf.append("]"); } if (getMean().isKnown()) { buf.append("; mean "); buf.append(getMean().toString()); } } if (valueRange != null && Ontology.ATTRIBUTE_VALUE_TYPE.isA(getValueType(), Ontology.DATE_TIME) && !Double.isInfinite(getValueRange().getLower()) && !Double.isInfinite(getValueRange().getUpper())) { buf.append(valueSetRelation + " ["); switch (getValueType()) { case Ontology.DATE: buf.append(Tools.formatDate(new Date((long) getValueRange().getLower()))); buf.append("..."); buf.append(Tools.formatDate(new Date((long) getValueRange().getUpper()))); buf.append("]"); break; case Ontology.TIME: buf.append(Tools.formatTime(new Date((long) getValueRange().getLower()))); buf.append("..."); buf.append(Tools.formatTime(new Date((long) getValueRange().getUpper()))); buf.append("]"); break; case Ontology.DATE_TIME: buf.append(Tools.formatDateTime(new Date((long) getValueRange().getLower()))); buf.append("..."); buf.append(Tools.formatDateTime(new Date((long) getValueRange().getUpper()))); buf.append("]"); break; } } } protected String getDescriptionAsTableRow() { StringBuilder b = new StringBuilder(); b.append("<tr><td>"); String role2 = getRole(); if (role2 == null) { role2 = "-"; } b.append(role2).append("</td><td>"); b.append(getName()); String unit = getAnnotations().getAnnotation(Annotations.KEY_UNIT); if (unit != null) { b.append(" <em>[").append(unit).append("]</em>"); } b.append("</td><td>"); b.append(getValueTypeName()).append("</td><td>"); if (valueSetRelation != SetRelation.UNKNOWN) { appendValueSetDescription(b); } else { if (isNominal()) b.append("values unkown"); else b.append("range unknown"); } b.append("</td><td>"); switch (containsMissingValues()) { case NO: b.append("no missing values"); break; case YES: b.append(numberOfMissingValues.toString()); b.append(" missing values"); break; case UNKNOWN: b.append("may contain missing values"); break; } final String comment = getAnnotations().getAnnotation(Annotations.KEY_COMMENT); b.append("</td><td>").append(comment != null ? comment : "-").append("</tr></tr>"); return b.toString(); } @Override public AttributeMetaData clone() { return new AttributeMetaData(this); } public boolean isNominal() { return Ontology.ATTRIBUTE_VALUE_TYPE.isA(type, Ontology.NOMINAL); } public boolean isBinominal() { return Ontology.ATTRIBUTE_VALUE_TYPE.isA(type, Ontology.BINOMINAL); } public boolean isPolynominal() { return Ontology.ATTRIBUTE_VALUE_TYPE.isA(type, Ontology.POLYNOMINAL); } public boolean isNumerical() { return Ontology.ATTRIBUTE_VALUE_TYPE.isA(type, Ontology.NUMERICAL); } public MetaDataInfo containsMissingValues() { return numberOfMissingValues.isAtLeast(1); } public void setNumberOfMissingValues(MDInteger numberOfMissingValues) { this.numberOfMissingValues = numberOfMissingValues; } public MDInteger getNumberOfMissingValues() { return this.numberOfMissingValues; } public SetRelation getValueSetRelation() { return valueSetRelation; } public Set<String> getValueSet() { return valueSet; } public void setValueSet(Set<String> valueSet, SetRelation relation) { this.valueSetRelation = relation; this.valueSet = valueSet; } public Range getValueRange() { return valueRange; } public void setValueRange(Range range, SetRelation relation) { this.valueSetRelation = relation; this.valueRange = range; } public AttributeMetaData copy() { return new AttributeMetaData(this); } /** * Sets the role of this attribute. The name is equivalent with the names from Attributes. * To reset use null as parameter. */ public void setRole(String role) { this.role = role; } public void setRegular() { this.role = null; } public boolean isSpecial() { return role != null; } /** * This method returns a AttributeMetaData object for the prediction attribute * created on applying a model on an exampleset with the given label. */ public static AttributeMetaData createPredictionMetaData(AttributeMetaData labelMetaData) { AttributeMetaData result = labelMetaData.clone(); result.setName("prediction(" + result.getName() + ")"); result.setRole(Attributes.PREDICTION_NAME); return result; } /** * This method creates the attribute meta data for the confidence attributes in the given * exampleSetMetaData. If the values are not known precisely the attributeSet relation of the * exampleSetMetaData object is set appropriate. * @return */ public static ExampleSetMetaData createConfidenceAttributeMetaData(ExampleSetMetaData exampleSetMD) { if (exampleSetMD.hasSpecial(Attributes.LABEL_NAME) == MetaDataInfo.YES) { AttributeMetaData labelMetaData = exampleSetMD.getLabelMetaData(); if (labelMetaData.isNominal()) { for (String value: labelMetaData.getValueSet()) { AttributeMetaData conf = new AttributeMetaData(Attributes.CONFIDENCE_NAME + "_" + value, Ontology.REAL, Attributes.CONFIDENCE_NAME); conf.setValueRange(new Range(0d, 1d), SetRelation.EQUAL); exampleSetMD.addAttribute(conf); } // setting attribute set relation according to value set relation exampleSetMD.mergeSetRelation(labelMetaData.getValueSetRelation()); return exampleSetMD; } } return exampleSetMD; } public void setValueSetRelation(SetRelation valueSetRelation) { this.valueSetRelation = valueSetRelation; } public void setMean(MDReal mean) { this.mean = mean; } public MDReal getMean() { return mean; } public void setMode(String mode) { this.mode = mode; } public String getMode() { return mode; } /** Sets types and ranges to the superset of this and the argument. */ public void merge(AttributeMetaData amd) { if (amd.isNominal() != this.isNominal()) { this.type = Ontology.ATTRIBUTE_VALUE; } if (isNominal()) { if (amd.valueSet != null && this.valueSet != null) { if (!amd.valueSet.equals(this.valueSet)) { this.valueSetRelation.merge(SetRelation.SUBSET); } this.valueSet.addAll(amd.valueSet); } this.valueSetRelation.merge(amd.valueSetRelation); } if (isNumerical()) { if (valueRange != null && amd.valueRange != null) { double min = Math.min(amd.valueRange.getLower(), this.valueRange.getLower()); double max = Math.max(amd.valueRange.getUpper(), this.valueRange.getUpper()); this.valueRange = new Range(min, max); } this.valueSetRelation.merge(amd.valueSetRelation); } } /** Returns either the value range or the value set, depending on the type of attribute. */ public String getRangeString() { if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(getValueType(), Ontology.DATE_TIME)) { if (!Double.isInfinite(getValueRange().getLower()) && !Double.isInfinite(getValueRange().getUpper())) { StringBuilder buf = new StringBuilder(); buf.append(valueSetRelation.toString()); if (valueSetRelation != SetRelation.UNKNOWN) { buf.append("["); switch (getValueType()) { case Ontology.DATE: buf.append(Tools.formatDate(new Date((long) getValueRange().getLower()))); buf.append(" \u2013 "); buf.append(Tools.formatDate(new Date((long) getValueRange().getUpper()))); break; case Ontology.TIME: buf.append(Tools.formatTime(new Date((long) getValueRange().getLower()))); buf.append(" \u2013 "); buf.append(Tools.formatTime(new Date((long) getValueRange().getUpper()))); break; case Ontology.DATE_TIME: buf.append(Tools.formatDateTime(new Date((long) getValueRange().getLower()))); buf.append(" \u2013 "); buf.append(Tools.formatDateTime(new Date((long) getValueRange().getUpper()))); break; } buf.append("]"); return buf.toString(); } else { return "Unknown date range"; } } return "Unbounded date range"; } else if (!isNominal() && valueRange != null) { return valueSetRelation.toString() + (valueSetRelation != SetRelation.UNKNOWN ? valueRange.toString() : ""); } else if (isNominal() && valueSet != null) { return valueSetRelation.toString() + (valueSetRelation != SetRelation.UNKNOWN ? valueSet.toString() : ""); } else { return "unknown"; } } /** Throws away nominal values until the value set size is at most the value specified by property * {@link RapidMiner#PROPERTY_RAPIDMINER_GENERAL_MAX_NOMINAL_VALUES}. */ public void shrinkValueSet() { int maxSize = getMaximumNumberOfNominalValues(); shrinkValueSet(maxSize); } /** Returns the maximum number of values to be used for meta data generation as specified by * {@link RapidMiner#PROPERTY_RAPIDMINER_GENERAL_MAX_NOMINAL_VALUES}. */ public static int getMaximumNumberOfNominalValues() { int maxSize = 100; String maxSizeString = ParameterService.getParameterValue(RapidMiner.PROPERTY_RAPIDMINER_GENERAL_MAX_NOMINAL_VALUES); if (maxSizeString != null) { maxSize = Integer.parseInt(maxSizeString); if (maxSize == 0) { maxSize = Integer.MAX_VALUE; } } return maxSize; } /** Throws away nominal values until the value set size is at most the given value.*/ private void shrinkValueSet(int maxSize) { if (valueSet != null) { if (valueSet.size() > maxSize) { Set<String> newSet = new TreeSet<String>(); Iterator<String> i = valueSet.iterator(); int count = 0; while (i.hasNext() && count < maxSize) { newSet.add(i.next()); count++; } this.valueSet = newSet; valueSetRelation = valueSetRelation.merge(SetRelation.SUPERSET); if (owner != null) { owner.setNominalDataWasShrinked(true); } } } } /** * This method is only to be used by ExampleSetMetaData to register as owner of this attributeMetaData. * Returnes is this object or a clone if this object already has an owner. */ /*pp*/ AttributeMetaData registerOwner(ExampleSetMetaData owner) { if (this.owner == null) { this.owner = owner; return this; } else { AttributeMetaData clone = this.clone(); clone.owner = owner; return clone; } } public void setAnnotations(Annotations annotations) { if (annotations == null) { this.annotations = new Annotations(); } else { this.annotations = annotations; } } public Annotations getAnnotations() { return annotations; } }