/******************************************************************************* * Copyright (C) 2006-2012 Dominik Jain. * * This file is part of ProbCog. * * ProbCog is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * ProbCog is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with ProbCog. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package probcog.srldb.datadict; import java.io.Serializable; import probcog.clustering.BasicClusterer; import probcog.clustering.ClusterNamer; import probcog.clustering.EMClusterer; import probcog.clustering.SimpleClusterer; import probcog.srldb.Database; import probcog.srldb.Item; import probcog.srldb.Database.AttributeClustering; import probcog.srldb.datadict.domain.AutomaticDomain; import probcog.srldb.datadict.domain.BooleanDomain; import probcog.srldb.datadict.domain.DiscardedDomain; import probcog.srldb.datadict.domain.Domain; import probcog.srldb.datadict.domain.OrderedStringDomain; import weka.clusterers.Clusterer; import kdl.prox3.dbmgr.DataTypeEnum; /** * Data dictionary definition of an attribute. * @author Dominik Jain */ public class DDAttribute implements Cloneable, Serializable { private static final long serialVersionUID = 1L; protected String name; protected Domain<?> domain; /** * whether this attribute is scheduled for clustering */ protected ClusteringTask clusteringTask; /** * whether this attribute is actually discarded/unused */ protected boolean discarded; /** * the item that owns this attribute (usually a DDObject) */ protected DDItem owner; protected class ClusteringTask { public Integer numClusters = null; public ClusterNamer<Clusterer> namer = null; public String toString() { return String.format("%s", numClusters == null ? "auto" : numClusters.toString()); } public AttributeClustering perform(Iterable<Item> items) throws Exception { DDAttribute attrib = DDAttribute.this; Domain<?> domain = getDomain(); AttributeClustering ac; // if the domain was specified by a user as an ordered list of strings, use K-Means // with the corresponding number of clusters, naming the clusters using the strings // (using the strings in ascending order of cluster centroid) if(domain instanceof OrderedStringDomain) { SimpleClusterer c = new SimpleClusterer(); ((SimpleClusterer)c).setNumClusters(domain.getValues().length); ac = Database.clusterAttribute(attrib, items, c, new ClusterNamer.Fixed(((OrderedStringDomain)domain).getValues())); } // if the domain was generated automatically (no user input), either use EM // clustering to determine a suitable number of clusters or, if the number is given, // K-means, and use default names (attribute name followed by index) else if(domain instanceof AutomaticDomain) { BasicClusterer<?> c; if(numClusters == null) { c = new EMClusterer(); System.out.println(" applying EM clustering to " + attrib); } else { c = new SimpleClusterer(); ((SimpleClusterer)c).setNumClusters(numClusters); System.out.printf(" applying %d-means clustering to " + attrib, numClusters); } ClusterNamer<Clusterer> namer = this.namer; if(namer == null) namer = new ClusterNamer.SimplePrefix(attrib.getName()); ac = Database.clusterAttribute(attrib, items, c, namer); } else throw new DDException("Don't know how to perform clustering for target domain " + " (" + domain.getClass() + ")"); return ac; } } protected DDAttribute(String name) { this.name = name; this.domain = null; clusteringTask = null; this.discarded = false; this.owner = null; } public DDAttribute(String name, Domain<?> domain) { this(name); this.domain = domain; } public DDAttribute(String name, Domain<?> domain, boolean doClustering) { this(name, domain); setClustering(doClustering); } /** * @param doClustering whether this attribute should be scheduled for clustering, * replacing all its values in instances with the respective clustering result */ public void setClustering(boolean doClustering, ClusterNamer<Clusterer> namer) { if(doClustering) { clusteringTask = new ClusteringTask(); clusteringTask.namer = namer; } else clusteringTask = null; } public void setClustering(boolean doClustering) { setClustering(doClustering, null); } public void setClustering(Integer numClusters, ClusterNamer<Clusterer> namer) { clusteringTask = new ClusteringTask(); clusteringTask.numClusters = numClusters; clusteringTask.namer = namer; } public void setClustering(Integer numClusters) { setClustering(numClusters, null); } public AttributeClustering doClustering(Iterable<Item> items) throws Exception { return clusteringTask.perform(items); } public String getName() { return name; } public DataTypeEnum getType() { return domain.getType(); } public boolean requiresClustering() { return clusteringTask != null; } public Domain<?> getDomain() { return domain; } public boolean isBoolean() { return domain.isBoolean(); } /** * marks this attribute as discarded/unused<br> * * An attribute may eventually be discarded even though it is defined, because, * for example, it requires clustering and too few instances to actually perform * clustering were found in the database. */ public void discard() { discarded = true; domain = DiscardedDomain.getInstance(); // avoid wasting space on domain data } public boolean isDiscarded() { return this.discarded; } public DDAttribute clone() { try { return (DDAttribute)super.clone(); } catch (CloneNotSupportedException e) { return null; } } public void setName(String name) { this.name = name; } public void setDomain(Domain<?> domain) { this.domain = domain; } public void setOwner(DDItem item) throws DDException { if(owner == null || item == null) owner = item; else throw new DDException("Error: Cannot add attribute " + this.getName() + " to more than one item; previously added to " + this.owner.getName()); } public DDItem getOwner() { return owner; } public String toString() { return String.format("DDAttribute:%s[domain=%s/size=%d, discarded=%s, clustering=%s]", name, domain.getClass().getSimpleName(), domain.getValues().length, Boolean.toString(discarded), clusteringTask == null ? "none" : clusteringTask.toString()); } }