/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.clustering;
import java.util.ArrayList;
import java.util.Collection;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.set.ConditionedExampleSet;
import com.rapidminer.example.set.NoMissingAttributeValueCondition;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.NominalMapping;
import com.rapidminer.operator.AbstractModel;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.OperatorProgress;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.Tools;
/**
* This class is the standard flat cluster model, using the example ids to remember which examples
* were assigned to which cluster. This information is stored within the single clusters. Since
* this, the id attribute needs to be unchanged when cluster model is applied onto an example set.
*
* @author Sebastian Land
*/
public class ClusterModel extends AbstractModel implements ClusterModelInterface {
private static final long serialVersionUID = 3780908886210272852L;
public static final int UNASSIGNABLE = -1;
private static final int OPERATOR_PROGRESS_STEPS = 50000;
/**
* The progress of this operator is split into 3 part-progresses. These values define how many
* percent are completed after part-progress 1 and part-progress 2.
*/
private static final int INTERMEDIATE_PROGRESS_1 = 10;
private static final int INTERMEDIATE_PROGRESS_2 = 30;
private boolean isAddingAsLabel;
private boolean isRemovingUnknown;
private ArrayList<Cluster> clusters;
public ClusterModel(ExampleSet exampleSet, int k, boolean addClusterAsLabel, boolean removeUnknown) {
super(exampleSet);
this.clusters = new ArrayList<Cluster>(k);
for (int i = 0; i < k; i++) {
clusters.add(new Cluster(i));
}
this.isAddingAsLabel = addClusterAsLabel;
this.isRemovingUnknown = removeUnknown;
}
@Override
public ExampleSet apply(ExampleSet exampleSet) throws OperatorException {
exampleSet = (ExampleSet) exampleSet.clone();
OperatorProgress progress = null;
if (getShowProgress() && getOperator() != null && getOperator().getProgress() != null) {
progress = getOperator().getProgress();
progress.setTotal(100);
}
Attributes attributes = exampleSet.getAttributes();
// additional checks
this.checkCapabilities(exampleSet);
// creating attribute
Attribute targetAttribute;
if (!isAddingAsLabel) {
targetAttribute = AttributeFactory.createAttribute("cluster", Ontology.NOMINAL);
exampleSet.getExampleTable().addAttribute(targetAttribute);
attributes.setCluster(targetAttribute);
} else {
targetAttribute = AttributeFactory.createAttribute("label", Ontology.NOMINAL);
exampleSet.getExampleTable().addAttribute(targetAttribute);
attributes.setLabel(targetAttribute);
}
if (progress != null) {
progress.setCompleted(INTERMEDIATE_PROGRESS_1);
}
// setting values
int[] clusterIndices = getClusterAssignments(exampleSet);
if (progress != null) {
progress.setCompleted(INTERMEDIATE_PROGRESS_2);
}
int i = 0;
for (Example example : exampleSet) {
if (clusterIndices[i] != ClusterModel.UNASSIGNABLE) {
example.setValue(targetAttribute, "cluster_" + clusterIndices[i]);
} else {
example.setValue(targetAttribute, Double.NaN);
}
i++;
if (progress != null && i % OPERATOR_PROGRESS_STEPS == 0) {
progress.setCompleted(
(int) ((100.0 - INTERMEDIATE_PROGRESS_2) * i / exampleSet.size() + INTERMEDIATE_PROGRESS_2));
}
}
// removing unknown examples if desired
if (isRemovingUnknown) {
exampleSet = new ConditionedExampleSet(exampleSet,
new NoMissingAttributeValueCondition(exampleSet, targetAttribute.getName()));
}
return exampleSet;
}
public int getNumberOfClusters() {
return clusters.size();
}
/**
* This method returns whether this cluster model should add the assignment as a label.
*/
public boolean isAddingLabel() {
return isAddingAsLabel;
}
/**
* This method returns whether examples which can't be assigned should be removed from the
* resulting example set.
*
* @return
*/
public boolean isRemovingUnknownAssignments() {
return isRemovingUnknown;
}
public void setClusterAssignments(int[] clusterId, ExampleSet exampleSet) {
Attribute id = exampleSet.getAttributes().getId();
if (id.isNominal()) {
NominalMapping mapping = id.getMapping();
int i = 0;
for (Example example : exampleSet) {
getCluster(clusterId[i]).assignExample(mapping.mapIndex((int) example.getValue(id)));
i++;
}
} else {
int i = 0;
for (Example example : exampleSet) {
getCluster(clusterId[i]).assignExample(example.getValue(id));
i++;
}
}
}
/**
* This method returns an array with the indices or the cluster for all examples in the set.
* This will work with new examples, if centroid based clustering has been used before.
* Otherwise new examples cannot be assigned.
*/
public int[] getClusterAssignments(ExampleSet exampleSet) {
int[] clusterAssignments = new int[exampleSet.size()];
Attribute idAttribute = exampleSet.getAttributes().getId();
if (idAttribute.isNominal()) {
int j = 0;
for (Example example : exampleSet) {
clusterAssignments[j] = getClusterIndexOfId(example.getValueAsString(idAttribute));
j++;
}
} else {
int j = 0;
for (Example example : exampleSet) {
clusterAssignments[j] = getClusterIndexOfId(example.getValue(idAttribute));
j++;
}
}
return clusterAssignments;
}
/**
* This method returns the index of the cluster, this Id's example has been assigned to. Please
* note, that this can only be applied to examples included in the clustering process. New
* examples might be assigned to clusters using the getClusterAssignments method, if and only if
* the cluster model supports this. Currently only centroid based cluster models do.
*/
public int getClusterIndexOfId(Object id) {
int index = 0;
for (Cluster cluster : clusters) {
if (cluster.containsExampleId(id)) {
return index;
}
index++;
}
return UNASSIGNABLE;
}
public Cluster getCluster(int i) {
return clusters.get(i);
}
public Collection<Cluster> getClusters() {
return clusters;
}
@Override
public String getExtension() {
return "cm";
}
@Override
public String getFileDescription() {
return "Cluster model";
}
public void checkCapabilities(ExampleSet exampleSet) throws OperatorException {
com.rapidminer.example.Tools.isIdTagged(exampleSet);
}
@Override
public String getName() {
return "Cluster Model";
}
@Override
public String toString() {
StringBuffer result = new StringBuffer();
int sum = 0;
for (int i = 0; i < getNumberOfClusters(); i++) {
Cluster cl = getCluster(i);
int numObjects = cl.getNumberOfExamples();
result.append("Cluster " + cl.getClusterId() + ": " + numObjects + " items" + Tools.getLineSeparator());
sum += numObjects;
}
result.append("Total number of items: " + sum + Tools.getLineSeparator());
return result.toString();
}
}