package de.tud.inf.operator.mm;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
import Jama.Matrix;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DataRowFactory;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.att.AttributeSet;
import de.tud.inf.operator.mm.util.MetaConfig;
/**
* This class computes a NMI matrix of the given clusterings.
*
* @version $Revision$
* @author Andre Jaehnig
*/
public class MutualInformationCalculator extends Operator {
/************************************************************************************************
* FIELDS
***********************************************************************************************/
/** Indicates if the mutual information values should be normalized. */
public static final String PARAMETER_NORMALIZE = "normalize";
/** Indicates if an additional column with the sum of the row should be added. */
public static final String PARAMETER_ADD_SNMI = "add_snmi";
/** Column name for the sum of normalized mutual informations. */
public static final String SNMI_COLUMN_NAME = "snmi";
/** Filename of the meta configuration file. */
public static final String PARAMETER_META_FILENAME = "meta_filename";
/** Filename of the nmi file. */
public static final String PARAMETER_NMI_FILENAME = "nmi_filename";
/************************************************************************************************
* GETTER & SETTER
***********************************************************************************************/
/*
* (non-Javadoc)
*
* @see com.rapidminer.operator.Operator#getInputClasses()
*/
@Override
public Class<?>[] getInputClasses() {
return new Class[] { ExampleSet.class };
}
/*
* (non-Javadoc)
*
* @see com.rapidminer.operator.Operator#getOutputClasses()
*/
@Override
public Class<?>[] getOutputClasses() {
return new Class[] { ExampleSet.class };
}
/*
* (non-Javadoc)
*
* @see com.rapidminer.operator.Operator#getParameterTypes()
*/
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
types.add(new ParameterTypeBoolean(PARAMETER_NORMALIZE,
"Indicates if the mutual information values should be normalized.", true));
types
.add(new ParameterTypeBoolean(
PARAMETER_ADD_SNMI,
"Indicates if an additional column with the sum of the row should be added. (If this value is true all values will be normalized regardless of the normalized-value.)",
true));
types.add(new ParameterTypeString(PARAMETER_META_FILENAME, "Filename of the meta configuration file."));
types.add(new ParameterTypeString(PARAMETER_NMI_FILENAME, "Filename of the nmi file."));
return types;
}
/************************************************************************************************
* CONSTRUCTOR
***********************************************************************************************/
/**
* Constructor.
*
* @param description
*/
public MutualInformationCalculator(OperatorDescription description) {
super(description);
}
/************************************************************************************************
* PUBLIC METHODS
***********************************************************************************************/
/*
* (non-Javadoc)
*
* @see com.rapidminer.operator.Operator#apply()
*/
@Override
public IOObject[] apply() throws OperatorException {
// get input example set
ExampleSet clusterExampleSet = this.getInput(ExampleSet.class);
int elementCount = clusterExampleSet.size();
this.logNote("Input example-set has " + elementCount + " elements.");
// read out parameters
boolean normalizeValues = this.getParameterAsBoolean(PARAMETER_NORMALIZE);
boolean addSnmiColumn = this.getParameterAsBoolean(PARAMETER_ADD_SNMI);
String metaFileName = this.getParameterAsString(PARAMETER_META_FILENAME);
String nmiFileName = this.getParameterAsString(PARAMETER_NMI_FILENAME);
MetaConfig mc = MetaConfig.load(metaFileName);
String clusterColumnPrefix = mc.getClusteringColumnPrefix();
// create attributes for each clustering
AttributeSet attributeSet = new AttributeSet();
for (Attribute attribute : clusterExampleSet.getAttributes()) {
String attrName = attribute.getName();
if (attrName.startsWith(clusterColumnPrefix)) {
attributeSet.addAttribute(AttributeFactory.createAttribute(attrName, Ontology.REAL));
}
}
// create attribute for the sum of NMI (if desired)
if (addSnmiColumn) {
Attribute sumNMIAttr = AttributeFactory.createAttribute(SNMI_COLUMN_NAME, Ontology.REAL);
attributeSet.addAttribute(sumNMIAttr);
// adding this column implicates that all values will be normalized
normalizeValues = true;
}
// create table for the output
MemoryExampleTable table = new MemoryExampleTable(attributeSet.getAllAttributes());
DataRowFactory drf = new DataRowFactory(DataRowFactory.TYPE_DOUBLE_ARRAY, '.');
// create a list of cluster-id-assignments for each clustering
this.logNote("Creating a map with cluster-id-assignments for each clustering.");
Map<String, Map<Integer, List<Integer>>> clusterAssign = new LinkedHashMap<String, Map<Integer, List<Integer>>>();
Iterator<Example> it = clusterExampleSet.iterator();
Attribute idAttr = clusterExampleSet.getAttributes().getId();
// iterate through all elements
while (it.hasNext()) {
Example example = it.next();
// get element id
int id = (int) example.getValue(idAttr);
// iterate through all clustering attributes
for (Attribute attribute : clusterExampleSet.getAttributes()) {
String attrName = attribute.getName();
if (!attrName.startsWith("cr")) {
// not relevant
continue;
}
// get map of cluster-id-assignments for the current clustering
Map<Integer, List<Integer>> clusters = clusterAssign.get(attrName);
if (clusters == null) {
// add new map for this clustering
clusters = new HashMap<Integer, List<Integer>>();
clusterAssign.put(attrName, clusters);
}
// get id list for the current cluster number
int clusterId = (int) example.getValue(attribute);
List<Integer> idList = clusters.get(clusterId);
if (idList == null) {
// add new list for this cluster
idList = new ArrayList<Integer>();
clusters.put(clusterId, idList);
}
// add id to the id-list of the cluster
idList.add(id);
}
}
// calculate entropy for each cluster result
this.logNote("Calculating the entropy for each clustering.");
Map<String, Double> entropies = new HashMap<String, Double>();
double entropy;
double idListSize;
for (Entry<String, Map<Integer, List<Integer>>> entry : clusterAssign.entrySet()) {
entropy = 0.0d;
// iterate through clusters
for (List<Integer> idList : entry.getValue().values()) {
idListSize = (double) idList.size();
entropy += idListSize * Math.log(idListSize / (double) elementCount);
}
entropies.put(entry.getKey(), entropy);
}
int clusteringCount = entropies.size();
/*
* calculate mutual information for each combination of 2 clusterings
*
* because of the instability of the java mathematics I decided to store the values at a
* matrix first (to easily build a symmetric matrix) and afterwards put them into the example
* set
*/
this.logNote("Calculating the mutual information for each combination of 2 clusterings.");
Matrix temporaryMIMatrix = new Matrix(clusteringCount, clusteringCount);
int sizeA, sizeB, commonIds;
Integer firstA, lastA, firstB, lastB;
double mutualInformation;
int clusteringNumA = 0;
int clusteringNumB = 0;
for (String keyA : clusterAssign.keySet()) {
clusteringNumB = 0;
for (String keyB : clusterAssign.keySet()) {
if (clusteringNumB > clusteringNumA) {
break;
}
// iterate through the cluster-combinations of A and B
mutualInformation = 0.0d;
for (List<Integer> idListA : clusterAssign.get(keyA).values()) {
sizeA = idListA.size();
firstA = idListA.get(0);
lastA = idListA.get(sizeA - 1);
for (List<Integer> idListB : clusterAssign.get(keyB).values()) {
// get number of elements that both clusters have in common (ids are ordered)
// first check if they even have common elements
sizeB = idListB.size();
firstB = idListB.get(0);
lastB = idListB.get(sizeB - 1);
if (lastA < firstB || firstA > lastB) {
// no they don't
commonIds = 0;
}
else {
// they have
Set<Integer> dummySet = new HashSet<Integer>(idListA);
dummySet.addAll(idListB);
commonIds = sizeA + sizeB - dummySet.size();
}
if (commonIds == 0) {
// mutual information = 0.0
continue;
}
mutualInformation += (double) commonIds
* Math.log((double) (elementCount * commonIds) / (double) (sizeA * sizeB));
}
}
if (normalizeValues) {
mutualInformation = mutualInformation / Math.sqrt(entropies.get(keyA) * entropies.get(keyB));
}
// store value at matrix (symmetric matrix)
temporaryMIMatrix.set(clusteringNumA, clusteringNumB, mutualInformation);
temporaryMIMatrix.set(clusteringNumB, clusteringNumA, mutualInformation);
clusteringNumB++;
}
clusteringNumA++;
}
// copy matrix values into example set
this.logNote("Copy MI-values into output example set.");
Double[] values = new Double[attributeSet.getAllAttributes().size()];
double mutualInformationSum;
int counter;
for (int row = 0; row < clusteringCount; row++) {
mutualInformationSum = 0.0d;
counter = 0;
for (int col = 0; col < clusteringCount; col++) {
double value = temporaryMIMatrix.get(row, col);
values[counter++] = value;
if (row != col) {
mutualInformationSum += value;
}
}
if (addSnmiColumn) {
values[counter++] = mutualInformationSum;
}
// add values to the table
table.addDataRow(drf.create(values, table.getAttributes()));
}
// create output example set
ExampleSet newExampleSet = table.createExampleSet(attributeSet);
mc.setNmiFileName(nmiFileName);
mc.setNmiNormalized(normalizeValues);
mc.setSnmiAdded(addSnmiColumn);
mc.setSnmiColumnName(SNMI_COLUMN_NAME);
mc.save(metaFileName);
return new IOObject[] { newExampleSet };
}
/************************************************************************************************
* PRIVATE METHODS
***********************************************************************************************/
/*
* none
*/
}