/*
* Copyright 2004-2010 Information & Software Engineering Group (188/1)
* Institute of Software Technology and Interactive Systems
* Vienna University of Technology, Austria
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package at.tuwien.ifs.somtoolbox.reportgenerator;
import java.awt.Color;
import java.io.File;
import java.util.Vector;
import java.util.logging.Logger;
import at.tuwien.ifs.somtoolbox.apps.viewer.CommonSOMViewerStateData;
import at.tuwien.ifs.somtoolbox.data.InputData;
import at.tuwien.ifs.somtoolbox.data.InputDatum;
import at.tuwien.ifs.somtoolbox.data.SOMLibClassInformation;
import at.tuwien.ifs.somtoolbox.data.SOMVisualisationData;
import at.tuwien.ifs.somtoolbox.data.TemplateVector;
import at.tuwien.ifs.somtoolbox.util.PCA;
import at.tuwien.ifs.somtoolbox.visualization.clustering.ClusterLabel;
import at.tuwien.ifs.somtoolbox.visualization.clustering.ClusterNode;
/**
* FIXME: most probably all the methods in this class should be part of {@link InputData} and
* {@link SOMLibClassInformation}, respectively ! <br>
* this class collects all available information about the values in the input dataset, like from the input file, the
* template vector file, ... and maybe computes some properties of its own. It's job is to give one centralized placed
* where the actual report generators (the output object) can ask for the data.
*
* @author Sebastian Skritek (0226286, Sebastian.Skritek@gmx.at)
* @version $Id: DatasetInformation.java 3590 2010-05-21 10:43:45Z mayer $
*/
public class DatasetInformation {
public static final int MIN_VALUE = 1;
public static final int MAX_VALUE = 2;
public static final int MEAN_VALUE = 3;
public static final int VAR_VALUE = 4;
public static final int ZERO_VALUE = 5;
public static final int ONLY01 = 6;
public static final int DISCRETE = 7;
private Vector<Integer> selectedIndices;
private InputData inputData;
private String inputDataFilename;
private String tvFilename;
private TemplateVector inputTemplate;
private SOMLibClassInformation classInfo;
private String[] classNames = null;
private String classInformationFilename = null;
private EditableReportProperties EP;
/* variables containing information about the data distribution */
/** we check whether there are values != 0 or 1 */
private boolean only01[] = null;
/** only an estimation - we call values discrete if they are integer values */
private boolean discrete[] = null;
/** holds for each dimension the minimal value */
private double[] min = null;
/** holds for each dimension the maximal value */
private double[] max = null;
/** holds for each dimension the mean value */
private double[] mean = null;
/** holds for each dimension the variance */
private double[] var = null;
/** holds for each dimension the number of 0 - values. Using this we estimate the missing values */
private int zeroValues[] = null;
boolean denseData = true;
// Input Data herrichten
/**
* creates a new object storing information about a given dataset
*
* @param selectedIndices Vector of indices of the input items selected for more information
* @param inputDataFilename the path to the file containing the input data
* @param tvFilename the path to the file containin the template vector
* @param classInformationFile the path to the file containing the class information
* @param EP the customized Report Features of the Semantic Report
*/
public DatasetInformation(Vector<Integer> selectedIndices, String inputDataFilename, String tvFilename,
String classInformationFile, EditableReportProperties EP) {
this(selectedIndices, inputDataFilename, tvFilename, classInformationFile, EP, new CommonSOMViewerStateData());
}
public DatasetInformation(Vector<Integer> selectedIndices, String inputDataFilename, String tvFilename,
String classInformationFile, EditableReportProperties EP, CommonSOMViewerStateData state) {
// check what files we already have in the state, and try to load the missing ones
if (state.inputDataObjects.getInputData() == null) { // need to load the input data
state.inputDataObjects.setFileName(SOMVisualisationData.INPUT_VECTOR, inputDataFilename);
state.inputDataObjects.readAvailableData();
}
this.inputData = state.inputDataObjects.getInputData();
if (state.inputDataObjects.getClassInfo() == null) {
if (new File(classInformationFile).exists()) {
this.classInformationFilename = classInformationFile;
state.inputDataObjects.setFileName(SOMVisualisationData.CLASS_INFO, classInformationFile);
state.inputDataObjects.readAvailableData();
if (state.inputDataObjects.getClassInfo() == null) {
Logger.getLogger("at.tuwien.ifs.somtoolbox.reports").warning(
"Could not read class information file from '" + classInformationFile
+ "' - generating report without class information.");
}
this.classInfo = state.inputDataObjects.getClassInfo();
} else {
Logger.getLogger("at.tuwien.ifs.somtoolbox.reports").warning(
"No class information file specified - generating report without class information.");
}
} else {
classInfo = state.inputDataObjects.getClassInfo();
}
this.EP = EP;
this.tvFilename = tvFilename;
this.selectedIndices = selectedIndices;
this.inputTemplate = this.inputData.templateVector();
this.inputDataFilename = inputDataFilename;
}
/**
* returns whether class information are attached to the input vectors does not check whether it is a valid file,
* only whether a String with length > 0 has been specified as path
*
* @return true if a class information file (.cls) has been specified, false otherwise
*/
public boolean classInfoAvailable() {
return classInfo != null;
}
public SOMLibClassInformation getClassInfo() {
return classInfo;
}
/**
* returns the number of input vectors used for training the SOM, that is the number of different vectors present in
* the input file for the SOM training.
*
* @return the number of input vectors that appear in the input file
*/
public int getNumberOfInputVectors() {
return inputData.numVectors();
}
/**
* returns the mean vector of all input items belonging to the given class
*
* @param classId the id of the class for which the mean vector shall be calculated
* @return the mean vector of the class
*/
public double[] getClassMeanVector(int classId) {
double[] mean = new double[this.getVectorDim()];
int n = this.classInfo.getNumberOfClassMembers(classId);
for (int i = 0; i < mean.length; i++) {
mean[i] = 0;
}
String[] ins = this.classInfo.getDataNamesInClass(this.getNameOfClass(classId));
for (String in : ins) {
double[] temp = this.inputData.getInputDatum(in).getVector().toArray();
for (int c = 0; c < temp.length; c++) {
mean[c] += temp[c] / n;
}
}
return mean;
}
/**
* returns the dimension of the input vectors, that is the same as the number of attributes used to describe the
* objects.
*
* @return the dimension of the input vectors
*/
public int getVectorDim() {
return inputData.dim();
}
/**
* returns whether the values in the given dimension are all only 0 or 1
*
* @param index the dimension (starting with 0) for which this property is requested
* @return true if all input vectors contain only 0 or 1 in this dimension, false otherwise
*/
public boolean is01(int index) {
if (this.only01 == null) {
this.checkDatatypes();
}
if (index > this.only01.length - 1) {
return false;
}
return this.only01[index];
}
/**
* returns whether our heuristic estimates this dimension to contain discrete values This is the case, if all values
* in this dimension are exact integer values.
*
* @param index the dimension (starting with 0) for which the estimation is requested
* @return true if all input vectors have only plain integers as values in this dimension, false otherwise
*/
public boolean isDiscrete(int index) {
if (this.discrete == null) {
this.checkDatatypes();
}
if (index > this.discrete.length - 1) {
return false;
}
return this.discrete[index];
}
/**
* returns the number of input vectors that have 0 as value in the given dimension
*
* @param index the dimension (starting with 0) for which the number is requested
* @return the number of input vectors having the value 0 in the given dimension
*/
public int getNumberOfZeroValues(int index) {
if (this.zeroValues == null) {
this.checkDatatypes();
}
if (index > this.zeroValues.length - 1) {
return -1;
}
return this.zeroValues[index];
}
/**
* returns whether the input set has been normalized (in fact, this functions returns the result of
* InputData.isNormalizedToUnitLength())
*
* @return true if data iset is normalized, false if not
*/
public boolean isNormalized() {
return this.inputData.isNormalizedToUnitLength();
}
/**
* FIXME: split this into simple single getter methods... !<br>
* returns the requested value describing the distribution of the input values. The types of information available
* are described by the constant members of this class (this function returns numerical properties):
* <ul>
* <li>min value of all input vectors(MIN_VALUE)</li>
* <li>max value of all input vectors(MAX_VALUE)</li>
* <li>mean value of all input vecotrs(MEAN_VALUE)</li>
* <li>variance of the values in the input vectors(VAR_VALUE)</li>
* <li>the number of input vectors having 0 as value(ZERO_VALUE) (is in fact int, not double)</li>
* </ul>
* all information are returned for the given dimension (argument attribute).
*
* @param type specifies the type of information to be returned: allowed are some constants defined by this class
* (see above)
* @param attribute the index of the attribute for which the value shall be returned (starting with 0)
* @return the requested value. if the requested type is not available, -1 is returned
*/
public double getNumericalDataProps(int type, int attribute) {
switch (type) {
case MIN_VALUE:
if (this.min == null) {
this.checkDatatypes();
}
return this.min[attribute];
case MAX_VALUE:
if (this.max == null) {
this.checkDatatypes();
}
return this.max[attribute];
case MEAN_VALUE:
if (this.mean == null) {
this.checkDatatypes();
}
return this.mean[attribute];
case VAR_VALUE:
if (this.var == null) {
this.checkDatatypes();
}
return this.var[attribute];
case ZERO_VALUE:
if (this.zeroValues == null) {
this.checkDatatypes();
}
return this.zeroValues[attribute];
default:
Logger.getLogger("at.tuwien.ifs.somtoolbox.reports").warning(
"Requested unknown dataset distribution value: " + type
+ " (in DatasetInformation.getNumericalDataProps())");
return -1;
}
}
/**
* FIXME: split this into simple single getter methods... !<br>
* returns the requested value describing the distribution of the input values. The types of information available
* are described by the constant members of this class (this function returns boolean properties):
* <ul>
* <li>whether all values are plain integers(DISCRETE)</li>
* <li>whether all values are either 0 or 1 (ONLY01)</li>
* </ul>
* all information are returned for the given dimension (argument attribute).
*
* @param type specifies the type of information to be returned: allowed are some constants defined by this class
* (see above)
* @param attribute the index of the attribute for which the value shall be returned (starting with 0)
* @return the requested value. if the requested type is not available, -1 is returned
*/
public boolean getBoolDataProps(int type, int attribute) {
switch (type) {
case DISCRETE:
if (this.discrete == null) {
this.checkDatatypes();
}
return this.discrete[attribute];
case ONLY01:
if (this.only01 == null) {
this.checkDatatypes();
}
return this.only01[attribute];
default:
Logger.getLogger("at.tuwien.ifs.somtoolbox.reports").warning(
"Requested unknown dataset distribution value: " + type
+ " (in DatasetInformation.getBoolDataProps())");
return false;
}
}
/**
* returns the label (that is the name defined for an attribute in the template vector file) for the specified
* attribute. If no template file is given, only the index of the attribute is returned.
*
* @param dim the index within the vector of the attribute whose label shall be returned
* @return the label specified in the template vector file or (if not present) the index of the attribute
*/
public String getAttributeLabel(int dim) {
if (this.inputTemplate != null) {
return inputTemplate.getLabel(dim);
} else {
return "" + dim;
}
}
/**
* returns the number of classes. If there are no class information are attached to data, -1 is returned.
*
* @return the number of classes or -1
*/
public int getNumberOfClasses() {
if (this.classInfo == null) {
return -1;
} else {
return this.classInfo.numClasses();
}
}
/**
* returns the name of the class specified by the index
*
* @param c the index of the class (starting with 0)
* @return the name of the class specified by the index, the empty String in case of any error finding the name
*/
public String getNameOfClass(int c) {
if (this.classInfo == null) {
return "";
}
if (this.classNames == null) {
this.classNames = this.classInfo.classNames();
}
if (this.classNames.length <= c) {
return "";
}
return this.classNames[c];
}
/**
* returns a list of labels of all input items belonging to the given class
*
* @param classId the id of the class for which the input items are requested
* @return a list containing the lables of the input items belonging to this class
*/
public String[] getInputLabelsofClass(int classId) {
return this.classInfo.getDataNamesInClass(this.getNameOfClass(classId));
}
/**
* returns an array of length three containing the r,g,b values of the colour used to colour the specified class
*
* @param c the index of the class for which the colour is requested
* @return an array containing the r, g and b definitions of a color
*/
public int[] getClassColorRGB(int c) {
int[] rgb = new int[3];
Color[] colors = this.classInfo.getClassColors();
if (colors.length > c) {
rgb[0] = colors[c].getRed();
rgb[1] = colors[c].getGreen();
rgb[2] = colors[c].getBlue();
} else {
rgb[0] = 255;
rgb[1] = 255;
rgb[2] = 255;
}
return rgb;
}
/**
* returns the number of input elements belonging to the given class if no class information is attached to this
* input, -1 is returned
*
* @param c the index of the class (starting with 0)
* @return the number of elements belonging to this class, or -1
*/
public int getNumberOfClassmembers(int c) {
if (this.classInfo == null) {
return -1;
} else {
return this.classInfo.getNumberOfClassMembers(c);
}
}
/** returns the index of the class the input vector specified by its index belongs to */
public int getClassIndexOfInput(String inputLabel) {
return this.classInfo.getClassIndex(inputLabel);
}
/**
* returns the path of the file containin the class information
*
* @return path to the file containting the class information
*/
public String getClassInformationFilename() {
return this.classInformationFilename;
}
/**
* runs over all dimensions of the input vectors and tries to fetch some information about their data ranges and
* other properties information gathered are:
* <ul>
* <li>min and max value within each dimension (this.min, this.max)</li>
* <li>does a dimension contain only 0/1 values (this.only01)</li>
* <li>does a dimension contain only plain integer values (this.discrete)</li>
* <li>how many 0 (=missing?) values are in each dimension (this.zeroValues)</li>
* </ul>
* the results are stored in the appropriate arrays
*/
private void checkDatatypes() {
int d = this.getVectorDim(); // dimension of input vectors
int n = this.getNumberOfInputVectors(); // the numbers of vectors
double temp;
this.only01 = new boolean[d];
this.discrete = new boolean[d];
this.min = new double[d];
this.max = new double[d];
this.mean = new double[d];
this.var = new double[d];
this.zeroValues = new int[d];
// check all dimensions
for (int i = 0; i < d; i++) {
// initialization of the arrays for this dimension
this.only01[i] = true;
this.discrete[i] = true;
this.min[i] = this.inputData.getInputDatum(0).getVector().get(i);
this.max[i] = this.inputData.getInputDatum(0).getVector().get(i);
this.mean[i] = 0;
this.mean[i] = 0;
this.zeroValues[i] = 0;
// and now for each input vector in this dimension
for (int j = 0; j < n; j++) {
temp = this.inputData.getInputDatum(j).getVector().get(i); // retrieve the current value
// discrete or 01?
if ((int) temp != temp && this.discrete[i]) {
// not even discrete value
this.only01[i] = false;
this.discrete[i] = false;
} else if ((double) (int) temp % 2 != temp && this.only01[i]) {
// no 01 value
this.only01[i] = false;
}
// min/max values:
if (temp < this.min[i]) {
this.min[i] = temp;
}
if (temp > this.max[i]) {
this.max[i] = temp;
}
// mean value:
this.mean[i] += temp / n;
// zero Values
if (temp == 0) {
this.zeroValues[i]++;
}
}
// now that we have the mean value for this dimension, we can calculate the variance
for (int j = 0; j < n; j++) {
temp = this.inputData.getInputDatum(j).getVector().get(i); // retrieve the current value
this.var[i] += (temp - this.mean[i]) * (temp - this.mean[i]) / (n - 1);
}
}
}
/**
* returns the InputData object storing information about the input data used for training the som. Needed by
* objects of type TestRunResult for some analysis
*
* @return the input data used to train the SOM
*/
public InputData getInputData() {
return this.inputData;
}
/** returns the InputDatum labelled with the specified name */
public InputDatum getInputDatum(String name) {
return this.inputData.getInputDatum(name);
}
/** returns the InputDatum at the specified index */
public InputDatum getInputDatum(int d) {
return this.inputData.getInputDatum(d);
}
/**
* returns the number of inputs the user has selected to get information about their position on the SOM
*
* @return the number of inputs selected by the user.
*/
public int getNumberOfSelectedInputs() {
return this.selectedIndices.size();
}
/**
* returns the id of the inputVector at position index in the list of selected inputs each input vector is
* identified by an id, which is its index in the complete input. The vectors selected by the user (to display their
* position on the SOM) are also stored in a list. To retrieve the "real" id of the vector at position index in this
* list, this function should be used
*
* @param index the index of the vector in the list of selected inputs
* @return the id of the corresponding input, that is the index in the complete input list, -1 if error
*/
public int getSelectedInputId(int index) {
if (index >= this.selectedIndices.size()) {
return -1;
}
return this.selectedIndices.get(index).intValue();
}
/**
* returns the complete filename of the file containing the input data complete filename means including the path.
* The string is not verified to point to a valid input file (or any file at all).
*
* @return the complete filename (including absolute path) of the input filename
*/
public String getInputDataFilename() {
return this.inputDataFilename;
}
/**
* returns the complete filename of the file containing the template data complete filename means including the
* path. The string is not verified to point to a valid template file (or any file at all).
*
* @return the complete filename (including absolute path) of the template filename
*/
public String getTemplateFilename() {
return this.tvFilename;
}
/**
* Tries to name a cluster by the input data mapped to units lying within the cluster For naming the cluster, some
* very simple heuristics are used: First, if there are any labels of the clusters, which correpsond to 0/1
* attributes, and their values are all 0 (or 1) in the cluster, the name of this attribute is included to the name
* of the cluster. (attributes of 0/1 type are supposed to encode any "has this property" yes/no information,
* thereby the value 1 is interpreted as "cluster has this property", whereas 0 is interpreted as "has not") If
* there are any labels that don't correspond not 0/1 attributes, it is checked whether both subclusters have the
* same value for this label. If yes, the name of this label is included to the name of the cluster If none of the
* properties above is valid, the first nodeDepth-1 labels of the cluster suggested by the clustering algorithm is
* used. (at least for the animal map this works quite well)
*
* @param node the node representing the cluster tha shall be named
* @param clusterByValue indicates whether the labels for the cluster shall be created by value (is handed unchanged
* to ClusterNode.getLabels(clusterByValue, boolen)
* @param nodeDepth the depth of the node in the tree, whereby the root (i.e. the cluster containing the whole map)
* node has depth 1
* @return the list of labels found for this cluster
*/
public Vector<String> getClusterName(ClusterNode node, int clusterByValue, int nodeDepth) {
Vector<String> texts = new Vector<String>();
ClusterLabel[] labels = node.getLabels(clusterByValue, false);
if (labels == null || labels.length == 0) {
texts.add("no label found");
return texts;
}
String curLabel;
int[] inds = new int[labels.length];
/*
* Following idea: - if the data is only 0/1, we take all those where we have 0/1 as value, and call it has label and has not label -
* otherwise we take all labels, that have the same value in both child nodes - we take at least one label for the cluster
*/
boolean[] labeled = new boolean[labels.length];
String[] createdLabels = new String[labels.length];
for (int i = 0; i < labeled.length; i++) {
labeled[i] = false;
}
// first check whether we have 0/1 values (indications for "has/has not" a given property
for (int i = 0; i < labels.length; i++) {
curLabel = labels[i].getName();
inds[i] = -1;
for (int j = 0; j < this.getVectorDim(); j++) {
if (this.inputTemplate.getLabel(j).equals(curLabel)) {
inds[i] = j;
break;
}
}
if (inds[i] >= 0 && this.is01(inds[i])) {
if (labels[i].getValue() == 0) {
// yes - strike
curLabel = "has/is no/not " + curLabel;
labeled[i] = true;
createdLabels[i] = curLabel;
} else if (labels[i].getValue() == 1) {
// ok - strike too
curLabel = "has/is " + curLabel;
labeled[i] = true;
createdLabels[i] = curLabel;
}
}
}
// then check for all labels left, whether the are the same as in both child nodes:
ClusterNode child1 = node.getChild1();
ClusterNode child2 = node.getChild2();
ClusterLabel[] labels1 = child1.getLabels(clusterByValue, false);
ClusterLabel[] labels2 = child2.getLabels(clusterByValue, false);
for (int i = 0; i < labeled.length; i++) {
if (labeled[i]) {
continue; // already done
}
if (inds[i] > 0 && is01(inds[i])) {
continue; // already done
}
if (labels1.length < i + 1 || labels2.length < i + 1) {
break;
}
if (labels[i].getValue() == labels1[i].getValue() && labels[i].getValue() == labels2[i].getValue()) {
curLabel = labels[i].getName() + " = " + String.format("%.6f", labels[i].getValue());
labeled[i] = true;
createdLabels[i] = curLabel;
}
}
for (int i = 0; i < createdLabels.length; i++) {
if (labeled[i]) {
texts.add(createdLabels[i]);
}
}
if (texts.size() == 0) {
/*
* we need some labels. the question is: which and how many. The idea is to stick to the labeling algorithm provided by the toolbox, and
* to simlpy pick the first k ones. We make k dependent from the depth of the node, by the idea that ideally each new cluster introduced a
* new dimension.
*/
for (int i = 0; i < nodeDepth - 1 && i < labels.length; i++) {
texts.add(labels[i].getName() + " = " + String.format("%.6f", labels[i].getValue()));
}
}
return texts;
}
/**
* This method calculates the most important Dimensions of the Dataset according to the results of a PCA, and rows
* the resulting dim-index in a new array on first index. On index 2, the corresponding % of the TotalVariance is
* calculated (as a quality measure)
*
* @return new array with most important dims ranked decreasingly.
*/
public double[][] getPCAdeterminedDims() {
double[][] result_array;
if (this.EP == null) {// If selected Number >this.getVectorDim() or if no specified value was entered, take all
// PCAComponents
result_array = new double[this.getVectorDim()][2];
} else {
if (this.EP.getMetroMapComponents() > this.getVectorDim() || this.EP.getMetroMapComponents() < 0) {
result_array = new double[this.getVectorDim()][2];
} else {
result_array = new double[this.EP.getMetroMapComponents()][2];
}
}
double[][] data = inputData.getData();
PCA pca = new PCA(data);
double BestAxisVar = Double.MAX_VALUE;
double CurrBestAxisVar = Double.MIN_VALUE;
int CurrBestAxisIndex = -1;
int counter = 0;
double temp = 0.0;
while (counter != result_array.length) { /* Loop it result array length times and get the best Eigenvalue. */
CurrBestAxisVar = Double.MIN_VALUE;
for (int curAxis = 0; curAxis < this.getVectorDim(); curAxis++) {
if (pca.info[curAxis] > CurrBestAxisVar && pca.info[curAxis] < BestAxisVar) {
CurrBestAxisVar = pca.info[curAxis];
CurrBestAxisIndex = curAxis;
}
}
temp = pca.info[CurrBestAxisIndex] / this.getVectorDim();
BestAxisVar = CurrBestAxisVar;
result_array[counter][0] = CurrBestAxisIndex; /* save the best Dim index on index 0 */
result_array[counter][1] = temp; /* and its corresponding Variance Value on index 1 */
counter++;
}
return result_array;
}
/**
* this method is just a small helper method, used to display the Dimensions in the top-part of the output document
* It accumulates the Variances and calculates this Percentage from the total Variance
*/
public double calculateAccumulatedVariance() {
double perc = 0.0;
double[][] array = this.getPCAdeterminedDims();
for (double[] element : array) {
perc += element[1];
}
return perc;
}
/** Returns the names of the 3 files, used for training */
public String[] getTrainingDataInfo() {
String[] list = new String[3];
list[0] = new String(applyNameFix(this.inputDataFilename));
list[1] = new String(applyNameFix(this.tvFilename));
if (this.classInformationFilename == null) {
list[2] = "no class information file";
} else {
list[2] = new String(applyNameFix(this.classInformationFilename));
}
;
return list;
}
/** small helper method for getTrainingDataInfo */
private static String applyNameFix(String target) {
int c1 = target.lastIndexOf(System.getProperty("file.separator"), target.length());
return target.substring(c1 + 1, target.length());
}
/** Returns the Editable Report Properties for the Semantic Report */
public EditableReportProperties getEP() {
return this.EP;
}
}