/*
* Copyright 2004-2010 Information & Software Engineering Group (188/1)
* Institute of Software Technology and Interactive Systems
* Vienna University of Technology, Austria
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package at.tuwien.ifs.somtoolbox.data;
import static org.apache.commons.lang.StringUtils.isBlank;
import java.awt.Color;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.logging.Logger;
import org.jfree.chart.plot.DefaultDrawingSupplier;
import org.jfree.chart.plot.DrawingSupplier;
import org.jfree.util.PaintList;
import com.martiansoftware.jsap.JSAPResult;
import at.tuwien.ifs.somtoolbox.SOMToolboxException;
import at.tuwien.ifs.somtoolbox.apps.config.OptionFactory;
import at.tuwien.ifs.somtoolbox.input.SOMLibFileFormatException;
import at.tuwien.ifs.somtoolbox.util.FileUtils;
import at.tuwien.ifs.somtoolbox.util.StdErrProgressWriter;
import at.tuwien.ifs.somtoolbox.util.StringIntegerComparator;
import at.tuwien.ifs.somtoolbox.util.StringUtils;
/**
* This class provides information about class labels for the {@link InputData} input vectors.<br>
* <p>
* The file format consists of a <code>header</code> and the content as follows:
* </p>
* <b>$TYPE</b> string, mandatory. Fixed to <i>class_information.</i> <br>
* <b>$NUM_CLASSES</b> integer, mandatory: gives the number of classes. <br>
* <b>$CLASS_NAMES</b> mandatory: a space-separated list of class names; the count has to be the same as in
* $NUM_CLASSES. <br>
* <b>$XDIM</b> integer, mandatory: number of units in x-direction. Fixed to <i>2</i>. <br>
* <b>$YDIM</b> integer, mandatory: dimensionality class information vector, equals the number of input vectors (
* {@link InputData#numVectors()}). <br>
* <b>labelName_n classIndex_n</b> the $YDIM number of mappings from the input vector label name to the class label
* index [0...($NUM_CLASSES-1)]. <br>
* <p>
* See also an example file from the <a href="../../../../../examples/iris.cls">Iris data set</a>.
* </p>
* <p>
* Alternatively, the file format can be more simple, and not contain any file header. Then, there is only a list of
* lines with two tabulator-seperated <code>Strings</code> in the form of <code>labelName className</code>.<br>
* The number of classes, and the indices of those classes, are computer automatically.
* </p>
* <p>
* Finally, the simplest form of the file is to have lines with just the class label; then, this class is assigned to
* the input datum with the index of the line number.<br>
* The number of classes, and the indices of those classes, are computer automatically.
* </p>
*
* @author Michael Dittenbach
* @author Thomas Lidy
* @author Rudolf Mayer
* @version $Id: SOMLibClassInformation.java 3888 2010-11-02 17:42:53Z frank $
*/
@SuppressWarnings("rawtypes")
public class SOMLibClassInformation {
private static final Logger logger = Logger.getLogger("at.tuwien.ifs.somtoolbox");
/**
* The file name to read from.
*/
protected String classInformationFileName = null;
/**
* The number of classes. Either read from the file header, or computed from the distinct number of class names in
* the tab-seperated file.
*/
private int numClasses = 0;
/**
* The names of the classes. Either read from the file header, or computed from the distinct class names in the
* tab-seperated file.
*/
private String[] classNames = null;
/** Returns the names of the classes. */
public String[] getClassNames() {
return classNames;
}
/** Returns an array of data names for each class. */
public String[][] getDataNamesPerClass() {
String[][] all = new String[classNames.length][];
for (int i = 0; i < classNames.length; i++) {
all[i] = getDataNamesInClass(classNames[i]);
}
return all;
}
public String[] getDataNames() {
return dataNames;
}
/**
* The number of inputs in each class.
*/
private int[] classMemberCount = null;
/**
* The number of input vectors. Either read from the file header, or computed from the number of data lines in the
* tab-seperated file.
*/
protected int numData = 0;
// FIXME: not used?
private String[] dataNames = null;
/**
* A mapping input index => class index, for fast lookup.
*/
private int[] dataClasses = null;
/**
* Mapping class name => class index, for fast lookup.
*/
private HashMap<String, Comparable> dataHash = null;
private ArrayList<String> classNamesTemp;
private ArrayList<String> dataNamesTemp;
private PaintList paintList;
/**
* Constructor intended to be used e.g. when generating data, or when reading a file with the
* {@link SOMPAKInputData}
*/
public SOMLibClassInformation() {
dataHash = new HashMap<String, Comparable>();
classNamesTemp = new ArrayList<String>();
dataNamesTemp = new ArrayList<String>();
}
public SOMLibClassInformation(Map<String, String> classAssignment) {
this();
for (Map.Entry<String, String> entry : classAssignment.entrySet()) {
addItem(entry.getKey(), entry.getValue());
}
processItems(false);
}
/** Constructor intended to be used when generating data. */
public SOMLibClassInformation(String[] classNames, String[][] dataName) {
this();
this.classNames = classNames;
numClasses = classNames.length;
numData = 0;
classMemberCount = new int[numClasses];
for (int i = 0; i < dataName.length; i++) {
numData += dataName[i].length;
classMemberCount[i] = dataName[i].length;
}
dataNames = new String[numData];
dataClasses = new int[numData];
dataHash = new HashMap<String, Comparable>();
int index = 0;
for (int i = 0; i < dataName.length; i++) {
for (int j = 0; j < dataName[i].length; j++) {
dataNames[index] = dataName[i][j];
dataHash.put(dataName[i][j], new Integer(i));
dataClasses[index] = i;
index++;
}
// System.arraycopy(dataName[i], 0, dataNames, index, dataName[i].length);
// index += dataName[i].length;
}
initPaintList();
}
/**
* Creates a new class information object by trying to read the given file in both the versions with a file header (
* {@link #readSOMLibClassInformationFile()}) and the tab-separated file ({@link #readTabSepClassInformationFile()}
* ).
*
* @param classInformationFileName The file to read from
* @throws SOMToolboxException if there is any error in the file format
*/
public SOMLibClassInformation(String classInformationFileName) throws SOMToolboxException {
this.classInformationFileName = classInformationFileName;
// TODO: br.close() in case of any error!!!
try {
readSOMLibClassInformationFile();
} catch (ClassInfoHeaderNotFoundException nfe) {
logger.info("Reading SOMLib Class infromation file format failed: " + nfe.getMessage());
logger.info("Trying to read tab/space separated class info file...");
try {
readTabSepClassInformationFile();
} catch (SOMToolboxException e) {
try {
logger.info("Reading tab/space separated class file failed: " + e.getMessage());
logger.info("Trying to read simple format...");
readSimple();
} catch (IOException ioEx) {
throwClassInfoReadingError(classInformationFileName, ioEx);
}
} catch (IOException e) {
throwClassInfoReadingError(classInformationFileName, e);
}
} catch (IOException e) {
throwClassInfoReadingError(classInformationFileName, e);
}
if (paintList == null) {
initPaintList();
}
logger.info("Class information file correctly loaded.");
}
private void throwClassInfoReadingError(String classInformationFileName, IOException e)
throws SOMLibFileFormatException {
throw new SOMLibFileFormatException("Problems reading class information file " + classInformationFileName
+ ": ' " + e.getMessage() + "'. Aborting.");
}
/**
* Reads a class information file containing no header, and tab-separated String entries for the input vector and
* class labels.
*
* @throws SOMToolboxException if there is any error in the file format
*/
private void readTabSepClassInformationFile() throws SOMToolboxException, IOException {
String line = null;
String name, classname;
int index = 0; // line counter
classNamesTemp = new ArrayList<String>();
dataNamesTemp = new ArrayList<String>();
BufferedReader br = FileUtils.openFile("Class information file", classInformationFileName);
dataHash = new HashMap<String, Comparable>();
while ((line = br.readLine()) != null) {
index++;
if (line.trim().length() == 0) {
continue; // ignore empty lines
}
String[] lineElements = line.split("[\t]+"); // StringUtils.REGEX_SPACE_OR_TAB);
if (lineElements.length != 2) {
br.close();
throw new SOMLibFileFormatException("Number of elements per line must be exactly 2! Error in line "
+ index);
}
name = lineElements[0];
classname = lineElements[1];
addItem(name, classname);
}
br.close();
processItems(true);
}
private void readSimple() throws SOMToolboxException, IOException {
boolean lastEmpty = false;
String line = null;
int index = 0; // line counter
BufferedReader br = FileUtils.openFile("Class information file", classInformationFileName);
while ((line = br.readLine()) != null) {
index++;
if (isBlank(line)) {
lastEmpty = true;
} else {
if (lastEmpty) {
br.close();
throw new SOMLibFileFormatException("Empty line # " + index);
}
addItem(String.valueOf(index - 1), line.trim());
}
}
br.close();
processItems(false);
}
public void processItems(boolean sort) {
numData = dataNamesTemp.size();
numClasses = classNamesTemp.size();
classNames = classNamesTemp.toArray(new String[numClasses]);
if (sort) {
Arrays.sort(classNames, new StringIntegerComparator(classNames));
}
classNamesTemp = new ArrayList<String>(Arrays.asList(classNames));
dataNames = dataNamesTemp.toArray(new String[numData]);
dataClasses = new int[numData];
classMemberCount = new int[numClasses];
for (int i = 0; i < dataNamesTemp.size(); i++) {
String label = dataNamesTemp.get(i);
Object classname = dataHash.get(label);
int classid = classNamesTemp.indexOf(classname);
if (classid < 0) {
System.out.printf("(%d) Did not find classindex for \"%s\", which is the class of \"%s\"%n", i,
classname, label);
continue;
}
dataNames[i] = label;
dataClasses[i] = classid;
dataHash.put(label, new Integer(classid));
classMemberCount[classid]++;
}
initPaintList();
}
public void addItem(String label, String classname) {
// Rudi: not sure what the below code attempted to do, but it breaks things in some cases, as too little steps
// are done
// also, it is kind-of duplicate to if (dataHash.containsKey(label))
// thus, commented it out..
// if (!dataHash.containsKey(label)) {
// dataHash.put(label, classname);
// } else {
// return;
// }
if (!classNamesTemp.contains(classname)) {
classNamesTemp.add(classname);
}
if (dataHash.containsKey(label)) {
if (!dataHash.get(label).equals(classname)) {
logger.warning("Ignoring duplicate label " + label + ", existing class is: '" + dataHash.get(label)
+ "', new class would have been: '" + classname + "'.");
}
} else {
dataHash.put(label, classname);
dataNamesTemp.add(label);
}
}
/** Reads a class information file containing a header and class indices. */
protected void readSOMLibClassInformationFile() throws IOException, SOMToolboxException {
String line = null;
int index = 0; // line counter
int columns = 0;
numData = 0;
BufferedReader br = FileUtils.openFile("Class information file", classInformationFileName);
// PROCESS HEADER as long as lines start with $
while ((line = br.readLine()) != null) {
// we ignore comment lines
if (line.startsWith("#")) {
continue;
}
if (!line.startsWith("$")) {
break;
}
index++;
if (line.startsWith("$TYPE")) {
// ignore
} else if (line.startsWith("$NUM_CLASSES")) {
String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB);
if (lineElements.length > 1) {
numClasses = Integer.parseInt(lineElements[1]);
} else {
throw new SOMLibFileFormatException(
"Class information file format corrupt in $NUM_CLASSES line. Aborting.");
}
} else if (line.startsWith("$CLASS_NAMES")) {
String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB);
if (lineElements.length > 1) {
classNames = new String[numClasses];
if (lineElements.length > numClasses + 1) {
throw new SOMLibFileFormatException(
"Class information file format corrupt in $CLASS_NAMES line; expecting to find "
+ numClasses + " classes, but found " + (lineElements.length - 1)
+ ". Aborting.");
}
for (int c = 0; c < numClasses; c++) {
classNames[c] = lineElements[c + 1];
}
} else {
throw new SOMLibFileFormatException(
"Class information file format corrupt in $CLASS_NAMES line. Aborting.");
}
} else if (line.startsWith("$XDIM")) {
String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB);
if (lineElements.length > 1) {
columns = Integer.parseInt(lineElements[1]);
if (columns < 2) {
throw new SOMLibFileFormatException(
"Class information file format corrupt. At least 2 columns (name, classId) required. Aborting.");
}
} else {
throw new SOMLibFileFormatException();
}
} else if (line.startsWith("$YDIM")) {
String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB);
if (lineElements.length > 1) {
numData = Integer.parseInt(lineElements[1]);
} else {
throw new SOMLibFileFormatException(
"Class information file format corrupt in $YDIM line. Aborting.");
}
} else if (line.startsWith("$CLASS_COLOURS ")) {
String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB);
if (lineElements.length > 1) {
paintList = new PaintList();
int colourIndex = 0;
for (int i = 1; i < lineElements.length; i++) {
String[] rgb = lineElements[i].split(",");
if (rgb.length == 3) {
try {
Color colour = new Color(Integer.parseInt(rgb[0]), Integer.parseInt(rgb[1]),
Integer.parseInt(rgb[2]));
paintList.setPaint(colourIndex, colour);
colourIndex++;
} catch (NumberFormatException e) {
throw new SOMLibFileFormatException(
"Class information file format corrupt in $CLASS_COLOURS - colour '"
+ lineElements[i] + "' is not correct:" + e.getMessage() + "Aborting.");
}
} else {
throw new SOMLibFileFormatException(
"Class information file format corrupt in $CLASS_COLOURS - colour '"
+ lineElements[i] + "' is not correct. Aborting.");
}
}
if (colourIndex + 1 < numClasses()) { // if we have too few classes
logger.info("Class info file contained too few class colours (" + colourIndex
+ ", # of classes: " + numClasses() + "), filling up with default classes.");
DrawingSupplier supplier = new DefaultDrawingSupplier();
for (int i = colourIndex + 1; i < numClasses(); i++) {
paintList.setPaint(i, supplier.getNextPaint());
}
}
} else {
throw new SOMLibFileFormatException(
"Class information file format corrupt in $CLASS_COLOURS line. Aborting.");
}
}
}
if (index == 0) {
throw new ClassInfoHeaderNotFoundException("Class information file: no header line starting with $ found");
}
classMemberCount = new int[numClasses];
// READ REST OF THE FILE
if (numData == 0) {
throw new SOMLibFileFormatException("Class information file format corrupt. Missing $YDIM value. Aborting.");
}
dataNames = new String[numData];
dataClasses = new int[numData];
dataHash = new HashMap<String, Comparable>(numData);
index = 0;
while (line != null) {
// TODO if line is no comment line ($)
index++;
String[] lineElements = line.split(StringUtils.REGEX_SPACE_OR_TAB);
if (lineElements.length != columns) {
throw new SOMLibFileFormatException("Class information file format corrupt in element number " + index
+ ", incorrect number of columns: XDIM: " + columns + ", columns: " + lineElements.length
+ ". Aborting.");
} else {
try {
dataNames[index - 1] = lineElements[0];
dataClasses[index - 1] = Integer.parseInt(lineElements[1]);
dataHash.put(lineElements[0], dataClasses[index - 1]);
classMemberCount[Integer.parseInt(lineElements[1])]++;
} catch (NumberFormatException e) { // does not happen at the moment
throw new SOMLibFileFormatException("Class id number format corrupt in element number " + index
+ ": '" + lineElements[1] + "'. Aborting.");
}
}
line = br.readLine();
}
if (index != numData) {
throw new SOMLibFileFormatException(
"Class information file format corrupt. Incorrect number of data items. Aborting.\n"
+ Integer.toString(index) + " " + Integer.toString(numData));
}
br.close();
}
/**
* Gets the number of classes, as read from $NUM_CLASSES, or computed.
*
* @return the number of classes.
*/
public int numClasses() {
return numClasses;
}
/**
* Returns all the distinct class names.
*
* @return the class names.
*/
public String[] classNames() {
return classNames;
}
/**
* Gets the index number for a given class label.
*
* @param className the class label.
* @return the index of that label.
*/
public int getClassIndex(String className) {
Object classid = dataHash.get(className);
if (classid == null) {
return -1;
} else {
return ((Integer) classid).intValue();
}
}
/**
* Gets the class label name for a given input vector index.
*
* @param index index of the input vector.
* @return the name of the class.
*/
public String getClassName(int index) {
return classNames[dataClasses[index]];
}
/**
* Gets the class name for a vector name.
*
* @param vectorName the name of the input vector.
* @return the name of the class.
* @throws SOMLibFileFormatException If there is no class information available for the given vector name/label
*/
public String getClassName(String vectorName) throws SOMLibFileFormatException {
Object object = dataHash.get(vectorName);
if (object instanceof Integer) {
return classNames[((Integer) object).intValue()];
} else {
throw new SOMLibFileFormatException(
"Class information file incomplete. Error pairing input vectors with class names for vector label '"
+ vectorName + "'.");
}
}
public boolean hasClassAssignmentForName(String vectorName) {
return dataHash.containsKey(vectorName);
}
public int getClassIndexForInput(String vectorName) throws SOMLibFileFormatException {
Object object = dataHash.get(vectorName);
if (object instanceof Integer) {
return ((Integer) object).intValue();
} else {
throw new SOMLibFileFormatException(
"Class information file corrupt. Error pairing input vectors with class names for vector: "
+ vectorName);
}
}
/**
* Gets the number of input vectors in the given class.
*
* @param classIndex the index of the class.
* @return the total number of inputs in that class.
*/
public int getNumberOfClassMembers(int classIndex) {
return classMemberCount[classIndex];
}
public double getPercentageOfClassMembers(int classIndex) {
return classMemberCount[classIndex] / (double) numData;
}
public String[] getDataNamesInClass(String className) {
ArrayList<String> result = new ArrayList<String>();
for (int i = 0; i < numData; i++) {
if (getClassName(i).equals(className)) {
result.add(dataNames[i]);
}
}
return result.toArray(new String[result.size()]);
}
/** computes the percentages of class membership for the given label names */
public int[] computeClassDistribution(String[] labelNames) {
int[] values = new int[numClasses()];
for (int v = 0; v < values.length; v++) {
values[v] = 0;
}
if (labelNames != null) {
for (String labelName : labelNames) {
int ci = getClassIndex(labelName);
if (ci < 0) {
System.err.println("ERROR: Class index could not be retrieved for item " + labelName);
} else {
values[ci] += 1;
}
}
}
return values;
}
/** Initialise a standard paint list */
private void initPaintList() {
paintList = new PaintList();
DrawingSupplier supplier = new DefaultDrawingSupplier();
for (int i = 0; i < numClasses(); i++) {
paintList.setPaint(i, supplier.getNextPaint());
}
}
/** Get the class colours as {@link PaintList}. */
public PaintList getPaintList() {
return paintList;
}
/** Get all class colours. */
public Color[] getClassColors() {
Color[] res = new Color[paintList.size()];
for (int i = 0; i < res.length; i++) {
res[i] = (Color) paintList.getPaint(i);
}
return res;
}
/** Get the colour for the given class index. */
public Color getClassColor(int index) {
return (Color) paintList.getPaint(index);
}
/** Get the colour for the given class index. */
public void setClassColor(int index, Color color) {
paintList.setPaint(index, color);
}
/** Load colours from an external (non-classinfo) file. */
public boolean loadClassColours(File file) {
try {
BufferedReader br = new BufferedReader(new FileReader(file));
String line;
String strcols[];
int r, g, b;
int i = 0;
boolean done = false;
while ((line = br.readLine()) != null & !done) {
if (i >= numClasses()) {
logger.warning("Color file contains more colors than needed in current class visualization. Skipping colors.");
done = true;
} else {
strcols = line.split(" ");
if (strcols.length != 3) {
logger.severe("Color file: Error in line '" + line
+ "' - did not find 3 int color parts (RGB)!");
br.close();
return false;
}
r = Integer.parseInt(strcols[0]);
g = Integer.parseInt(strcols[1]);
b = Integer.parseInt(strcols[2]);
// set new color
paintList.setPaint(i, new Color(r, g, b));
i++;
}
}
br.close();
if (i < numClasses()) {
logger.warning("Color file contained less colors than needed in current class visualization. Keeping some old colors.");
}
logger.info("Successfully loaded " + i + " class colours from file '" + file.getAbsolutePath() + "'.");
} catch (Exception ex) {
logger.severe("Could not read color file" + file.toString() + "! " + ex.getMessage());
ex.printStackTrace();
return false;
}
return true;
}
public void removeNotPresentElements(SOMLibSparseInputData inputData) {
int[] classMemberCountTemp = new int[numClasses];
LinkedHashMap<String, Comparable> newData = new LinkedHashMap<String, Comparable>();
StdErrProgressWriter progress = new StdErrProgressWriter(dataNames.length, "Checking class info for item ", 10);
for (int i = 0; i < dataNames.length; i++) {
if (inputData.getInputDatum(dataNames[i]) != null) {
// keep the data
newData.put(dataNames[i], dataClasses[i]);
classMemberCountTemp[dataClasses[i]]++;
}
progress.progress();
}
System.out.println("Original class info size: " + dataNames.length);
System.out.println("Reduced class info size: " + newData.size());
System.out.println("Input data size: " + inputData.numVectors());
System.out.println("The new class distribution:");
for (int i = 0; i < classMemberCountTemp.length; i++) {
System.out.println(" " + i + "\t" + classNames[i] + "\t: " + classMemberCountTemp[i]);
}
// FIXME: we should also check if there are some classes that are not present any more!
classMemberCount = classMemberCountTemp;
dataNames = newData.keySet().toArray(new String[newData.size()]);
dataClasses = new int[newData.size()];
for (int i = 0; i < dataClasses.length; i++) {
dataClasses[i] = (Integer) newData.get(i);
}
numData = newData.size();
dataHash = newData;
for (String element : inputData.dataNames) {
if (dataHash.get(element) == null) {
System.out.println("Could not find class for input '" + element + "'.");
}
}
}
/** Method for stand-alone execution to convert a file to the SOMLibClassInformation format. */
public static void main(String[] args) throws SOMToolboxException, IOException {
// register and parse all options
JSAPResult config = OptionFactory.parseResults(args, OptionFactory.getOptClassInformationFile(true),
OptionFactory.getOptOutputFileName(true));
InputDataWriter.writeAsSOMLib(new SOMLibClassInformation(config.getString("classInformationFile")),
config.getString("output"));
}
}