/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * MultiLabelInstances.java * Copyright (C) 2009-2010 Aristotle University of Thessaloniki, Thessaloniki, Greece */ package mulan.data; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import mulan.core.ArgumentNullException; import mulan.core.MulanRuntimeException; import weka.core.Attribute; import weka.core.Instance; import weka.core.Instances; /** * Implements multi-label instances data set. Multi-label data are stored in Weka's * {@link Instances}. The class is a convenient wrapper. The data are loaded form * data file, checked for valid format. If hierarchy for labels is specified via * XML meta-data file, the data file is cross-checked with XML for consistency. * <br></br> * Applied rules:<br></br> * - label names must be unique<br></br> * - all labels in XML meta-data must be defined also in ARFF data set<br></br> * - each label attribute must be nominal with binary values<br></br> * - if labels has hierarchy, then if child labels indicates <code>true</code> of some * data instance, then all its parent labels must indicate also <code>true</code> for that instance<br></br> * * @author Jozef Vilcek */ public class MultiLabelInstances { private Instances dataSet; private final LabelsMetaData labelsMetaData; /** * Creates a new instance of {@link MultiLabelInstances} data. * The label attributes are assumed to be at the end of ARFF data file. The count * is specified by parameter. Based on these attributes the {@link LabelsMetaData} * are created. * * @param arffFilePath the path to ARFF file containing the data * @param numLabelAttributes the number of ARFF data set attributes which are labels. * @throws ArgumentNullException if arrfFilePath is null * @throws IllegalArgumentException if numLabelAttribures is less than 2 * @throws InvalidDataFormatException if format of loaded multi-label data is invalid * @throws DataLoadException if ARFF data file can not be loaded */ public MultiLabelInstances(String arffFilePath, int numLabelAttributes) throws InvalidDataFormatException { if (arffFilePath == null) { throw new ArgumentNullException("arffFilePath"); } if (numLabelAttributes < 2) { throw new IllegalArgumentException("The number of label attributes must me at least 2 or higher."); } File arffFile = new File(arffFilePath); Instances data = loadInstances(arffFile); LabelsMetaData labelsData = loadLabesMeta(data, numLabelAttributes); validate(data, labelsData); dataSet = data; labelsMetaData = labelsData; } /** * Creates a new instance of {@link MultiLabelInstances} data from the supplied {@link InputStream} * data source. The data in the stream are assumed to be in ARFF format. * The label attributes in ARFF data are assumed to be the last ones. Based on those attributes * the {@link LabelsMetaData} are created. * * @param arffDataStream the {@link InputStream} data source to load data in ARFF format * @param numLabelAttributes the number of last ARFF data set attributes which are labels. * @throws ArgumentNullException if {@link InputStream} data source is null * @throws IllegalArgumentException if number of labels attributes is less than 2 * @throws InvalidDataFormatException if format of loaded multi-label data is invalid * @throws DataLoadException if ARFF data can not be loaded */ public MultiLabelInstances(InputStream arffDataStream, int numLabelAttributes) throws InvalidDataFormatException { if (arffDataStream == null) { throw new ArgumentNullException("arffDataStream"); } if (numLabelAttributes < 2) { throw new IllegalArgumentException("The number of label attributes must me at least 2 or higher."); } Instances data = loadInstances(arffDataStream); LabelsMetaData labelsData = loadLabesMeta(data, numLabelAttributes); validate(data, labelsData); dataSet = data; labelsMetaData = labelsData; } /** * Creates a new instance of {@link MultiLabelInstances} data. * The Instances object and labels meta-data are loaded separately. The load failure is * indicated by {@link DataLoadException}. When data are loaded, validations are applied * to ensure consistency between ARFF data and specified labels meta-data. * * @param data the Instances object containing the data * @param xmlLabelsDefFilePath the path to XML file containing labels meta-data * @throws IllegalArgumentException if input parameters refers to non-existing files * @throws InvalidDataFormatException if format of loaded multi-label data is invalid * @throws DataLoadException if XML meta-data of ARFF data file can not be loaded */ public MultiLabelInstances(Instances data, String xmlLabelsDefFilePath) throws InvalidDataFormatException { if (xmlLabelsDefFilePath == null) { throw new ArgumentNullException("xmlLabelsDefFilePath"); } LabelsMetaData labelsData = loadLabesMeta(xmlLabelsDefFilePath); validate(data, labelsData); dataSet = data; labelsMetaData = labelsData; } /** * Creates a new instance of {@link MultiLabelInstances} data. * The ARFF data file and labels meta-data are loaded separately. The load failure is * indicated by {@link DataLoadException}. When data are loaded, validations are applied * to ensure consistency between ARFF data and specified labels meta-data. * * @param arffFilePath the path to ARFF file containing the data * @param xmlLabelsDefFilePath the path to XML file containing labels meta-data * @throws ArgumentNullException if input parameters are null * @throws IllegalArgumentException if input parameters refers to non-existing files * @throws InvalidDataFormatException if format of loaded multi-label data is invalid * @throws DataLoadException if XML meta-data of ARFF data file can not be loaded */ public MultiLabelInstances(String arffFilePath, String xmlLabelsDefFilePath) throws InvalidDataFormatException { if (arffFilePath == null) { throw new ArgumentNullException("arffFilePath"); } if (xmlLabelsDefFilePath == null) { throw new ArgumentNullException("xmlLabelsDefFilePath"); } File arffFile = new File(arffFilePath); Instances data = loadInstances(arffFile); LabelsMetaData labelsData = loadLabesMeta(xmlLabelsDefFilePath); validate(data, labelsData); dataSet = data; labelsMetaData = labelsData; } /** * Creates a new instance of {@link MultiLabelInstances} data from the supplied {@link InputStream} * data source. The data in the stream are assumed to be in ARFF format. * The labels meta data for ARFF data are retrieved separately from the different {@link InputStream} * data source. The meta data are assumed to be in XML format and conform to valid schema. * Data load load failure is indicated by {@link DataLoadException}. When data are loaded, validations * are applied to ensure consistency between ARFF data and specified labels meta-data. * * @param arffDataStream the {@link InputStream} data source to load data in ARFF format * @param xmlLabelsDefStream the {@link InputStream} data source to load XML labels meta data * @throws ArgumentNullException if input parameters are null * @throws IllegalArgumentException if input parameters refers to non-existing files * @throws InvalidDataFormatException if format of loaded multi-label data is invalid * @throws DataLoadException if XML meta-data of ARFF data can not be loaded */ public MultiLabelInstances(InputStream arffDataStream, InputStream xmlLabelsDefStream) throws InvalidDataFormatException { if (arffDataStream == null) { throw new ArgumentNullException("arffDataStream"); } if (xmlLabelsDefStream == null) { throw new ArgumentNullException("xmlLabelsDefStream"); } Instances data = loadInstances(arffDataStream); LabelsMetaData labelsData = loadLabesMeta(xmlLabelsDefStream); validate(data, labelsData); dataSet = data; labelsMetaData = labelsData; } /** * Creates a new instance of {@link MultiLabelInstances} data from existing {@link Instances} * and {@link LabelsMetaData}. The input parameters are not copied. Internally are stored only * references.<br></br> * The data set and labels meta data are validated against each other. Any violation of * validation criteria result in {@link InvalidDataFormatException}. * * @param dataSet the data set with data instances in multi-label format * @param labelsMetaData the meta-data about label attributes of data set * @throws IllegalArgumentException if input parameters are null * @throws InvalidDataFormatException if multi-label data format is not valid */ public MultiLabelInstances(Instances dataSet, LabelsMetaData labelsMetaData) throws InvalidDataFormatException { if (dataSet == null) { throw new ArgumentNullException("dataSet"); } if (labelsMetaData == null) { throw new ArgumentNullException("labelsMetaData"); } validate(dataSet, labelsMetaData); this.dataSet = dataSet; this.labelsMetaData = labelsMetaData; } /** * Gets the number of labels (label attributes) * @return number of labels */ public int getNumLabels() { return labelsMetaData.getNumLabels(); } /** * Gets the number of instances * @return number of instances */ public int getNumInstances() { return dataSet.numInstances(); } /** * Gets the cardinality of the dataset * * @return dataset cardinality */ public double getCardinality() { double labelCardinality = 0; int numInstances = dataSet.numInstances(); int numLabels = labelsMetaData.getNumLabels(); int[] labelIndices = getLabelIndices(); for (int i = 0; i < numInstances; i++) { for (int j = 0; j < numLabels; j++) { if (dataSet.instance(i).stringValue(labelIndices[j]).equals("1")) { labelCardinality++; } } } labelCardinality /= numInstances; return labelCardinality; } /** * @return an array with the indices of the label attributes inside the * Instances object */ public int[] getLabelIndices() { int[] labelIndices = new int[labelsMetaData.getNumLabels()]; int numAttributes = dataSet.numAttributes(); Set<String> labelNames = labelsMetaData.getLabelNames(); int counter = 0; for (int index = 0; index < numAttributes; index++) { Attribute attr = dataSet.attribute(index); if (labelNames.contains(attr.name())) { labelIndices[counter] = index; counter++; } } return labelIndices; } /** * @return a mapping of attribute names and their indices * Instances object */ public Map<String, Integer> getLabelsOrder() { int numAttributes = dataSet.numAttributes(); Set<String> labelNames = labelsMetaData.getLabelNames(); HashMap<String, Integer> assoc = new HashMap<String, Integer>(); int counter = 0; for (int index = 0; index < numAttributes; index++) { Attribute attr = dataSet.attribute(index); if (labelNames.contains(attr.name())) { assoc.put(attr.name(), counter); counter++; } } return assoc; } /** * Gets the {@link Set} of label {@link Attribute} instances of * this {@link MultiLabelInstances} instance. * @return the Set of label Attribute instances */ public Set<Attribute> getLabelAttributes() { Set<String> labelNames = labelsMetaData.getLabelNames(); Set<Attribute> labelAttributes = new HashSet<Attribute>(getNumLabels()); int numAttributes = dataSet.numAttributes(); for (int index = 0; index < numAttributes; index++) { Attribute attr = dataSet.attribute(index); if (labelNames.contains(attr.name())) { labelAttributes.add(attr); } } return labelAttributes; } /** * Gets the array with indices of feature attributes stored in * underlying {@link Instances} data set. * * @return an array with the indices of the feature attributes */ public int[] getFeatureIndices() { int numAttributes = dataSet.numAttributes(); Set<Attribute> featureAttributes = getFeatureAttributes(); int[] featureIndices = new int[featureAttributes.size()]; int counter = 0; for (int index = 0; index < numAttributes; index++) { Attribute attr = dataSet.attribute(index); if (featureAttributes.contains(attr)) { featureIndices[counter] = attr.index(); counter++; } } return featureIndices; } /** * Gets the {@link Set} of feature {@link Attribute} instances of * this {@link MultiLabelInstances} instance. * @return the {@link Set} of feature {@link Attribute} instances */ public Set<Attribute> getFeatureAttributes() { Set<String> labelNames = labelsMetaData.getLabelNames(); Set<Attribute> featureAttributes = new HashSet<Attribute>(getNumLabels()); int numAttributes = dataSet.numAttributes(); for (int index = 0; index < numAttributes; index++) { Attribute attr = dataSet.attribute(index); if (!labelNames.contains(attr.name())) { featureAttributes.add(attr); } } return featureAttributes; } /** * Gets the {@link LabelsMetaData} instance, which contains descriptive meta-data about * label attributes stored in underlying {@link Instances} data set. * * @return descriptive meta-data about label attributes */ public LabelsMetaData getLabelsMetaData() { return labelsMetaData; } /** * Gets underlying {@link Instances}, which contains all data. * * @return underlying Instances object which contains all data */ public Instances getDataSet() { return dataSet; } /** * If {@link Instances} data set are retrieved from {@link MultiLabelInstances} and * post-processed, modified by custom code, it can be again reintegrated into * {@link MultiLabelInstances} if needed. The underlying {@link LabelsMetaData} are * modified to reflect changes in data set. The method creates new instance of * {@link MultiLabelInstances} with modified data set and new meta-data. * <br></br> * The supported changes are:<br></br> * - remove of label {@link Attribute} to the existing {@link Instances}<br></br> * - add/remove of {@link Instance} from the existing {@link Instances}<br></br> * - add/remove of feature/predictor {@link Attribute} to the existing {@link Instances}<br></br> * * @param modifiedDataSet the modified data set * @return the modified data set * @throws IllegalArgumentException if specified modified data set is null * @throws InvalidDataFormatException if multi-label data format with specified modifications is not valid */ public MultiLabelInstances reintegrateModifiedDataSet(Instances modifiedDataSet) throws InvalidDataFormatException { if (modifiedDataSet == null) { throw new IllegalArgumentException("The modified data set is null."); } //TODO: add support for addition of label attributes to modified data set if necessary LabelsMetaDataImpl newMetaData = (LabelsMetaDataImpl) labelsMetaData.clone(); Set<String> origLabelNames = labelsMetaData.getLabelNames(); for (String labelName : origLabelNames) { if (modifiedDataSet.attribute(labelName) == null) { newMetaData.removeLabelNode(labelName); } } return new MultiLabelInstances(modifiedDataSet, newMetaData); } /** * Returns a deep copy of the {@link MultiLabelInstances} instance. */ @Override public MultiLabelInstances clone() { LabelsMetaData metaDataCopy = labelsMetaData.clone(); Instances dataSetCopy = new Instances(dataSet); try { return new MultiLabelInstances(dataSetCopy, metaDataCopy); } catch (InvalidDataFormatException ex) { throw new MulanRuntimeException( String.format("The cloning of '%' class instance failed", getClass()), ex); } } private Instances loadInstances(File arffFile) { if (!arffFile.exists()) { throw new IllegalArgumentException( String.format("The arff data file does not exists under specified path '%s'.", arffFile.getAbsolutePath())); } Instances aDataSet = null; FileInputStream fileStream = null; try { fileStream = new FileInputStream(arffFile); } catch (FileNotFoundException exception) { throw new DataLoadException( String.format("The specified data file '%s' can not be found.", arffFile.getAbsolutePath()), exception); } aDataSet = loadInstances(fileStream); return aDataSet; } private Instances loadInstances(InputStream stream) { Instances aDataSet = null; InputStreamReader streamReader = new InputStreamReader(stream); try { aDataSet = new Instances(streamReader); } catch (IOException exception) { throw new DataLoadException( String.format("Error creating Instances data from supplied Reader data source", exception)); } return aDataSet; } private LabelsMetaData loadLabesMeta(String xmlLabelsDefFilePath) { LabelsMetaData labelsMeta = null; try { labelsMeta = LabelsBuilder.createLabels(xmlLabelsDefFilePath); } catch (LabelsBuilderException exception) { throw new DataLoadException( String.format("Error loading labels meta-data from xml file '%s'.", xmlLabelsDefFilePath), exception); } return labelsMeta; } private LabelsMetaData loadLabesMeta(InputStream xmlLabelsDefStream) { LabelsMetaData labelsMeta = null; try { labelsMeta = LabelsBuilder.createLabels(xmlLabelsDefStream); } catch (LabelsBuilderException exception) { throw new DataLoadException(String.format("Error loading labels meta-data from input stream."), exception); } return labelsMeta; } private LabelsMetaData loadLabesMeta(Instances data, int numLabels) throws InvalidDataFormatException { LabelsMetaDataImpl labelsData = new LabelsMetaDataImpl(); int numAttributes = data.numAttributes(); for (int index = numAttributes - numLabels; index < numAttributes; index++) { String attrName = data.attribute(index).name(); labelsData.addRootNode(new LabelNodeImpl(attrName)); } if (labelsData.getNumLabels() < numLabels) { throw new InvalidDataFormatException("The names of label attributes are not unique."); } return labelsData; } /** * Does validation and integrity checks between data set and meta-data. The appropriate exception is * thrown if any inconsistencies of validation rules breached. * The passed data set and meta-data are not modified in any way. */ private void validate(Instances dataSet, LabelsMetaData labelsMetaData) throws InvalidDataFormatException { Set<String> labelNames = labelsMetaData.getLabelNames(); if (labelNames.size() < 2) { throw new InvalidDataFormatException( String.format("There must be at least 2 label attributes specified, but only '%s' are defined in metadata", labelNames.size())); } int numAttributes = dataSet.numAttributes(); int numMatches = 0; for (int index = 0; index < numAttributes; index++) { Attribute attribute = dataSet.attribute(index); if (labelNames.contains(attribute.name())) { numMatches++; if (!checkLabelAttributeFormat(attribute)) { throw new InvalidDataFormatException( String.format("The format of label attribute '%s' is not valid.", attribute.name())); } } } if (numMatches != labelNames.size()) { throw new InvalidDataFormatException( String.format("Not all labels defined in meta-data are present in ARFF data file.")); } if (labelsMetaData.isHierarchy()) { checkLabelsConsistency(dataSet, labelsMetaData.getRootLabels()); } } // Checks label attribute, if it is nominal and have binary values. private boolean checkLabelAttributeFormat(Attribute attribute) { if (attribute.isNominal() != true) { return false; } List<String> allowedValues = new ArrayList<String>(); allowedValues.add("0"); allowedValues.add("1"); int numValues = attribute.numValues(); if (allowedValues.size() != numValues) { return false; } for (int index = 0; index < numValues; index++) { String value = attribute.value(index); if (allowedValues.contains(value)) { allowedValues.remove(value); } } if (allowedValues.size() != 0) { return false; } return true; } // Checks the consistency of labels if there is a hierarchy between them. // If child labels is 'true' for some instance, all its parent labels should be // also 'true' for the instance. private void checkLabelsConsistency(Instances dataSet, Set<LabelNode> rootLabelNodes) throws InvalidDataFormatException { // create an index for faster access to attribute based on name Map<String, Attribute> attributesIndex = new HashMap<String, Attribute>(); for (int index = 0; index < dataSet.numAttributes(); index++) { Attribute attribute = dataSet.attribute(index); attributesIndex.put(attribute.name(), attribute); } int numInstances = dataSet.numInstances(); for (int index = 0; index < numInstances; index++) { Instance instance = dataSet.instance(index); for (LabelNode labelNode : rootLabelNodes) { checkSubtreeConsistency(labelNode, instance, true, attributesIndex); } } } private void checkSubtreeConsistency(LabelNode node, Instance instance, boolean canBeLabelSet, Map<String, Attribute> attributesIndex) throws InvalidDataFormatException { boolean isLabelSet = isLabelSet(instance, node.getName(), attributesIndex); if (isLabelSet == true && canBeLabelSet == false) { throw new InvalidDataFormatException(String.format("Consistency of labels hierarchy is breached for: Label='%s', Instance='%s'", node.getName(), instance.toString())); } if (node.hasChildren()) { Set<LabelNode> childNodes = node.getChildren(); for (LabelNode child : childNodes) { checkSubtreeConsistency(child, instance, isLabelSet, attributesIndex); } } } private boolean isLabelSet(Instance instance, String labelName, Map<String, Attribute> attributesIndex) { if (instance.stringValue(attributesIndex.get(labelName)).equals("1")) return true; else return false; } /** * Create a HashMap that contains every label, with its depth in the Hierarchical tree * @return a HashMap that contains every label with its depth in the Hierarchical tree */ public HashMap<String, Integer> getLabelDepth() { int numAttributes = dataSet.numAttributes(); Set<String> labelNames = labelsMetaData.getLabelNames(); HashMap<String, Integer> assoc = new HashMap<String, Integer>(); for (int index = 0; index < numAttributes; index++) { Attribute attr = dataSet.attribute(index); if (labelNames.contains(attr.name())) { assoc.put(attr.name(), getDepth(attr.name())); } } return assoc; } /** * Calculates the depth of a label, in the Hierarchy of the tree of labels. * Returns the counter of every level. We define the root node label that has the depth 1 * @param labelName * @return the depth of a label */ public int getDepth(String labelName) { int counter = 0; while (labelsMetaData.getLabelNode(labelName).hasParent()) { counter++; labelName = labelsMetaData.getLabelNode(labelName).getParent().getName(); } return counter + 1; } public int[] getLabelDepthIndices() { int[] labelDepthIndices = new int[labelsMetaData.getNumLabels()]; int numAttributes = dataSet.numAttributes(); Set<String> labelNames = labelsMetaData.getLabelNames(); int counter = 0; for (int index = 0; index < numAttributes; index++) { Attribute attr = dataSet.attribute(index); if (labelNames.contains(attr.name())) { labelDepthIndices[counter] = getDepth(attr.name()); counter++; } } return labelDepthIndices; } /** * Method that checks whether an instance has missing labels * * @param instance one instance of this dataset * @return true if the instance has missing labels */ public boolean hasMissingLabels(Instance instance) { int numLabels = getNumLabels(); int[] labelIndices = getLabelIndices(); boolean missing = false; for (int j = 0; j < numLabels; j++) { if (instance.isMissing(labelIndices[j])) { missing = true; break; } } return missing; } }