/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * DataPair.java * Copyright (C) 2009-2010 Aristotle University of Thessaloniki, Thessaloniki, Greece */ package mulan.classifier.neural; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import mulan.core.ArgumentNullException; import mulan.data.MultiLabelInstances; import weka.core.Attribute; import weka.core.Instance; import weka.core.Instances; /** * Class for representation of a data-pair instance. The data pair contains * an input pattern and respected true or expected output pattern for the input pattern. * * @author Jozef Vilcek */ public class DataPair { private final double[] input; private final double[] output; private boolean[] outputBoolean; /** * Creates a {@link DataPair} instance. * @param inputPattern the input pattern * @param trueOutput the true/expected output pattern for the input */ public DataPair(final double[] inputPattern, final double[] trueOutput) { if (inputPattern == null) { throw new ArgumentNullException("inputPattern"); } if (trueOutput == null) { throw new ArgumentNullException("trueOutput"); } this.input = Arrays.copyOf(inputPattern, inputPattern.length); this.output = Arrays.copyOf(trueOutput, trueOutput.length); } /** * Gets the input pattern. * @return the input pattern */ public double[] getInput() { return input; } /** * Gets the ideal/expected output pattern. * @return the output pattern */ public double[] getOutput() { return output; } /** * Gets the ideal/expected output pattern as boolean values. * This is useful when output represents labels bipartition. * If output values in <code>double[]</code> are not in boolean representation, * then output of this method is might not be valid. * The computation is as follows:<br></br> * - if value is equal to 1, then output is <code>true</code> in boolean<br></br> * - if value is other than 1, then output is <code>false</code> in boolean * * @return the boolean representation of the output pattern */ public boolean[] getOutputBoolean() { if (outputBoolean == null) { outputBoolean = new boolean[output.length]; for (int i = 0; i < output.length; i++) { outputBoolean[i] = (output[i] == 1) ? true : false; } } return outputBoolean; } /** * Creates a {@link DataPair} representation for each {@link Instance} contained in * {@link MultiLabelInstances} data set. The {@link DataPair} is a light weight representation * of instance values (by double values), which is useful when iteration over the data and its * values. * * @param mlDataSet the {@link MultiLabelInstances} which content has to be * converted to list of {@link DataPair} * @param bipolarOutput indicates whether output values should be converted * to bipolar values, or left intact as binary * @return the list of data pairs */ // TODO: this method should be in some kind of "data utils". public static List<DataPair> createDataPairs(MultiLabelInstances mlDataSet, boolean bipolarOutput) { Instances data = mlDataSet.getDataSet(); int[] featureIndices = mlDataSet.getFeatureIndices(); int[] labelIndices = mlDataSet.getLabelIndices(); int numFeatures = featureIndices.length; int numLabels = mlDataSet.getNumLabels(); int numInstances = data.numInstances(); List<DataPair> dataPairs = new ArrayList<DataPair>(numInstances); for (int index = 0; index < numInstances; index++) { Instance instance = data.instance(index); double[] input = new double[numFeatures]; for (int i = 0; i < numFeatures; i++) { int featureIndex = featureIndices[i]; Attribute featureAttr = instance.attribute(featureIndex); // if attribute is binary, parse the string value ... it is expected to be '0' or '1' if (featureAttr.isNominal() && featureAttr.numValues() == 2) { input[i] = Double.parseDouble(instance.stringValue(featureIndex)); } // else : // a) the attribute is nominal with multiple values, use indexes as nominal values // do not have to be numbers in general ... this is fall-back ... should be rare case // b) is numeric attribute else { input[i] = instance.value(featureIndex); } } if (mlDataSet.hasMissingLabels(instance)) continue; double[] output = new double[numLabels]; for (int i = 0; i < numLabels; i++) { output[i] = Double.parseDouble(data.attribute(labelIndices[i]).value((int) instance.value(labelIndices[i]))); if (bipolarOutput && output[i] == 0) { output[i] = -1; } } dataPairs.add(new DataPair(input, output)); } return dataPairs; } }