/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * EuclideanDistance.java * Copyright (C) 2002 University of Waikato * */ package weka.core; import java.io.Serializable; import java.io.*; /** * Implementing Euclidean distance (or similarity) function. * * One object defines not one distance but the data model in which * the distances between objects of that data model can be computed. * * Attention: For efficiency reasons the use of consistency checks (like are * the data models of the two instances exactly the same), is low. * * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) * @version $Revision: 1.1.1.1 $ */ public class EuclideanDistance extends DistanceFunction implements Cloneable, Serializable { /** True if normalization should be done */ protected boolean m_Normalize = true; /** The number of attributes the contribute to a prediction */ protected double m_NumAttributesUsed; /** * Constructs an Euclidean Distance object. * @param data the instances the distance function should work on */ public EuclideanDistance() { } /** * Constructs an Euclidean Distance object. * @param data the instances the distance function should work on */ public EuclideanDistance(Instances data) { super(data); setNumAttributesUsed(); } /** * Constructs an Euclidean Distance object. * @param data the instances the distance function should work on * @param normalize if true normalization is done */ public EuclideanDistance(Instances data, boolean normalize) { super(data); m_Normalize = normalize; setNumAttributesUsed(); } /** * Constructs an Euclidean Distance object. Ranges are already given. * @param data the instances the distance function should work on * @param ranges the min and max values of the attribute values */ public EuclideanDistance(Instances data, double [][] ranges) { super(data, ranges); setNumAttributesUsed(); } /** * Constructs an Euclidean Distance object. Ranges are already given. * @param data the instances the distance function should work on * @param ranges the min and max values of the attribute values * @param normalize if true normalization is done */ public EuclideanDistance(Instances data, double [][] ranges, boolean normalize) { super(data, ranges); m_Normalize = normalize; setNumAttributesUsed(); } /** * Computes and sets the number of attributes used. */ public void setNumAttributesUsed() { m_NumAttributesUsed = 0.0; for (int i = 0; i < m_Model.numAttributes(); i++) { if ((i != m_Model.classIndex()) && (m_Model.attribute(i).isNominal() || m_Model.attribute(i).isNumeric())) { m_NumAttributesUsed += 1.0; } } } /** * Calculates the distance (or similarity) between two instances. * @param first the first instance * @param second the second instance * @return the distance between the two given instances, */ public double distance(Instance first, Instance second) throws Exception { //if (!Instances.inRanges(first,m_Ranges)) // OOPS("Not in ranges"); //OOPS(" dist first "+ first); //if (!Instances.inRanges(second,m_Ranges)) // OOPS("Not in ranges"); //OOPS(" dist second "+ second); double distance = 0; int firstI, secondI; for (int p1 = 0, p2 = 0; p1 < first.numValues() || p2 < second.numValues();) { if (p1 >= first.numValues()) { // model !! todo numinstances might change firstI = m_Model.numAttributes(); } else { // only in case instance is sparseInstance, firstI is different to p1 firstI = first.index(p1); } if (p2 >= second.numValues()) { secondI = m_Model.numAttributes(); } else { secondI = second.index(p2); } // ignore class values if (firstI == m_Model.classIndex()) { p1++; continue; } if (secondI == m_Model.classIndex()) { p2++; continue; } double diff; if (firstI == secondI) { diff = difference(firstI, first.valueSparse(p1), second.valueSparse(p2)); p1++; p2++; } else if (firstI > secondI) { diff = difference(secondI, 0, second.valueSparse(p2)); p2++; } else { diff = difference(firstI, first.valueSparse(p1), 0); p1++; } distance += diff * diff; } distance = Math.sqrt(distance / m_NumAttributesUsed); return distance; } /** * Computes the difference between two given attribute values. * @param index the index of the current attribute * @param val1 the first attribute value * @param val2 the second attribute value * @return the distance between the two given attribute values */ private double difference(int index, double val1, double val2) throws Exception { // If attribute is numeric if (Instance.isMissingValue(val1) || Instance.isMissingValue(val2)) { throw new Exception("Missing value not allowed."); } else { return norm(val1, index) - norm(val2, index); } } /** * Returns value in the middle of the two parameter values. * @param range the ranges to this dimension * @return the middle value */ public double getMiddle(double[] ranges) { double middle = ranges[R_MIN] + ranges[R_WIDTH] * 0.5; return middle; } /** * Checks the instances. * Dataset should only contain nominal or stringumeric attributes. */ public void checkInstances() throws Exception { for (int i = 0; i < m_Model.numAttributes(); i++) { if (m_Model.classIndex() != i) { if (!m_Model.attribute(i).isNumeric()) throw new Exception("Euclidean Distance only allows numeric attributes."); } } } /** * Returns true if the value of the given dimension is smaller or equal the * value to be compared with. * @param instance the instance where the value should be taken of * @param dim the dimension of the value * @param the value to compare with * @return true is value of instance is smaller or equal value */ public boolean valueIsSmallerEqual(Instance instance, int dim, double value) { return instance.value(dim) <= value; } /** * Normalises a given value of a numeric attribute. * @param ranges the min max values of the attributes * @param x the value to be normalized * @param i the attribute's index */ private double norm(double x, int i) { if (!m_Normalize) { return x; } else if (Double.isNaN(m_Ranges[i][R_MIN]) || Utils.eq(m_Ranges[i][R_MAX], m_Ranges[i][R_MIN])) { return 0; } else { return (x - m_Ranges[i][R_MIN]) / (m_Ranges[i][R_WIDTH]); } } /** * Documents the content of an EuclideanDistance object in a string. * @return the converted string */ public String toString() { StringBuffer text = new StringBuffer(); //todo text.append("\n"); return text.toString(); } /** * Used for debug println's. * @param output string that is printed */ private void OOPS(String output) { System.out.println(output); } /** * Main method for testing this class. */ public static void main(String[] args) { try { Reader r = null; if (args.length > 1) { throw (new Exception("Usage: EuclideanDistance <filename>")); } else if (args.length == 0) { r = new BufferedReader(new InputStreamReader(System.in)); } else { r = new BufferedReader(new FileReader(args[0])); } Instances i = new Instances(r); EuclideanDistance test = new EuclideanDistance(i); System.out.println("test:\n " + test); } catch (Exception e) { e.printStackTrace(); } } }