/******************************************************************************* * Copyright 2007, 2009 Stephen O'Rourke (stephen.orourke@sydney.edu.au) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ package tml.utils; import weka.core.Instance; import weka.core.matrix.DoubleVector; /** * A library for calculating the semantic distance between instances. * * @author Stephen O'Rourke * */ public class DistanceLib { public static enum DistanceMeasure {COSINE, EUCLIDEAN, JACCARD, JENSEN_SHANNON, KULLBACK_LEIBLER}; public static double distance(DistanceMeasure distanceMeasure, Instance inst1, Instance inst2) { switch (distanceMeasure) { case COSINE: { return cosine(inst1, inst2); } case EUCLIDEAN: { return euclidean(inst1, inst2); } case JACCARD: { return jaccard(inst1, inst2); } case JENSEN_SHANNON: { return jensenShannon(inst1, inst2); } case KULLBACK_LEIBLER: { return kullbackLeibler(inst1, inst2); } default: { return Double.NaN; } } } public static double euclidean(Instance inst1, Instance inst2) { DoubleVector x = new DoubleVector(inst1.toDoubleArray()); DoubleVector y = new DoubleVector(inst2.toDoubleArray()); double distance = x.minus(y).norm2(); return distance; } public static double cosine(Instance inst1, Instance inst2) { DoubleVector x = new DoubleVector(inst1.toDoubleArray()); DoubleVector y = new DoubleVector(inst2.toDoubleArray()); double dotXY = x.times(y).norm1(); double cosim = dotXY / (x.norm2() * y.norm2()); return cosim; } public static double jaccard(Instance inst1, Instance inst2) { DoubleVector x = new DoubleVector(inst1.toDoubleArray()); DoubleVector y = new DoubleVector(inst2.toDoubleArray()); double intersection = 0.0; for (int i = 0; i < x.size(); i++) { intersection += Math.min(x.get(i), y.get(i)); } if (intersection > 0.0) { double union = x.norm1() + y.norm1() - intersection; return intersection / union; } else { return 0.0; } } public static double kullbackLeibler(Instance inst1, Instance inst2) { double divergence = 0.0; for (int i = 0; i < inst1.numAttributes(); ++i) { if (inst1.value(i) != 0 && inst2.value(i) != 0) { divergence += inst1.value(i) * Math.log(inst1.value(i) / inst2.value(i)); } } divergence /= Math.log(2); return divergence; } public static double jensenShannon(Instance inst1, Instance inst2) { Instance averageInst = new Instance(inst1.numAttributes()); for (int i=0; i<inst1.numAttributes(); i++) { averageInst.setValue(i, (inst1.value(i) + inst2.value(i))/2); } double divergence = (kullbackLeibler(inst1, averageInst) + kullbackLeibler(inst2, averageInst))/2; return divergence; } }