/** * This class computes the Earth Mover's Distance, using the EMD-HAT algorithm * created by Ofir Pele and Michael Werman. * <p> * This implementation is strongly based on the C++ code by the same authors, * that can be found here: * http://www.cs.huji.ac.il/~ofirpele/FastEMD/code/ * <p> * Some of the author's comments on the original were kept or edited for * this context. */ package com.formulasearchengine.mathosphere.mathpd.distances.earthmover; import java.util.*; /** * @author Telmo Menezes (telmo@telmomenezes.com) * @author Ofir Pele * */ public class JFastEMD { /** * This interface is similar to Rubner's interface. See: * http://www.cs.duke.edu/~tomasi/software/emd.htm * * To get the same results as Rubner's code you should set extra_mass_penalty to 0, * and divide by the minimum of the sum of the two signature's weights. However, I * suggest not to do this as you lose the metric property and more importantly, in my * experience the performance is better with emd_hat. for more on the difference * between emd and emd_hat, see the paper: * A Linear Time Histogram Metric for Improved SIFT Matching * Ofir Pele, Michael Werman * ECCV 2008 * * To get shorter running time, set the ground distance function to * be a thresholded distance. For example: min(L2, T). Where T is some threshold. * Note that the running time is shorter with smaller T values. Note also that * thresholding the distance will probably increase accuracy. Finally, a thresholded * metric is also a metric. See paper: * Fast and Robust Earth Mover's Distances * Ofir Pele, Michael Werman * ICCV 2009 * * If you use this code, please cite the papers. */ static public double distance(Signature signature1, Signature signature2, double extraMassPenalty) { Vector<Double> P = new Vector<Double>(); Vector<Double> Q = new Vector<Double>(); for (int i = 0; i < signature1.getNumberOfFeatures() + signature2.getNumberOfFeatures(); i++) { P.add(0.0); Q.add(0.0); } for (int i = 0; i < signature1.getNumberOfFeatures(); i++) { P.set(i, signature1.getWeights()[i]); } for (int j = 0; j < signature2.getNumberOfFeatures(); j++) { Q.set(j + signature1.getNumberOfFeatures(), signature2.getWeights()[j]); } Vector<Vector<Double>> C = new Vector<Vector<Double>>(); for (int i = 0; i < P.size(); i++) { Vector<Double> vec = new Vector<Double>(); for (int j = 0; j < P.size(); j++) { vec.add(0.0); } C.add(vec); } for (int i = 0; i < signature1.getNumberOfFeatures(); i++) { for (int j = 0; j < signature2.getNumberOfFeatures(); j++) { double dist = signature1.getFeatures()[i] .groundDist(signature2.getFeatures()[j]); assert (dist >= 0); C.get(i).set(j + signature1.getNumberOfFeatures(), dist); C.get(j + signature1.getNumberOfFeatures()).set(i, dist); } } return emdHat(P, Q, C, extraMassPenalty); } static private long emdHatImplLongLongInt(Vector<Long> Pc, Vector<Long> Qc, Vector<Vector<Long>> C, long extraMassPenalty) { int N = Pc.size(); assert (Qc.size() == N); // Ensuring that the supplier - P, have more mass. // Note that we assume here that C is symmetric Vector<Long> P; Vector<Long> Q; long absDiffSumPSumQ; long sumP = 0; long sumQ = 0; for (int i = 0; i < N; i++) sumP += Pc.get(i); for (int i = 0; i < N; i++) sumQ += Qc.get(i); if (sumQ > sumP) { P = Qc; Q = Pc; absDiffSumPSumQ = sumQ - sumP; } else { P = Pc; Q = Qc; absDiffSumPSumQ = sumP - sumQ; } // creating the b vector that contains all vertexes Vector<Long> b = new Vector<Long>(); for (int i = 0; i < 2 * N + 2; i++) { b.add(0l); } int THRESHOLD_NODE = 2 * N; int ARTIFICIAL_NODE = 2 * N + 1; // need to be last ! for (int i = 0; i < N; i++) { b.set(i, P.get(i)); } for (int i = N; i < 2 * N; i++) { b.set(i, Q.get(i - N)); } // remark*) I put here a deficit of the extra mass, as mass that flows // to the threshold node // can be absorbed from all sources with cost zero (this is in reverse // order from the paper, // where incoming edges to the threshold node had the cost of the // threshold and outgoing // edges had the cost of zero) // This also makes sum of b zero. b.set(THRESHOLD_NODE, -absDiffSumPSumQ); b.set(ARTIFICIAL_NODE, 0l); long maxC = 0; for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { assert (C.get(i).get(j) >= 0); if (C.get(i).get(j) > maxC) maxC = C.get(i).get(j); } } if (extraMassPenalty == -1) extraMassPenalty = maxC; Set<Integer> sourcesThatFlowNotOnlyToThresh = new HashSet<Integer>(); Set<Integer> sinksThatGetFlowNotOnlyFromThresh = new HashSet<Integer>(); long preFlowCost = 0; // regular edges between sinks and sources without threshold edges Vector<List<Edge>> c = new Vector<List<Edge>>(); for (int i = 0; i < b.size(); i++) { c.add(new LinkedList<Edge>()); } for (int i = 0; i < N; i++) { if (b.get(i) == 0) continue; for (int j = 0; j < N; j++) { if (b.get(j + N) == 0) continue; if (C.get(i).get(j) == maxC) continue; c.get(i).add(new Edge(j + N, C.get(i).get(j))); } } // checking which are not isolated for (int i = 0; i < N; i++) { if (b.get(i) == 0) continue; for (int j = 0; j < N; j++) { if (b.get(j + N) == 0) continue; if (C.get(i).get(j) == maxC) continue; sourcesThatFlowNotOnlyToThresh.add(i); sinksThatGetFlowNotOnlyFromThresh.add(j + N); } } // converting all sinks to negative for (int i = N; i < 2 * N; i++) { b.set(i, -b.get(i)); } // add edges from/to threshold node, // note that costs are reversed to the paper (see also remark* above) // It is important that it will be this way because of remark* above. for (int i = 0; i < N; ++i) { c.get(i).add(new Edge(THRESHOLD_NODE, 0)); } for (int j = 0; j < N; ++j) { c.get(THRESHOLD_NODE).add(new Edge(j + N, maxC)); } // artificial arcs - Note the restriction that only one edge i,j is // artificial so I ignore it... for (int i = 0; i < ARTIFICIAL_NODE; i++) { c.get(i).add(new Edge(ARTIFICIAL_NODE, maxC + 1)); c.get(ARTIFICIAL_NODE).add(new Edge(i, maxC + 1)); } // remove nodes with supply demand of 0 // and vertexes that are connected only to the // threshold vertex int currentNodeName = 0; // Note here it should be vector<int> and not vector<int> // as I'm using -1 as a special flag !!! int REMOVE_NODE_FLAG = -1; Vector<Integer> nodesNewNames = new Vector<Integer>(); Vector<Integer> nodesOldNames = new Vector<Integer>(); for (int i = 0; i < b.size(); i++) { nodesNewNames.add(REMOVE_NODE_FLAG); nodesOldNames.add(0); } for (int i = 0; i < N * 2; i++) { if (b.get(i) != 0) { if (sourcesThatFlowNotOnlyToThresh.contains(i) || sinksThatGetFlowNotOnlyFromThresh.contains(i)) { nodesNewNames.set(i, currentNodeName); nodesOldNames.add(i); currentNodeName++; } else { if (i >= N) { preFlowCost -= (b.get(i) * maxC); } b.set(THRESHOLD_NODE, b.get(THRESHOLD_NODE) + b.get(i)); // add mass(i<N) or deficit (i>=N) } } } nodesNewNames.set(THRESHOLD_NODE, currentNodeName); nodesOldNames.add(THRESHOLD_NODE); currentNodeName++; nodesNewNames.set(ARTIFICIAL_NODE, currentNodeName); nodesOldNames.add(ARTIFICIAL_NODE); currentNodeName++; Vector<Long> bb = new Vector<Long>(); for (int i = 0; i < currentNodeName; i++) { bb.add(0l); } int j = 0; for (int i = 0; i < b.size(); i++) { if (nodesNewNames.get(i) != REMOVE_NODE_FLAG) { bb.set(j, b.get(i)); j++; } } Vector<List<Edge>> cc = new Vector<List<Edge>>(); for (int i = 0; i < bb.size(); i++) { cc.add(new LinkedList<Edge>()); } for (int i = 0; i < c.size(); i++) { if (nodesNewNames.get(i) == REMOVE_NODE_FLAG) continue; for (Edge it : c.get(i)) { if (nodesNewNames.get(it._to) != REMOVE_NODE_FLAG) { cc.get(nodesNewNames.get(i)).add( new Edge(nodesNewNames.get(it._to), it._cost)); } } } MinCostFlow mcf = new MinCostFlow(); long myDist; Vector<List<Edge0>> flows = new Vector<List<Edge0>>(bb.size()); for (int i = 0; i < bb.size(); i++) { flows.add(new LinkedList<Edge0>()); } long mcfDist = mcf.compute(bb, cc, flows); myDist = preFlowCost + // pre-flowing on cases where it was possible mcfDist + // solution of the transportation problem (absDiffSumPSumQ * extraMassPenalty); // emd-hat extra mass penalty return myDist; } static private double emdHat(Vector<Double> P, Vector<Double> Q, Vector<Vector<Double>> C, double extraMassPenalty) { // This condition should hold: // ( 2^(sizeof(CONVERT_TO_T*8)) >= ( MULT_FACTOR^2 ) // Note that it can be problematic to check it because // of overflow problems. I simply checked it with Linux calc // which has arbitrary precision. double MULT_FACTOR = 1000000; // Constructing the input int N = P.size(); Vector<Long> iP = new Vector<Long>(); Vector<Long> iQ = new Vector<Long>(); Vector<Vector<Long>> iC = new Vector<Vector<Long>>(); for (int i = 0; i < N; i++) { iP.add(0l); iQ.add(0l); Vector<Long> vec = new Vector<Long>(); for (int j = 0; j < N; j++) { vec.add(0l); } iC.add(vec); } // Converting to CONVERT_TO_T double sumP = 0.0; double sumQ = 0.0; double maxC = C.get(0).get(0); for (int i = 0; i < N; i++) { sumP += P.get(i); sumQ += Q.get(i); for (int j = 0; j < N; j++) { if (C.get(i).get(j) > maxC) maxC = C.get(i).get(j); } } double minSum = Math.min(sumP, sumQ); double maxSum = Math.max(sumP, sumQ); double PQnormFactor = MULT_FACTOR / maxSum; double CnormFactor = MULT_FACTOR / maxC; for (int i = 0; i < N; i++) { iP.set(i, (long) (Math.floor(P.get(i) * PQnormFactor + 0.5))); iQ.set(i, (long) (Math.floor(Q.get(i) * PQnormFactor + 0.5))); for (int j = 0; j < N; j++) { iC.get(i) .set(j, (long) (Math.floor(C.get(i).get(j) * CnormFactor + 0.5))); } } // computing distance without extra mass penalty double dist = emdHatImplLongLongInt(iP, iQ, iC, 0); // unnormalize dist = dist / PQnormFactor; dist = dist / CnormFactor; // adding extra mass penalty if (extraMassPenalty == -1) extraMassPenalty = maxC; dist += (maxSum - minSum) * extraMassPenalty; return dist; } }