JFastEMD.java example

Explorer

mathosphere-master
- basex
  - src
    - main
      - java
        com
        formulasearchengine
        mathosphere
        basex
        BaseXClient.java
        Benchmark.java
        Client.java
        Server.java
        TexQueryGenerator.java
        types
        Formula.java
        Hit.java
        Qvar.java
        Result.java
        Results.java
        Run.java
    - test
      - java
        com
        formulasearchengine
        mathosphere
        basex
        BaseXTestSuite.java
        BenchmarkTest.java
        ClientTest.java
        ServerTest.java
        TestUtils.java
        TexQueryGeneratorTest.java
        XStreamTest.java
- evaluation
  - src
    - main
      - java
        com
        formulasearchengine
        mlp
        evaluation
        Evaluator.java
        Main.java
        cli
        CliParams.java
        EvaluateCommand.java
        pojo
        GoldEntry.java
        IdentifierDefinition.java
        ScoreSummary.java
    - test
      - java
        com
        formulasearchengine
        mlp
        evaluation
        CliTest.java
        ExtractionTests.java
- mathosphere-core
  - src
    - main
      - java
        com
        formulasearchengine
        mathosphere
        App.java
        mathpd
        CollectionUtils.java
        ConverterPairCSVToMatrix.java
        Distances.java
        FlinkPd.java
        MathPdFeatureExtractor.java
        ValidCSymbols.java
        cli
        FlinkPdCommandConfig.java
        contracts
        PreprocessedExtractedMathPDDocumentMapper.java
        TextExtractorMapper.java
        distances
        earthmover
        EarthMoverDistanceWrapper.java
        Feature.java
        Feature2D.java
        JFastEMD.java
        MathAuxiliary.java
        Signature.java
        Test.java
        pojos
        ArxivDocument.java
        ExtractedMathPDDocument.java
        text
        BasicStringMatcher.java
        BoyerMooreSearch.java
        mlp
        FlinkMlpRelationFinder.java
        MachineLearningModelGenerator.java
        MachineLearningRelationClassifier.java
        Main.java
        PatternMatchingRelationFinder.java
        RelationExtractor.java
        StupidRelationFinder.java
        cli
        BaseConfig.java
        CliParams.java
        CountCommandConfig.java
        EvalCommandConfig.java
        FlinkMlpCommandConfig.java
        ListCommandConfig.java
        MachineLearningDefinienClassifierConfig.java
        MachineLearningDefinienExtractionConfig.java
        MlpCommandConfig.java
        contracts
        CreateCandidatesMapper.java
        JsonSerializerMapper.java
        PatternMatcherMapper.java
        StupidRelationScorer.java
        TextAnnotatorMapper.java
        TextExtractorMapper.java
        ml
        WekaClassifier.java
        WekaLearner.java
        WekaUtils.java
        pojos
        EvaluationResult.java
        GoldEntry.java
        IdentifierDefinition.java
        MathTag.java
        ParsedWikiDocument.java
        RawWikiDocument.java
        Relation.java
        Sentence.java
        StringEntry.java
        StrippedWikiDocumentOutput.java
        WikiDocumentOutput.java
        WikidataLink.java
        Word.java
        rus
        RuleBasedPosTagger.java
        RusPosAnnotator.java
        text
        DefinitionUtils.java
        MachineLearningPatternMatcher.java
        MathConverter.java
        MathMLUtils.java
        MyPatternMatcher.java
        PatternMatcher.java
        PlaintextDocumentBuilder.java
        PosTag.java
        PosTagger.java
        SimpleFeatureExtractor.java
        SimpleFeatureExtractorMapper.java
        SimplePatternMatcher.java
        StringReplacer.java
        StringReplacerCallback.java
        TeX2MathML.java
        TexInfo.java
        TokenCounter.java
        UnicodeMap.java
        UnicodeUtils.java
        WikiTextUtils.java
        WikidataInterface.java
        WikidataLinkMap.java
        utils
        GoldUtil.java
        HarvestFromFiles.java
        Util.java
        wikitext
        CaseType.java
        CommentType.java
        ContributorType.java
        DeletedFlagType.java
        DiscussionThreadingInfo.java
        LogItemType.java
        LogParamsType.java
        LogTextType.java
        MediaWikiType.java
        NamespaceType.java
        NamespacesType.java
        ObjectFactory.java
        PageType.java
        RedirectType.java
        RevisionType.java
        SiteInfoType.java
        TextType.java
        UploadType.java
    - test
      - java
        com
        formulasearchengine
        mathosphere
        TestUtils.java
        mathpd
        FeaturesAndDistancesTest.java
        FlinkPdTest.java
        GetAllIdentifiersTest.java
        text
        BasicStringMatcherTest.java
        mlp
        FlinkMlpRelationFinderTest.java
        cli
        CliMainTest.java
        CliParamsTest.java
        MlpCommandConfigTest.java
        contracts
        CreateCandidatesMapperTest.java
        PatternMatcherMapperTest.java
        RussianWikiPipelineTest.java
        TextAnnotatorMapperTest.java
        TextExtractorMapperTest.java
        flink
        ListCollector.java
        gold
        AddTitles.java
        performance
        DefinitionExtractionTest.java
        IdentifierExtractionTest.java
        PerformanceHelper.java
        StupidRelationFinderTest.java
        pojos
        MathTagTest.java
        rus
        RuleBasedPosTaggerTest.java
        text
        MachineLearningPatternMatcherTest.java
        MathConverterTest.java
        MathMLUtilsTest.java
        PatternMatcherTest.java
        PlaintextDocumentBuilderTest.java
        PosTaggerTest.java
        PosTaggerTestGer.java
        SimplePatternMatcherTest.java
        TeX2MathMLTest.java
        TexInfoTest.java
        TokenCounterTest.java
        UnicodeMapTest.java
        UnicodeUtilsTest.java
        WikiTextUtilsTest.java
        WikidataInterfaceTest.java
        WikidataLinkMapTest.java
        wikitext
        MediaWikiTypeTest.java
        PageTypeTest.java
- restd
  - src
    - main
      - java
        com
        formulasearchengine
        mathosphere
        restd
        AppModule.java
        AppServer.java
        domain
        InfoStatement.java
        MathRequest.java
        MathUpdate.java
        WikiTextRequest.java
        rest
        BaseXResource.java
        MlpResource.java
    - test
      - java
        com
        formulasearchengine
        mathosphere
        restd
        rest
        BaseXResourceTest.java

/**
 * This class computes the Earth Mover's Distance, using the EMD-HAT algorithm
 * created by Ofir Pele and Michael Werman.
 * <p>
 * This implementation is strongly based on the C++ code by the same authors,
 * that can be found here:
 * http://www.cs.huji.ac.il/~ofirpele/FastEMD/code/
 * <p>
 * Some of the author's comments on the original were kept or edited for
 * this context.
 */


package com.formulasearchengine.mathosphere.mathpd.distances.earthmover;

import java.util.*;

/**
 * @author Telmo Menezes (telmo@telmomenezes.com)
 * @author Ofir Pele
 *
 */
public class JFastEMD {
    /**
     * This interface is similar to Rubner's interface. See:
     * http://www.cs.duke.edu/~tomasi/software/emd.htm
     *
     * To get the same results as Rubner's code you should set extra_mass_penalty to 0,
     * and divide by the minimum of the sum of the two signature's weights. However, I
     * suggest not to do this as you lose the metric property and more importantly, in my
     * experience the performance is better with emd_hat. for more on the difference
     * between emd and emd_hat, see the paper:
     * A Linear Time Histogram Metric for Improved SIFT Matching
     * Ofir Pele, Michael Werman
     * ECCV 2008
     *
     * To get shorter running time, set the ground distance function to
     * be a thresholded distance. For example: min(L2, T). Where T is some threshold.
     * Note that the running time is shorter with smaller T values. Note also that
     * thresholding the distance will probably increase accuracy. Finally, a thresholded
     * metric is also a metric. See paper:
     * Fast and Robust Earth Mover's Distances
     * Ofir Pele, Michael Werman
     * ICCV 2009
     *
     * If you use this code, please cite the papers.
     */
    static public double distance(Signature signature1, Signature signature2, double extraMassPenalty) {

        Vector<Double> P = new Vector<Double>();
        Vector<Double> Q = new Vector<Double>();
        for (int i = 0; i < signature1.getNumberOfFeatures() + signature2.getNumberOfFeatures(); i++) {
            P.add(0.0);
            Q.add(0.0);
        }
        for (int i = 0; i < signature1.getNumberOfFeatures(); i++) {
            P.set(i, signature1.getWeights()[i]);
        }
        for (int j = 0; j < signature2.getNumberOfFeatures(); j++) {
            Q.set(j + signature1.getNumberOfFeatures(), signature2.getWeights()[j]);
        }

        Vector<Vector<Double>> C = new Vector<Vector<Double>>();
        for (int i = 0; i < P.size(); i++) {
            Vector<Double> vec = new Vector<Double>();
            for (int j = 0; j < P.size(); j++) {
                vec.add(0.0);
            }
            C.add(vec);
        }
        for (int i = 0; i < signature1.getNumberOfFeatures(); i++) {
            for (int j = 0; j < signature2.getNumberOfFeatures(); j++) {
                double dist = signature1.getFeatures()[i]
                        .groundDist(signature2.getFeatures()[j]);
                assert (dist >= 0);
                C.get(i).set(j + signature1.getNumberOfFeatures(), dist);
                C.get(j + signature1.getNumberOfFeatures()).set(i, dist);
            }
        }

        return emdHat(P, Q, C, extraMassPenalty);
    }


    static private long emdHatImplLongLongInt(Vector<Long> Pc, Vector<Long> Qc,
                                              Vector<Vector<Long>> C, long extraMassPenalty) {

        int N = Pc.size();
        assert (Qc.size() == N);

        // Ensuring that the supplier - P, have more mass.
        // Note that we assume here that C is symmetric
        Vector<Long> P;
        Vector<Long> Q;
        long absDiffSumPSumQ;
        long sumP = 0;
        long sumQ = 0;
        for (int i = 0; i < N; i++)
            sumP += Pc.get(i);
        for (int i = 0; i < N; i++)
            sumQ += Qc.get(i);
        if (sumQ > sumP) {
            P = Qc;
            Q = Pc;
            absDiffSumPSumQ = sumQ - sumP;
        } else {
            P = Pc;
            Q = Qc;
            absDiffSumPSumQ = sumP - sumQ;
        }

        // creating the b vector that contains all vertexes
        Vector<Long> b = new Vector<Long>();
        for (int i = 0; i < 2 * N + 2; i++) {
            b.add(0l);
        }
        int THRESHOLD_NODE = 2 * N;
        int ARTIFICIAL_NODE = 2 * N + 1; // need to be last !
        for (int i = 0; i < N; i++) {
            b.set(i, P.get(i));
        }
        for (int i = N; i < 2 * N; i++) {
            b.set(i, Q.get(i - N));
        }

        // remark*) I put here a deficit of the extra mass, as mass that flows
        // to the threshold node
        // can be absorbed from all sources with cost zero (this is in reverse
        // order from the paper,
        // where incoming edges to the threshold node had the cost of the
        // threshold and outgoing
        // edges had the cost of zero)
        // This also makes sum of b zero.
        b.set(THRESHOLD_NODE, -absDiffSumPSumQ);
        b.set(ARTIFICIAL_NODE, 0l);

        long maxC = 0;
        for (int i = 0; i < N; i++) {
            for (int j = 0; j < N; j++) {
                assert (C.get(i).get(j) >= 0);
                if (C.get(i).get(j) > maxC)
                    maxC = C.get(i).get(j);
            }
        }
        if (extraMassPenalty == -1)
            extraMassPenalty = maxC;

        Set<Integer> sourcesThatFlowNotOnlyToThresh = new HashSet<Integer>();
        Set<Integer> sinksThatGetFlowNotOnlyFromThresh = new HashSet<Integer>();
        long preFlowCost = 0;

        // regular edges between sinks and sources without threshold edges
        Vector<List<Edge>> c = new Vector<List<Edge>>();
        for (int i = 0; i < b.size(); i++) {
            c.add(new LinkedList<Edge>());
        }
        for (int i = 0; i < N; i++) {
            if (b.get(i) == 0)
                continue;
            for (int j = 0; j < N; j++) {
                if (b.get(j + N) == 0)
                    continue;
                if (C.get(i).get(j) == maxC)
                    continue;
                c.get(i).add(new Edge(j + N, C.get(i).get(j)));
            }
        }

        // checking which are not isolated
        for (int i = 0; i < N; i++) {
            if (b.get(i) == 0)
                continue;
            for (int j = 0; j < N; j++) {
                if (b.get(j + N) == 0)
                    continue;
                if (C.get(i).get(j) == maxC)
                    continue;
                sourcesThatFlowNotOnlyToThresh.add(i);
                sinksThatGetFlowNotOnlyFromThresh.add(j + N);
            }
        }

        // converting all sinks to negative
        for (int i = N; i < 2 * N; i++) {
            b.set(i, -b.get(i));
        }

        // add edges from/to threshold node,
        // note that costs are reversed to the paper (see also remark* above)
        // It is important that it will be this way because of remark* above.
        for (int i = 0; i < N; ++i) {
            c.get(i).add(new Edge(THRESHOLD_NODE, 0));
        }
        for (int j = 0; j < N; ++j) {
            c.get(THRESHOLD_NODE).add(new Edge(j + N, maxC));
        }

        // artificial arcs - Note the restriction that only one edge i,j is
        // artificial so I ignore it...
        for (int i = 0; i < ARTIFICIAL_NODE; i++) {
            c.get(i).add(new Edge(ARTIFICIAL_NODE, maxC + 1));
            c.get(ARTIFICIAL_NODE).add(new Edge(i, maxC + 1));
        }

        // remove nodes with supply demand of 0
        // and vertexes that are connected only to the
        // threshold vertex
        int currentNodeName = 0;
        // Note here it should be vector<int> and not vector<int>
        // as I'm using -1 as a special flag !!!
        int REMOVE_NODE_FLAG = -1;
        Vector<Integer> nodesNewNames = new Vector<Integer>();
        Vector<Integer> nodesOldNames = new Vector<Integer>();
        for (int i = 0; i < b.size(); i++) {
            nodesNewNames.add(REMOVE_NODE_FLAG);
            nodesOldNames.add(0);
        }
        for (int i = 0; i < N * 2; i++) {
            if (b.get(i) != 0) {
                if (sourcesThatFlowNotOnlyToThresh.contains(i)
                        || sinksThatGetFlowNotOnlyFromThresh.contains(i)) {
                    nodesNewNames.set(i, currentNodeName);
                    nodesOldNames.add(i);
                    currentNodeName++;
                } else {
                    if (i >= N) {
                        preFlowCost -= (b.get(i) * maxC);
                    }
                    b.set(THRESHOLD_NODE, b.get(THRESHOLD_NODE) + b.get(i)); // add mass(i<N) or deficit (i>=N)
                }
            }
        }
        nodesNewNames.set(THRESHOLD_NODE, currentNodeName);
        nodesOldNames.add(THRESHOLD_NODE);
        currentNodeName++;
        nodesNewNames.set(ARTIFICIAL_NODE, currentNodeName);
        nodesOldNames.add(ARTIFICIAL_NODE);
        currentNodeName++;

        Vector<Long> bb = new Vector<Long>();
        for (int i = 0; i < currentNodeName; i++) {
            bb.add(0l);
        }
        int j = 0;
        for (int i = 0; i < b.size(); i++) {
            if (nodesNewNames.get(i) != REMOVE_NODE_FLAG) {
                bb.set(j, b.get(i));
                j++;
            }
        }

        Vector<List<Edge>> cc = new Vector<List<Edge>>();
        for (int i = 0; i < bb.size(); i++) {
            cc.add(new LinkedList<Edge>());
        }
        for (int i = 0; i < c.size(); i++) {
            if (nodesNewNames.get(i) == REMOVE_NODE_FLAG)
                continue;
            for (Edge it : c.get(i)) {
                if (nodesNewNames.get(it._to) != REMOVE_NODE_FLAG) {
                    cc.get(nodesNewNames.get(i)).add(
                            new Edge(nodesNewNames.get(it._to), it._cost));
                }
            }
        }

        MinCostFlow mcf = new MinCostFlow();

        long myDist;

        Vector<List<Edge0>> flows = new Vector<List<Edge0>>(bb.size());
        for (int i = 0; i < bb.size(); i++) {
            flows.add(new LinkedList<Edge0>());
        }

        long mcfDist = mcf.compute(bb, cc, flows);

        myDist = preFlowCost + // pre-flowing on cases where it was possible
                mcfDist + // solution of the transportation problem
                (absDiffSumPSumQ * extraMassPenalty); // emd-hat extra mass penalty

        return myDist;
    }

    static private double emdHat(Vector<Double> P, Vector<Double> Q, Vector<Vector<Double>> C,
                                 double extraMassPenalty) {

        // This condition should hold:
        // ( 2^(sizeof(CONVERT_TO_T*8)) >= ( MULT_FACTOR^2 )
        // Note that it can be problematic to check it because
        // of overflow problems. I simply checked it with Linux calc
        // which has arbitrary precision.
        double MULT_FACTOR = 1000000;

        // Constructing the input
        int N = P.size();
        Vector<Long> iP = new Vector<Long>();
        Vector<Long> iQ = new Vector<Long>();
        Vector<Vector<Long>> iC = new Vector<Vector<Long>>();
        for (int i = 0; i < N; i++) {
            iP.add(0l);
            iQ.add(0l);
            Vector<Long> vec = new Vector<Long>();
            for (int j = 0; j < N; j++) {
                vec.add(0l);
            }
            iC.add(vec);
        }

        // Converting to CONVERT_TO_T
        double sumP = 0.0;
        double sumQ = 0.0;
        double maxC = C.get(0).get(0);
        for (int i = 0; i < N; i++) {
            sumP += P.get(i);
            sumQ += Q.get(i);
            for (int j = 0; j < N; j++) {
                if (C.get(i).get(j) > maxC)
                    maxC = C.get(i).get(j);
            }
        }
        double minSum = Math.min(sumP, sumQ);
        double maxSum = Math.max(sumP, sumQ);
        double PQnormFactor = MULT_FACTOR / maxSum;
        double CnormFactor = MULT_FACTOR / maxC;
        for (int i = 0; i < N; i++) {
            iP.set(i, (long) (Math.floor(P.get(i) * PQnormFactor + 0.5)));
            iQ.set(i, (long) (Math.floor(Q.get(i) * PQnormFactor + 0.5)));
            for (int j = 0; j < N; j++) {
                iC.get(i)
                        .set(j,
                                (long) (Math.floor(C.get(i).get(j)
                                        * CnormFactor + 0.5)));
            }
        }

        // computing distance without extra mass penalty
        double dist = emdHatImplLongLongInt(iP, iQ, iC, 0);
        // unnormalize
        dist = dist / PQnormFactor;
        dist = dist / CnormFactor;

        // adding extra mass penalty
        if (extraMassPenalty == -1)
            extraMassPenalty = maxC;
        dist += (maxSum - minSum) * extraMassPenalty;

        return dist;
    }
}