/** * Copyright (C) 2013-2014 Olaf Lessenich * Copyright (C) 2014-2015 University of Passau, Germany * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301 USA * * Contributors: * Olaf Lessenich <lessenic@fim.uni-passau.de> * Georg Seibt <seibt@fim.uni-passau.de> */ package de.fosd.jdime.matcher.cost_model; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.List; import java.util.ListIterator; import java.util.Map; import java.util.NoSuchElementException; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Consumer; import java.util.function.Function; import java.util.function.Predicate; import java.util.function.Supplier; import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Stream; import de.fosd.jdime.artifact.Artifact; import de.fosd.jdime.artifact.ArtifactList; import de.fosd.jdime.artifact.Artifacts; import de.fosd.jdime.config.merge.MergeContext; import de.fosd.jdime.matcher.MatcherInterface; import de.fosd.jdime.matcher.matching.Matching; import de.fosd.jdime.matcher.matching.Matchings; import de.fosd.jdime.util.Tuple; import org.apache.commons.math3.random.RandomGenerator; import static de.fosd.jdime.matcher.cost_model.Bounds.BY_LOWER_UPPER; import static java.lang.Integer.toHexString; import static java.lang.System.identityHashCode; import static java.util.Comparator.comparing; import static java.util.logging.Level.FINER; import static java.util.logging.Level.FINEST; import static java.util.stream.Collectors.summingDouble; import static java.util.stream.Collectors.toList; import static java.util.stream.Collectors.toSet; import static java.util.stream.Stream.concat; /** * A <code>MatcherInterface</code> implementation based on the Flexible Tree Matching algorithm. * * @param <T> the type of the artifacts being matched * @see <a href="http://theory.stanford.edu/~tim/papers/ijcai11.pdf">The Paper</a> */ public class CostModelMatcher<T extends Artifact<T>> implements MatcherInterface<T> { private static final Logger LOG = Logger.getLogger(CostModelMatcher.class.getCanonicalName()); /** * A function weighing a matching that incurred a cost. * * @param <T> the type of the artifacts */ @FunctionalInterface public interface SimpleWeightFunction<T extends Artifact<T>> { float weigh(CMMatching<T> matching); } /** * A function weighing a matching that incurred a specific cost. * * @param <T> the type of the artifacts */ @FunctionalInterface public interface WeightFunction<T extends Artifact<T>> { float weigh(CMMatching<T> matching, float quantity); } /** * The return type of {@link #objective(CMMatchings, CMParameters)} containing the value of the objective * function and the exact cost of the newly proposed set of <code>CMMatching</code>s. */ private final class ObjectiveValue { public final double objValue; public final float matchingsCost; public ObjectiveValue(double objValue, float matchingsCost) { this.objValue = objValue; this.matchingsCost = matchingsCost; } } /** * The return type of {@link #acceptanceProb(double, CMMatchings, CMParameters)} containing the probability * of the newly proposed set of <code>CMMatching</code>s being accepted for the next iteration and the * <code>ObjectiveValue</code> for the proposed matchings. */ private final class AcceptanceProbability { public final double acceptanceProbability; public final ObjectiveValue mHatObjectiveValue; public AcceptanceProbability(double acceptanceProbability, ObjectiveValue mHatObjectiveValue) { this.acceptanceProbability = acceptanceProbability; this.mHatObjectiveValue = mHatObjectiveValue; } } /** * Returns the exact cost of the given set of <code>matchings</code>. * * @param context * the <code>MergeContext</code> containing the parameters to be used * @param matchings * the matchings to calculate the cost for * @param left * the left root * @param right * the right root * @return the exact cost based on the weights in <code>context</code> */ public float cost(MergeContext context, Matchings<T> matchings, T left, T right) { if (matchings.isEmpty()) { return 0; } Set<T> leftUnmatched = new LinkedHashSet<>(Artifacts.dfs(left)); Set<T> rightUnmatched = new LinkedHashSet<>(Artifacts.dfs(right)); CMMatchings<T> cmMatchings = new CMMatchings<>(left, right); for (Matching<T> matching : matchings) { cmMatchings.add(new CMMatching<>(matching.getLeft(), matching.getRight())); leftUnmatched.remove(matching.getLeft()); rightUnmatched.remove(matching.getRight()); } for (T l : leftUnmatched) { cmMatchings.add(new CMMatching<>(l, null)); } for (T r : rightUnmatched) { cmMatchings.add(new CMMatching<>(null, r)); } return cost(cmMatchings, new CMParameters<>(context)); } /** * Returns the exact cost of the given <code>matchings</code>. This assumes that <code>matchings</code> contains * for every node in the left and right tree exactly one <code>CMMatching</code> containing the node. * The exact cost computed for every <code>CMMatching</code> can be retrieved using * ({@link CMMatching#getExactCost()} after this call. * * @param matchings * the <code>CMMatchings</code>s to evaluate * @param parameters * the <code>CMParameters</code> to use * @return the cost based on the weight functions in <code>parameters</code> */ private float cost(CMMatchings<T> matchings, CMParameters<T> parameters) { if (!matchings.sane()) { throw new IllegalArgumentException("The given list of matchings has an invalid format. A list of " + "matchings where every artifact from the left and right tree occurs in exactly one matching is " + "required. Matchings matching artifacts that do not occur in the left or right tree are not " + "allowed."); } if (matchings.isEmpty()) { return 0; } if (parameters.parallel) { matchings.parallelStream().forEach(m -> cost(m, matchings, parameters)); } else { matchings.forEach(m -> cost(m, matchings, parameters)); } float sumCost = matchings.stream().collect(summingDouble(CMMatching::getExactCost)).floatValue(); sumCost *= (1.0f / (matchings.left.getTreeSize() + matchings.right.getTreeSize())); parameters.clearExactCaches(); return sumCost; } /** * Sets the exact cost ({@link CMMatching#setExactCost(float)}) of the given <code>matching</code> based on * the given set of <code>matchings</code>. * * @param matching * the <code>CMMatching</code> to compute the cost for * @param matchings * the complete <code>CMMatching</code>s * @param parameters * the <code>CMParameters</code> to use */ private void cost(CMMatching<T> matching, CMMatchings<T> matchings, CMParameters<T> parameters) { if (matching.isNoMatch()) { matching.setExactCost(parameters.wn); return; } float cR = renamingCost(matching, parameters); float cA = ancestryViolationCost(matching, matchings, parameters); float cS = siblingGroupBreakupCost(matching, matchings, parameters); float cO = orderingCost(matching, matchings, parameters); matching.setExactCost(cR + cA + cS + cO); } /** * Returns the cost for renaming the node. The cost will be zero if the <code>Artifact</code>s match according to * {@link Artifact#matches(Artifact)}, otherwise it is determined by the set renaming weight function * in <code>parameters</code>. * * @param matching * the <code>CMMatching</code> to compute the cost for * @return the exact renaming cost of the <code>matching</code> */ private float renamingCost(CMMatching<T> matching, CMParameters<T> parameters) { if (matching.m.matches(matching.n)) { return 0; } else { return parameters.wr.weigh(matching); } } /** * Returns the exact ancestry violation cost for <code>matching</code>. * * @param matching * the matching to calculate the cost for * @param matchings * all matchings * @param parameters * the cost model parameters * @return the exact ancestry violation cost */ private float ancestryViolationCost(CMMatching<T> matching, CMMatchings<T> matchings, CMParameters<T> parameters) { int numM = numAncestryViolatingChildren(matching.m, matching.n, matchings, parameters); int numN = numAncestryViolatingChildren(matching.n, matching.m, matchings, parameters); return parameters.wa.weigh(matching, numM + numN); } /** * Returns the number of children of <code>m</code> that violate ancestry of <code>m</code> is matched with * <code>n</code>. * * @param m * the artifact to return the number of ancestry violating children for * @param n * the artifact <code>m</code> is being matched with * @param matchings * all matchings * @param parameters * the cost model parameters * @return the number of children of <code>m</code> violating ancestry */ private int numAncestryViolatingChildren(T m, T n, CMMatchings<T> matchings, CMParameters<T> parameters) { ArtifactList<T> mChildren = m.getChildren(); ArtifactList<T> nChildren = n.getChildren(); Predicate<T> filter = a -> a != null && !nChildren.contains(a); return (int) mChildren.stream().map(mChild -> image(mChild, matchings, parameters)).filter(filter).count(); } /** * Returns the exact sibling group breakup cost for <code>matching</code>. * * @param matching * the matching to calculate the cost for * @param matchings * all matchings * @param parameters * the cost model parameters * @return the exact sibling group breakup cost */ private float siblingGroupBreakupCost(CMMatching<T> matching, CMMatchings<T> matchings, CMParameters<T> parameters) { List<T> dMm, iMm; Set<T> fMm; List<T> dMn, iMn; Set<T> fMn; float mCost; float nCost; dMm = siblingDivergentSubset(matching.m, matching.n, matchings, parameters); if (dMm.isEmpty()) { mCost = 0; } else { iMm = siblingInvariantSubset(matching.m, matching.n, matchings, parameters); fMm = distinctSiblingFamilies(matching.m, matchings, parameters); mCost = (float) dMm.size() / (iMm.size() * fMm.size()); } dMn = siblingDivergentSubset(matching.n, matching.m, matchings, parameters); if (dMn.isEmpty()) { nCost = 0; } else { iMn = siblingInvariantSubset(matching.n, matching.m, matchings, parameters); fMn = distinctSiblingFamilies(matching.n, matchings, parameters); nCost = (float) dMn.size() / (iMn.size() * fMn.size()); } return parameters.ws.weigh(matching, mCost + nCost); } /** * Returns the sibling invariant subset of siblings of <code>m</code>. * * @param m * the artifact for whose siblings the sibling invariant subset is to be returned * @param n * the artifact <code>m</code> is being matched with * @param matchings * all matchings * @param parameters * the cost model parameters * @return the sibling invariant subset */ private List<T> siblingInvariantSubset(T m, T n, CMMatchings<T> matchings, CMParameters<T> parameters) { List<T> mSiblings = siblings(m, matchings, parameters); List<T> nSiblings = siblings(n, matchings, parameters); return mSiblings.stream().filter(s -> nSiblings.contains(image(s, matchings, parameters))).collect(toList()); } /** * Returns the sibling divergent subset of siblings of <code>m</code>. * * @param m * the artifact for whose siblings the sibling divergent subset is to be returned * @param n * the artifact <code>m</code> is being matched with * @param matchings * all matchings * @param parameters * the cost model parameters * @return the sibling divergent subset */ private List<T> siblingDivergentSubset(T m, T n, CMMatchings<T> matchings, CMParameters<T> parameters) { List<T> inv = siblingInvariantSubset(m, n, matchings, parameters); List<T> sibs = siblings(m, matchings, parameters); return sibs.stream().filter(sibling -> !inv.contains(sibling) && image(sibling, matchings, parameters) != null) .collect(toList()); } /** * Returns the set of distinct sibling families that siblings of <code>m</code> are matched into represented by * their parent artifact. For the root, <code>null</code> will be included in the set. * * @param m * the artifact for whose siblings the distinct sibling families are to be returned * @param matchings * all matchings * @param parameters * the cost model parameters * @return the distinct sibling family representatives */ private Set<T> distinctSiblingFamilies(T m, CMMatchings<T> matchings, CMParameters<T> parameters) { Function<T, T> image = mChild -> image(mChild, matchings, parameters); Predicate<T> notNull = t -> t != null; Function<T, T> getParent = Artifact::getParent; return siblings(m, matchings, parameters).stream().map(image).filter(notNull).map(getParent).collect(toSet()); } /** * Returns the exact ordering cost for <code>matching</code>. * * @param matching * the matching to calculate the cost for * @param matchings * all matchings * @param parameters * the cost model paramters * @return the exact ordering cost */ private float orderingCost(CMMatching<T> matching, CMMatchings<T> matchings, CMParameters<T> parameters) { Stream<T> leftSiblings = otherSiblings(matching.m, matchings, parameters).stream(); Stream<T> rightSiblings = otherSiblings(matching.n, matchings, parameters).stream(); Stream<CMMatching<T>> s = concat(leftSiblings, rightSiblings).map(a -> matching(a, matchings, parameters)) .filter(m -> !m.isNoMatch()).distinct(); if (s.anyMatch(toCheck -> violatesOrdering(toCheck, matching, matchings, parameters))) { return parameters.wo.weigh(matching); } else { return 0; } } /** * Tests whether <code>toCheck</code> violates the ordering induced by <code>matching</code>. * * @param toCheck * the matching to check * @param matching * the matching introducing an ordering * @param matchings * all matchings * @param parameters * the cost model parameters * @return true iff <code>toCheck</code> violates the ordering induced by <code>matching</code> */ private boolean violatesOrdering(CMMatching<T> toCheck, CMMatching<T> matching, CMMatchings<T> matchings, CMParameters<T> parameters) { Tuple<T, T> leftSides = lca(toCheck.m, matching.m, matchings, parameters); Tuple<T, T> rightSides = lca(toCheck.n, matching.n, matchings, parameters); List<T> leftSiblings = siblings(leftSides.x, matchings, parameters); List<T> rightSiblings = siblings(rightSides.x, matchings, parameters); if (concat(leftSiblings.stream(), rightSiblings.stream()).noneMatch(T::isOrdered)) { return false; } int leftXi = leftSiblings.indexOf(leftSides.x); int leftYi = leftSiblings.indexOf(leftSides.y); int rightXi = rightSiblings.indexOf(rightSides.x); int rightYi = rightSiblings.indexOf(rightSides.y); if (leftXi < leftYi) { return rightXi > rightYi; } else if (leftXi > leftYi) { return rightXi < rightYi; } return false; // TODO weird case, maybe true is better? } /** * Returns the path from the given <code>artifact</code> to the root node of the tree it is a part of. * * @param artifact * the <code>Artifact</code> to return the path for * @return the path represented by a list of <code>Artifact</code>s beginning with <code>artifact</code> and ending * with the root of the tree */ private List<T> pathToRoot(T artifact) { List<T> path = new ArrayList<>(); do { path.add(artifact); artifact = artifact.getParent(); } while (artifact != null); return path; } /** * Finds the lowest pair of (possibly different) ancestors of <code>a</code> and <code>b</code> that are part of the * same sibling group. * * @param a * the first <code>Artifact</code> * @param b * the second <code>Artifact</code> * @param matchings * the current <code>CMMatching</code> * @param parameters * the <code>CMParameters</code> to use * @return the ancestor of the first <code>Artifact</code> in the first position, that of the second in the second * position */ private Tuple<T, T> lca(T a, T b, CMMatchings<T> matchings, CMParameters<T> parameters) { return parameters.lcaCache.computeIfAbsent(Tuple.of(a, b), ab -> { Tuple<T, T> ba = Tuple.of(b, a); if (parameters.lcaCache.containsKey(ba)) { Tuple<T, T> baLCS = parameters.lcaCache.get(ba); return Tuple.of(baLCS.y, baLCS.x); } if (siblings(a, matchings, parameters).contains(b)) { return ab; } List<T> aPath = pathToRoot(a); List<T> bPath = pathToRoot(b); ListIterator<T> aIt = aPath.listIterator(aPath.size()); ListIterator<T> bIt = bPath.listIterator(bPath.size()); T l, r; do { l = aIt.previous(); r = bIt.previous(); } while (l == r && (aIt.hasPrevious() && bIt.hasPrevious())); return Tuple.of(l, r); }); } /** * Finds the (first) <code>CMMatching</code> in <code>matchings</code> containing the given * <code>artifact</code>. * * @param artifact * the <code>Artifact</code> for which the containing <code>CMMatching</code> is to be returned * @param matchings * the current matchings * @param parameters * the <code>CMParameters</code> to use * @return the <code>CMMatching</code> containing the <code>artifact</code> * @throws NoSuchElementException * if no <code>CMMatching</code> containing <code>artifact</code> can be found in * <code>matchings</code> */ private CMMatching<T> matching(T artifact, CMMatchings<T> matchings, CMParameters<T> parameters) { return parameters.exactContainsCache.computeIfAbsent(artifact, a -> matchings.stream().filter(m -> m.contains(a)).findFirst().orElseThrow(() -> new NoSuchElementException("No matching containing " + artifact + " found.") ) ); } /** * Finds the (first) <code>CMMatching</code> in <code>matchings</code> containing the given * <code>artifact</code> and returns the other <code>Artifact</code> in the <code>CMMatching</code>. * * @param artifact * the <code>Artifact</code> whose image is to be returned * @param matchings * the current matchings * @return the matching partner of <code>artifact</code> in the given <code>matchings</code> * @throws NoSuchElementException * if no <code>CMMatching</code> containing <code>artifact</code> can be found in * <code>matchings</code> */ private T image(T artifact, CMMatchings<T> matchings, CMParameters<T> parameters) { return matching(artifact, matchings, parameters).other(artifact); } /** * Sets the bounds ({@link CMMatching#setCostBounds(Bounds)}) for the cost of all current matchings. * * @param currentMatchings * the current <code>CMMatchings</code>s being considered * @param parameters * the <code>CMParameters</code> to use */ private void boundCost(CMMatchings<T> currentMatchings, CMParameters<T> parameters) { LOG.finer(() -> "Bounding " + currentMatchings.size() + " matchings."); AtomicInteger mCount = LOG.isLoggable(FINEST) ? new AtomicInteger() : null; Consumer<CMMatching<T>> mPeek = m -> LOG.finest(() -> "Done with matching " + mCount.getAndIncrement() + " " + m); if (parameters.parallel) { currentMatchings.parallelStream().peek(mPeek).forEach(m -> boundCost(m, currentMatchings, parameters)); } else { currentMatchings.stream().peek(mPeek).forEach(m -> boundCost(m, currentMatchings, parameters)); } parameters.clearBoundCaches(); } /** * Sets the bounds ({@link CMMatching#setCostBounds(Bounds)}) for the cost of the given <code>matching</code> * based on the given <code>currentMatchings</code>. * * @param matching * the <code>CMMatching</code> whose costs are to be bounded * @param currentMatchings * the current <code>CMMatchings</code>s being considered * @param parameters * the <code>CMParameters</code> to use */ private void boundCost(CMMatching<T> matching, CMMatchings<T> currentMatchings, CMParameters<T> parameters) { if (matching.isNoMatch()) { matching.setBounds(parameters.wn, parameters.wn); return; } float cR = renamingCost(matching, parameters); Bounds cABounds = boundAncestryViolationCost(matching, currentMatchings, parameters); Bounds cSBounds = boundSiblingGroupBreakupCost(matching, currentMatchings, parameters); Bounds cOBounds = boundOrderingCost(matching, currentMatchings, parameters); float lower = cR + cABounds.getLower() + cSBounds.getLower() + cOBounds.getLower(); float upper = cR + cABounds.getUpper() + cSBounds.getUpper() + cOBounds.getUpper(); matching.setBounds(lower, upper); } /** * Returns the bounded ancestry violation cost for <code>matching</code>. * * @param matching * the matching to calculate the bounds for * @param currentMatchings * all matchings * @param parameters * the cost model parameters * @return the bounded ancestry violation cost */ private Bounds boundAncestryViolationCost(CMMatching<T> matching, CMMatchings<T> currentMatchings, CMParameters<T> parameters) { T m = matching.m; T n = matching.n; Stream<T> mLower = m.getChildren().stream().filter(mChild -> ancestryIndicator(mChild, n, currentMatchings, false, parameters)); Stream<T> nLower = n.getChildren().stream().filter(nChild -> ancestryIndicator(nChild, m, currentMatchings, false, parameters)); Stream<T> mUpper = m.getChildren().stream().filter(mChild -> ancestryIndicator(mChild, n, currentMatchings, true, parameters)); Stream<T> nUpper = n.getChildren().stream().filter(nChild -> ancestryIndicator(nChild, m, currentMatchings, true, parameters)); int lowerBound = (int) (mLower.count() + nLower.count()); int upperBound = (int) (mUpper.count() + nUpper.count()); return new Bounds(parameters.wa.weigh(matching, lowerBound), parameters.wa.weigh(matching, upperBound)); } /** * Evaluates the upper/lower ancestry violation indicator. * * @param child * the child for which to check whether ancestry violation is possible/unavoidable * @param n * the matching partner of the parent of <code>child</code> * @param currentMatchings * all matchings * @param upper * whether to evaluate the upper or lower indicator * @param parameters * the cost model parameters * @return the value of the indicator function */ private boolean ancestryIndicator(T child, T n, CMMatchings<T> currentMatchings, boolean upper, CMParameters<T> parameters) { if (upper) { Predicate<CMMatching<T>> indicator = match -> { T partner = match.other(child); return !(partner == null || n.getChildren().contains(partner)); }; return containing(child, currentMatchings, parameters).stream().anyMatch(indicator); } else { Predicate<CMMatching<T>> indicator = match -> { T partner = match.other(child); return partner == null || n.getChildren().contains(partner); }; return containing(child, currentMatchings, parameters).stream().noneMatch(indicator); } } /** * Bounds the sibling group breakup cost for <code>matching</code>. * * @param matching * the matching to bound the cost for * @param currentMatchings * all matchings * @param parameters * the cost model parameters * @return the bounded sibling group breakup cost */ private Bounds boundSiblingGroupBreakupCost(CMMatching<T> matching, CMMatchings<T> currentMatchings, CMParameters<T> parameters) { T m = matching.m; T n = matching.n; float mnLower, nmLower, lower, mnUpper, nmUpper, upper; Bounds dMN = boundDivergentSiblings(m, n, currentMatchings, parameters); Bounds dNM = boundDivergentSiblings(n, m, currentMatchings, parameters); if (dMN.getLower() != 0 || dMN.getUpper() != 0) { Bounds iMN = boundInvariantSiblings(m, n, currentMatchings, parameters); mnLower = dMN.getLower() / (iMN.getUpper() * (dMN.getLower() + 1)); mnUpper = dMN.getUpper() / iMN.getLower(); } else { mnLower = 0; mnUpper = 0; } if (dNM.getLower() != 0 || dNM.getUpper() != 0) { Bounds iNM = boundInvariantSiblings(n, m, currentMatchings, parameters); nmLower = dNM.getLower() / (iNM.getUpper() * (dNM.getLower() + 1)); nmUpper = dNM.getUpper() / iNM.getLower(); } else { nmLower = 0; nmUpper = 0; } lower = parameters.ws.weigh(matching, mnLower + nmLower); upper = parameters.ws.weigh(matching, (mnUpper + nmUpper) / 2); return new Bounds(lower, upper); } /** * Bounds the size of the divergent sibling subset of siblings of <code>m</code>. * * @param m * the artifact for whose siblings the size of the sibling divergent subset is to be bounded * @param n * the artifact <code>m</code> is being matched with * @param currentMatchings * all matchings * @param parameters * the cost model parameters * @return the bounded size of the divergent sibling subset */ private Bounds boundDivergentSiblings(T m, T n, CMMatchings<T> currentMatchings, CMParameters<T> parameters) { List<T> osibs = otherSiblings(m, currentMatchings, parameters); long lower = osibs.stream().filter(mSib -> divergentSiblingIndicator(mSib, n, currentMatchings, false, parameters)).count(); long upper = osibs.stream().filter(mSib -> divergentSiblingIndicator(mSib, n, currentMatchings, true, parameters)).count(); return new Bounds(lower, upper); } /** * Evaluates the upper/lower divergent sibling subset indicator. * * @param sibling * the sibling for which to check whether inclusion in the sibling divergent subset is possible/unavoidable * @param n * the artifact that the sibling of <code>sibling</code> is matched with * @param currentMatchings * all matchings * @param upper * whether to evaluate the upper or lower indicator * @param parameters * the cost model parameters * @return the value of the indicator function */ private boolean divergentSiblingIndicator(T sibling, T n, CMMatchings<T> currentMatchings, boolean upper, CMParameters<T> parameters) { if (upper) { Predicate<CMMatching<T>> indicator = match -> { T partner = match.other(sibling); return !(partner == null || otherSiblings(n, currentMatchings, parameters).contains(partner)); }; return containing(sibling, currentMatchings, parameters).stream().anyMatch(indicator); } else { Predicate<CMMatching<T>> indicator = match -> { T partner = match.other(sibling); return partner == null || otherSiblings(n, currentMatchings, parameters).contains(partner); }; return containing(sibling, currentMatchings, parameters).stream().noneMatch(indicator); } } /** * Bounds the size of the invariant sibling subset of siblings of <code>m</code>. * * @param m * the artifact for whose siblings the size of the sibling invariant subset is to be bounded * @param n * the artifact <code>m</code> is being matched with * @param currentMatchings * all matchings * @param parameters * the cost model parameters * @return the bounded size of the invariant sibling subset */ private Bounds boundInvariantSiblings(T m, T n, CMMatchings<T> currentMatchings, CMParameters<T> parameters) { List<T> osibs = otherSiblings(m, currentMatchings, parameters); long lower = osibs.stream().filter(mSib -> invariantSiblingIndicator(mSib, n, currentMatchings, false, parameters)).count(); long upper = osibs.stream().filter(mSib -> invariantSiblingIndicator(mSib, n, currentMatchings, true, parameters)).count(); return new Bounds(lower + 1, upper + 1); } /** * Evaluates the upper/lower invariant sibling subset indicator. * * @param sibling * the sibling for which to check whether inclusion in the sibling invariant subset is possible/unavoidable * @param n * the artifact that the sibling of <code>sibling</code> is matched with * @param currentMatchings * all matchings * @param upper * whether to evaluate the upper or lower indicator * @param parameters * the cost model parameters * @return the value of the indicator function */ private boolean invariantSiblingIndicator(T sibling, T n, CMMatchings<T> currentMatchings, boolean upper, CMParameters<T> parameters) { Predicate<CMMatching<T>> indicator = match -> otherSiblings(n, currentMatchings, parameters).contains(match.other(sibling)); if (upper) { return containing(sibling, currentMatchings, parameters).stream().anyMatch(indicator); } else { return containing(sibling, currentMatchings, parameters).stream().allMatch(indicator); } } /** * Bounds the ordering violation cost of <code>matching</code>. * * @param matching * the matching to bound the cost for * @param currentMatchings * all matchings * @param parameters * the cost model parameters * @return the bounded ordering violation cost */ private Bounds boundOrderingCost(CMMatching<T> matching, CMMatchings<T> currentMatchings, CMParameters<T> parameters) { float lower, upper; List<T> mosibs = otherSiblings(matching.m, currentMatchings, parameters); List<T> nosibs = otherSiblings(matching.n, currentMatchings, parameters); Stream<T> siblings = concat(mosibs.stream(), nosibs.stream()); boolean orderingPossible = siblings.allMatch(sib -> containing(sib, currentMatchings, parameters).stream().anyMatch(match -> match.isNoMatch() || !violatesOrdering(match, matching, currentMatchings, parameters) ) ); if (!orderingPossible) { lower = parameters.wo.weigh(matching); upper = lower; } else { lower = 0; siblings = concat(mosibs.stream(), nosibs.stream()); boolean violationPossible = siblings.anyMatch(sib -> containing(sib, currentMatchings, parameters).stream().anyMatch(match -> !match.isNoMatch() && violatesOrdering(match, matching, currentMatchings, parameters) ) ); upper = violationPossible ? parameters.wo.weigh(matching) : 0; } return new Bounds(lower, upper); } /** * Returns a new <code>List</code> containing the children of the parent of <code>artifact</code> or an empty * <code>List</code> for the root node. This includes the <code>artifact</code> itself. * * @param artifact * the <code>Artifact</code> whose siblings are to be returned * @param matchings * the current <code>CMMatchings</code> * @param parameters * the <code>CMParameters</code> to use * @return the siblings of the given <code>artifact</code> */ private List<T> siblings(T artifact, CMMatchings<T> matchings, CMParameters<T> parameters) { return parameters.siblingCache.computeIfAbsent(artifact, a -> { List<T> siblings; if (artifact == matchings.left || artifact == matchings.right) { siblings = new ArrayList<>(Collections.singleton(a)); } else { T parent = a.getParent(); siblings = parent.getChildren() .stream() .filter(s -> s != a && parameters.siblingCache.containsKey(s)) .map(s -> parameters.siblingCache.get(s)).findFirst() .orElseGet(() -> new ArrayList<>(parent.getChildren())); } return siblings; }); } /** * Returns the siblings of <code>artifact</code> as in {@link #siblings(Artifact, CMMatchings, CMParameters)} but * does not include <code>artifact</code> itself. * * @param artifact * the <code>Artifact</code> whose siblings are to be returned * @param matchings * the current <code>CMMatchings</code> *@param parameters * the <code>CMParameters</code> to use @return the siblings of the given <code>artifact</code> */ private List<T> otherSiblings(T artifact, CMMatchings<T> matchings, CMParameters<T> parameters) { return parameters.otherSiblingsCache.computeIfAbsent(artifact, a -> { List<T> siblings = new ArrayList<>(siblings(a, matchings, parameters)); siblings.remove(a); return siblings; }); } /** * Returns all matchings containing <code>artifact</code> from <code>currentMatchings</code>. * * @param artifact * the artifact to search for * @param currentMatchings * all matchings * @param parameters * the cost model parameters * @return all matchings containig <code>artifact</code> */ private List<CMMatching<T>> containing(T artifact, CMMatchings<T> currentMatchings, CMParameters<T> parameters) { return parameters.boundContainsCache.computeIfAbsent(artifact, a -> currentMatchings.stream().filter(m -> m.contains(a)).collect(toList()) ); } @Override public Matchings<T> match(MergeContext context, T left, T right) { return match(context, left, right, new CMMatchings<>(left, right)); } /** * Matches the trees rooted in <code>left</code> and <code>right</code>. The matchings contained in * <code>preFixed</code> will be considered fixed and returned as is in addition to any matchings between previously * unmatched artifacts. * * @param context * the <code>MergeContext</code> containing the parameters to use for the Flexible Tree Matching * algorithm * @param left * the left root * @param right * the right root * @param preFixed * the matchings between the left and right tree that are fixed * @return the resulting matchings */ public Matchings<T> match(MergeContext context, T left, T right, Matchings<T> preFixed) { CMMatchings<T> cmPreFixed = new CMMatchings<>(left, right); for (Matching<T> matching : preFixed.optimized()) { cmPreFixed.add(new CMMatching<>(matching.getLeft(), matching.getRight())); } return match(context, left, right, cmPreFixed); } /** * Matches the tress rooted in <code>left</code> and <code>right</code> using the Metropolis algorithm and the * Flexible Tree Matching cost model. * * @param context * the <code>MergeContext</code> containing the parameters to use for the Flexible Tree Matching * algorithm * @param left * the left root * @param right * the right root * @param preFixed * the matchings between the left and right tree that are fixed * @return the resulting matchings */ private Matchings<T> match(MergeContext context, T left, T right, CMMatchings<T> preFixed) { CMParameters<T> parameters = new CMParameters<>(context); LOG.fine("Matching " + left + " and " + right + " using the " + getClass().getSimpleName()); CMMatchings<T> m = initialize(preFixed, parameters); ObjectiveValue mObjVal = objective(m, parameters); CMMatchings<T> lowest = m; float lowestCost = mObjVal.matchingsCost; for (int i = 0; i < context.getCostModelIterations(); i++) { CMMatchings<T> mHat = propose(m, preFixed, parameters); AcceptanceProbability mHatAccProb = acceptanceProb(mObjVal.objValue, mHat, parameters); if (chance(parameters.rng, mHatAccProb.acceptanceProbability)) { log(FINER, mHat, () -> "Accepting the matchings."); m = mHat; mObjVal = mHatAccProb.mHatObjectiveValue; } if (mHatAccProb.mHatObjectiveValue.matchingsCost < lowestCost) { lowest = mHat; lowestCost = mHatAccProb.mHatObjectiveValue.matchingsCost; float finalLowestCost = lowestCost; log(FINER, mHat, () -> "New lowest cost matchings with cost " + finalLowestCost + " found."); } LOG.fine("End of iteration " + i); } LOG.fine(() -> "Matching ended after " + context.getCostModelIterations() + " iterations."); return convert(lowest); } /** * Returns <code>true</code> with a probability of <code>p</code>. * * @param rng * the PRNG to sample from * @param p * a number between 0.0 and 1.0 * @return true or false depending on the next double returned by the PRNG */ boolean chance(RandomGenerator rng, double p) { return rng.nextDouble() < p; } /** * Converts a <code>List</code> of <code>CMMatching</code>s to an equivalent <code>Set</code> of * <code>Matching</code>s. * * @param matchings * the <code>CMMatching</code>s to convert * @return the resulting <code>Matchings</code> */ private Matchings<T> convert(CMMatchings<T> matchings) { Map<T, T> mMap = matchings.asMap(); Function<CMMatching<T>, Matching<T>> toMatching = m -> { Set<T> ls = new HashSet<>(Artifacts.dfs(m.m)); Set<T> rs = new HashSet<>(Artifacts.dfs(m.n)); int score = (int) ls.stream().filter(a -> rs.contains(mMap.get(a))).count(); Matching<T> matching = new Matching<>(m.m, m.n, score); matching.setAlgorithm(CostModelMatcher.class.getSimpleName()); return matching; }; return matchings.stream().filter(m -> !m.isNoMatch()).map(toMatching) .collect(Matchings::new, Matchings::add, Matchings::addAll); } /** * Proposes a new set of <code>CMMatching</code>s based on the previous matchings <code>m</code>. * * @param m * the matchings from the previous iteration * @param preFixed * the matchings between the left and right tree that are fixed * @return the proposed matchings for the next iteration */ private CMMatchings<T> propose(CMMatchings<T> m, CMMatchings<T> preFixed, CMParameters<T> parameters) { CMMatchings<T> mVariable = new CMMatchings<>(m, m.left, m.right); mVariable.removeAll(preFixed); int j; if (parameters.fixRandomPercentage) { int lower = (int) (parameters.fixLower * mVariable.size()); int upper = (int) (parameters.fixUpper * mVariable.size()); Collections.shuffle(mVariable, parameters.rng); // TODO a switch to turn this off j = intFromRange(lower, upper, parameters); } else { //TODO sort by exact cost? Collections.sort(mVariable, Comparator.comparing(CMMatching::getExactCost)); j = parameters.rng.nextInt(mVariable.size()); } CMMatchings<T> fixed = new CMMatchings<>(mVariable.subList(0, j), m.left, m.right); log(FINER, m, () -> "Fixing the first " + j + "variable matchings from the last iteration."); log(FINEST, m, () -> "They are: " + fixed); fixed.addAll(preFixed); CMMatchings<T> proposition = complete(fixed, parameters); log(FINER, proposition, () -> "Proposing matchings for the next iteration."); log(FINEST, proposition, () -> "Proposition is: " + proposition); return proposition; } /** * Returns a uniformly distributed random integer from the given range (inclusive). * * @param lower * the lower bound * @param upper * the upper bound * @param parameters * the cost model parameters * @return a random int from [<code>lower</code>, <code>upper</code>] */ private int intFromRange(int lower, int upper, CMParameters<T> parameters) { return lower + (int) (parameters.rng.nextFloat() * ((upper - lower) + 1)); } /** * Constructs the initial set of matchings. * * @param preFixed * the matchings between the left and right tree that are fixed * @param parameters * the cost model parameters * @return the initial matchings */ private CMMatchings<T> initialize(CMMatchings<T> preFixed, CMParameters<T> parameters) { CMMatchings<T> initial = complete(preFixed, parameters); log(FINER, initial, () -> "Initial set of matchings assembled."); log(FINEST, initial, () -> "Initial set is: " + initial); return initial; } /** * Completes the given <code>fixedMatchings</code> to a set of matchings in which every artifact from the left * and right tree is covered by exactly one matching. * * @param fixedMatchings * the fixed matchings to complete * @param parameters * the cost model parameters * @return the completed set of matchings */ private CMMatchings<T> complete(CMMatchings<T> fixedMatchings, CMParameters<T> parameters) { CMMatchings<T> current = completeBipartiteGraph(fixedMatchings.left, fixedMatchings.right, parameters); CMMatchings<T> fixed = new CMMatchings<>(fixedMatchings, fixedMatchings.left, fixedMatchings.right); fixed.forEach(m -> prune(m, current)); while (fixed.size() != current.size()) { boundCost(current, parameters); Collections.sort(current, comparing(CMMatching::getCostBounds, BY_LOWER_UPPER)); CMMatchings<T> available = new CMMatchings<>(current, current.left, current.right); available.removeAll(fixed); int i; do { i = parameters.assignDist.sample(); } while (i >= available.size()); CMMatching<T> matching = available.get(i); fixed.add(matching); prune(matching, current); } return fixed; } /** * Removes the other matchings containing an artifact matched in <code>matching</code> from <code>g</code>. * * @param matching * the matching to prune for * @param g * the matchings to prune from */ private void prune(CMMatching<T> matching, CMMatchings<T> g) { for (ListIterator<CMMatching<T>> it = g.listIterator(); it.hasNext();) { CMMatching<T> current = it.next(); boolean neq = !matching.equals(current); if (neq && ((matching.m != null && matching.m == current.m) || (matching.n != null && matching.n == current.n))) { it.remove(); } } } /** * Returns the (randomly ordered) complete bipartite graph between the trees rooted in <code>left</code> and * <code>right</code> with the addition of one no-match node (represented by <code>null</code>) each. * * @param left * the left root * @param right * the right root * @param parameters * the cost model parameters * @return the complete bipartite graph with its edges represented by <code>CMMatching</code>s */ private CMMatchings<T> completeBipartiteGraph(T left, T right, CMParameters<T> parameters) { List<T> leftNodes = Artifacts.bfs(left); List<T> rightNodes = Artifacts.bfs(right); // add the "No Match" node leftNodes.add(null); rightNodes.add(null); CMMatchings<T> bipartiteGraph = new CMMatchings<>(left, right); for (T lNode : leftNodes) { for (T rNode : rightNodes) { if (lNode != null || rNode != null) { bipartiteGraph.add(new CMMatching<>(lNode, rNode)); } } } Collections.shuffle(bipartiteGraph, parameters.rng); return bipartiteGraph; } /** * Returns the value of the objective function. * * @param matchings * the matchings to return the objective function value for * @param parameters * the cost model parameters * @return the value of the objective function and the cost that was calculated as part of it */ private ObjectiveValue objective(CMMatchings<T> matchings, CMParameters<T> parameters) { float cost = cost(matchings, parameters); double objVal = Math.exp(-(parameters.beta * cost)); log(FINER, matchings, () -> "Cost of matchings is " + cost); log(FINER, matchings, () -> "Objective function value for matchings is " + objVal); return new ObjectiveValue(objVal, cost); } /** * Returns the acceptance probability for the proposed set of matchings <code>mHat</code>. * * @param mObjectiveValue * the objective value for the current reference set of matchings * @param mHat * the newly proposed set of matchings * @param parameters * the cost model parameters * @return the acceptance probability including the <code>ObjectiveValue</code> calculated for <code>mHat</code> */ private AcceptanceProbability acceptanceProb(double mObjectiveValue, CMMatchings<T> mHat, CMParameters<T> parameters) { ObjectiveValue mHatObjectiveValue = objective(mHat, parameters); double acceptanceProb = Math.min(1, mHatObjectiveValue.objValue / mObjectiveValue); log(FINER, mHat, () -> "Acceptance probability for matchings is " + acceptanceProb); return new AcceptanceProbability(acceptanceProb, mHatObjectiveValue); } /** * Returns the hexadecimal identity hash code of the given <code>Object</code> as a <code>String</code>. * * @param o * the <code>Object</code> to return the <code>String</code> id for * @return the <code>String</code> id */ private String id(Object o) { return toHexString(identityHashCode(o)); } /** * Logs the given <code>msg</code> using the {@link #LOG} and prepends the {@link #id(Object)} of the given * matchings. * * @param level * the level to log at * @param matchings * the matchings the message concerns * @param msg * the message to log */ private void log(Level level, CMMatchings<T> matchings, Supplier<String> msg) { LOG.log(level, () -> String.format("%-10s%s", id(matchings), msg.get())); } }