/**
* Copyright (C) 2013-2014 Olaf Lessenich
* Copyright (C) 2014-2015 University of Passau, Germany
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301 USA
*
* Contributors:
* Olaf Lessenich <lessenic@fim.uni-passau.de>
* Georg Seibt <seibt@fim.uni-passau.de>
*/
package de.fosd.jdime.matcher.cost_model;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.function.Supplier;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Stream;
import de.fosd.jdime.artifact.Artifact;
import de.fosd.jdime.artifact.ArtifactList;
import de.fosd.jdime.artifact.Artifacts;
import de.fosd.jdime.config.merge.MergeContext;
import de.fosd.jdime.matcher.MatcherInterface;
import de.fosd.jdime.matcher.matching.Matching;
import de.fosd.jdime.matcher.matching.Matchings;
import de.fosd.jdime.util.Tuple;
import org.apache.commons.math3.random.RandomGenerator;
import static de.fosd.jdime.matcher.cost_model.Bounds.BY_LOWER_UPPER;
import static java.lang.Integer.toHexString;
import static java.lang.System.identityHashCode;
import static java.util.Comparator.comparing;
import static java.util.logging.Level.FINER;
import static java.util.logging.Level.FINEST;
import static java.util.stream.Collectors.summingDouble;
import static java.util.stream.Collectors.toList;
import static java.util.stream.Collectors.toSet;
import static java.util.stream.Stream.concat;
/**
* A <code>MatcherInterface</code> implementation based on the Flexible Tree Matching algorithm.
*
* @param <T> the type of the artifacts being matched
* @see <a href="http://theory.stanford.edu/~tim/papers/ijcai11.pdf">The Paper</a>
*/
public class CostModelMatcher<T extends Artifact<T>> implements MatcherInterface<T> {
private static final Logger LOG = Logger.getLogger(CostModelMatcher.class.getCanonicalName());
/**
* A function weighing a matching that incurred a cost.
*
* @param <T> the type of the artifacts
*/
@FunctionalInterface
public interface SimpleWeightFunction<T extends Artifact<T>> {
float weigh(CMMatching<T> matching);
}
/**
* A function weighing a matching that incurred a specific cost.
*
* @param <T> the type of the artifacts
*/
@FunctionalInterface
public interface WeightFunction<T extends Artifact<T>> {
float weigh(CMMatching<T> matching, float quantity);
}
/**
* The return type of {@link #objective(CMMatchings, CMParameters)} containing the value of the objective
* function and the exact cost of the newly proposed set of <code>CMMatching</code>s.
*/
private final class ObjectiveValue {
public final double objValue;
public final float matchingsCost;
public ObjectiveValue(double objValue, float matchingsCost) {
this.objValue = objValue;
this.matchingsCost = matchingsCost;
}
}
/**
* The return type of {@link #acceptanceProb(double, CMMatchings, CMParameters)} containing the probability
* of the newly proposed set of <code>CMMatching</code>s being accepted for the next iteration and the
* <code>ObjectiveValue</code> for the proposed matchings.
*/
private final class AcceptanceProbability {
public final double acceptanceProbability;
public final ObjectiveValue mHatObjectiveValue;
public AcceptanceProbability(double acceptanceProbability, ObjectiveValue mHatObjectiveValue) {
this.acceptanceProbability = acceptanceProbability;
this.mHatObjectiveValue = mHatObjectiveValue;
}
}
/**
* Returns the exact cost of the given set of <code>matchings</code>.
*
* @param context
* the <code>MergeContext</code> containing the parameters to be used
* @param matchings
* the matchings to calculate the cost for
* @param left
* the left root
* @param right
* the right root
* @return the exact cost based on the weights in <code>context</code>
*/
public float cost(MergeContext context, Matchings<T> matchings, T left, T right) {
if (matchings.isEmpty()) {
return 0;
}
Set<T> leftUnmatched = new LinkedHashSet<>(Artifacts.dfs(left));
Set<T> rightUnmatched = new LinkedHashSet<>(Artifacts.dfs(right));
CMMatchings<T> cmMatchings = new CMMatchings<>(left, right);
for (Matching<T> matching : matchings) {
cmMatchings.add(new CMMatching<>(matching.getLeft(), matching.getRight()));
leftUnmatched.remove(matching.getLeft());
rightUnmatched.remove(matching.getRight());
}
for (T l : leftUnmatched) {
cmMatchings.add(new CMMatching<>(l, null));
}
for (T r : rightUnmatched) {
cmMatchings.add(new CMMatching<>(null, r));
}
return cost(cmMatchings, new CMParameters<>(context));
}
/**
* Returns the exact cost of the given <code>matchings</code>. This assumes that <code>matchings</code> contains
* for every node in the left and right tree exactly one <code>CMMatching</code> containing the node.
* The exact cost computed for every <code>CMMatching</code> can be retrieved using
* ({@link CMMatching#getExactCost()} after this call.
*
* @param matchings
* the <code>CMMatchings</code>s to evaluate
* @param parameters
* the <code>CMParameters</code> to use
* @return the cost based on the weight functions in <code>parameters</code>
*/
private float cost(CMMatchings<T> matchings, CMParameters<T> parameters) {
if (!matchings.sane()) {
throw new IllegalArgumentException("The given list of matchings has an invalid format. A list of " +
"matchings where every artifact from the left and right tree occurs in exactly one matching is " +
"required. Matchings matching artifacts that do not occur in the left or right tree are not " +
"allowed.");
}
if (matchings.isEmpty()) {
return 0;
}
if (parameters.parallel) {
matchings.parallelStream().forEach(m -> cost(m, matchings, parameters));
} else {
matchings.forEach(m -> cost(m, matchings, parameters));
}
float sumCost = matchings.stream().collect(summingDouble(CMMatching::getExactCost)).floatValue();
sumCost *= (1.0f / (matchings.left.getTreeSize() + matchings.right.getTreeSize()));
parameters.clearExactCaches();
return sumCost;
}
/**
* Sets the exact cost ({@link CMMatching#setExactCost(float)}) of the given <code>matching</code> based on
* the given set of <code>matchings</code>.
*
* @param matching
* the <code>CMMatching</code> to compute the cost for
* @param matchings
* the complete <code>CMMatching</code>s
* @param parameters
* the <code>CMParameters</code> to use
*/
private void cost(CMMatching<T> matching, CMMatchings<T> matchings, CMParameters<T> parameters) {
if (matching.isNoMatch()) {
matching.setExactCost(parameters.wn);
return;
}
float cR = renamingCost(matching, parameters);
float cA = ancestryViolationCost(matching, matchings, parameters);
float cS = siblingGroupBreakupCost(matching, matchings, parameters);
float cO = orderingCost(matching, matchings, parameters);
matching.setExactCost(cR + cA + cS + cO);
}
/**
* Returns the cost for renaming the node. The cost will be zero if the <code>Artifact</code>s match according to
* {@link Artifact#matches(Artifact)}, otherwise it is determined by the set renaming weight function
* in <code>parameters</code>.
*
* @param matching
* the <code>CMMatching</code> to compute the cost for
* @return the exact renaming cost of the <code>matching</code>
*/
private float renamingCost(CMMatching<T> matching, CMParameters<T> parameters) {
if (matching.m.matches(matching.n)) {
return 0;
} else {
return parameters.wr.weigh(matching);
}
}
/**
* Returns the exact ancestry violation cost for <code>matching</code>.
*
* @param matching
* the matching to calculate the cost for
* @param matchings
* all matchings
* @param parameters
* the cost model parameters
* @return the exact ancestry violation cost
*/
private float ancestryViolationCost(CMMatching<T> matching, CMMatchings<T> matchings, CMParameters<T> parameters) {
int numM = numAncestryViolatingChildren(matching.m, matching.n, matchings, parameters);
int numN = numAncestryViolatingChildren(matching.n, matching.m, matchings, parameters);
return parameters.wa.weigh(matching, numM + numN);
}
/**
* Returns the number of children of <code>m</code> that violate ancestry of <code>m</code> is matched with
* <code>n</code>.
*
* @param m
* the artifact to return the number of ancestry violating children for
* @param n
* the artifact <code>m</code> is being matched with
* @param matchings
* all matchings
* @param parameters
* the cost model parameters
* @return the number of children of <code>m</code> violating ancestry
*/
private int numAncestryViolatingChildren(T m, T n, CMMatchings<T> matchings, CMParameters<T> parameters) {
ArtifactList<T> mChildren = m.getChildren();
ArtifactList<T> nChildren = n.getChildren();
Predicate<T> filter = a -> a != null && !nChildren.contains(a);
return (int) mChildren.stream().map(mChild -> image(mChild, matchings, parameters)).filter(filter).count();
}
/**
* Returns the exact sibling group breakup cost for <code>matching</code>.
*
* @param matching
* the matching to calculate the cost for
* @param matchings
* all matchings
* @param parameters
* the cost model parameters
* @return the exact sibling group breakup cost
*/
private float siblingGroupBreakupCost(CMMatching<T> matching, CMMatchings<T> matchings, CMParameters<T> parameters) {
List<T> dMm, iMm;
Set<T> fMm;
List<T> dMn, iMn;
Set<T> fMn;
float mCost;
float nCost;
dMm = siblingDivergentSubset(matching.m, matching.n, matchings, parameters);
if (dMm.isEmpty()) {
mCost = 0;
} else {
iMm = siblingInvariantSubset(matching.m, matching.n, matchings, parameters);
fMm = distinctSiblingFamilies(matching.m, matchings, parameters);
mCost = (float) dMm.size() / (iMm.size() * fMm.size());
}
dMn = siblingDivergentSubset(matching.n, matching.m, matchings, parameters);
if (dMn.isEmpty()) {
nCost = 0;
} else {
iMn = siblingInvariantSubset(matching.n, matching.m, matchings, parameters);
fMn = distinctSiblingFamilies(matching.n, matchings, parameters);
nCost = (float) dMn.size() / (iMn.size() * fMn.size());
}
return parameters.ws.weigh(matching, mCost + nCost);
}
/**
* Returns the sibling invariant subset of siblings of <code>m</code>.
*
* @param m
* the artifact for whose siblings the sibling invariant subset is to be returned
* @param n
* the artifact <code>m</code> is being matched with
* @param matchings
* all matchings
* @param parameters
* the cost model parameters
* @return the sibling invariant subset
*/
private List<T> siblingInvariantSubset(T m, T n, CMMatchings<T> matchings, CMParameters<T> parameters) {
List<T> mSiblings = siblings(m, matchings, parameters);
List<T> nSiblings = siblings(n, matchings, parameters);
return mSiblings.stream().filter(s -> nSiblings.contains(image(s, matchings, parameters))).collect(toList());
}
/**
* Returns the sibling divergent subset of siblings of <code>m</code>.
*
* @param m
* the artifact for whose siblings the sibling divergent subset is to be returned
* @param n
* the artifact <code>m</code> is being matched with
* @param matchings
* all matchings
* @param parameters
* the cost model parameters
* @return the sibling divergent subset
*/
private List<T> siblingDivergentSubset(T m, T n, CMMatchings<T> matchings, CMParameters<T> parameters) {
List<T> inv = siblingInvariantSubset(m, n, matchings, parameters);
List<T> sibs = siblings(m, matchings, parameters);
return sibs.stream().filter(sibling -> !inv.contains(sibling) && image(sibling, matchings, parameters) != null)
.collect(toList());
}
/**
* Returns the set of distinct sibling families that siblings of <code>m</code> are matched into represented by
* their parent artifact. For the root, <code>null</code> will be included in the set.
*
* @param m
* the artifact for whose siblings the distinct sibling families are to be returned
* @param matchings
* all matchings
* @param parameters
* the cost model parameters
* @return the distinct sibling family representatives
*/
private Set<T> distinctSiblingFamilies(T m, CMMatchings<T> matchings, CMParameters<T> parameters) {
Function<T, T> image = mChild -> image(mChild, matchings, parameters);
Predicate<T> notNull = t -> t != null;
Function<T, T> getParent = Artifact::getParent;
return siblings(m, matchings, parameters).stream().map(image).filter(notNull).map(getParent).collect(toSet());
}
/**
* Returns the exact ordering cost for <code>matching</code>.
*
* @param matching
* the matching to calculate the cost for
* @param matchings
* all matchings
* @param parameters
* the cost model paramters
* @return the exact ordering cost
*/
private float orderingCost(CMMatching<T> matching, CMMatchings<T> matchings, CMParameters<T> parameters) {
Stream<T> leftSiblings = otherSiblings(matching.m, matchings, parameters).stream();
Stream<T> rightSiblings = otherSiblings(matching.n, matchings, parameters).stream();
Stream<CMMatching<T>> s = concat(leftSiblings, rightSiblings).map(a -> matching(a, matchings, parameters))
.filter(m -> !m.isNoMatch()).distinct();
if (s.anyMatch(toCheck -> violatesOrdering(toCheck, matching, matchings, parameters))) {
return parameters.wo.weigh(matching);
} else {
return 0;
}
}
/**
* Tests whether <code>toCheck</code> violates the ordering induced by <code>matching</code>.
*
* @param toCheck
* the matching to check
* @param matching
* the matching introducing an ordering
* @param matchings
* all matchings
* @param parameters
* the cost model parameters
* @return true iff <code>toCheck</code> violates the ordering induced by <code>matching</code>
*/
private boolean violatesOrdering(CMMatching<T> toCheck, CMMatching<T> matching, CMMatchings<T> matchings, CMParameters<T> parameters) {
Tuple<T, T> leftSides = lca(toCheck.m, matching.m, matchings, parameters);
Tuple<T, T> rightSides = lca(toCheck.n, matching.n, matchings, parameters);
List<T> leftSiblings = siblings(leftSides.x, matchings, parameters);
List<T> rightSiblings = siblings(rightSides.x, matchings, parameters);
if (concat(leftSiblings.stream(), rightSiblings.stream()).noneMatch(T::isOrdered)) {
return false;
}
int leftXi = leftSiblings.indexOf(leftSides.x);
int leftYi = leftSiblings.indexOf(leftSides.y);
int rightXi = rightSiblings.indexOf(rightSides.x);
int rightYi = rightSiblings.indexOf(rightSides.y);
if (leftXi < leftYi) {
return rightXi > rightYi;
} else if (leftXi > leftYi) {
return rightXi < rightYi;
}
return false; // TODO weird case, maybe true is better?
}
/**
* Returns the path from the given <code>artifact</code> to the root node of the tree it is a part of.
*
* @param artifact
* the <code>Artifact</code> to return the path for
* @return the path represented by a list of <code>Artifact</code>s beginning with <code>artifact</code> and ending
* with the root of the tree
*/
private List<T> pathToRoot(T artifact) {
List<T> path = new ArrayList<>();
do {
path.add(artifact);
artifact = artifact.getParent();
} while (artifact != null);
return path;
}
/**
* Finds the lowest pair of (possibly different) ancestors of <code>a</code> and <code>b</code> that are part of the
* same sibling group.
*
* @param a
* the first <code>Artifact</code>
* @param b
* the second <code>Artifact</code>
* @param matchings
* the current <code>CMMatching</code>
* @param parameters
* the <code>CMParameters</code> to use
* @return the ancestor of the first <code>Artifact</code> in the first position, that of the second in the second
* position
*/
private Tuple<T, T> lca(T a, T b, CMMatchings<T> matchings, CMParameters<T> parameters) {
return parameters.lcaCache.computeIfAbsent(Tuple.of(a, b), ab -> {
Tuple<T, T> ba = Tuple.of(b, a);
if (parameters.lcaCache.containsKey(ba)) {
Tuple<T, T> baLCS = parameters.lcaCache.get(ba);
return Tuple.of(baLCS.y, baLCS.x);
}
if (siblings(a, matchings, parameters).contains(b)) {
return ab;
}
List<T> aPath = pathToRoot(a);
List<T> bPath = pathToRoot(b);
ListIterator<T> aIt = aPath.listIterator(aPath.size());
ListIterator<T> bIt = bPath.listIterator(bPath.size());
T l, r;
do {
l = aIt.previous();
r = bIt.previous();
} while (l == r && (aIt.hasPrevious() && bIt.hasPrevious()));
return Tuple.of(l, r);
});
}
/**
* Finds the (first) <code>CMMatching</code> in <code>matchings</code> containing the given
* <code>artifact</code>.
*
* @param artifact
* the <code>Artifact</code> for which the containing <code>CMMatching</code> is to be returned
* @param matchings
* the current matchings
* @param parameters
* the <code>CMParameters</code> to use
* @return the <code>CMMatching</code> containing the <code>artifact</code>
* @throws NoSuchElementException
* if no <code>CMMatching</code> containing <code>artifact</code> can be found in
* <code>matchings</code>
*/
private CMMatching<T> matching(T artifact, CMMatchings<T> matchings, CMParameters<T> parameters) {
return parameters.exactContainsCache.computeIfAbsent(artifact, a ->
matchings.stream().filter(m -> m.contains(a)).findFirst().orElseThrow(() ->
new NoSuchElementException("No matching containing " + artifact + " found.")
)
);
}
/**
* Finds the (first) <code>CMMatching</code> in <code>matchings</code> containing the given
* <code>artifact</code> and returns the other <code>Artifact</code> in the <code>CMMatching</code>.
*
* @param artifact
* the <code>Artifact</code> whose image is to be returned
* @param matchings
* the current matchings
* @return the matching partner of <code>artifact</code> in the given <code>matchings</code>
* @throws NoSuchElementException
* if no <code>CMMatching</code> containing <code>artifact</code> can be found in
* <code>matchings</code>
*/
private T image(T artifact, CMMatchings<T> matchings, CMParameters<T> parameters) {
return matching(artifact, matchings, parameters).other(artifact);
}
/**
* Sets the bounds ({@link CMMatching#setCostBounds(Bounds)}) for the cost of all current matchings.
*
* @param currentMatchings
* the current <code>CMMatchings</code>s being considered
* @param parameters
* the <code>CMParameters</code> to use
*/
private void boundCost(CMMatchings<T> currentMatchings, CMParameters<T> parameters) {
LOG.finer(() -> "Bounding " + currentMatchings.size() + " matchings.");
AtomicInteger mCount = LOG.isLoggable(FINEST) ? new AtomicInteger() : null;
Consumer<CMMatching<T>> mPeek = m -> LOG.finest(() -> "Done with matching " + mCount.getAndIncrement() + " " + m);
if (parameters.parallel) {
currentMatchings.parallelStream().peek(mPeek).forEach(m -> boundCost(m, currentMatchings, parameters));
} else {
currentMatchings.stream().peek(mPeek).forEach(m -> boundCost(m, currentMatchings, parameters));
}
parameters.clearBoundCaches();
}
/**
* Sets the bounds ({@link CMMatching#setCostBounds(Bounds)}) for the cost of the given <code>matching</code>
* based on the given <code>currentMatchings</code>.
*
* @param matching
* the <code>CMMatching</code> whose costs are to be bounded
* @param currentMatchings
* the current <code>CMMatchings</code>s being considered
* @param parameters
* the <code>CMParameters</code> to use
*/
private void boundCost(CMMatching<T> matching, CMMatchings<T> currentMatchings, CMParameters<T> parameters) {
if (matching.isNoMatch()) {
matching.setBounds(parameters.wn, parameters.wn);
return;
}
float cR = renamingCost(matching, parameters);
Bounds cABounds = boundAncestryViolationCost(matching, currentMatchings, parameters);
Bounds cSBounds = boundSiblingGroupBreakupCost(matching, currentMatchings, parameters);
Bounds cOBounds = boundOrderingCost(matching, currentMatchings, parameters);
float lower = cR + cABounds.getLower() + cSBounds.getLower() + cOBounds.getLower();
float upper = cR + cABounds.getUpper() + cSBounds.getUpper() + cOBounds.getUpper();
matching.setBounds(lower, upper);
}
/**
* Returns the bounded ancestry violation cost for <code>matching</code>.
*
* @param matching
* the matching to calculate the bounds for
* @param currentMatchings
* all matchings
* @param parameters
* the cost model parameters
* @return the bounded ancestry violation cost
*/
private Bounds boundAncestryViolationCost(CMMatching<T> matching, CMMatchings<T> currentMatchings, CMParameters<T> parameters) {
T m = matching.m;
T n = matching.n;
Stream<T> mLower = m.getChildren().stream().filter(mChild -> ancestryIndicator(mChild, n, currentMatchings, false, parameters));
Stream<T> nLower = n.getChildren().stream().filter(nChild -> ancestryIndicator(nChild, m, currentMatchings, false, parameters));
Stream<T> mUpper = m.getChildren().stream().filter(mChild -> ancestryIndicator(mChild, n, currentMatchings, true, parameters));
Stream<T> nUpper = n.getChildren().stream().filter(nChild -> ancestryIndicator(nChild, m, currentMatchings, true, parameters));
int lowerBound = (int) (mLower.count() + nLower.count());
int upperBound = (int) (mUpper.count() + nUpper.count());
return new Bounds(parameters.wa.weigh(matching, lowerBound), parameters.wa.weigh(matching, upperBound));
}
/**
* Evaluates the upper/lower ancestry violation indicator.
*
* @param child
* the child for which to check whether ancestry violation is possible/unavoidable
* @param n
* the matching partner of the parent of <code>child</code>
* @param currentMatchings
* all matchings
* @param upper
* whether to evaluate the upper or lower indicator
* @param parameters
* the cost model parameters
* @return the value of the indicator function
*/
private boolean ancestryIndicator(T child, T n, CMMatchings<T> currentMatchings, boolean upper, CMParameters<T> parameters) {
if (upper) {
Predicate<CMMatching<T>> indicator = match -> {
T partner = match.other(child);
return !(partner == null || n.getChildren().contains(partner));
};
return containing(child, currentMatchings, parameters).stream().anyMatch(indicator);
} else {
Predicate<CMMatching<T>> indicator = match -> {
T partner = match.other(child);
return partner == null || n.getChildren().contains(partner);
};
return containing(child, currentMatchings, parameters).stream().noneMatch(indicator);
}
}
/**
* Bounds the sibling group breakup cost for <code>matching</code>.
*
* @param matching
* the matching to bound the cost for
* @param currentMatchings
* all matchings
* @param parameters
* the cost model parameters
* @return the bounded sibling group breakup cost
*/
private Bounds boundSiblingGroupBreakupCost(CMMatching<T> matching, CMMatchings<T> currentMatchings, CMParameters<T> parameters) {
T m = matching.m;
T n = matching.n;
float mnLower, nmLower, lower, mnUpper, nmUpper, upper;
Bounds dMN = boundDivergentSiblings(m, n, currentMatchings, parameters);
Bounds dNM = boundDivergentSiblings(n, m, currentMatchings, parameters);
if (dMN.getLower() != 0 || dMN.getUpper() != 0) {
Bounds iMN = boundInvariantSiblings(m, n, currentMatchings, parameters);
mnLower = dMN.getLower() / (iMN.getUpper() * (dMN.getLower() + 1));
mnUpper = dMN.getUpper() / iMN.getLower();
} else {
mnLower = 0;
mnUpper = 0;
}
if (dNM.getLower() != 0 || dNM.getUpper() != 0) {
Bounds iNM = boundInvariantSiblings(n, m, currentMatchings, parameters);
nmLower = dNM.getLower() / (iNM.getUpper() * (dNM.getLower() + 1));
nmUpper = dNM.getUpper() / iNM.getLower();
} else {
nmLower = 0;
nmUpper = 0;
}
lower = parameters.ws.weigh(matching, mnLower + nmLower);
upper = parameters.ws.weigh(matching, (mnUpper + nmUpper) / 2);
return new Bounds(lower, upper);
}
/**
* Bounds the size of the divergent sibling subset of siblings of <code>m</code>.
*
* @param m
* the artifact for whose siblings the size of the sibling divergent subset is to be bounded
* @param n
* the artifact <code>m</code> is being matched with
* @param currentMatchings
* all matchings
* @param parameters
* the cost model parameters
* @return the bounded size of the divergent sibling subset
*/
private Bounds boundDivergentSiblings(T m, T n, CMMatchings<T> currentMatchings, CMParameters<T> parameters) {
List<T> osibs = otherSiblings(m, currentMatchings, parameters);
long lower = osibs.stream().filter(mSib -> divergentSiblingIndicator(mSib, n, currentMatchings, false, parameters)).count();
long upper = osibs.stream().filter(mSib -> divergentSiblingIndicator(mSib, n, currentMatchings, true, parameters)).count();
return new Bounds(lower, upper);
}
/**
* Evaluates the upper/lower divergent sibling subset indicator.
*
* @param sibling
* the sibling for which to check whether inclusion in the sibling divergent subset is possible/unavoidable
* @param n
* the artifact that the sibling of <code>sibling</code> is matched with
* @param currentMatchings
* all matchings
* @param upper
* whether to evaluate the upper or lower indicator
* @param parameters
* the cost model parameters
* @return the value of the indicator function
*/
private boolean divergentSiblingIndicator(T sibling, T n, CMMatchings<T> currentMatchings, boolean upper, CMParameters<T> parameters) {
if (upper) {
Predicate<CMMatching<T>> indicator = match -> {
T partner = match.other(sibling);
return !(partner == null || otherSiblings(n, currentMatchings, parameters).contains(partner));
};
return containing(sibling, currentMatchings, parameters).stream().anyMatch(indicator);
} else {
Predicate<CMMatching<T>> indicator = match -> {
T partner = match.other(sibling);
return partner == null || otherSiblings(n, currentMatchings, parameters).contains(partner);
};
return containing(sibling, currentMatchings, parameters).stream().noneMatch(indicator);
}
}
/**
* Bounds the size of the invariant sibling subset of siblings of <code>m</code>.
*
* @param m
* the artifact for whose siblings the size of the sibling invariant subset is to be bounded
* @param n
* the artifact <code>m</code> is being matched with
* @param currentMatchings
* all matchings
* @param parameters
* the cost model parameters
* @return the bounded size of the invariant sibling subset
*/
private Bounds boundInvariantSiblings(T m, T n, CMMatchings<T> currentMatchings, CMParameters<T> parameters) {
List<T> osibs = otherSiblings(m, currentMatchings, parameters);
long lower = osibs.stream().filter(mSib -> invariantSiblingIndicator(mSib, n, currentMatchings, false, parameters)).count();
long upper = osibs.stream().filter(mSib -> invariantSiblingIndicator(mSib, n, currentMatchings, true, parameters)).count();
return new Bounds(lower + 1, upper + 1);
}
/**
* Evaluates the upper/lower invariant sibling subset indicator.
*
* @param sibling
* the sibling for which to check whether inclusion in the sibling invariant subset is possible/unavoidable
* @param n
* the artifact that the sibling of <code>sibling</code> is matched with
* @param currentMatchings
* all matchings
* @param upper
* whether to evaluate the upper or lower indicator
* @param parameters
* the cost model parameters
* @return the value of the indicator function
*/
private boolean invariantSiblingIndicator(T sibling, T n, CMMatchings<T> currentMatchings, boolean upper, CMParameters<T> parameters) {
Predicate<CMMatching<T>> indicator = match -> otherSiblings(n, currentMatchings, parameters).contains(match.other(sibling));
if (upper) {
return containing(sibling, currentMatchings, parameters).stream().anyMatch(indicator);
} else {
return containing(sibling, currentMatchings, parameters).stream().allMatch(indicator);
}
}
/**
* Bounds the ordering violation cost of <code>matching</code>.
*
* @param matching
* the matching to bound the cost for
* @param currentMatchings
* all matchings
* @param parameters
* the cost model parameters
* @return the bounded ordering violation cost
*/
private Bounds boundOrderingCost(CMMatching<T> matching, CMMatchings<T> currentMatchings, CMParameters<T> parameters) {
float lower, upper;
List<T> mosibs = otherSiblings(matching.m, currentMatchings, parameters);
List<T> nosibs = otherSiblings(matching.n, currentMatchings, parameters);
Stream<T> siblings = concat(mosibs.stream(), nosibs.stream());
boolean orderingPossible = siblings.allMatch(sib ->
containing(sib, currentMatchings, parameters).stream().anyMatch(match ->
match.isNoMatch() || !violatesOrdering(match, matching, currentMatchings, parameters)
)
);
if (!orderingPossible) {
lower = parameters.wo.weigh(matching);
upper = lower;
} else {
lower = 0;
siblings = concat(mosibs.stream(), nosibs.stream());
boolean violationPossible = siblings.anyMatch(sib ->
containing(sib, currentMatchings, parameters).stream().anyMatch(match ->
!match.isNoMatch() && violatesOrdering(match, matching, currentMatchings, parameters)
)
);
upper = violationPossible ? parameters.wo.weigh(matching) : 0;
}
return new Bounds(lower, upper);
}
/**
* Returns a new <code>List</code> containing the children of the parent of <code>artifact</code> or an empty
* <code>List</code> for the root node. This includes the <code>artifact</code> itself.
*
* @param artifact
* the <code>Artifact</code> whose siblings are to be returned
* @param matchings
* the current <code>CMMatchings</code>
* @param parameters
* the <code>CMParameters</code> to use
* @return the siblings of the given <code>artifact</code>
*/
private List<T> siblings(T artifact, CMMatchings<T> matchings, CMParameters<T> parameters) {
return parameters.siblingCache.computeIfAbsent(artifact, a -> {
List<T> siblings;
if (artifact == matchings.left || artifact == matchings.right) {
siblings = new ArrayList<>(Collections.singleton(a));
} else {
T parent = a.getParent();
siblings = parent.getChildren()
.stream()
.filter(s -> s != a && parameters.siblingCache.containsKey(s))
.map(s -> parameters.siblingCache.get(s)).findFirst()
.orElseGet(() -> new ArrayList<>(parent.getChildren()));
}
return siblings;
});
}
/**
* Returns the siblings of <code>artifact</code> as in {@link #siblings(Artifact, CMMatchings, CMParameters)} but
* does not include <code>artifact</code> itself.
*
* @param artifact
* the <code>Artifact</code> whose siblings are to be returned
* @param matchings
* the current <code>CMMatchings</code>
*@param parameters
* the <code>CMParameters</code> to use @return the siblings of the given <code>artifact</code>
*/
private List<T> otherSiblings(T artifact, CMMatchings<T> matchings, CMParameters<T> parameters) {
return parameters.otherSiblingsCache.computeIfAbsent(artifact, a -> {
List<T> siblings = new ArrayList<>(siblings(a, matchings, parameters));
siblings.remove(a);
return siblings;
});
}
/**
* Returns all matchings containing <code>artifact</code> from <code>currentMatchings</code>.
*
* @param artifact
* the artifact to search for
* @param currentMatchings
* all matchings
* @param parameters
* the cost model parameters
* @return all matchings containig <code>artifact</code>
*/
private List<CMMatching<T>> containing(T artifact, CMMatchings<T> currentMatchings, CMParameters<T> parameters) {
return parameters.boundContainsCache.computeIfAbsent(artifact, a ->
currentMatchings.stream().filter(m -> m.contains(a)).collect(toList())
);
}
@Override
public Matchings<T> match(MergeContext context, T left, T right) {
return match(context, left, right, new CMMatchings<>(left, right));
}
/**
* Matches the trees rooted in <code>left</code> and <code>right</code>. The matchings contained in
* <code>preFixed</code> will be considered fixed and returned as is in addition to any matchings between previously
* unmatched artifacts.
*
* @param context
* the <code>MergeContext</code> containing the parameters to use for the Flexible Tree Matching
* algorithm
* @param left
* the left root
* @param right
* the right root
* @param preFixed
* the matchings between the left and right tree that are fixed
* @return the resulting matchings
*/
public Matchings<T> match(MergeContext context, T left, T right, Matchings<T> preFixed) {
CMMatchings<T> cmPreFixed = new CMMatchings<>(left, right);
for (Matching<T> matching : preFixed.optimized()) {
cmPreFixed.add(new CMMatching<>(matching.getLeft(), matching.getRight()));
}
return match(context, left, right, cmPreFixed);
}
/**
* Matches the tress rooted in <code>left</code> and <code>right</code> using the Metropolis algorithm and the
* Flexible Tree Matching cost model.
*
* @param context
* the <code>MergeContext</code> containing the parameters to use for the Flexible Tree Matching
* algorithm
* @param left
* the left root
* @param right
* the right root
* @param preFixed
* the matchings between the left and right tree that are fixed
* @return the resulting matchings
*/
private Matchings<T> match(MergeContext context, T left, T right, CMMatchings<T> preFixed) {
CMParameters<T> parameters = new CMParameters<>(context);
LOG.fine("Matching " + left + " and " + right + " using the " + getClass().getSimpleName());
CMMatchings<T> m = initialize(preFixed, parameters);
ObjectiveValue mObjVal = objective(m, parameters);
CMMatchings<T> lowest = m;
float lowestCost = mObjVal.matchingsCost;
for (int i = 0; i < context.getCostModelIterations(); i++) {
CMMatchings<T> mHat = propose(m, preFixed, parameters);
AcceptanceProbability mHatAccProb = acceptanceProb(mObjVal.objValue, mHat, parameters);
if (chance(parameters.rng, mHatAccProb.acceptanceProbability)) {
log(FINER, mHat, () -> "Accepting the matchings.");
m = mHat;
mObjVal = mHatAccProb.mHatObjectiveValue;
}
if (mHatAccProb.mHatObjectiveValue.matchingsCost < lowestCost) {
lowest = mHat;
lowestCost = mHatAccProb.mHatObjectiveValue.matchingsCost;
float finalLowestCost = lowestCost;
log(FINER, mHat, () -> "New lowest cost matchings with cost " + finalLowestCost + " found.");
}
LOG.fine("End of iteration " + i);
}
LOG.fine(() -> "Matching ended after " + context.getCostModelIterations() + " iterations.");
return convert(lowest);
}
/**
* Returns <code>true</code> with a probability of <code>p</code>.
*
* @param rng
* the PRNG to sample from
* @param p
* a number between 0.0 and 1.0
* @return true or false depending on the next double returned by the PRNG
*/
boolean chance(RandomGenerator rng, double p) {
return rng.nextDouble() < p;
}
/**
* Converts a <code>List</code> of <code>CMMatching</code>s to an equivalent <code>Set</code> of
* <code>Matching</code>s.
*
* @param matchings
* the <code>CMMatching</code>s to convert
* @return the resulting <code>Matchings</code>
*/
private Matchings<T> convert(CMMatchings<T> matchings) {
Map<T, T> mMap = matchings.asMap();
Function<CMMatching<T>, Matching<T>> toMatching = m -> {
Set<T> ls = new HashSet<>(Artifacts.dfs(m.m));
Set<T> rs = new HashSet<>(Artifacts.dfs(m.n));
int score = (int) ls.stream().filter(a -> rs.contains(mMap.get(a))).count();
Matching<T> matching = new Matching<>(m.m, m.n, score);
matching.setAlgorithm(CostModelMatcher.class.getSimpleName());
return matching;
};
return matchings.stream().filter(m -> !m.isNoMatch()).map(toMatching)
.collect(Matchings::new, Matchings::add, Matchings::addAll);
}
/**
* Proposes a new set of <code>CMMatching</code>s based on the previous matchings <code>m</code>.
*
* @param m
* the matchings from the previous iteration
* @param preFixed
* the matchings between the left and right tree that are fixed
* @return the proposed matchings for the next iteration
*/
private CMMatchings<T> propose(CMMatchings<T> m, CMMatchings<T> preFixed, CMParameters<T> parameters) {
CMMatchings<T> mVariable = new CMMatchings<>(m, m.left, m.right);
mVariable.removeAll(preFixed);
int j;
if (parameters.fixRandomPercentage) {
int lower = (int) (parameters.fixLower * mVariable.size());
int upper = (int) (parameters.fixUpper * mVariable.size());
Collections.shuffle(mVariable, parameters.rng); // TODO a switch to turn this off
j = intFromRange(lower, upper, parameters);
} else {
//TODO sort by exact cost?
Collections.sort(mVariable, Comparator.comparing(CMMatching::getExactCost));
j = parameters.rng.nextInt(mVariable.size());
}
CMMatchings<T> fixed = new CMMatchings<>(mVariable.subList(0, j), m.left, m.right);
log(FINER, m, () -> "Fixing the first " + j + "variable matchings from the last iteration.");
log(FINEST, m, () -> "They are: " + fixed);
fixed.addAll(preFixed);
CMMatchings<T> proposition = complete(fixed, parameters);
log(FINER, proposition, () -> "Proposing matchings for the next iteration.");
log(FINEST, proposition, () -> "Proposition is: " + proposition);
return proposition;
}
/**
* Returns a uniformly distributed random integer from the given range (inclusive).
*
* @param lower
* the lower bound
* @param upper
* the upper bound
* @param parameters
* the cost model parameters
* @return a random int from [<code>lower</code>, <code>upper</code>]
*/
private int intFromRange(int lower, int upper, CMParameters<T> parameters) {
return lower + (int) (parameters.rng.nextFloat() * ((upper - lower) + 1));
}
/**
* Constructs the initial set of matchings.
*
* @param preFixed
* the matchings between the left and right tree that are fixed
* @param parameters
* the cost model parameters
* @return the initial matchings
*/
private CMMatchings<T> initialize(CMMatchings<T> preFixed, CMParameters<T> parameters) {
CMMatchings<T> initial = complete(preFixed, parameters);
log(FINER, initial, () -> "Initial set of matchings assembled.");
log(FINEST, initial, () -> "Initial set is: " + initial);
return initial;
}
/**
* Completes the given <code>fixedMatchings</code> to a set of matchings in which every artifact from the left
* and right tree is covered by exactly one matching.
*
* @param fixedMatchings
* the fixed matchings to complete
* @param parameters
* the cost model parameters
* @return the completed set of matchings
*/
private CMMatchings<T> complete(CMMatchings<T> fixedMatchings, CMParameters<T> parameters) {
CMMatchings<T> current = completeBipartiteGraph(fixedMatchings.left, fixedMatchings.right, parameters);
CMMatchings<T> fixed = new CMMatchings<>(fixedMatchings, fixedMatchings.left, fixedMatchings.right);
fixed.forEach(m -> prune(m, current));
while (fixed.size() != current.size()) {
boundCost(current, parameters);
Collections.sort(current, comparing(CMMatching::getCostBounds, BY_LOWER_UPPER));
CMMatchings<T> available = new CMMatchings<>(current, current.left, current.right);
available.removeAll(fixed);
int i;
do {
i = parameters.assignDist.sample();
} while (i >= available.size());
CMMatching<T> matching = available.get(i);
fixed.add(matching);
prune(matching, current);
}
return fixed;
}
/**
* Removes the other matchings containing an artifact matched in <code>matching</code> from <code>g</code>.
*
* @param matching
* the matching to prune for
* @param g
* the matchings to prune from
*/
private void prune(CMMatching<T> matching, CMMatchings<T> g) {
for (ListIterator<CMMatching<T>> it = g.listIterator(); it.hasNext();) {
CMMatching<T> current = it.next();
boolean neq = !matching.equals(current);
if (neq && ((matching.m != null && matching.m == current.m) || (matching.n != null && matching.n == current.n))) {
it.remove();
}
}
}
/**
* Returns the (randomly ordered) complete bipartite graph between the trees rooted in <code>left</code> and
* <code>right</code> with the addition of one no-match node (represented by <code>null</code>) each.
*
* @param left
* the left root
* @param right
* the right root
* @param parameters
* the cost model parameters
* @return the complete bipartite graph with its edges represented by <code>CMMatching</code>s
*/
private CMMatchings<T> completeBipartiteGraph(T left, T right, CMParameters<T> parameters) {
List<T> leftNodes = Artifacts.bfs(left);
List<T> rightNodes = Artifacts.bfs(right);
// add the "No Match" node
leftNodes.add(null);
rightNodes.add(null);
CMMatchings<T> bipartiteGraph = new CMMatchings<>(left, right);
for (T lNode : leftNodes) {
for (T rNode : rightNodes) {
if (lNode != null || rNode != null) {
bipartiteGraph.add(new CMMatching<>(lNode, rNode));
}
}
}
Collections.shuffle(bipartiteGraph, parameters.rng);
return bipartiteGraph;
}
/**
* Returns the value of the objective function.
*
* @param matchings
* the matchings to return the objective function value for
* @param parameters
* the cost model parameters
* @return the value of the objective function and the cost that was calculated as part of it
*/
private ObjectiveValue objective(CMMatchings<T> matchings, CMParameters<T> parameters) {
float cost = cost(matchings, parameters);
double objVal = Math.exp(-(parameters.beta * cost));
log(FINER, matchings, () -> "Cost of matchings is " + cost);
log(FINER, matchings, () -> "Objective function value for matchings is " + objVal);
return new ObjectiveValue(objVal, cost);
}
/**
* Returns the acceptance probability for the proposed set of matchings <code>mHat</code>.
*
* @param mObjectiveValue
* the objective value for the current reference set of matchings
* @param mHat
* the newly proposed set of matchings
* @param parameters
* the cost model parameters
* @return the acceptance probability including the <code>ObjectiveValue</code> calculated for <code>mHat</code>
*/
private AcceptanceProbability acceptanceProb(double mObjectiveValue, CMMatchings<T> mHat, CMParameters<T> parameters) {
ObjectiveValue mHatObjectiveValue = objective(mHat, parameters);
double acceptanceProb = Math.min(1, mHatObjectiveValue.objValue / mObjectiveValue);
log(FINER, mHat, () -> "Acceptance probability for matchings is " + acceptanceProb);
return new AcceptanceProbability(acceptanceProb, mHatObjectiveValue);
}
/**
* Returns the hexadecimal identity hash code of the given <code>Object</code> as a <code>String</code>.
*
* @param o
* the <code>Object</code> to return the <code>String</code> id for
* @return the <code>String</code> id
*/
private String id(Object o) {
return toHexString(identityHashCode(o));
}
/**
* Logs the given <code>msg</code> using the {@link #LOG} and prepends the {@link #id(Object)} of the given
* matchings.
*
* @param level
* the level to log at
* @param matchings
* the matchings the message concerns
* @param msg
* the message to log
*/
private void log(Level level, CMMatchings<T> matchings, Supplier<String> msg) {
LOG.log(level, () -> String.format("%-10s%s", id(matchings), msg.get()));
}
}