/**
* diqube: Distributed Query Base.
*
* Copyright (C) 2015 Bastian Gloeckle
*
* This file is part of diqube.
*
* diqube is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.diqube.data.types.str.dict;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Deque;
import java.util.LinkedList;
/**
* Helper class to analyze the values of two tries and compare them to each other.
*
* @author Bastian Gloeckle
*/
public class TrieValueAnalyzer {
/**
* Traverses two Tries and identifies the terminal IDs of the tries where specific invariants hold (see
* {@link TrieValueAnalyzerCallback}).
*
* @param ourNode
* The root node of the first trie.
* @param otherNode
* The root node of the second trie.
* @param callback
* The callback that will be called when specific situations are encountered. The methods of the callback
* will be called for all terminal IDs of ourNode at least once. Please note that the equalIds method is the
* "strongest" guarantee this class can make. The "greater" method is the second strongest guarantee and the
* "smaller" method is the less strongest guarantee. This means that if the "equal" method is called for a
* specific ourId, then that node is equal to another. If not and if the "greater" method is called, then the
* greater method will be called with the smallest otherId where the otherNodes value is greater than the
* value of ourNode. And only if neither "equal" nor "greater" are called for a node, only then the "smaller"
* method will be called with the greatest otherId whose value is smaller than the value of ourId.
*/
public void analyzeTries(ParentNode ourNode, ParentNode otherNode, TrieValueAnalyzerCallback callback) {
analyzeTriesInternal(ourNode, null, otherNode, null, callback);
}
/**
* Traverses two Tries and identifies the terminal IDs.
*
* <p>
* The general goal of one execution of this method is to identify any directly matching strings (which are identified
* by early {@link TerminalNode}s in the tries), or if two {@link ParentNode}s are identified as children, this method
* calls itself recursively.
*
* @param ourNode
* The root node of the first trie.
* @param ourRequiredPrefix
* If unset, this is <code>null</code>. Otherwise this contains a character array which all children of
* ourNode need to adhere to - those that don't won't be inspected.
* @param otherNode
* The root node of the second trie.
* @param otherRequiredPrefix
* If unset, this is <code>null</code>. Otherwise this contains a character array which all children of
* otherNode need to adhere to - those that don't won't be inspected.
* @param callback
* The callback that will be called when specific situations are encountered. The methods of the callback
* will be called for all terminal IDs of ourNode at least once. Please note that the equalIds method is the
* "strongest" guarantee this class can make. The "greater" method is the second strongest guarantee and the
* "smaller" method is the less strongest guarantee. This means that if the "equal" method is called for a
* specific ourId, then that node is equal to another. If not and if the "greater" method is called, then the
* greater method will be called with the smallest otherId where the otherNodes value is greater than the
* value of ourNode. And only if neither "equal" nor "greater" are called for a node, only then the "smaller"
* method will be called with the greatest otherId whose value is smaller than the value of ourId.
*/
/*
* Note to callback guarantees: This implementation walks along the trie of "ourNode" and synchronously along the trie
* of otherNode. If it finds a "smaller" inequality to be true, it will call the "smaller" method, meaning that at the
* current location the value of otherNode is smaller than ourNode. This though does not yet guarantee that otherNode
* is the node with the greatest otherId where that inequality is true - we might find better matching nodes later. If
* we though find the "greater" inequality to be true, it is for a specific node in our trie where we identified at
* least once the smallest node of otherTrie where that inequality holds - this is true because we walk mainly along
* ourTrie and just walk along otherTrie synchronously. If we find a node where equality holds, that overrules
* everything, of course.
*/
private void analyzeTriesInternal(ParentNode ourNode, char[] ourRequiredPrefix, ParentNode otherNode,
char[] otherRequiredPrefix, TrieValueAnalyzerCallback callback) {
int ourIdx = 0;
int otherIdx = 0;
boolean doneOnWholeSubTrieOur = false;
boolean doneOnWholeSubTrieOther = false;
while (ourIdx < ourNode.getChildNodes().length && otherIdx < otherNode.getChildNodes().length) {
// match the children of ourNode and otherNode - try to find matching pairs at ourIdx/otherIdx.
TrieNode<?> ourChild = ourNode.getChildNodes()[ourIdx];
// the other side needs to have a specific prefix. So lets prefix ourChildChars accordingly.
char[] ourChildChars = prefixIfNonNull(otherRequiredPrefix, ourNode.getChildChars()[ourIdx]);
TrieNode<?> otherChild = otherNode.getChildNodes()[otherIdx];
// our side needs to have a specific prefix. So lets prefix otherChildChars accordingly.
char[] otherChildChars = prefixIfNonNull(ourRequiredPrefix, otherNode.getChildChars()[otherIdx]);
doneOnWholeSubTrieOur = false;
doneOnWholeSubTrieOther = false;
int compareRes = TrieUtil.compareChars(ourChildChars, otherChildChars, 0);
if (compareRes == 0) {
// whole string in ourChild was matched.
if (ourChild instanceof TerminalNode) {
// Our child is a Terminal node, means our tree ends here with a final string that is contained in our dict.
if (ourChildChars.length == otherChildChars.length) {
// we did not only match the whole string of our node, but also of the other node. If the otherNode string
// is longer, there is no equal string, as the string in the other dict is longer than the one in our dict.
if (otherChild instanceof TerminalNode) {
// The otherDict also contains a TerminalNode -> we have a match!
callback.foundEqualIds(((TerminalNode) ourChild).getTerminalId(),
((TerminalNode) otherChild).getTerminalId());
doneOnWholeSubTrieOther = true; // done, because we worked on a TerminalNode that matched perfectly.
} else {
// The otherDict contains the same string, but does not have a TerminalNode, but a ParentNode. It could be
// that that ParentNode in turn contains a TerminalNode for an empty string as sub-node, which would lead
// to a match. Check that.
// As the full string of ourNode matched, it cannot be a match to a string in otherDict if we'd have to
// go deeper in otherDict -> there is at max one level!
ParentNode otherChildParent = (ParentNode) otherChild;
if (otherChildParent.getChildChars().length > 0 && otherChildParent.getChildChars()[0].length == 0
&& otherChildParent.getChildNodes()[0] instanceof TerminalNode) {
TerminalNode otherChildChildTerm = (TerminalNode) otherChildParent.getChildNodes()[0];
callback.foundEqualIds(((TerminalNode) ourChild).getTerminalId(), otherChildChildTerm.getTerminalId());
} else
// we fully matched ourChildChars, but otherNode contains a ParentNode that does not have a direct
// terminalNode -> all terminals referred to by otherNode are bigger than our node.
callback.foundGreaterId(((TerminalNode) ourChild).getTerminalId(), otherChildParent.getMinId());
}
} else {
// ourChild is a terminal node whose value is a pure prefix to otherChild -> otherChild is larger.
callback.foundGreaterId(((TerminalNode) ourChild).getTerminalId(), getMinId(otherChild));
}
doneOnWholeSubTrieOur = true; // done, because our trie was a terminalNode.
} else {
// matched whole string in ourChild, but we do not have a terminalNode as ourChild.
// check if otherChild is a terminalNode and see if we can match that terminal string to one in ourNode.
if (otherChild instanceof TerminalNode) {
// find Characters in otherDict that still need to be matched.
long ourId = TrieUtil.findIdOfValue(otherChildChars, ourChildChars.length, ourChild);
long turnaroundPoint = (ourId < 0) ? -(ourId + 1) : ourId;
findAllTerminalNodes(ourChild).forEach(term -> {
if (term.getTerminalId() == ourId)
callback.foundEqualIds(ourId, ((TerminalNode) otherChild).getTerminalId());
else if (term.getTerminalId() < turnaroundPoint)
callback.foundGreaterId(term.getTerminalId(), ((TerminalNode) otherChild).getTerminalId());
else
callback.foundSmallerId(term.getTerminalId(), ((TerminalNode) otherChild).getTerminalId());
});
doneOnWholeSubTrieOther = true; // matched TerminalNode otherNode fully.
} else {
// both, ourChild and otherChild are a ParentNode with the same prefix -> go recursive.
if (ourChildChars.length == otherChildChars.length) {
// string fully matched
analyzeTriesInternal((ParentNode) ourChild, null, (ParentNode) otherChild, null, callback);
// we worked on both sub-tries and they cannot match any other in the tries -> mark both as done.
doneOnWholeSubTrieOther = true;
doneOnWholeSubTrieOur = true;
} else {
// we matched all of ourChildChars, but otherChildChars contains more elements -> we need to set a
// ourPrefix for the following recursive call.
char[] newOurPrefix = new char[otherChildChars.length - ourChildChars.length];
// TODO #6 remove double-arraycopy, as the recursive call will copy right away again.
System.arraycopy(otherChildChars, ourChildChars.length, newOurPrefix, 0, newOurPrefix.length);
analyzeTriesInternal((ParentNode) ourChild, newOurPrefix, (ParentNode) otherChild, null, callback);
// mark the trie with the more specific string as done (in this case otherChild), as the more general one
// might actually match more sub-tries of the other node.
doneOnWholeSubTrieOther = true;
}
}
}
} else if (compareRes < 0) {
// not all of ourChildChars were matched, our value is smaller than the other -> there is a differing character
// at a specific index.
// Mark all terminalNodes as being smaller than otherChild.
long otherMinId = getMinId(otherChild);
findAllTerminalNodes(ourChild).forEach(term -> callback.foundGreaterId(term.getTerminalId(), otherMinId));
// we worked on all strings contained in the trie of ourChild.
doneOnWholeSubTrieOur = true;
} else {
// compareRes > 0 -> ourChildChars is greater than otherChildChars. We though might have matched all of
// otherChildChars, in which case we'd need to recurse deeper in otherChild.
if (otherChildChars.length == 0) {
// special case, where ourChild has characters, but others does not - we cannot match that node! so lets skip
// it as soon as possible.
long otherMaxId = getMaxId(otherChild);
findAllTerminalNodes(ourChild).forEach(term -> callback.foundSmallerId(term.getTerminalId(), otherMaxId));
doneOnWholeSubTrieOther = true;
} else if (compareRes > otherChildChars.length) {
// we matched all of otherChildChars.
if (ourChild instanceof TerminalNode) {
// our node is a terminalNode, lets search for the remaining string in otherChild.
long otherId = TrieUtil.findIdOfValue(ourChildChars, otherChildChars.length, otherChild);
if (otherId >= 0)
callback.foundEqualIds(((TerminalNode) ourChild).getTerminalId(), otherId);
else {
long insertionPoint = -(otherId + 1);
if (insertionPoint > 0)
callback.foundSmallerId(((TerminalNode) ourChild).getTerminalId(), insertionPoint - 1);
if (insertionPoint <= getMaxId(otherChild))
callback.foundGreaterId(((TerminalNode) ourChild).getTerminalId(), insertionPoint);
}
doneOnWholeSubTrieOur = true; // our trie was a TerminalNode only, so we worked on the whole trie.
} else if (otherChild instanceof TerminalNode) {
// other string is shorter than ours, but other node is terminal -> there are no more nodes in
// otherChild that we could use to match our string.
findAllTerminalNodes(ourChild).forEach(
term -> callback.foundSmallerId(term.getTerminalId(), ((TerminalNode) otherChild).getTerminalId()));
// go forward on that node, that was more specific - the less specific one could match additional nodes!
if (ourChildChars.length > otherChildChars.length)
doneOnWholeSubTrieOur = true;
else
doneOnWholeSubTrieOther = true;
} else {
// both child nodes are ParentNodes, where our ParentNode has a string that starts with the whole string of
// otherNode.
char[] newOtherPrefix = new char[ourChildChars.length - otherChildChars.length];
System.arraycopy(ourChildChars, otherChildChars.length, newOtherPrefix, 0, newOtherPrefix.length);
analyzeTriesInternal((ParentNode) ourChild, null, (ParentNode) otherChild, newOtherPrefix, callback);
// mark the trie with the more specific string as done (in this case ourChild), as the more general one
// might actually match more sub-tries of the other node.
doneOnWholeSubTrieOur = true;
}
} else {
// ourChildChars is greater than otherChildChars and we did not match all of the otherChildChars. This means
// there is a conflicting character somewhere, where ourChild is > otherChild.
long otherMaxId = getMaxId(otherChild);
// Note: We found a node that is smaller than our node, but especially here, there is no guarantee that this
// is the maximum otherId for which this inequality holds - we might find better matches in the next
// iteration.
findAllTerminalNodes(ourChild).forEach(term -> callback.foundSmallerId(term.getTerminalId(), otherMaxId));
// we worked on the full other trie, ourChild though might still match a following sub-trie of otherNode.
doneOnWholeSubTrieOther = true;
}
}
if (doneOnWholeSubTrieOther)
otherIdx++;
if (doneOnWholeSubTrieOur)
ourIdx++;
}
// If we stopped iterating the while loop because otherNode did not have any more children, we know that all
// children of ourNode that we did not look at are bigger than the biggest sub-node of otherNode. As we guarantee
// that a callback method will be called for each terminalId in ourNode, we need to call the corresponding method.
long maxOtherId = getMaxId(otherNode);
if (!doneOnWholeSubTrieOur)
// we did not proceed ourIdx in the last execution of the while loop - that means we worked on the corresponding
// child already and do not need to inspect it further.
ourIdx++;
while (ourIdx < ourNode.getChildNodes().length) {
findAllTerminalNodes(ourNode.getChildNodes()[ourIdx])
.forEach(term -> callback.foundSmallerId(term.getTerminalId(), maxOtherId));
ourIdx++;
}
}
/**
* Returns a prefixed char array, if there is a prefix specified.
*
* @param prefixOrNull
* If not <code>null</code> this char[] will be prefixed to actualValue.
* @param actualValue
* The value that should be returned with a possible prefix.
*/
private char[] prefixIfNonNull(char[] prefixOrNull, char[] actualValue) {
if (prefixOrNull == null)
return actualValue;
char[] res = new char[prefixOrNull.length + actualValue.length];
System.arraycopy(prefixOrNull, 0, res, 0, prefixOrNull.length);
System.arraycopy(actualValue, 0, res, prefixOrNull.length, actualValue.length);
return res;
}
private Collection<TerminalNode> findAllTerminalNodes(TrieNode<?> node) {
Collection<TerminalNode> res = new ArrayList<>();
Deque<TrieNode<?>> queue = new LinkedList<>();
queue.add(node);
while (!queue.isEmpty()) {
TrieNode<?> cur = queue.poll();
if (cur instanceof TerminalNode)
res.add((TerminalNode) cur);
else
queue.addAll(Arrays.asList(((ParentNode) cur).getChildNodes()));
}
return res;
}
private long getMinId(TrieNode<?> node) {
if (node instanceof TerminalNode)
return ((TerminalNode) node).getTerminalId();
return ((ParentNode) node).getMinId();
}
private long getMaxId(TrieNode<?> node) {
if (node instanceof TerminalNode)
return ((TerminalNode) node).getTerminalId();
return ((ParentNode) node).getMaxId();
}
/**
* Callback that is called when specific situations are encountered while executing
* {@link TrieValueAnalyzer#analyzeTries(ParentNode, ParentNode, TrieValueAnalyzerCallback)}.
*/
public static interface TrieValueAnalyzerCallback {
/**
* An equal value in the tries was found.
*
* @param ourId
* ID of the value that was reached from ourNode parameter to
* {@link TrieValueAnalyzer#analyzeTries(ParentNode, ParentNode, TrieValueAnalyzerCallback)}.
* @param otherId
* ID of the value that was reached from otherNode parameter to
* {@link TrieValueAnalyzer#analyzeTries(ParentNode, ParentNode, TrieValueAnalyzerCallback)}.
*/
public void foundEqualIds(long ourId, long otherId);
/**
* It was found that the value of otherId is greater than the value of ourId.
*
* <p>
* Please be aware that this method might be called multiple times with the same ourId but different otherIds.
* {@link TrieValueAnalyzer} though guarantees for "ourIds" that are not used in a call to
* {@link #foundEqualIds(long, long)} that either (1) this method will be called at least once for the smallest
* otherId of the set of otherIds that are greater than a specific ourId or (2) {@link #foundSmallerId(long, long)}
* will be called at least once with the greatest otherIds of the set of otherIds that are smaller than the value of
* ourId.
*
* @param ourId
* ID of the value that was reached from ourNode parameter to
* {@link TrieValueAnalyzer#analyzeTries(ParentNode, ParentNode, TrieValueAnalyzerCallback)}.
* @param otherId
* ID of the value that was reached from otherNode parameter to
* {@link TrieValueAnalyzer#analyzeTries(ParentNode, ParentNode, TrieValueAnalyzerCallback)}.
*/
public void foundGreaterId(long ourId, long otherId);
/**
* It was found that the value of otherId is smaller than the value of ourId.
*
* <p>
* Please be aware that this method might be called multiple times with the same ourId but different otherIds.
* {@link TrieValueAnalyzer} though guarantees for "ourIds" that are not used in a call to
* {@link #foundEqualIds(long, long)} that either (1) this method will be called at least once for the greatest
* otherId of the set of otherIds that are smaller than a specific ourId or (2) {@link #foundGreaterId(long, long)}
* will be called at least once with the smallest otherId of the set of otherIds that are greater than the value of
* ourId.
*
* @param ourId
* ID of the value that was reached from ourNode parameter to
* {@link TrieValueAnalyzer#analyzeTries(ParentNode, ParentNode, TrieValueAnalyzerCallback)}.
* @param otherId
* ID of the value that was reached from otherNode parameter to
* {@link TrieValueAnalyzer#analyzeTries(ParentNode, ParentNode, TrieValueAnalyzerCallback)}.
*/
public void foundSmallerId(long ourId, long otherId);
}
}