package com.github.liblevenshtein.collection.dictionary;
import java.util.ArrayDeque;
import java.util.Collection;
import java.util.Deque;
import java.util.HashMap;
import java.util.Map;
import lombok.NonNull;
/**
* <p>
* Node reference-based DAWG implementation that requires the input collection
* to be sorted before it can be built. The sortation is required for space and
* time efficiency.
* </p>
* <p>
* The algorithm for constructing the DAWG (Direct Acyclic Word Graph) from the
* input dictionary of words (DAWGs are otherwise known as an MA-FSA, or Minimal
* Acyclic Finite-State Automata), was taken and modified from the following
* blog from Steve Hanov:
* </p>
* <ul>
* <li>http://stevehanov.ca/blog/index.php?id=115</li>
* </ul>
* <p>
* The algorithm therein was taken from the following paper:
* </p>
* <pre>
* <code>
* {@literal @}MISC {Daciuk00incrementalconstruction,
* author = {Jan Daciuk and
* Bruce W. Watson and
* Richard E. Watson and
* Stoyan Mihov},
* title = {Incremental Construction of Minimal Acyclic Finite-State Automata},
* year = {2000}
* }
* </code>
* </pre>
* @author Dylon Edwards
* @since 2.1.0
*/
public class SortedDawg extends Dawg {
private static final long serialVersionUID = 1L;
/** Transitions that have not been checked for redundancy. */
private Deque<Transition> uncheckedTransitions = new ArrayDeque<>();
/** Nodes that have been checked for redundancy. */
private Map<DawgNode, DawgNode> minimizedNodes = new HashMap<>();
/** References the term that was last added. */
private String previousTerm = "";
/**
* Constructs a new SortedDawg instance.
*/
public SortedDawg() {
super();
}
/**
* Constructs a new SortedDawg instance.
* @param terms Collection of terms to add to this dictionary. This is assumed
* to be sorted ascendingly, in lexicographical order (case-sensitive),
* because the behavior of the current DAWG implementation is unstable if it
* is not.
*/
public SortedDawg(@NonNull final Collection<String> terms) {
this();
if (!addAll(terms)) {
throw new IllegalStateException("Failed to add all terms");
}
finish();
}
/**
* Constructs a new SortedDawg instance.
* @param size Number of terms in this dictionary.
* @param root Root node of this dictionary.
*/
public SortedDawg(
final int size,
@NonNull final DawgNode root) {
super(root, size);
}
/**
* {@inheritDoc}
*/
@Override
public synchronized boolean add(@NonNull final String term) {
if (term.compareTo(previousTerm) < 0) {
throw new IllegalArgumentException(
"Due to caveats with the current DAWG implementation, terms must be "
+ "inserted in ascending order");
}
// Special Case: Empty String
if (term.isEmpty()) {
root = new FinalDawgNode();
return true;
}
final int upperBound = term.length() < previousTerm.length()
? term.length()
: previousTerm.length();
// Find the length of the longest, common prefix between term and
// previousTerm
int i = 0;
while (i < upperBound && term.charAt(i) == previousTerm.charAt(i)) {
i += 1;
}
// Check the unchecked nodes for redundancy, proceeding from the last one
// down to the common prefix size. Then, truncate the list at that point.
minimize(i);
// Add the suffix, starting from the correct node, mid-way through the graph
DawgNode node = (null == uncheckedTransitions.peekFirst())
? root
: uncheckedTransitions.peekFirst().target();
for (int k = term.length() - 1; i < k; i += 1) {
final char label = term.charAt(i);
final DawgNode nextNode = new DawgNode();
uncheckedTransitions.addFirst(new Transition(node, label, nextNode));
node = nextNode;
}
if (i < term.length()) {
final char label = term.charAt(i);
final DawgNode nextNode = new FinalDawgNode();
uncheckedTransitions.addFirst(new Transition(node, label, nextNode));
}
previousTerm = term;
size += 1;
return true;
}
/**
* Finishes processing the pending transitions.
*/
public void finish() {
minimize(0);
}
/**
* Builds this DAWG in such a way that it remains a minimal trie.
* @param lowerBound Number of pending transitions to leave for the next
* round (they will be the most-recent transitions).
*/
private void minimize(final int lowerBound) {
// Proceed from the leaf up to a certain point
for (int j = uncheckedTransitions.size(); j > lowerBound; j -= 1) {
final Transition transition = uncheckedTransitions.removeFirst();
final DawgNode source = transition.source();
final char label = transition.label();
final DawgNode target = transition.target();
final DawgNode existing = minimizedNodes.get(target);
if (null != existing) {
source.addEdge(label, existing);
}
else {
source.addEdge(label, target);
minimizedNodes.put(target, target);
}
}
}
/**
* {@inheritDoc}
*/
@Override
public boolean remove(final Object object) {
throw new UnsupportedOperationException(
"SortedDawg does not support removing terms");
}
}