/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.util;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.ListIterator;
/**
* TrieStringMatcher is a base class for simple tree-based string
* matching.
*
*/
public abstract class TrieStringMatcher {
protected TrieNode root;
protected TrieStringMatcher() {
this.root= new TrieNode('\000', false);
}
/**
* Node class for the character tree.
*/
protected class TrieNode implements Comparable<TrieNode> {
protected TrieNode[] children;
protected LinkedList<TrieNode> childrenList;
protected char nodeChar;
protected boolean terminal;
/**
* Creates a new TrieNode, which contains the given
* <code>nodeChar</code>. If <code>isTerminal</code> is
* <code>true</code>, the new node is a <em>terminal</em> node in
* the trie.
*/
TrieNode(char nodeChar, boolean isTerminal) {
this.nodeChar= nodeChar;
this.terminal= isTerminal;
this.childrenList= new LinkedList<TrieNode>();
}
/**
* Returns <code>true</code> if this node is a <em>terminal</em>
* node in the trie.
*/
boolean isTerminal() {
return terminal;
}
/**
* Returns the child node of this node whose node-character is
* <code>nextChar</code>. If no such node exists, one will be is
* added. If <em>isTerminal</em> is <code>true</code>, the node
* will be a terminal node in the trie.
*/
TrieNode getChildAddIfNotPresent(char nextChar, boolean isTerminal) {
if (childrenList == null) {
childrenList= new LinkedList<TrieNode>();
childrenList.addAll(Arrays.asList(children));
children= null;
}
if (childrenList.size() == 0) {
TrieNode newNode= new TrieNode(nextChar, isTerminal);
childrenList.add(newNode);
return newNode;
}
ListIterator<TrieNode> iter= childrenList.listIterator();
TrieNode node= iter.next();
while ( (node.nodeChar < nextChar) && iter.hasNext() )
node= iter.next();
if (node.nodeChar == nextChar) {
node.terminal= node.terminal | isTerminal;
return node;
}
if (node.nodeChar > nextChar)
iter.previous();
TrieNode newNode= new TrieNode(nextChar, isTerminal);
iter.add(newNode);
return newNode;
}
/**
* Returns the child node of this node whose node-character is
* <code>nextChar</code>. If no such node exists,
* <code>null</code> is returned.
*/
TrieNode getChild(char nextChar) {
if (children == null) {
children= childrenList.toArray(new TrieNode[childrenList.size()]);
childrenList= null;
Arrays.sort(children);
}
int min= 0;
int max= children.length - 1;
int mid= 0;
while (min < max) {
mid= (min + max) / 2;
if (children[mid].nodeChar == nextChar)
return children[mid];
if (children[mid].nodeChar < nextChar)
min= mid + 1;
else // if (children[mid].nodeChar > nextChar)
max= mid - 1;
}
if (min == max)
if (children[min].nodeChar == nextChar)
return children[min];
return null;
}
public int compareTo(TrieNode other) {
if (this.nodeChar < other.nodeChar)
return -1;
if (this.nodeChar == other.nodeChar)
return 0;
// if (this.nodeChar > other.nodeChar)
return 1;
}
}
/**
* Returns the next {@link TrieNode} visited, given that you are at
* <code>node</code>, and the the next character in the input is
* the <code>idx</code>'th character of <code>s</code>.
*/
protected final TrieNode matchChar(TrieNode node, String s, int idx) {
return node.getChild(s.charAt(idx));
}
/**
* Adds any necessary nodes to the trie so that the given
* <code>String</code> can be decoded and the last character is
* represented by a terminal node. Zero-length <code>Strings</code>
* are ignored.
*/
protected final void addPatternForward(String s) {
TrieNode node= root;
int stop= s.length() - 1;
int i;
if (s.length() > 0) {
for (i= 0; i < stop; i++)
node= node.getChildAddIfNotPresent(s.charAt(i), false);
node= node.getChildAddIfNotPresent(s.charAt(i), true);
}
}
/**
* Adds any necessary nodes to the trie so that the given
* <code>String</code> can be decoded <em>in reverse</em> and the
* first character is represented by a terminal node. Zero-length
* <code>Strings</code> are ignored.
*/
protected final void addPatternBackward(String s) {
TrieNode node= root;
if (s.length() > 0) {
for (int i= s.length()-1; i > 0; i--)
node= node.getChildAddIfNotPresent(s.charAt(i), false);
node= node.getChildAddIfNotPresent(s.charAt(0), true);
}
}
/**
* Returns true if the given <code>String</code> is matched by a
* pattern in the trie
*/
public abstract boolean matches(String input);
/**
* Returns the shortest substring of <code>input<code> that is
* matched by a pattern in the trie, or <code>null<code> if no match
* exists.
*/
public abstract String shortestMatch(String input);
/**
* Returns the longest substring of <code>input<code> that is
* matched by a pattern in the trie, or <code>null<code> if no match
* exists.
*/
public abstract String longestMatch(String input);
}