/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.commoncrawl.util; import java.util.Arrays; import java.util.LinkedList; import java.util.ListIterator; /** * TrieStringMatcher is a base class for simple tree-based string matching. * */ public abstract class TrieStringMatcher { protected TrieNode root; protected TrieStringMatcher() { this.root = new TrieNode('\000', false); } /** * Node class for the character tree. */ protected class TrieNode implements Comparable<TrieNode> { protected TrieNode[] children; protected LinkedList<TrieNode> childrenList; protected char nodeChar; protected boolean terminal; /** * Creates a new TrieNode, which contains the given <code>nodeChar</code>. * If <code>isTerminal</code> is <code>true</code>, the new node is a * <em>terminal</em> node in the trie. */ TrieNode(char nodeChar, boolean isTerminal) { this.nodeChar = nodeChar; this.terminal = isTerminal; this.childrenList = new LinkedList<TrieNode>(); } /** * Returns <code>true</code> if this node is a <em>terminal</em> node in the * trie. */ boolean isTerminal() { return terminal; } /** * Returns the child node of this node whose node-character is * <code>nextChar</code>. If no such node exists, one will be is added. If * <em>isTerminal</em> is <code>true</code>, the node will be a terminal * node in the trie. */ TrieNode getChildAddIfNotPresent(char nextChar, boolean isTerminal) { if (childrenList == null) { childrenList = new LinkedList<TrieNode>(); childrenList.addAll(Arrays.asList(children)); children = null; } if (childrenList.size() == 0) { TrieNode newNode = new TrieNode(nextChar, isTerminal); childrenList.add(newNode); return newNode; } ListIterator<TrieNode> iter = childrenList.listIterator(); TrieNode node = iter.next(); while ((node.nodeChar < nextChar) && iter.hasNext()) node = iter.next(); if (node.nodeChar == nextChar) { node.terminal = node.terminal | isTerminal; return node; } if (node.nodeChar > nextChar) iter.previous(); TrieNode newNode = new TrieNode(nextChar, isTerminal); iter.add(newNode); return newNode; } /** * Returns the child node of this node whose node-character is * <code>nextChar</code>. If no such node exists, <code>null</code> is * returned. */ TrieNode getChild(char nextChar) { if (children == null) { children = childrenList.toArray(new TrieNode[childrenList.size()]); childrenList = null; Arrays.sort(children); } int min = 0; int max = children.length - 1; int mid = 0; while (min < max) { mid = (min + max) / 2; if (children[mid].nodeChar == nextChar) return children[mid]; if (children[mid].nodeChar < nextChar) min = mid + 1; else // if (children[mid].nodeChar > nextChar) max = mid - 1; } if (min == max) if (children[min].nodeChar == nextChar) return children[min]; return null; } public int compareTo(TrieNode other) { if (this.nodeChar < other.nodeChar) return -1; if (this.nodeChar == other.nodeChar) return 0; // if (this.nodeChar > other.nodeChar) return 1; } } /** * Returns the next {@link TrieNode} visited, given that you are at * <code>node</code>, and the the next character in the input is the * <code>idx</code>'th character of <code>s</code>. */ protected final TrieNode matchChar(TrieNode node, String s, int idx) { return node.getChild(s.charAt(idx)); } /** * Adds any necessary nodes to the trie so that the given <code>String</code> * can be decoded and the last character is represented by a terminal node. * Zero-length <code>Strings</code> are ignored. */ protected final void addPatternForward(String s) { TrieNode node = root; int stop = s.length() - 1; int i; if (s.length() > 0) { for (i = 0; i < stop; i++) node = node.getChildAddIfNotPresent(s.charAt(i), false); node = node.getChildAddIfNotPresent(s.charAt(i), true); } } /** * Adds any necessary nodes to the trie so that the given <code>String</code> * can be decoded <em>in reverse</em> and the first character is represented * by a terminal node. Zero-length <code>Strings</code> are ignored. */ protected final void addPatternBackward(String s) { TrieNode node = root; if (s.length() > 0) { for (int i = s.length() - 1; i > 0; i--) node = node.getChildAddIfNotPresent(s.charAt(i), false); node = node.getChildAddIfNotPresent(s.charAt(0), true); } } /** * Returns true if the given <code>String</code> is matched by a pattern in * the trie */ public abstract boolean matches(String input); /** * Returns the shortest substring of <code>input<code> that is * matched by a pattern in the trie, or <code>null<code> if no match * exists. */ public abstract String shortestMatch(String input); /** * Returns the longest substring of <code>input<code> that is * matched by a pattern in the trie, or <code>null<code> if no match * exists. */ public abstract String longestMatch(String input); }