/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.io.IOException; import java.util.List; import java.util.Stack; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * * @author rana * * @param <MetadataType> */ public class NameTree<MetadataType> { private static final Log LOG = LogFactory.getLog(NameTree.class); private Node<MetadataType> _root = new Node<MetadataType>(); private Stack<TreePosition> _iterationStack = new Stack<TreePosition>(); private long numberOfNodes = 0; private long numberOfNodesRemoved = 0; private static final int searchMode_NodeChar = 1; private static final int searchMode_MultiNodeChar = 2; public NameTree() { _iterationStack.ensureCapacity(1024); } public synchronized long getActiveNodeCount() { return numberOfNodes - numberOfNodesRemoved; } public synchronized void clear() { _root = new Node<MetadataType>(); _iterationStack.clear(); numberOfNodes = 0; numberOfNodesRemoved = 0; } public static class DNSResult { DNSResult(int ipAddress,long ipAddressTTL,String cName) { _ipV4Address = ipAddress; _ttl = ipAddressTTL; _cName = cName; } public int getIPAddress() { return _ipV4Address; } public long getTTL() { return _ttl; } public String getCannonicalName() { return _cName; } int _ipV4Address; long _ttl; String _cName; } private static final class TreePosition { TreePosition(Node node,int index) { _node = node; _index = index; } public final Node getNode() { return _node; } public final int getIndex() { return _index; } public final void setIndex(int index) { _index = index; } public final Node resolve() { if (_index < _node.getChildCount()) { return _node.getChildAt(_index); } return null; } private Node _node; private int _index; } static int numberOfNodesChildEQ1 =0; static int numberOfNodesChildLTEQ4 =0; static int numberOfNodesChildLTEQ8 =0; static int numberOfNodesChildGT8 =0; public static class Node<MetadataType> { private static final int GROWTH_FACTOR = 1; private char _nodeChar; private char _nodeCharArray[]; private int _flags = 0; private int _count = 0; private Object _children; private Node<MetadataType> _parent = null; private long _lastTouched = -1; private long _longData = 0; private MetadataType _metadata = null; /* private Object _metadata = null; */ public static final short Flag_Is_RootNode = 1 << 0; public static final short Flag_Is_TerminalNode = 1 << 1; public static final short Flag_Is_SuperNode = 1 << 2; public static final short Flag_Is_MultiCharNode = 1 << 3; public static final short Flag_NEXT_AVAILABLE_BIT_POS = 4; private Node() { _parent = null; _flags = Flag_Is_RootNode; } public boolean isRootNode() { return (_flags & Flag_Is_RootNode) != 0; } public Node<MetadataType> getParentNode() { return _parent; } public Node(Node<MetadataType> parent,char nodeChar,int flags) { numberOfNodesChildEQ1++; _nodeChar = nodeChar; _flags = (short)flags; _parent = parent; } public final char getNodeChar() { return _nodeChar; } public final char[] getMultiCharArray() { return _nodeCharArray; } public final boolean isTerminalNode() { return (_flags & Flag_Is_TerminalNode) != 0; } public final void markAsTerminalNode() { _flags |= Flag_Is_TerminalNode; } public final boolean isSuperNode() { return (_flags & Flag_Is_SuperNode) != 0; } public final void markAsSuperNode() { _flags |=Flag_Is_SuperNode; } public final boolean isMultiCharNode() { return (_flags & Flag_Is_MultiCharNode) != 0; } public final void setFlag(short flag) { _flags |= flag; } public final void clearFlag(short flag) { _flags &= ~flag; } public final boolean isFlagSet(short flag) { return (_flags & flag) != 0; } /** update last touched time **/ public final void setLastTouchedTime(long timeInMilliseconds) { _lastTouched = timeInMilliseconds; } /** get last touched time **/ public final long getLastTouchedTime() { return _lastTouched; } public final void setLongData(long data) { _longData = data; } public final long getLongData() { return _longData; } public final void setMetadata(MetadataType metadata) { _metadata = metadata; } public MetadataType getMetadata() { return _metadata; } public String getFullName() { StringBuffer nameOut = new StringBuffer(); Node currentNode = this; while (!currentNode.isRootNode()) { if (currentNode.isMultiCharNode()) { nameOut.append(currentNode.getMultiCharArray()); } nameOut.append(currentNode.getNodeChar()); currentNode = currentNode.getParentNode(); } return nameOut.toString(); } public final int compareTo(char c) { if (_nodeChar < c) return -1; else if (_nodeChar > c) return 1; else return 0; } public final int getChildCount() { return _count; } public final Node<MetadataType> getChildAt(int index) { return (_count == 1) ? (Node)_children : ((Node[])_children)[index]; } public final void removeChildAt(int index) { if (index >= _count) throw new RuntimeException("Invalid Index" ); if (_count == 1) { _children = null; } else { int rightOfIndexCount = _count - (index + 1); if (rightOfIndexCount > 0) { if (_count > 2) System.arraycopy(_children, index + 1, _children, index, rightOfIndexCount); else _children = ((Node[])_children)[1]; } else { if (_count == 2) { _children = ((Node[])_children)[0]; } } } _count--; } public final Node<MetadataType> findOrAddChild(NameTree<MetadataType> cacheObject,char nodeChar,boolean addChild) { int itemPosition= -1; if (_count == 1) { if (((Node)_children).getNodeChar() == nodeChar) { itemPosition = 0; } else if (((Node)_children).getNodeChar() < nodeChar ) { itemPosition = -2; } } else if (_count > 1) { itemPosition = binarySearch((Node[])_children,0,_count,nodeChar); } // lt 0 means item was not found ... if (itemPosition < 0 && addChild) { // allocate a new Node Node newNode = new Node(this,nodeChar,0); // increment stats ... cacheObject.numberOfNodes++; // normalize item positoin ... itemPosition = Math.abs(itemPosition + 1); // check some bounding conditions. if (_count == 0) { _children = newNode; } else { // get some basic bounding information established ... int leftCopyItems = itemPosition; int rightCopyItems = _count - itemPosition; Node copyArray[] = (_count == 1) ? null: (Node[]) _children; // if the array is full ... if (_count ==1 || _count == ((Node[])_children).length) { if (_count == 1) { numberOfNodesChildEQ1 --; numberOfNodesChildLTEQ4++; } else if (_count == 4) { numberOfNodesChildLTEQ4 --; numberOfNodesChildLTEQ8++; } else if (_count == 8) { numberOfNodesChildLTEQ8--; numberOfNodesChildGT8++; } // allocate a new array int growAmount = ((_count / GROWTH_FACTOR) + 1) * GROWTH_FACTOR; copyArray = new Node[growAmount]; } // copy lbound items only if copying into new array ... if (leftCopyItems != 0 && copyArray != null) { if (_count == 1) copyArray[0] = (Node)_children; else System.arraycopy(_children, 0, copyArray, 0, leftCopyItems); } // copy rbound items no matter what ... if (rightCopyItems != 0) { if (_count == 1) copyArray[1] = (Node)_children; else System.arraycopy(_children, itemPosition, copyArray, itemPosition + 1, rightCopyItems); } // and assign copy array to children _children = copyArray; // insert new item into array ... ((Node[])_children)[itemPosition] = newNode; } // increment count _count++; } if (itemPosition >=0) { return (_count == 1) ? (Node)_children : ((Node[])_children)[itemPosition]; } return null; } // Like public version, but without range checks. private static int binarySearch(Node[] a, int fromIndex, int toIndex,char key) { int low = fromIndex; int high = toIndex - 1; while (low <= high) { int mid = (low + high) >>> 1; Node midVal = a[mid]; int cmp = midVal.compareTo(key); if (cmp < 0) low = mid + 1; else if (cmp > 0) high = mid - 1; else return mid; // key found } return -(low + 1); // key not found. } public void markAsMultiCharNode(char s[], int startOffset,int length) { if (length == 0) { throw new RuntimeException(); } _flags |= Flag_Is_MultiCharNode; _nodeCharArray = new char[length]; System.arraycopy(s, startOffset, _nodeCharArray, 0, length); } public Node<MetadataType> splitMultiCharNodeAt(NameTree cacheObject,int splitIdx) { // allocate new intermediate node Node newIntermediateNode = new Node(this,_nodeCharArray[splitIdx],0); //increment stats ... cacheObject.numberOfNodes++; newIntermediateNode._longData = this._longData; newIntermediateNode._metadata = this._metadata; this._longData = 0; this._metadata = null; // this._metadata = null; newIntermediateNode._flags =(short) (this._flags & ~Flag_Is_MultiCharNode); this._flags = 0; int leftOfSplitLength = splitIdx; int rightOfSplitLength = _nodeCharArray.length - splitIdx - 1; if (leftOfSplitLength != 0) { newIntermediateNode.markAsMultiCharNode(_nodeCharArray,0,leftOfSplitLength); } if (rightOfSplitLength != 0) { char array[] = new char[rightOfSplitLength]; System.arraycopy(_nodeCharArray,splitIdx+1 , array, 0, rightOfSplitLength); _nodeCharArray = array; _flags |= Flag_Is_MultiCharNode; } else { _nodeCharArray = null; _flags &= ~Flag_Is_MultiCharNode; } // absorb the old node's children ... newIntermediateNode._children = this._children; newIntermediateNode._count = this._count; // and re-parent our children to point to new intermediate branch node for (int i=0;i<newIntermediateNode.getChildCount();++i) { newIntermediateNode.getChildAt(i)._parent = newIntermediateNode; } // make the new node the old node's only child ... this._children = newIntermediateNode; this._count = 1; return newIntermediateNode; } } /** add a name node **/ public Node<MetadataType> addNameNode(String nodeName) { return addNode(_root,nodeName); } /** underlying routine shared by name and ip lookup routines **/ private Node<MetadataType> addNode(Node<MetadataType> rootNode,String path){ // search super node in reverse order ... String s = path.toLowerCase(); Node<MetadataType> node = rootNode; if (s.length() > 0) { int multiNodeIdx = -1; int searchMode = searchMode_NodeChar; for (int i= s.length()-1; i >= 0; i--) { if (searchMode == searchMode_NodeChar) { // find the node for the next character ... node= node.findOrAddChild(this,s.charAt(i),true); // update/set the node's ttl //node.setTimeToLive(Math.max(node.getTimeToLive(),ttl)); // if the returned node is a multi-char node, search into the node ... if (node.isMultiCharNode()) { searchMode = searchMode_MultiNodeChar; multiNodeIdx = node.getMultiCharArray().length - 1; } // otherwise... if the node has no children and the index is not zero and current char is not '.' token ... else if (node.getChildCount() == 0 && i != 0 && s.charAt(i) != '.' && !node.isTerminalNode()) { int multiNodeScanStart = i-1; int multiNodeCharEndPos = multiNodeScanStart; // walk backwards until either end of string is reached, or a '.' token is located ... while (multiNodeCharEndPos >= 0 && s.charAt(multiNodeCharEndPos) != '.') multiNodeCharEndPos--; // if we actually accumulated something in the scan buffer ... if (multiNodeScanStart - multiNodeCharEndPos != 0) { // if scan terminated before the end of the string, then a '.' token terminated the scan ... if (multiNodeCharEndPos != -1) { // gobble up the appropriate number of characters ... node.markAsMultiCharNode(s.toCharArray(),multiNodeCharEndPos + 1,i-(multiNodeCharEndPos + 1)); //set up i so that loop can continue ... i = multiNodeCharEndPos + 1; // and break out of multi-char scan mode ... searchMode = searchMode_NodeChar; } else { // gobble up remaining characters and covert node to multi-char node ... node.markAsMultiCharNode(s.toCharArray(),0,i); // and break out of loop... break; } } } } else { if (multiNodeIdx ==-1 || i == -1) { throw new RuntimeException(); } if (node.getMultiCharArray()[multiNodeIdx] != s.charAt(i)) { node.splitMultiCharNodeAt(this,multiNodeIdx); i+=1; searchMode = searchMode_NodeChar; } else { multiNodeIdx -= 1; if (multiNodeIdx < 0) searchMode = searchMode_NodeChar; } } } // finally, if we are still in multi-char search mode and multiNodeIdx != -1 // this means that we terminated a successfull match int the MIDDLE of a multi-char node if (searchMode == searchMode_MultiNodeChar && multiNodeIdx != -1) { // in this case, we have to split the multi-char node appropriately ... node.splitMultiCharNodeAt(this,multiNodeIdx); } node.markAsTerminalNode(); return node; } return null; } public Node<MetadataType> findNode(String nodeName) { Node<MetadataType> nodeOut = _findNode(_root,nodeName); if (nodeOut != null && nodeOut.isTerminalNode()) { nodeOut.setLastTouchedTime(System.currentTimeMillis()); } return nodeOut; } private synchronized Node _findNode(Node rootNode,String nodeName) { String s = nodeName.toLowerCase(); Node node= rootNode; if (s.length() > 0) { Node lastSubTerminalNode = null; for (int i= s.length()-1; i >= 0 && node != null; i--) { if (s.charAt(i) == '.' && node != _root) { lastSubTerminalNode = node; } node= node.findOrAddChild(this,s.charAt(i),false); // now if this is a multi-char node ... if (node != null && node.isMultiCharNode()) { int innerScanPos = i-1; int multiCharArrayScanPos = node.getMultiCharArray().length -1; while (innerScanPos >= 0 && multiCharArrayScanPos >=0) { if (s.charAt(innerScanPos) == node.getMultiCharArray()[multiCharArrayScanPos]) { innerScanPos--; multiCharArrayScanPos--; } else { break; } } // now first condition for a successfull match is than we completely scanned the mutli-char array ... if (multiCharArrayScanPos == -1) { // now if char scan also reached the beginning of the search string ... if (innerScanPos == -1) { // this is a true match ... return (node.isTerminalNode()) ? node : null; } // otherwise ... check the character at new position ... else { // set i to inner scan pos + 1 (so that out loop will properly adjust value) i=innerScanPos+1; } } else { node = null; break; } } } if (node == null && lastSubTerminalNode != null) { if (lastSubTerminalNode.isSuperNode()) { node = lastSubTerminalNode; } } if (node != null) return (node.isTerminalNode()) ? node : null; } return null; } private synchronized void collectTerminalNodes(Node node,List<Node> terminalNodeVector)throws IOException { if (node.isTerminalNode()) { terminalNodeVector.add(node); } for (int i=0;i<node.getChildCount();++i) { collectTerminalNodes(node.getChildAt(i),terminalNodeVector); } } public synchronized void collectTerminalNodes(List<Node> terminalNodeVector){ try { collectTerminalNodes(_root,terminalNodeVector); } catch (IOException e) { } } }