// Jericho HTML Parser - Java based library for analysing and manipulating HTML // Version 3.2 // Copyright (C) 2004-2009 Martin Jericho // http://jericho.htmlparser.net/ // // This library is free software; you can redistribute it and/or // modify it under the terms of either one of the following licences: // // 1. The Eclipse Public License (EPL) version 1.0, // included in this distribution in the file licence-epl-1.0.html // or available at http://www.eclipse.org/legal/epl-v10.html // // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later, // included in this distribution in the file licence-lgpl-2.1.txt // or available at http://www.gnu.org/licenses/lgpl.txt // // This library is distributed on an "AS IS" basis, // WITHOUT WARRANTY OF ANY KIND, either express or implied. // See the individual licence texts for more details. package net.htmlparser.jericho; import java.util.*; final class TagTypeRegister { private TagTypeRegister parent=null; private char ch=NULL_CHAR; private TagTypeRegister[] children=null; // always in alphabetical order private TagType[] tagTypes=null; // in descending order of priority private static final char NULL_CHAR='\u0000'; private static final TagType[] DEFAULT_TAG_TYPES={ StartTagType.UNREGISTERED, StartTagType.NORMAL, StartTagType.COMMENT, StartTagType.MARKUP_DECLARATION, StartTagType.DOCTYPE_DECLARATION, StartTagType.CDATA_SECTION, StartTagType.XML_PROCESSING_INSTRUCTION, StartTagType.XML_DECLARATION, StartTagType.SERVER_COMMON, StartTagType.SERVER_COMMON_ESCAPED, StartTagType.SERVER_COMMON_COMMENT, EndTagType.UNREGISTERED, EndTagType.NORMAL }; private static TagTypeRegister root=new TagTypeRegister(); static { add(DEFAULT_TAG_TYPES); } private TagTypeRegister() {} private static synchronized void add(final TagType[] tagTypes) { for (int i=0; i<tagTypes.length; i++) add(tagTypes[i]); } public static synchronized void add(final TagType tagType) { TagTypeRegister cursor=root; final String startDelimiter=tagType.getStartDelimiter(); for (int i=0; i<startDelimiter.length(); i++) { final char ch=startDelimiter.charAt(i); TagTypeRegister child=cursor.getChild(ch); if (child==null) { child=new TagTypeRegister(); child.parent=cursor; child.ch=ch; cursor.addChild(child); } cursor=child; } cursor.addTagType(tagType); } public static synchronized void remove(final TagType tagType) { TagTypeRegister cursor=root; final String startDelimiter=tagType.getStartDelimiter(); for (int i=0; i<startDelimiter.length(); i++) { final char ch=startDelimiter.charAt(i); final TagTypeRegister child=cursor.getChild(ch); if (child==null) return; cursor=child; } cursor.removeTagType(tagType); // clean up any unrequired children: while (cursor!=root && cursor.tagTypes==null && cursor.children==null) { cursor.parent.removeChild(cursor); cursor=cursor.parent; } } // list is in order of lowest to highest precedence public static List<TagType> getList() { final ArrayList<TagType> list=new ArrayList<TagType>(); root.addTagTypesToList(list); return list; } private void addTagTypesToList(final List<TagType> list) { if (tagTypes!=null) for (int i=tagTypes.length-1; i>=0; i--) list.add(tagTypes[i]); if (children!=null) for (TagTypeRegister tagTypeRegister : children) tagTypeRegister.addTagTypesToList(list); } public static final String getDebugInfo() { return root.appendDebugInfo(new StringBuilder(),0).toString(); } static final class ProspectiveTagTypeIterator implements Iterator<TagType> { private TagTypeRegister cursor; private int tagTypeIndex=0; public ProspectiveTagTypeIterator(final Source source, final int pos) { // returns empty iterator if pos out of range final ParseText parseText=source.getParseText(); cursor=root; int posIndex=0; try { // find deepest node that matches the text at pos: while (true) { final TagTypeRegister child=cursor.getChild(parseText.charAt(pos+(posIndex++))); if (child==null) break; cursor=child; } } catch (IndexOutOfBoundsException ex) {} // not avoiding this exception is expensive but only happens in the very rare circumstance that the end of file is encountered in the middle of a potential tag. // go back up until we reach a node that contains a list of tag types: while (cursor.tagTypes==null) if ((cursor=cursor.parent)==null) break; } public boolean hasNext() { return cursor!=null; } public TagType next() { final TagType[] tagTypes=cursor.tagTypes; final TagType nextTagType=tagTypes[tagTypeIndex]; if ((++tagTypeIndex)==tagTypes.length) { tagTypeIndex=0; do {cursor=cursor.parent;} while (cursor!=null && cursor.tagTypes==null); } return nextTagType; } public void remove() { throw new UnsupportedOperationException(); } } public String toString() { return appendDebugInfo(new StringBuilder(),0).toString(); } private StringBuilder appendDebugInfo(final StringBuilder sb, final int level) { for (int i=0; i<level; i++) sb.append(" "); if (ch!=NULL_CHAR) sb.append(ch).append(' '); if (tagTypes!=null) { sb.append('('); for (TagType tagType : tagTypes) sb.append(tagType.getDescription()).append(", "); sb.setLength(sb.length()-2); sb.append(')'); } sb.append(Config.NewLine); if (children!=null) { final int childLevel=level+1; for (TagTypeRegister tagTypeRegister : children) tagTypeRegister.appendDebugInfo(sb,childLevel); } return sb; } private TagTypeRegister getChild(final char ch) { if (children==null) return null; if (children.length==1) return children[0].ch==ch ? children[0] : null; // perform binary search: int low=0; int high=children.length-1; while (low<=high) { int mid=(low+high) >> 1; final char midChar=children[mid].ch; if (midChar<ch) low=mid+1; else if (midChar>ch) high=mid-1; else return children[mid]; } return null; } private void addChild(final TagTypeRegister child) { // assumes the character associated with the child register does not already exist in this register's children. if (children==null) { children=new TagTypeRegister[] {child}; } else { final TagTypeRegister[] newChildren=new TagTypeRegister[children.length+1]; int i=0; while (i<children.length && children[i].ch<=child.ch) { newChildren[i]=children[i]; i++; } newChildren[i++]=child; while (i<newChildren.length) { newChildren[i]=children[i-1]; i++; } children=newChildren; } } private void removeChild(final TagTypeRegister child) { // this method assumes that the specified child exists in the children array if (children.length==1) { children=null; return; } final TagTypeRegister[] newChildren=new TagTypeRegister[children.length-1]; int offset=0; for (int i=0; i<children.length; i++) { if (children[i]==child) offset=-1; else newChildren[i+offset]=children[i]; } children=newChildren; } private int indexOfTagType(final TagType tagType) { if (tagTypes==null) return -1; for (int i=0; i<tagTypes.length; i++) if (tagTypes[i]==tagType) return i; return -1; } private void addTagType(final TagType tagType) { final int indexOfTagType=indexOfTagType(tagType); if (indexOfTagType==-1) { if (tagTypes==null) { tagTypes=new TagType[] {tagType}; } else { final TagType[] newTagTypes=new TagType[tagTypes.length+1]; newTagTypes[0]=tagType; for (int i=0; i<tagTypes.length; i++) newTagTypes[i+1]=tagTypes[i]; tagTypes=newTagTypes; } } else { // tagType already exists in the list, just move it to the front for (int i=indexOfTagType; i>0; i--) tagTypes[i]=tagTypes[i-1]; tagTypes[0]=tagType; } } private void removeTagType(final TagType tagType) { final int indexOfTagType=indexOfTagType(tagType); if (indexOfTagType==-1) return; if (tagTypes.length==1) { tagTypes=null; return; } final TagType[] newTagTypes=new TagType[tagTypes.length-1]; for (int i=0; i<indexOfTagType; i++) newTagTypes[i]=tagTypes[i]; for (int i=indexOfTagType; i<newTagTypes.length; i++) newTagTypes[i]=tagTypes[i+1]; tagTypes=newTagTypes; } }