/* * Title: FastPageParser * Description: * * This software is published under the terms of the OpenSymphony Software * License version 1.1, of which a copy has been included with this * distribution in the LICENSE.txt file. */ package com.opensymphony.module.sitemesh.parser; import com.opensymphony.module.sitemesh.DefaultSitemeshBuffer; import com.opensymphony.module.sitemesh.Page; import com.opensymphony.module.sitemesh.PageParser; import com.opensymphony.module.sitemesh.SitemeshBuffer; import com.opensymphony.module.sitemesh.html.util.CharArray; import com.opensymphony.module.sitemesh.util.CharArrayReader; import com.opensymphony.module.sitemesh.util.CharArrayWriter; import java.io.IOException; import java.io.Reader; import java.util.Collections; import java.util.HashMap; import java.util.Map; /** * Very fast PageParser implementation for parsing HTML. * * <p>Produces FastPage.</p> * * @author <a href="mailto:salaman@qoretech.com">Victor Salaman</a> * @version $Revision: 1.13 $ * @deprecated Use HTMLPageParser instead - it performs better and is more extensible. */ public final class FastPageParser implements PageParser { private static final int TOKEN_NONE = -0; private static final int TOKEN_EOF = -1; private static final int TOKEN_TEXT = -2; private static final int TOKEN_TAG = -3; private static final int TOKEN_COMMENT = -4; private static final int TOKEN_CDATA = -5; private static final int TOKEN_SCRIPT = -6; private static final int TOKEN_DOCTYPE = -7; private static final int TOKEN_EMPTYTAG = -8; private static final int STATE_EOF = -1; private static final int STATE_TEXT = -2; private static final int STATE_TAG = -3; private static final int STATE_COMMENT = -4; private static final int STATE_TAG_QUOTE = -5; private static final int STATE_CDATA = -6; private static final int STATE_SCRIPT = -7; private static final int STATE_DOCTYPE = -8; private static final int TAG_STATE_NONE = 0; private static final int TAG_STATE_HTML = -1; private static final int TAG_STATE_HEAD = -2; private static final int TAG_STATE_TITLE = -3; private static final int TAG_STATE_BODY = -4; private static final int TAG_STATE_XML = -6; private static final int TAG_STATE_XMP = -7; // These hashcodes are hardcoded because swtich statements can only // switch on compile-time constants. // In theory it is possible for there to be a hashcode collision with // other HTML tags, however in practice it is *very* unlikely because // tags are generally only a few characters long and hence are likely // to produce unique values. private static final int SLASH_XML_HASH = 1518984; // "/xml".hashCode(); private static final int XML_HASH = 118807; // "xml".hashCode(); private static final int SLASH_XMP_HASH = 1518988; // "/xmp".hashCode(); private static final int XMP_HASH = 118811; // "xmp".hashCode(); private static final int HTML_HASH = 3213227; // "html".hashCode(); private static final int SLASH_HTML_HASH = 46618714; // "/html".hashCode(); private static final int HEAD_HASH = 3198432; // "head".hashCode(); private static final int TITLE_HASH = 110371416; // "title".hashCode(); private static final int SLASH_TITLE_HASH = 1455941513; // "/title".hashCode(); private static final int PARAMETER_HASH = 1954460585; // "parameter".hashCode(); private static final int META_HASH = 3347973; // "meta".hashCode(); private static final int SLASH_HEAD_HASH = 46603919; // "/head".hashCode(); private static final int FRAMESET_HASH = -1644953643; // "frameset".hashCode(); private static final int FRAME_HASH = 97692013; // "frame".hashCode(); private static final int BODY_HASH = 3029410; // "body".hashCode(); private static final int SLASH_BODY_HASH = 46434897; // "/body".hashCode(); private static final int CONTENT_HASH = 951530617; // "content".hashCode(); public Page parse(char[] buffer) throws IOException { return parse(new DefaultSitemeshBuffer(buffer)); } public Page parse(SitemeshBuffer buffer) throws IOException { CharArrayReader reader = new CharArrayReader(buffer.getCharArray(), 0, buffer.getBufferLength()); CharArray _buffer = new CharArray(4096); CharArray _body = new CharArray(4096); CharArray _head = new CharArray(512); CharArray _title = new CharArray(128); Map _htmlProperties = null; Map _metaProperties = new HashMap(6); Map _sitemeshProperties = new HashMap(6); Map _bodyProperties = null; CharArray _currentTaggedContent = new CharArray(1024); String _contentTagId = null; boolean tagged = false; boolean _frameSet = false; int _state = STATE_TEXT; int _tokenType = TOKEN_NONE; int _pushBack = 0; int _comment = 0; int _quote = 0; boolean hide = false; int state = TAG_STATE_NONE; int laststate = TAG_STATE_NONE; boolean doneTitle = false; // This tag object gets reused each iteration. Tag tagObject = new Tag(); while (_tokenType != TOKEN_EOF) { if(tagged) { if(_tokenType == TOKEN_TAG || _tokenType == TOKEN_EMPTYTAG) { if(_buffer==null || _buffer.length()==0) { _tokenType=TOKEN_NONE; continue; } if (parseTag(tagObject, _buffer) == null) continue; if (_buffer.compareLowerSubstr("/content")) // Note that the '/' survives the | 32 operation { tagged = false; if(_contentTagId != null) { state = TAG_STATE_NONE; _sitemeshProperties.put(_contentTagId, _currentTaggedContent.toString()); _currentTaggedContent.setLength(0); _contentTagId = null; } } else { _currentTaggedContent.append('<').append(_buffer).append('>'); } } else { if(_buffer.length() > 0) _currentTaggedContent.append(_buffer); } } else { if(_tokenType == TOKEN_TAG || _tokenType == TOKEN_EMPTYTAG) { if(_buffer==null || _buffer.length()==0) { _tokenType=TOKEN_NONE; continue; } if(parseTag(tagObject, _buffer) == null) { _tokenType=TOKEN_TEXT; continue; } int tagHash = _buffer.substrHashCode(); if(state == TAG_STATE_XML || state == TAG_STATE_XMP) { writeTag(state, laststate, hide, _head, _buffer, _body); if( (state == TAG_STATE_XML && tagHash == SLASH_XML_HASH) ||(state == TAG_STATE_XMP && tagHash == SLASH_XMP_HASH) ) { state = laststate; } } else { boolean doDefault = false; switch (tagHash) { case HTML_HASH: if (!_buffer.compareLowerSubstr("html")) { // skip any accidental hash collisions doDefault = true; break; } state = TAG_STATE_HTML; _htmlProperties = parseProperties(tagObject, _buffer).properties; break; case HEAD_HASH: if (!_buffer.compareLowerSubstr("head")) { // skip any accidental hash collisions doDefault = true; break; } state = TAG_STATE_HEAD; break; case XML_HASH: if (!_buffer.compareLowerSubstr("xml")) { // skip any accidental hash collisions doDefault = true; break; } laststate = state; writeTag(state, laststate, hide, _head, _buffer, _body); state = TAG_STATE_XML; break; case XMP_HASH: if (!_buffer.compareLowerSubstr("xmp")) { // skip any accidental hash collisions doDefault = true; break; } laststate = state; writeTag(state, laststate, hide, _head, _buffer, _body); state = TAG_STATE_XMP; break; case TITLE_HASH: if (!_buffer.compareLowerSubstr("title")) { // skip any accidental hash collisions doDefault = true; break; } if (doneTitle) { hide = true; } else { laststate = state; state = TAG_STATE_TITLE; } break; case SLASH_TITLE_HASH: if (!_buffer.compareLowerSubstr("/title")) { // skip any accidental hash collisions doDefault = true; break; } if (doneTitle) { hide = false; } else { doneTitle = true; state = laststate; } break; case PARAMETER_HASH: if (!_buffer.compareLowerSubstr("parameter")) { // skip any accidental hash collisions doDefault = true; break; } parseProperties(tagObject, _buffer); String name = (String) tagObject.properties.get("name"); String value = (String) tagObject.properties.get("value"); if (name != null && value != null) { _sitemeshProperties.put(name, value); } break; case META_HASH: if (!_buffer.compareLowerSubstr("meta")) { // skip any accidental hash collisions doDefault = true; break; } CharArray metaDestination = state == TAG_STATE_HEAD ? _head : _body; metaDestination.append('<'); metaDestination.append(_buffer); metaDestination.append('>'); parseProperties(tagObject, _buffer); name = (String) tagObject.properties.get("name"); value = (String) tagObject.properties.get("content"); if (name == null) { String httpEquiv = (String) tagObject.properties.get("http-equiv"); if (httpEquiv != null) { name = "http-equiv." + httpEquiv; } } if (name != null && value != null) { _metaProperties.put(name, value); } break; case SLASH_HEAD_HASH: if (!_buffer.compareLowerSubstr("/head")) { // skip any accidental hash collisions doDefault = true; break; } state = TAG_STATE_HTML; break; case FRAME_HASH: if (!_buffer.compareLowerSubstr("frame")) { // skip any accidental hash collisions doDefault = true; break; } _frameSet = true; break; case FRAMESET_HASH: if (!_buffer.compareLowerSubstr("frameset")) { // skip any accidental hash collisions doDefault = true; break; } _frameSet = true; break; case BODY_HASH: if (!_buffer.compareLowerSubstr("body")) { // skip any accidental hash collisions doDefault = true; break; } if (_tokenType == TOKEN_EMPTYTAG) { state = TAG_STATE_BODY; } _bodyProperties = parseProperties(tagObject, _buffer).properties; break; case CONTENT_HASH: if (!_buffer.compareLowerSubstr("content")) { // skip any accidental hash collisions doDefault = true; break; } state = TAG_STATE_NONE; Map props = parseProperties(tagObject, _buffer).properties; if (props != null) { tagged = true; _contentTagId = (String) props.get("tag"); } break; case SLASH_XMP_HASH: if (!_buffer.compareLowerSubstr("/xmp")) { // skip any accidental hash collisions doDefault = true; break; } hide = false; break; case SLASH_BODY_HASH: if (!_buffer.compareLowerSubstr("/body")) { // skip any accidental hash collisions doDefault = true; break; } state = TAG_STATE_NONE; hide = true; break; case SLASH_HTML_HASH: if (!_buffer.compareLowerSubstr("/html")) { // skip any accidental hash collisions doDefault = true; break; } state = TAG_STATE_NONE; hide = true; break; default: doDefault = true; } if (doDefault) writeTag(state, laststate, hide, _head, _buffer, _body); } } else if (!hide) { if (_tokenType == TOKEN_TEXT) { if (state == TAG_STATE_TITLE) { _title.append(_buffer); } else if (shouldWriteToHead(state, laststate)) { _head.append(_buffer); } else { _body.append(_buffer); } } else if (_tokenType == TOKEN_COMMENT) { final CharArray commentDestination = shouldWriteToHead(state, laststate) ? _head : _body; commentDestination.append("<!--"); commentDestination.append(_buffer); commentDestination.append("-->"); } else if (_tokenType == TOKEN_CDATA) { final CharArray commentDestination = state == TAG_STATE_HEAD ? _head : _body; commentDestination.append("<![CDATA["); commentDestination.append(_buffer); commentDestination.append("]]>"); } else if (_tokenType == TOKEN_SCRIPT) { final CharArray commentDestination = state == TAG_STATE_HEAD ? _head : _body; commentDestination.append('<'); commentDestination.append(_buffer); } } } _buffer.setLength(0); start: while (true) { int c; if(_pushBack != 0) { c = _pushBack; _pushBack = 0; } else { try { c = reader.read(); } catch(IOException e) { _tokenType = TOKEN_EOF; break start; } } if(c < 0) { int tmpstate = _state; _state = STATE_EOF; if(_buffer.length() > 0 && tmpstate == STATE_TEXT) { _tokenType = TOKEN_TEXT; break start; } else { _tokenType = TOKEN_EOF; break start; } } switch(_state) { case STATE_TAG: { int buflen = _buffer.length(); if(c == '>') { if (_buffer.length() > 1 && _buffer.charAt(_buffer.length() - 1) == '/') { _tokenType = TOKEN_EMPTYTAG; } else { _tokenType = TOKEN_TAG; } _state = STATE_TEXT; break start; } else if(c == '/') { _buffer.append('/'); } else if(c == '<' && buflen == 0) { _buffer.append("<<"); _state = STATE_TEXT; } else if(c == '-' && buflen == 2 && _buffer.charAt(1) == '-' && _buffer.charAt(0) == '!') { _buffer.setLength(0); _state = STATE_COMMENT; } else if(c == '[' && buflen == 7 && _buffer.charAt(0) == '!' && _buffer.charAt(1) == '[' && _buffer.compareLower("cdata", 2)) { _buffer.setLength(0); _state = STATE_CDATA; } else if((c == 'e' || c == 'E') && buflen == 7 && _buffer.charAt(0) == '!' && _buffer.compareLower("doctyp", 1)) { _buffer.append((char)c); _state = STATE_DOCTYPE; } else if((c == 'T' || c == 't') && buflen == 5 && _buffer.compareLower("scrip", 0)) { _buffer.append((char)c); _state = STATE_SCRIPT; } else if(c == '"' || c == '\'') { _quote = c; _buffer.append(( char ) c); _state = STATE_TAG_QUOTE; } else { _buffer.append(( char ) c); } } break; case STATE_TEXT: { if(c == '<') { _state = STATE_TAG; if(_buffer.length() > 0) { _tokenType = TOKEN_TEXT; break start; } } else { _buffer.append(( char ) c); } } break; case STATE_TAG_QUOTE: { if(c == '>') { _pushBack = c; _state = STATE_TAG; } else { _buffer.append(( char ) c); if(c == _quote) { _state = STATE_TAG; } } } break; case STATE_COMMENT: { if(c == '>' && _comment >= 2) { _buffer.setLength(_buffer.length() - 2); _comment = 0; _state = STATE_TEXT; _tokenType = TOKEN_COMMENT; break start; } else if(c == '-') { _comment++; } else { _comment = 0; } _buffer.append(( char ) c); } break; case STATE_CDATA: { if(c == '>' && _comment >= 2) { _buffer.setLength(_buffer.length() - 2); _comment = 0; _state = STATE_TEXT; _tokenType = TOKEN_CDATA; break start; } else if(c == ']') { _comment++; } else { _comment = 0; } _buffer.append(( char ) c); } break; case STATE_SCRIPT: { _buffer.append((char) c); if (c == '<') { _comment = 0; } else if ((c == '/' && _comment == 0) ||((c == 's' || c == 'S' ) && _comment == 1) ||((c == 'c' || c == 'C' ) && _comment == 2) ||((c == 'r' || c == 'R' ) && _comment == 3) ||((c == 'i' || c == 'I' ) && _comment == 4) ||((c == 'p' || c == 'P' ) && _comment == 5) ||((c == 't' || c == 'T' ) && _comment == 6) ) { _comment++; } else if(c == '>' && _comment >= 7) { _comment = 0; _state = STATE_TEXT; _tokenType = TOKEN_SCRIPT; break start; } } break; case STATE_DOCTYPE: { _buffer.append((char) c); if (c == '>') { _state = STATE_TEXT; _tokenType = TOKEN_DOCTYPE; break start; } else { _comment = 0; } } break; } } } // Help the GC _currentTaggedContent = null; _buffer = null; return new FastPage(buffer, _sitemeshProperties, _htmlProperties, _metaProperties, _bodyProperties, _title.toString().trim(), _head.toString().trim(), _body.toString().trim(), _frameSet); } private static void writeTag(int state, int laststate, boolean hide, CharArray _head, CharArray _buffer, CharArray _body) { if (!hide) { if (shouldWriteToHead(state, laststate)) { _head.append('<').append(_buffer).append('>'); } else { _body.append('<').append(_buffer).append('>'); } } } private static boolean shouldWriteToHead(int state, int laststate) { return state == TAG_STATE_HEAD ||(laststate == TAG_STATE_HEAD && (state == TAG_STATE_XML || state == TAG_STATE_XMP)); } /** * Populates a {@link Tag} object using data from the supplied {@link CharArray}. * * The supplied tag parameter is reset and reused - this avoids excess object * creation which hwlps performance. * * @return the same tag instance that was passed in, except it will be populated * with a new <tt>name</tt> value (and the corresponding <tt>nameEndIdx</tt> value). * However if the tag contained nathing but whitespace, this method will return * <tt>null</tt>. */ private Tag parseTag(Tag tag, CharArray buf) { int len = buf.length(); int idx = 0; int begin; // Skip over any leading whitespace in the tag while (idx < len && Character.isWhitespace(buf.charAt(idx))) idx++; if(idx == len) return null; // Find out where the non-whitespace characters end. This will give us the tag name. begin = idx; while (idx < len && !Character.isWhitespace(buf.charAt(idx))) idx++; // Mark the tag name as a substring within the buffer. This allows us to perform // a substring comparison against it at a later date buf.setSubstr(begin, buf.charAt(idx - 1) == '/' ? idx - 1 : idx); // Remember where the name finishes so we can pull out the properties later if need be tag.nameEndIdx = idx; return tag; } /** * This is called when we need to extract the properties for the tag from the tag's HTML. * We only call this when necessary since it has quite a lot of overhead. * * @param tag the tag that is currently being processed. This should be the * tag that was returned as a result of a call to {@link #parseTag(FastPageParser.Tag, CharArray)} * (ie, it has the <tt>name</tt> and <tt>nameEndIdx</tt> fields set correctly for the * tag in question. The <tt>properties</tt> field can be in an undefined state - it * will get replaced regardless). * @param buffer a <tt>CharArray</tt> containing the entire tag that is being parsed. * @return the same tag instance that was passed in, only it will now be populated * with any properties that were specified in the tag's HTML. */ private static Tag parseProperties(Tag tag, CharArray buffer) { int len = buffer.length(); int idx = tag.nameEndIdx; // Start with an empty hashmap. A new HashMap is lazy-created if we happen to find any properties tag.properties = Collections.EMPTY_MAP; int begin; while (idx < len) { // Skip forward to the next non-whitespace character while (idx < len && Character.isWhitespace(buffer.charAt(idx))) idx++; if(idx == len) continue; begin = idx; if(buffer.charAt(idx) == '"') { idx++; while (idx < len && buffer.charAt(idx) != '"') idx++; if(idx == len) continue; idx++; } else if(buffer.charAt(idx) == '\'') { idx++; while (idx < len && buffer.charAt(idx) != '\'') idx++; if(idx == len) continue; idx++; } else { while (idx < len && !Character.isWhitespace(buffer.charAt(idx)) && buffer.charAt(idx) != '=') idx++; } // Mark the substring. This is the attribute name buffer.setSubstr(begin, idx); if(idx < len && Character.isWhitespace(buffer.charAt(idx))) { while (idx < len && Character.isWhitespace(buffer.charAt(idx))) idx++; } if(idx == len || buffer.charAt(idx) != '=') continue; idx++; if(idx == len) continue; while(idx < len && (buffer.charAt(idx) == '\n' || buffer.charAt(idx) == '\r')) idx++; if(buffer.charAt(idx) == ' ') { while (idx < len && Character.isWhitespace(buffer.charAt(idx))) idx++; if(idx == len || (buffer.charAt(idx) != '"' && buffer.charAt(idx) != '"')) continue; } begin = idx; int end; if(buffer.charAt(idx) == '"') { idx++; begin = idx; while (idx < len && buffer.charAt(idx) != '"') idx++; if(idx == len) continue; end = idx; idx++; } else if(buffer.charAt(idx) == '\'') { idx++; begin = idx; while (idx < len && buffer.charAt(idx) != '\'') idx++; if(idx == len) continue; end = idx; idx++; } else { while (idx < len && !Character.isWhitespace(buffer.charAt(idx))) idx++; end = idx; } // Extract the name and value as String objects and add them to the property map String name = buffer.getLowerSubstr(); String value = buffer.substring(begin, end); tag.addProperty(name, value); } return tag; } private class Tag { // The index where the name string ends. This is used as the starting // offet if we need to continue processing to find the tag's properties public int nameEndIdx = 0; // This holds a map of the various properties for a particular tag. // This map is only populated when required - normally it will remain empty public Map properties = Collections.EMPTY_MAP; /** * Adds a name/value property pair to this tag. Each property that is * added represents a property that was parsed from the tag's HTML. */ public void addProperty(String name, String value) { if(properties==Collections.EMPTY_MAP) { properties = new HashMap(8); } properties.put(name, value); } } }