/* $Id$ */ /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.manifoldcf.connectorcommon.fuzzyml; import org.apache.manifoldcf.core.interfaces.*; import org.apache.manifoldcf.core.system.Logging; import java.util.*; /** This class represents a basic xml/html tag parser. * It is capable of recognizing the following xml and html constructs: * * '<' <token> <attrs> '>' ... '</' <token> '>' * '<' <token> <attrs> '/>' * '<?' <token> <attrs> '?>' * '<![' [<token>] '[' ... ']]>' * '<!' <token> ... '>' * '<!--' ... '-->' * * Each of these, save the comment, has supporting protected methods that will be * called by the parsing engine. Overriding these methods will allow an extending * class to perform higher-level data extraction and parsing. * * Of these, the messiest is the <! ... > construct, since there can be multiple nested * btags, cdata-like escapes, and qtags inside. Ideally the parser should produce a * sequence of preparsed tokens from these tags. Since they can be nested, keeping * track of the depth is also essential, so we do that with a btag depth counter. * Thus, in this case, it is not the state that matters, but the btag depth, to determine * if the parser is operating inside a btag. */ public class TagParseState extends SingleCharacterReceiver { protected static final int TAGPARSESTATE_NORMAL = 0; protected static final int TAGPARSESTATE_SAWLEFTANGLE = 1; protected static final int TAGPARSESTATE_SAWEXCLAMATION = 2; protected static final int TAGPARSESTATE_SAWDASH = 3; protected static final int TAGPARSESTATE_IN_COMMENT = 4; protected static final int TAGPARSESTATE_SAWCOMMENTDASH = 5; protected static final int TAGPARSESTATE_SAWSECONDCOMMENTDASH = 6; protected static final int TAGPARSESTATE_IN_TAG_NAME = 7; protected static final int TAGPARSESTATE_IN_ATTR_NAME = 8; protected static final int TAGPARSESTATE_IN_ATTR_VALUE = 9; protected static final int TAGPARSESTATE_IN_TAG_SAW_SLASH = 10; protected static final int TAGPARSESTATE_IN_END_TAG_NAME = 11; protected static final int TAGPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE = 12; protected static final int TAGPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE = 13; protected static final int TAGPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE = 14; protected static final int TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE = 15; protected static final int TAGPARSESTATE_IN_QTAG_NAME = 16; protected static final int TAGPARSESTATE_IN_QTAG_ATTR_NAME = 17; protected static final int TAGPARSESTATE_IN_QTAG_SAW_QUESTION = 18; protected static final int TAGPARSESTATE_IN_QTAG_ATTR_VALUE = 19; protected static final int TAGPARSESTATE_IN_QTAG_ATTR_LOOKING_FOR_VALUE = 20; protected static final int TAGPARSESTATE_IN_QTAG_SINGLE_QUOTES_ATTR_VALUE = 21; protected static final int TAGPARSESTATE_IN_QTAG_DOUBLE_QUOTES_ATTR_VALUE = 22; protected static final int TAGPARSESTATE_IN_QTAG_UNQUOTED_ATTR_VALUE = 23; protected static final int TAGPARSESTATE_IN_BRACKET_TOKEN = 24; protected static final int TAGPARSESTATE_NEED_FINAL_BRACKET = 25; protected static final int TAGPARSESTATE_IN_BANG_TOKEN = 26; protected static final int TAGPARSESTATE_IN_CDATA_BODY = 27; protected static final int TAGPARSESTATE_SAWRIGHTBRACKET = 28; protected static final int TAGPARSESTATE_SAWSECONDRIGHTBRACKET = 29; protected static final int TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE_SAW_SLASH = 30; protected int currentState = TAGPARSESTATE_NORMAL; /** The btag depth, which indicates btag behavior when > 0. */ protected int bTagDepth = 0; /** This is the only buffer we actually accumulate stuff in. */ protected StringBuilder accumBuffer = new StringBuilder(); // The following are pointers to the accum buffer above, when allocated. protected StringBuilder currentTagNameBuffer = null; protected StringBuilder currentAttrNameBuffer = null; protected StringBuilder currentValueBuffer = null; protected String currentTagName = null; protected String currentAttrName = null; protected List<AttrNameValue> currentAttrList = null; // Body decoding state /** Whether we've seen an ampersand */ protected boolean inAmpersand = false; /** Buffer of characters seen after ampersand. */ protected StringBuilder ampBuffer = new StringBuilder(); protected static final Map<String,String> mapLookup = new HashMap<String,String>(); static { mapLookup.put("amp","&"); mapLookup.put("lt","<"); mapLookup.put("gt",">"); mapLookup.put("quot","\""); mapLookup.put("apos","'"); } public TagParseState() { super(65536); } /** Deal with a character. No exceptions are allowed, since those would represent * syntax errors, and we don't want those to cause difficulty. */ @Override public boolean dealWithCharacter(char thisChar) throws ManifoldCFException { // At this level we want basic lexical analysis - that is, we deal with identifying tags and comments, that's it. // We don't even attempt to map to lower case, that's how naive this is. switch (currentState) { case TAGPARSESTATE_NORMAL: if (thisChar == '<') { if (inAmpersand) { outputAmpBuffer(); inAmpersand = false; } currentState = TAGPARSESTATE_SAWLEFTANGLE; } else if (bTagDepth > 0 && thisChar == '>') { // Output current token, if any if (currentTagNameBuffer != null && currentTagNameBuffer.length() > 0) { currentTagName = currentTagNameBuffer.toString(); if (noteBTagToken(currentTagName)) return true; currentTagName = null; currentTagNameBuffer = null; } if (noteEndBTag()) return true; bTagDepth--; } else if (bTagDepth == 0) { if (inAmpersand) { if (thisChar == ';') { // We append the semi so that the output function can make good decisions ampBuffer.append(thisChar); if (outputAmpBuffer()) return true; inAmpersand = false; } else if (isWhitespace(thisChar)) { // Interpret ampersand buffer. if (outputAmpBuffer()) return true; inAmpersand = false; if (noteNormalCharacter(thisChar)) return true; } else ampBuffer.append(thisChar); } else if (thisChar == '&') { inAmpersand = true; ampBuffer.setLength(0); } else { if (noteNormalCharacter(thisChar)) return true; } } else { // In btag; accumulate tokens if (isPunctuation(thisChar)) { if (currentTagNameBuffer != null && currentTagNameBuffer.length() > 0) { currentTagName = currentTagNameBuffer.toString(); if (noteBTagToken(currentTagName)) return true; currentTagNameBuffer = null; currentTagName = null; } if (noteBTagToken(new StringBuilder().append(thisChar).toString())) return true; } else if (isWhitespace(thisChar)) { if (currentTagNameBuffer != null && currentTagNameBuffer.length() > 0) { currentTagName = currentTagNameBuffer.toString(); if (noteBTagToken(currentTagName)) return true; currentTagNameBuffer = null; currentTagName = null; } } else { if (currentTagNameBuffer == null) currentTagNameBuffer = newBuffer(); currentTagNameBuffer.append(thisChar); } } break; case TAGPARSESTATE_IN_CDATA_BODY: if (thisChar == ']') currentState = TAGPARSESTATE_SAWRIGHTBRACKET; else { if (noteEscapedCharacter(thisChar)) return true; } break; case TAGPARSESTATE_SAWRIGHTBRACKET: if (thisChar == ']') currentState = TAGPARSESTATE_SAWSECONDRIGHTBRACKET; else { currentState = TAGPARSESTATE_IN_CDATA_BODY; if (noteEscapedCharacter(']')) return true; if (noteEscapedCharacter(thisChar)) return true; } break; case TAGPARSESTATE_SAWSECONDRIGHTBRACKET: if (thisChar == '>') currentState = TAGPARSESTATE_NORMAL; else if (thisChar == ']') { // currentstate unchanged; emit the first bracket if (noteEscapedCharacter(']')) return true; } else { currentState = TAGPARSESTATE_IN_CDATA_BODY; if (noteEscapedCharacter(']')) return true; if (noteEscapedCharacter(']')) return true; if (noteEscapedCharacter(thisChar)) return true; } break; case TAGPARSESTATE_SAWLEFTANGLE: if (thisChar == '!') currentState = TAGPARSESTATE_SAWEXCLAMATION; else if (thisChar == '?') { currentState = TAGPARSESTATE_IN_QTAG_NAME; currentTagNameBuffer = newBuffer(); } else if (bTagDepth == 0 && thisChar == '/') { currentState = TAGPARSESTATE_IN_END_TAG_NAME; currentTagNameBuffer = newBuffer(); } else if (bTagDepth == 0) { if (isWhitespace(thisChar)) { // Not a tag. currentState = TAGPARSESTATE_NORMAL; if (noteNormalCharacter('<')) return true; if (noteNormalCharacter(thisChar)) return true; } else { currentState = TAGPARSESTATE_IN_TAG_NAME; currentTagNameBuffer = newBuffer(); currentTagNameBuffer.append(thisChar); } } else { // in btag, saw left angle, nothing recognizable after - must be a token if (noteBTagToken("<")) return true; if (!isWhitespace(thisChar)) { // Add char to current token buffer. currentTagNameBuffer = newBuffer(); currentTagNameBuffer.append(thisChar); } currentState = TAGPARSESTATE_NORMAL; } break; case TAGPARSESTATE_SAWEXCLAMATION: if (thisChar == '-') currentState = TAGPARSESTATE_SAWDASH; else if (thisChar == '[') { currentState = TAGPARSESTATE_IN_BRACKET_TOKEN; currentTagNameBuffer = newBuffer(); } else { bTagDepth++; currentState = TAGPARSESTATE_IN_BANG_TOKEN; currentTagNameBuffer = newBuffer(); if (!isWhitespace(thisChar)) currentTagNameBuffer.append(thisChar); } break; case TAGPARSESTATE_SAWDASH: if (thisChar == '-') currentState = TAGPARSESTATE_IN_COMMENT; else currentState = TAGPARSESTATE_NORMAL; break; case TAGPARSESTATE_IN_COMMENT: // We're in a comment. All we should look for is the end of the comment. if (thisChar == '-') currentState = TAGPARSESTATE_SAWCOMMENTDASH; break; case TAGPARSESTATE_SAWCOMMENTDASH: if (thisChar == '-') currentState = TAGPARSESTATE_SAWSECONDCOMMENTDASH; else currentState = TAGPARSESTATE_IN_COMMENT; break; case TAGPARSESTATE_SAWSECONDCOMMENTDASH: if (thisChar == '>') currentState = TAGPARSESTATE_NORMAL; else if (thisChar != '-') currentState = TAGPARSESTATE_IN_COMMENT; break; case TAGPARSESTATE_IN_QTAG_NAME: if (isWhitespace(thisChar)) { if (currentTagNameBuffer.length() > 0) { // Done with the tag name! currentTagName = currentTagNameBuffer.toString(); currentTagNameBuffer = null; currentAttrList = new ArrayList<AttrNameValue>(); currentState = TAGPARSESTATE_IN_QTAG_ATTR_NAME; currentAttrNameBuffer = newBuffer(); } } else if (thisChar == '?') { if (currentTagNameBuffer.length() > 0) { currentTagName = currentTagNameBuffer.toString(); currentTagNameBuffer = null; currentAttrList = new ArrayList<AttrNameValue>(); currentState = TAGPARSESTATE_IN_QTAG_SAW_QUESTION; if (noteQTag(currentTagName,currentAttrList)) return true; } else { currentState = TAGPARSESTATE_NORMAL; currentTagNameBuffer = null; } } else if (thisChar == '>') { if (currentTagNameBuffer.length() > 0) { currentTagName = currentTagNameBuffer.toString(); currentTagNameBuffer = null; currentAttrList = new ArrayList<AttrNameValue>(); } if (currentTagName != null) { if (noteQTag(currentTagName,currentAttrList)) return true; } currentState = TAGPARSESTATE_NORMAL; currentTagName = null; currentAttrList = null; } else currentTagNameBuffer.append(thisChar); break; case TAGPARSESTATE_IN_BRACKET_TOKEN: if (isWhitespace(thisChar)) { if (currentTagNameBuffer.length() > 0) { // Done with the bracket token! currentTagName = currentTagNameBuffer.toString(); currentTagNameBuffer = null; currentState = TAGPARSESTATE_NEED_FINAL_BRACKET; } } else if (thisChar == '[') { currentTagName = currentTagNameBuffer.toString(); currentTagNameBuffer = null; currentState = TAGPARSESTATE_IN_CDATA_BODY; if (noteEscaped(currentTagName)) return true; currentTagName = null; } else currentTagNameBuffer.append(thisChar); break; case TAGPARSESTATE_NEED_FINAL_BRACKET: if (thisChar == '[') { if (noteEscaped(currentTagName)) return true; currentTagName = null; currentState = TAGPARSESTATE_IN_CDATA_BODY; } break; case TAGPARSESTATE_IN_BANG_TOKEN: if (isWhitespace(thisChar)) { if (currentTagNameBuffer.length() > 0) { // Done with bang token currentTagName = currentTagNameBuffer.toString(); currentTagNameBuffer = null; if (noteBTag(currentTagName)) return true; currentTagName = null; currentState = TAGPARSESTATE_NORMAL; } } else if (thisChar == '>') { // Also done, but signal end too. currentTagName = currentTagNameBuffer.toString(); currentTagNameBuffer = null; if (noteBTag(currentTagName)) return true; currentTagName = null; currentState = TAGPARSESTATE_NORMAL; if (noteEndBTag()) return true; bTagDepth--; } else currentTagNameBuffer.append(thisChar); break; case TAGPARSESTATE_IN_TAG_NAME: if (isWhitespace(thisChar)) { if (currentTagNameBuffer.length() > 0) { // Done with the tag name! currentTagName = currentTagNameBuffer.toString(); currentTagNameBuffer = null; currentAttrList = new ArrayList<AttrNameValue>(); currentState = TAGPARSESTATE_IN_ATTR_NAME; currentAttrNameBuffer = newBuffer(); } } else if (thisChar == '/') { if (currentTagNameBuffer.length() > 0) { currentTagName = currentTagNameBuffer.toString(); currentTagNameBuffer = null; currentAttrList = new ArrayList<AttrNameValue>(); currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH; if (noteTag(currentTagName,currentAttrList)) return true; } else { currentState = TAGPARSESTATE_NORMAL; currentTagNameBuffer = null; } } else if (thisChar == '>') { if (currentTagNameBuffer.length() > 0) { currentTagName = currentTagNameBuffer.toString(); currentTagNameBuffer = null; currentAttrList = new ArrayList<AttrNameValue>(); } if (currentTagName != null) { if (noteTag(currentTagName,currentAttrList)) return true; } currentState = TAGPARSESTATE_NORMAL; currentTagName = null; currentAttrList = null; } else currentTagNameBuffer.append(thisChar); break; case TAGPARSESTATE_IN_QTAG_ATTR_NAME: if (isWhitespace(thisChar)) { if (currentAttrNameBuffer.length() > 0) { // Done with attr name! currentAttrName = currentAttrNameBuffer.toString(); currentAttrNameBuffer = null; currentState = TAGPARSESTATE_IN_QTAG_ATTR_LOOKING_FOR_VALUE; } } else if (thisChar == '=') { if (currentAttrNameBuffer.length() > 0) { currentAttrName = currentAttrNameBuffer.toString(); currentAttrNameBuffer = null; currentState = TAGPARSESTATE_IN_QTAG_ATTR_VALUE; currentValueBuffer = newBuffer(); } } else if (thisChar == '?') { if (currentAttrNameBuffer.length() > 0) { currentAttrName = currentAttrNameBuffer.toString(); currentAttrNameBuffer = null; } if (currentAttrName != null) { currentAttrList.add(new AttrNameValue(currentAttrName,"")); currentAttrName = null; } if (noteQTag(currentTagName,currentAttrList)) return true; currentState = TAGPARSESTATE_IN_QTAG_SAW_QUESTION; } else if (thisChar == '>') { if (currentAttrNameBuffer.length() > 0) { currentAttrName = currentAttrNameBuffer.toString(); currentAttrNameBuffer = null; } if (currentAttrName != null) { currentAttrList.add(new AttrNameValue(currentAttrName,"")); currentAttrName = null; } currentState = TAGPARSESTATE_NORMAL; if (noteQTag(currentTagName,currentAttrList)) return true; currentTagName = null; currentAttrList = null; } else currentAttrNameBuffer.append(thisChar); break; case TAGPARSESTATE_IN_ATTR_NAME: if (isWhitespace(thisChar)) { if (currentAttrNameBuffer.length() > 0) { // Done with attr name! currentAttrName = currentAttrNameBuffer.toString(); currentAttrNameBuffer = null; currentState = TAGPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE; } } else if (thisChar == '=') { if (currentAttrNameBuffer.length() > 0) { currentAttrName = currentAttrNameBuffer.toString(); currentAttrNameBuffer = null; currentState = TAGPARSESTATE_IN_ATTR_VALUE; currentValueBuffer = newBuffer(); } } else if (thisChar == '/') { if (currentAttrNameBuffer.length() > 0) { currentAttrName = currentAttrNameBuffer.toString(); currentAttrNameBuffer = null; } if (currentAttrName != null) { currentAttrList.add(new AttrNameValue(currentAttrName,"")); currentAttrName = null; } if (noteTag(currentTagName,currentAttrList)) return true; currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH; } else if (thisChar == '>') { if (currentAttrNameBuffer.length() > 0) { currentAttrName = currentAttrNameBuffer.toString(); currentAttrNameBuffer = null; } if (currentAttrName != null) { currentAttrList.add(new AttrNameValue(currentAttrName,"")); currentAttrName = null; } currentState = TAGPARSESTATE_NORMAL; if (noteTag(currentTagName,currentAttrList)) return true; currentTagName = null; currentAttrList = null; } else currentAttrNameBuffer.append(thisChar); break; case TAGPARSESTATE_IN_QTAG_ATTR_LOOKING_FOR_VALUE: if (thisChar == '=') { currentState = TAGPARSESTATE_IN_QTAG_ATTR_VALUE; currentValueBuffer = newBuffer(); } else if (thisChar == '>') { currentState = TAGPARSESTATE_NORMAL; if (noteQTag(currentTagName,currentAttrList)) return true; currentTagName = null; currentAttrList = null; } else if (thisChar == '?') { currentState = TAGPARSESTATE_IN_QTAG_SAW_QUESTION; currentAttrList.add(new AttrNameValue(currentAttrName,"")); currentAttrName = null; if (noteQTag(currentTagName,currentAttrList)) return true; } else if (!isWhitespace(thisChar)) { currentAttrList.add(new AttrNameValue(currentAttrName,"")); currentState = TAGPARSESTATE_IN_QTAG_ATTR_NAME; currentAttrNameBuffer = newBuffer(); currentAttrNameBuffer.append(thisChar); currentAttrName = null; } break; case TAGPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE: if (thisChar == '=') { currentState = TAGPARSESTATE_IN_ATTR_VALUE; currentValueBuffer = newBuffer(); } else if (thisChar == '>') { currentState = TAGPARSESTATE_NORMAL; if (noteTag(currentTagName,currentAttrList)) return true; currentTagName = null; currentAttrList = null; } else if (thisChar == '/') { currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH; currentAttrList.add(new AttrNameValue(currentAttrName,"")); currentAttrName = null; if (noteTag(currentTagName,currentAttrList)) return true; } else if (!isWhitespace(thisChar)) { currentAttrList.add(new AttrNameValue(currentAttrName,"")); currentState = TAGPARSESTATE_IN_ATTR_NAME; currentAttrNameBuffer = newBuffer(); currentAttrNameBuffer.append(thisChar); currentAttrName = null; } break; case TAGPARSESTATE_IN_QTAG_ATTR_VALUE: if (thisChar == '\'') currentState = TAGPARSESTATE_IN_QTAG_SINGLE_QUOTES_ATTR_VALUE; else if (thisChar == '"') currentState = TAGPARSESTATE_IN_QTAG_DOUBLE_QUOTES_ATTR_VALUE; else if (!isWhitespace(thisChar)) { currentState = TAGPARSESTATE_IN_QTAG_UNQUOTED_ATTR_VALUE; currentValueBuffer.append(thisChar); } break; case TAGPARSESTATE_IN_ATTR_VALUE: if (thisChar == '\'') currentState = TAGPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE; else if (thisChar == '"') currentState = TAGPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE; else if (thisChar == '/') currentState = TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE_SAW_SLASH; else if (!isWhitespace(thisChar)) { currentState = TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE; currentValueBuffer.append(thisChar); } break; case TAGPARSESTATE_IN_QTAG_SAW_QUESTION: if (thisChar == '>') { // No end-tag notification for this one currentState = TAGPARSESTATE_NORMAL; currentTagName = null; currentAttrList = null; } break; case TAGPARSESTATE_IN_TAG_SAW_SLASH: if (thisChar == '>') { if (noteEndTag(currentTagName)) return true; currentState = TAGPARSESTATE_NORMAL; currentTagName = null; currentAttrList = null; } break; case TAGPARSESTATE_IN_END_TAG_NAME: if (isWhitespace(thisChar)) { if (currentTagNameBuffer != null && currentTagNameBuffer.length() > 0) { // Done with the tag name! currentTagName = currentTagNameBuffer.toString(); currentTagNameBuffer = null; } } else if (thisChar == '>') { if (currentTagNameBuffer != null && currentTagNameBuffer.length() > 0) { currentTagName = currentTagNameBuffer.toString(); currentTagNameBuffer = null; } if (currentTagName != null) { if (noteEndTag(currentTagName)) return true; } currentTagName = null; currentState = TAGPARSESTATE_NORMAL; } else if (currentTagNameBuffer != null) currentTagNameBuffer.append(thisChar); break; case TAGPARSESTATE_IN_QTAG_SINGLE_QUOTES_ATTR_VALUE: if (thisChar == '\'' || thisChar == '\n' || thisChar == '\r') { currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString()))); currentAttrName = null; currentValueBuffer = null; currentState = TAGPARSESTATE_IN_QTAG_ATTR_NAME; currentAttrNameBuffer = newBuffer(); } else currentValueBuffer.append(thisChar); break; case TAGPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE: if (thisChar == '\'' || thisChar == '\n' || thisChar == '\r') { currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString()))); currentAttrName = null; currentValueBuffer = null; currentState = TAGPARSESTATE_IN_ATTR_NAME; currentAttrNameBuffer = newBuffer(); } else currentValueBuffer.append(thisChar); break; case TAGPARSESTATE_IN_QTAG_DOUBLE_QUOTES_ATTR_VALUE: if (thisChar == '"' || thisChar == '\n' || thisChar == '\r') { currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString()))); currentAttrName = null; currentValueBuffer = null; currentState = TAGPARSESTATE_IN_QTAG_ATTR_NAME; currentAttrNameBuffer = newBuffer(); } else currentValueBuffer.append(thisChar); break; case TAGPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE: if (thisChar == '"' || thisChar == '\n' || thisChar == '\r') { currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString()))); currentAttrName = null; currentValueBuffer = null; currentState = TAGPARSESTATE_IN_ATTR_NAME; currentAttrNameBuffer = newBuffer(); } else currentValueBuffer.append(thisChar); break; case TAGPARSESTATE_IN_QTAG_UNQUOTED_ATTR_VALUE: if (isWhitespace(thisChar)) { currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString()))); currentAttrName = null; currentValueBuffer = null; currentState = TAGPARSESTATE_IN_QTAG_ATTR_NAME; currentAttrNameBuffer = newBuffer(); } else if (thisChar == '?') { currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString()))); if (noteTag(currentTagName,currentAttrList)) return true; currentState = TAGPARSESTATE_IN_QTAG_SAW_QUESTION; } else if (thisChar == '>') { currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString()))); currentAttrName = null; currentValueBuffer = null; currentState = TAGPARSESTATE_NORMAL; if (noteTag(currentTagName,currentAttrList)) return true; currentTagName = null; currentAttrList = null; } else currentValueBuffer.append(thisChar); break; case TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE_SAW_SLASH: if (isWhitespace(thisChar)) { currentValueBuffer.append('/'); currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString()))); currentAttrName = null; currentValueBuffer = null; currentState = TAGPARSESTATE_IN_ATTR_NAME; currentAttrNameBuffer = newBuffer(); } else if (thisChar == '/') { currentValueBuffer.append('/'); } else if (thisChar == '>') { currentValueBuffer.append('/'); currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString()))); currentAttrName = null; currentValueBuffer = null; currentState = TAGPARSESTATE_NORMAL; if (noteTag(currentTagName,currentAttrList)) return true; currentTagName = null; currentAttrList = null; } else { currentValueBuffer.append('/'); currentValueBuffer.append(thisChar); currentState = TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE; } break; case TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE: if (isWhitespace(thisChar)) { currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString()))); currentAttrName = null; currentValueBuffer = null; currentState = TAGPARSESTATE_IN_ATTR_NAME; currentAttrNameBuffer = newBuffer(); } else if (thisChar == '/') { currentState = TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE_SAW_SLASH; } else if (thisChar == '>') { currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString()))); currentAttrName = null; currentValueBuffer = null; currentState = TAGPARSESTATE_NORMAL; if (noteTag(currentTagName,currentAttrList)) return true; currentTagName = null; currentAttrList = null; } else currentValueBuffer.append(thisChar); break; default: throw new ManifoldCFException("Invalid state: "+Integer.toString(currentState)); } return false; } /** Allocate the buffer. */ protected StringBuilder newBuffer() { accumBuffer.setLength(0); return accumBuffer; } /** Interpret ampersand buffer. */ protected boolean outputAmpBuffer() throws ManifoldCFException { if (ampBuffer.length() == 0 || (ampBuffer.length() == 1 && ampBuffer.charAt(0) == ';')) { // Length is zero; probably a mistake, so just output the whole thing if (noteNormalCharacter('&')) return true; if (dumpValues(ampBuffer.toString())) return true; return false; } else { // Is it a known entity? String entity = ampBuffer.toString(); if (entity.endsWith(";")) entity = entity.substring(0,entity.length()-1); String replacement = mapChunk(entity); if (replacement != null) { if (dumpValues(replacement)) return true; } return false; } } protected boolean dumpValues(String value) throws ManifoldCFException { for (int i = 0; i < value.length(); i++) { if (noteNormalCharacter(value.charAt(i))) return true; } return false; } /** This method gets called for every tag. Override this method to intercept tag begins. *@return true to halt further processing. */ protected boolean noteTag(String tagName, List<AttrNameValue> attributes) throws ManifoldCFException { if (Logging.misc.isDebugEnabled()) Logging.misc.debug(" Saw tag '"+tagName+"'"); return false; } /** This method gets called for every end tag. Override this method to intercept tag ends. *@return true to halt further processing. */ protected boolean noteEndTag(String tagName) throws ManifoldCFException { if (Logging.misc.isDebugEnabled()) Logging.misc.debug(" Saw end tag '"+tagName+"'"); return false; } /** This method is called for every <? ... ?> construct, or 'qtag'. * Override it to intercept such constructs. *@return true to halt further processing. */ protected boolean noteQTag(String tagName, List<AttrNameValue> attributes) throws ManifoldCFException { if (Logging.misc.isDebugEnabled()) Logging.misc.debug(" Saw QTag '"+tagName+"'"); return false; } /** This method is called for every <! <token> ... > construct, or 'btag'. * Override it to intercept these. *@return true to halt further processing. */ protected boolean noteBTag(String tagName) throws ManifoldCFException { if (Logging.misc.isDebugEnabled()) Logging.misc.debug(" Saw BTag '"+tagName+"'"); return false; } /** This method is called for the end of every btag, or any time * there's a naked '>' in the document. Override it if you want to intercept these. *@return true to halt further processing. */ protected boolean noteEndBTag() throws ManifoldCFException { Logging.misc.debug(" Saw end BTag"); return false; } /** Called for the start of every cdata-like tag, e.g. <![ <token> [ ... ]]> *@param token may be empty!!! *@return true to halt further processing. */ protected boolean noteEscaped(String token) throws ManifoldCFException { if (Logging.misc.isDebugEnabled()) Logging.misc.debug(" Saw escaped block '"+token+"'"); return false; } /** Called for the end of every cdata-like tag. *@return true to halt further processing. */ protected boolean noteEndEscaped() throws ManifoldCFException { Logging.misc.debug(" Saw end of escaped block"); return false; } /** This method gets called for every token inside a btag. *@return true to halt further processing. */ protected boolean noteBTagToken(String token) throws ManifoldCFException { return false; } /** This method gets called for every character that is not part of a tag etc. * Override this method to intercept such characters. *@return true to halt further processing. */ protected boolean noteNormalCharacter(char thisChar) throws ManifoldCFException { return false; } /** This method gets called for every character that is found within an * escape block, e.g. CDATA. * Override this method to intercept such characters. *@return true to halt further processing. */ protected boolean noteEscapedCharacter(char thisChar) throws ManifoldCFException { return false; } /** Decode an html attribute */ protected static String attributeDecode(String input) { StringBuilder output = new StringBuilder(); int i = 0; while (i < input.length()) { char x = input.charAt(i++); if (x == '&') { int index = input.indexOf(";",i); if (index != -1) { String chunk = input.substring(i,index); String replacement = mapChunk(chunk); if (replacement != null) { output.append(replacement); i = index + 1; continue; } } } output.append(x); } return output.toString(); } /** Map an entity reference back to a character */ protected static String mapChunk(String input) { if (input.startsWith("#")) { // Treat as a decimal value try { input = input.substring(1); int value; if (input.startsWith("x")) { // Hex value = Integer.decode("0"+input); } else { // Decimal value = Integer.parseInt(input); } StringBuilder sb = new StringBuilder(); sb.append((char)value); return sb.toString(); } catch (NumberFormatException e) { return null; } } else return mapLookup.get(input); } /** Is a character markup language whitespace? */ protected static boolean isWhitespace(char x) { return x <= ' '; } /** Is a character markup language punctuation? */ protected static boolean isPunctuation(char x) { return x == '%' || x == '|' || x == '&' || x == '!' || x == '^' || x == ',' || x == ';' || x == '[' || x == ']' || x == '(' || x == ')' || x == ':' || x == '/' || x == '\\' || x == '+' || x == '='; } }