/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.wicket.protocol.http.documentvalidation; import java.util.HashMap; import java.util.Map; import org.apache.wicket.util.string.Strings; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Lightweight document parser for HTML. This parser is only intended to process well formed and * simple HTML of the kind that would generally be utilized during testing. * * @author Chris Turner * @deprecated Will be removed in Wicket 9.0 together with {@link HtmlDocumentValidator} */ @Deprecated public class HtmlDocumentParser { private static final Logger log = LoggerFactory.getLogger(HtmlDocumentParser.class); /** Constant for close tag token. */ public static final int CLOSE_TAG = 4; /** Constant for comment token. */ public static final int COMMENT = 1; /** Constant for end token. */ public static final int END = 0; /** Constant for open tag token. */ public static final int OPEN_TAG = 2; /** Constant for open/close tag token. */ public static final int OPENCLOSE_TAG = 3; /** Constant for text token. */ public static final int TEXT = 5; /** constant for unknown token. */ public static final int UNKNOWN = -1; private Map<String, String> attributes; /** Extracted content */ private String comment; /** Document parse elements */ private final String document; private int pos; private String tag; private String text; /** * Create the parser for the current document. * * @param document * The document to parse */ public HtmlDocumentParser(final String document) { CharSequence tmp = Strings.replaceAll(document, "\n", ""); tmp = Strings.replaceAll(tmp, "\r", ""); this.document = Strings.replaceAll(tmp, "\t", " ").toString(); pos = 0; } /** * Get the attributes of the tag. * * @return The attributes */ public Map<String, String> getAttributes() { return attributes; } /** * Get the comment. * * @return The comment */ public String getComment() { return comment; } /** * Iterates through the document searching for tokens. Returns the type of token that was found. * If an unexpected token was encountered then the parser writes this fact to the console and * continues * * @return The token that was found */ public int getNextToken() { if (pos < document.length()) { char ch = document.charAt(pos); if (ch == '<') { return processDirective(); } else { return processText(); } } return END; } /** * Get the tag name. * * @return The tag name */ public String getTag() { return tag; } /** * Get the text. * * @return The text */ public String getText() { return text; } /** * Extract attributes from the given string. * * @param attributeString * The string * @return The map of attributes */ private Map<String, String> extractAttributes(String attributeString) { Map<String, String> m = new HashMap<String, String>(); attributeString = Strings.replaceAll(attributeString.trim(), "\t", " ").toString(); attributeString = Strings.replaceAll(attributeString, " = ", "=").toString(); String[] attributeElements = Strings.split(attributeString, ' '); for (String attributeElement : attributeElements) { String[] bits = Strings.split(attributeElement, '='); if (bits.length == 1) { m.put(bits[0].trim().toLowerCase(), ""); } else { bits[0] = bits[0].trim(); StringBuilder value = new StringBuilder(); for (int j = 1; j < bits.length; j++) { value.append(bits[j]); if (j < (bits.length - 1)) { value.append('='); } } bits[1] = value.toString().trim(); if (bits[1].startsWith("\"")) { bits[1] = bits[1].substring(1); } if (bits[1].endsWith("\"")) { bits[1] = bits[1].substring(0, bits[1].length() - 1); } m.put(bits[0].toLowerCase(), bits[1]); } } return m; } /** * Process a directive starting at the current position. * * @return The token found */ private int processDirective() { String part = document.substring(pos); if (part.matches("<!--.*-->.*")) { // This is a comment comment = part.substring(4, part.indexOf("-->")).trim(); pos += part.indexOf("-->") + 3; return COMMENT; } else if (part.matches("</.*>.*")) { // This is a closing tag tag = part.substring(2, part.indexOf('>')).trim().toLowerCase(); pos += part.indexOf(">") + 1; return CLOSE_TAG; } else if (part.matches("<[^/]+[^>]*/>.*")) { // This is an openclose tag if (part.matches("<([a-zA-Z]+:)?[a-zA-Z]+/>.*")) { // No attributes tag = part.substring(1, part.indexOf("/>")).toLowerCase(); attributes = new HashMap<String, String>(); } else { // Attributes tag = part.substring(1, part.indexOf(' ')).toLowerCase(); String attributeString = part.substring(part.indexOf(' '), part.indexOf("/>")); attributes = extractAttributes(attributeString); } pos += part.indexOf("/>") + 2; return OPENCLOSE_TAG; } else if (part.matches("<[^/>]+.*>.*")) { // This is an opening tag if (part.matches("<([a-zA-Z]+:)?[a-zA-Z0-9]*>.*")) { // No attributes tag = part.substring(1, part.indexOf('>')).toLowerCase(); attributes = new HashMap<String, String>(); } else { // Attributes tag = part.substring(1, part.indexOf(' ')).toLowerCase(); String attributeString = part.substring(part.indexOf(' '), part.indexOf('>')); attributes = extractAttributes(attributeString); } pos += part.indexOf(">") + 1; return OPEN_TAG; } else { int size = (part.length() > 30) ? 30 : part.length(); log.error("Unexpected markup found: " + part.substring(0, size) + "..."); return UNKNOWN; } } /** * Process text up to the next token. * * @return The token code */ private int processText() { StringBuilder buf = new StringBuilder(); while (pos < document.length()) { char ch = document.charAt(pos); if (ch == '<') { text = buf.toString(); return TEXT; } else { buf.append(ch); } pos++; } if (buf.length() > 0) { text = buf.toString(); return TEXT; } return END; } }