/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.wicket.protocol.http.documentvalidation;
import java.util.HashMap;
import java.util.Map;
import org.apache.wicket.util.string.Strings;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Lightweight document parser for HTML. This parser is only intended to process well formed and
* simple HTML of the kind that would generally be utilized during testing.
*
* @author Chris Turner
* @deprecated Will be removed in Wicket 9.0 together with {@link HtmlDocumentValidator}
*/
@Deprecated
public class HtmlDocumentParser
{
private static final Logger log = LoggerFactory.getLogger(HtmlDocumentParser.class);
/** Constant for close tag token. */
public static final int CLOSE_TAG = 4;
/** Constant for comment token. */
public static final int COMMENT = 1;
/** Constant for end token. */
public static final int END = 0;
/** Constant for open tag token. */
public static final int OPEN_TAG = 2;
/** Constant for open/close tag token. */
public static final int OPENCLOSE_TAG = 3;
/** Constant for text token. */
public static final int TEXT = 5;
/** constant for unknown token. */
public static final int UNKNOWN = -1;
private Map<String, String> attributes;
/** Extracted content */
private String comment;
/** Document parse elements */
private final String document;
private int pos;
private String tag;
private String text;
/**
* Create the parser for the current document.
*
* @param document
* The document to parse
*/
public HtmlDocumentParser(final String document)
{
CharSequence tmp = Strings.replaceAll(document, "\n", "");
tmp = Strings.replaceAll(tmp, "\r", "");
this.document = Strings.replaceAll(tmp, "\t", " ").toString();
pos = 0;
}
/**
* Get the attributes of the tag.
*
* @return The attributes
*/
public Map<String, String> getAttributes()
{
return attributes;
}
/**
* Get the comment.
*
* @return The comment
*/
public String getComment()
{
return comment;
}
/**
* Iterates through the document searching for tokens. Returns the type of token that was found.
* If an unexpected token was encountered then the parser writes this fact to the console and
* continues
*
* @return The token that was found
*/
public int getNextToken()
{
if (pos < document.length())
{
char ch = document.charAt(pos);
if (ch == '<')
{
return processDirective();
}
else
{
return processText();
}
}
return END;
}
/**
* Get the tag name.
*
* @return The tag name
*/
public String getTag()
{
return tag;
}
/**
* Get the text.
*
* @return The text
*/
public String getText()
{
return text;
}
/**
* Extract attributes from the given string.
*
* @param attributeString
* The string
* @return The map of attributes
*/
private Map<String, String> extractAttributes(String attributeString)
{
Map<String, String> m = new HashMap<String, String>();
attributeString = Strings.replaceAll(attributeString.trim(), "\t", " ").toString();
attributeString = Strings.replaceAll(attributeString, " = ", "=").toString();
String[] attributeElements = Strings.split(attributeString, ' ');
for (String attributeElement : attributeElements)
{
String[] bits = Strings.split(attributeElement, '=');
if (bits.length == 1)
{
m.put(bits[0].trim().toLowerCase(), "");
}
else
{
bits[0] = bits[0].trim();
StringBuilder value = new StringBuilder();
for (int j = 1; j < bits.length; j++)
{
value.append(bits[j]);
if (j < (bits.length - 1))
{
value.append('=');
}
}
bits[1] = value.toString().trim();
if (bits[1].startsWith("\""))
{
bits[1] = bits[1].substring(1);
}
if (bits[1].endsWith("\""))
{
bits[1] = bits[1].substring(0, bits[1].length() - 1);
}
m.put(bits[0].toLowerCase(), bits[1]);
}
}
return m;
}
/**
* Process a directive starting at the current position.
*
* @return The token found
*/
private int processDirective()
{
String part = document.substring(pos);
if (part.matches("<!--.*-->.*"))
{
// This is a comment
comment = part.substring(4, part.indexOf("-->")).trim();
pos += part.indexOf("-->") + 3;
return COMMENT;
}
else if (part.matches("</.*>.*"))
{
// This is a closing tag
tag = part.substring(2, part.indexOf('>')).trim().toLowerCase();
pos += part.indexOf(">") + 1;
return CLOSE_TAG;
}
else if (part.matches("<[^/]+[^>]*/>.*"))
{
// This is an openclose tag
if (part.matches("<([a-zA-Z]+:)?[a-zA-Z]+/>.*"))
{
// No attributes
tag = part.substring(1, part.indexOf("/>")).toLowerCase();
attributes = new HashMap<String, String>();
}
else
{
// Attributes
tag = part.substring(1, part.indexOf(' ')).toLowerCase();
String attributeString = part.substring(part.indexOf(' '), part.indexOf("/>"));
attributes = extractAttributes(attributeString);
}
pos += part.indexOf("/>") + 2;
return OPENCLOSE_TAG;
}
else if (part.matches("<[^/>]+.*>.*"))
{
// This is an opening tag
if (part.matches("<([a-zA-Z]+:)?[a-zA-Z0-9]*>.*"))
{
// No attributes
tag = part.substring(1, part.indexOf('>')).toLowerCase();
attributes = new HashMap<String, String>();
}
else
{
// Attributes
tag = part.substring(1, part.indexOf(' ')).toLowerCase();
String attributeString = part.substring(part.indexOf(' '), part.indexOf('>'));
attributes = extractAttributes(attributeString);
}
pos += part.indexOf(">") + 1;
return OPEN_TAG;
}
else
{
int size = (part.length() > 30) ? 30 : part.length();
log.error("Unexpected markup found: " + part.substring(0, size) + "...");
return UNKNOWN;
}
}
/**
* Process text up to the next token.
*
* @return The token code
*/
private int processText()
{
StringBuilder buf = new StringBuilder();
while (pos < document.length())
{
char ch = document.charAt(pos);
if (ch == '<')
{
text = buf.toString();
return TEXT;
}
else
{
buf.append(ch);
}
pos++;
}
if (buf.length() > 0)
{
text = buf.toString();
return TEXT;
}
return END;
}
}