/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.util.htmllex; import org.htmlparser.Node; import org.htmlparser.lexer.Lexer; import org.htmlparser.util.ParserException; /** * * The Lexer that comes with htmlparser does not handle non-escaped HTML * entities within SCRIPT tags - by default, something like: * * <script> * for(var i=0; i<23; i++) { j+=i; } * </script> * * Can cause the lexer to skip over a large part of the document. Technically, * the above isn't legit HTML, but of course, folks do stuff like that all the * time. So, this class uses a ParseContext object, passed in at construction, * which observes the SCRIPT and STYLE tags, both setting properties on the * ParseContext, and using that state information to perform a parseCDATA() * call instead of a nextNode() call at the right time, to try to keep the * SAX parsing in sync with the document. * * @author brad * */ public class ContextAwareLexer extends NodeUtils { private Lexer lexer = null; private ParseContext context = null; public ContextAwareLexer(Lexer lexer, ParseContext context) { this.lexer = lexer; this.context = context; } public Node nextNode() throws ParserException { Node node = null; if (context.isInJS()) { node = lexer.parseCDATA(true); if (node != null) { context.setInScriptText(true); context.setInJS(false); return node; } } else if (context.isInScriptText()) { node = lexer.parseCDATA(true); if (node != null) { return node; } } node = lexer.nextNode(context.isInJS()); if(node != null) { if(isNonEmptyOpenTagNodeNamed(node, SCRIPT_TAG_NAME)) { context.setInJS(true); } else if(isCloseTagNodeNamed(node, SCRIPT_TAG_NAME)) { context.setInJS(false); context.setInScriptText(false); } else if(isNonEmptyOpenTagNodeNamed(node, STYLE_TAG_NAME)) { context.setInCSS(true); } else if(isCloseTagNodeNamed(node, STYLE_TAG_NAME)) { context.setInCSS(false); } } return node; } }