// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2003 Somik Raha // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptScanner.java,v $ // $Author: derrickoswald $ // $Date: 2005/03/12 17:53:10 $ // $Revision: 1.63 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.scanners; import java.util.Vector; import org.htmlparser.Attribute; import org.htmlparser.Node; import org.htmlparser.NodeFactory; import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.Remark; import org.htmlparser.Tag; import org.htmlparser.Text; import org.htmlparser.lexer.Cursor; import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; import org.htmlparser.scanners.ScriptDecoder; import org.htmlparser.tags.ScriptTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; /** * The ScriptScanner handles script CDATA. */ public class ScriptScanner extends CompositeTagScanner { /** * Strict parsing of CDATA flag. * If this flag is set true, the parsing of script is performed without * regard to quotes. This means that erroneous script such as: * <pre> * document.write("</script>"); * </pre> * will be parsed in strict accordance with appendix * <a href="http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data"> * B.3.2 Specifying non-HTML data</a> of the * <a href="http://www.w3.org/TR/html4/">HTML 4.01 Specification</a> and * hence will be split into two or more nodes. Correct javascript would * escape the ETAGO: * <pre> * document.write("<\/script>"); * </pre> * If true, CDATA parsing will stop at the first ETAGO ("</") no matter * whether it is quoted or not. If false, balanced quotes (either single or * double) will shield an ETAGO. Beacuse of the possibility of quotes within * single or multiline comments, these are also parsed. In most cases, * users prefer non-strict handling since there is so much broken script * out in the wild. */ public static boolean STRICT = false; /** * Create a script scanner. */ public ScriptScanner() { } /** * Scan for script. * Accumulates text from the page, until </[a-zA-Z] is encountered. * @param tag The tag this scanner is responsible for. * @param lexer The source of CDATA. * @param stack The parse stack, <em>not used</em>. */ public Tag scan (Tag tag, Lexer lexer, NodeList stack) throws ParserException { String language; String code; Node content; int position; Node node; Attribute attribute; Vector vector; if (tag instanceof ScriptTag) { language = ((ScriptTag)tag).getLanguage (); if ((null != language) && (language.equalsIgnoreCase ("JScript.Encode") || language.equalsIgnoreCase ("VBScript.Encode"))) { code = ScriptDecoder.Decode (lexer.getPage (), lexer.getCursor ()); ((ScriptTag)tag).setScriptCode (code); } } content = lexer.parseCDATA (!STRICT); position = lexer.getPosition (); node = lexer.nextNode (false); if (null != node) if (!(node instanceof Tag) || !( ((Tag)node).isEndTag () && ((Tag)node).getTagName ().equals (tag.getIds ()[0]))) { lexer.setPosition (position); node = null; } // build new end tag if required if (null == node) { attribute = new Attribute ("/script", null); vector = new Vector (); vector.addElement (attribute); node = lexer.getNodeFactory ().createTagNode ( lexer.getPage (), position, position, vector); } tag.setEndTag ((Tag)node); if (null != content) { tag.setChildren (new NodeList (content)); content.setParent (tag); } node.setParent (tag); tag.doSemanticAction (); return (tag); } }